In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions  import from_unixtime
from pyspark.sql.functions  import to_date
from pyspark.sql import Row
from pyspark.sql.functions import to_json, struct
from pyspark.sql import functions as F

# Mount storage

Databricks use blob storage (ADLS Gen-2) as data source, here's an example of how to mount ADLS Gen-2 Storage FileSystem to DBFS.

First, create a ADLS-Gen2 storage account, and define mount point.

In [None]:
storageAccount="xxx" # Name of the ADLS Gen2 Storage Account
mountpoint = "/mnt/xxx" # Mount the storage account to a chosen path in DBFS
storageEndPoint ="abfss://container_name@{}.dfs.core.windows.net/".format(storageAccount)

Then, authenticate into the storage endpoint. To do so, these needs to be performed in Azure:
1. **Application registration**: You will need to register an Azure Active Directory (AAD) application. On the Azure portal home page, search for "Azure Active Directory" &rarr; select App registrations &rarr; New registration.
2. **Create secret to the application**: Click on "Certificates & secrets" under the Manage heading &rarr; add a new client secret &rarr; Copy the value
3. **Grant ADLS-Gen2 access to the registered Application**: In the ADLS-Gen2 storage account, navigate to Access Control (IAM) &rarr; Add &rarr; Add role assignment &rarr; Role = Storage Blob Data Contributor; Assign access to = User, group, or service principal; Select = The registered Application

In [None]:
clientID ="xxx" # Obtained from (1) the registered Application -> Application (client) ID
tenantID ="xxx" # Obtained from (1) the registered Application -> Directory (tenant) ID
clientSecret ="xxx" # Obtained from (2) the registered Application -> Copied secret value
oauth2Endpoint = "https://login.microsoftonline.com/{}/oauth2/token".format(tenantID)


configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": clientID,
           "fs.azure.account.oauth2.client.secret": clientSecret,
           "fs.azure.account.oauth2.client.endpoint": oauth2Endpoint}

try:
  dbutils.fs.mount(
  source = storageEndPoint,
  mount_point = mountpoint,
  extra_configs = configs)
except Exception as e:
  if 'Directory already mounted' in str(e):
    print('Directory already mounted')
  else:
    print(str(e))


These are some useful commands to inspect the mounts

In [None]:
# List all mount points
display(dbutils.fs.mounts())

# List files under a specific mount point
display(dbutils.fs.ls(mountpoint))

Files in the ADLS-Gen2 can thus be read

In [None]:
# Reading Orders.csv file in a Spark dataframe
df_ord= spark.read.format("csv").option("header",True).load("dbfs:/mnt/Gen2/csvFiles")

In [None]:
# Executing the below will create an External table in Databricks for you to read the CSV.
# This is not about creating Delta tables however
spark.sql(f"""
CREATE OR REPLACE TABLE Orders
    {{(schema)}}
USING {{csv}}
OPTIONS (
    path 'dbfs:/mnt/Gen2/csvFiles',
    header 'true',
    delimiter ','
    )
""")

# Reading and writing to Delta Tables 

Delta Table is actually a bunch of snappy-compressed parquet files, with Delta Log files. It offer the following benefits:
- Easy rollback since it tracks every changes to the table in the delta log
- ACID compliance
- Enforce constraint defined in DDL
- Optimized performance

Define paths

In [None]:
# Mount path for the raw data blobs
original_path = "/mnt/Gen2/Orders/someFiles"

# Delta output path
delta_path = "/mnt/Gen2/Orders/delta"

Create Delta table

This is an example for creating Delta Table using CSV files, these will happen after executing the below:
- Under the defined LOCATION, snappy-compressed parquet files will be created, alongside the _delta_log folder
- You will find the relevant table under the "Data" section in Databricks

In [None]:
# The Temp View is necessary to allow Databricks to parse the CSV first, before making it a Delta table
# LOCATION flag is used to make the table an unmanaged one, whose data does not reside in Databricks but in the specified path
spark.sql(f"""
CREATE OR REPLACE TEMP VIEW Orders_vw
    {{(schema)}}
USING {{file format e.g. csv}}
OPTIONS (
    path '{original_path}',
    header 'true',
    delimiter '|'
    );

CREATE OR REPLACE TABLE Orders
LOCATION '{delta_path}'
PARITION BY {{column name}}
AS
SELECT *, EXTRACT(YEAR FROM Event_Date) FROM Orders_vw
""")

For self-describing file formats like parquet, the syntax can be made easier

In [None]:
# Note: This will not work for CSV, beecause the CTAS statement cannot infer schema correctly
spark.sql(f"""
CREATE OR REPLACE TABLE Orders
LOCATION '{delta_path}'
PARITION BY {{column name}}
AS
    SELECT *
    FROM parquet.`{original_path}`
"""
)

Alternative code using PySpark

In [None]:
# Read input into dataframe
df_ord = (spark.read.format("parquet").load(original_path)
      .withColumn("timestamp", current_timestamp())
      .withColumn("O_OrderDateYear", year(col("O_OrderDate")))
     )

# Save into delta path. Go over to the ADLS Gen2 container and you should see new files got created in the delta path
# Files are organized into different folders according to the "partitionBy" value
df_ord.write.format("delta").partitionBy("O_OrderDateYear").mode("overwrite").save(delta_path)

# Execute this then visit the "Data" section in Databricks, you will see the relevant table.
spark.sql(f"""
CREATE OR REPLACE TABLE Orders
LOCATION '{delta_path}'
""")

After creation, the table can be queried in SQL or Python

In [None]:
# Via SQL
%sql
SELECT o.*
FROM Orders o

# Via Python
deltaTable = spark.read.format("delta").load(delta_path)
deltaTable.show()

Some useful commands

In [None]:
%sql

# View history of the Delta table
DESCRIBE HISTORY Orders

# Query a specific version
SELECT * FROM Orders VERSION AS OF 1

# Restore a previous version
RESTORE TABLE Orders VERSION AS OF 5

# View details of the Delta table e.g. number of files, partitioning, etc.
DESCRIBE DETAIL Orders

Optimization
- Compacting small files and indexing via [OPTIMIZE and ZORDER](https://www.confessionsofadataguy.com/exploring-delta-lakes-zorder-and-performance-on-databricks/)
- Remove unused files from a table directory via [VACCUM](https://learn.microsoft.com/en-us/azure/databricks/spark/latest/spark-sql/language-manual/delta-vacuum/), add DRY RUN to preview previous versions to be deleted first

Tables vs Views vs CTE
- Table
    - Managed table: Data is actually stored in DBFS
    - Unmanaged table: Data is stored in elsewhere e.g. ADLS-Gen2
- Views
    - View: Will persist like table
    - Temp View: Persist in the current notebook session only
    - Global Temp View: Can be shared across different notebook sessions, until cluster restarts
- CTE
    - CTE: Referenced within the scope of a SQL statement only


# Streaming data pipeline from EventHub

This is an example of building the Bronze, Silver, and Gold Zone for a streaming data pipeline
- Bronze: Read live data from EventHub, and historical data from ADLS-Gen2. Parse content and union them
- Silver: Implement business rules and data cleansing process and join with lookup tables
- Gold: Data is aggregated

In [None]:
# Define variables
db_name = "VehicleSensor"

def get_config(zone):
    return {
    'delta_path': f"/mnt/SensorData/vehiclestreamingdata/{zone}/delta",
    'chkpt_path': f"/mnt/SensorData/vehiclestreamingdata/{zone}/chkpt",
    'delta_table': f"VehicleDelta_{zone}"
    }

# Create DB first
spark.sql(f"CREATE DATABASE IF NOT EXISTS{db_name}")

## Bronze Zone

### Streaming from Kafka

Create Spark DataFrame which reads from the Kafka topic

In [None]:
TOPIC = "cookbook-eventhub" # Event Hub namespace
BOOTSTRAP_SERVERS = "cookbook-eventhub.servicebus.windows.net:9093" # Host name of Event Hub:9093, 9093 is the port for Kafka

# Go to Event Hub's Shared Access Policies -> Click onto a policy -> Copy Connection string–primary key here
CONN_STRING = "Endpoint=sb://kafkaenabledeventhubns.servicebus.windows.net/;SharedAccessKeyName=sendreceivekafka;SharedAccessKey=4vxbVwasdasdsdasd4aVcUWBvYp44sdasaasasasasasasvoVE=" 

# The $ConnectionString and $Default are fixed values, don't update them
EH_SASL = EH_SASL = f"kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"{CONN_STRING}\";"
GROUP_ID = "$Default" 

# // Read stream using Spark SQL (structured streaming)
# // consider adding .option("startingOffsets", "earliest") to read earliest available offset during testing
kafkaDF = spark.readStream \
    .format("kafka") \
    .option("subscribe", TOPIC) \
    .option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.jaas.config", EH_SASL) \
    .option("kafka.request.timeout.ms", "60000") \
    .option("kafka.session.timeout.ms", "60000") \
    .option("kafka.group.id", "POC") \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "latest") \
    .load() \
    .withColumn("source", lit(TOPIC)) # Optional: Also add the topic as column

#Check if streaming is on and getting the schema for the kakfa dataframe 
print(kafkaDF.isStreaming)
print(kafkaDF.printSchema())

#It should then output something like this:
#
#True
#root
# |-- key: binary (nullable = true)
# |-- value: binary (nullable = true)
# |-- topic: string (nullable = true)
# |-- partition: integer (nullable = true)
# |-- offset: long (nullable = true)
# |-- timestamp: timestamp (nullable = true)
# |-- timestampType: integer (nullable = true)
# |-- source: string (nullable = true)

: 

Parse the Kafka message

In [None]:
# Creating the schema for the 'value' field in Kafka message
jsonschema = StructType() \
.add("id", StringType()) \
.add("eventtime", TimestampType()) \
.add("rpm", IntegerType()) \
.add("speed", IntegerType()) \
.add("kms", IntegerType()) \
.add("lfi", IntegerType())  \
.add("lat", DoubleType()) \
.add("long", DoubleType())

# Select the key and value
newkafkaDF=kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "source") \
    .withColumn('vehiclejson', from_json(col('value'),schema=jsonschema))

# Flatten the json
kafkajsonDF=newkafkaDF.select("key","value", "vehiclejson.*")

Writing the streaming data to Delta table

In [None]:
# Save as Delta files. Checkpoint is set so that it can recover from failure in the event of server failure
query=kafkajsonDF.selectExpr(
                  "id"	  \
                  ,"eventtime"	   \
                  ,"rpm"	\
                  ,"speed" \
                  ,"kms" \
                  ,"lfi" \
                  ,"lat" \
                  ,"long" \
                  ,"source") \
            .writeStream.format("delta") \
            .outputMode("append") \
            .option("checkpointLocation",get_config("Bronze")['chkpt_path']) \
            .start(get_config("Bronze")['delta_path']) 

# Data can thus be streamed to a Delta table 
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {db_name}.{get_config("Bronze")['delta_table']}
USING DELTA LOCATION '{get_config("Bronze")['delta_path']}'
""")

### Streaming from ADLS-Gen2

In case you have historical data in ADLS-Gen2, and you would like to stream-read the data there as files are ingested there

In [None]:
# Function to read data from ADLS gen-2 using readStream API and writing as delta format
def append_batch_source():
  
  topic ="historical"
  mnt_path = "xxx"

  kafkaDF = (spark.readStream \
    .schema(jsonschema)
    .format("parquet") \
    .load(mnt_path).withColumn("source", lit(topic)))

  # See here about the explanations of different options for trigger
  # https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers
  query=kafkaDF.selectExpr(
                  "id"	  \
                  ,"eventtime"	   \
                  ,"rpm"	\
                  ,"speed" \
                  ,"kms" \
                  ,"lfi" \
                  ,"lat" \
                  ,"long" \
                  ,"source"
                  ) \
            .writeStream.format("delta") \
            .option("checkpointLocation",get_config("Hist")['chkpt_path']) \
            .trigger(processingTime='2 seconds') \
            .outputMode("append") \
            .start(get_config("Hist")['delta_path']) 

  return query


# Create historical delta table
append_batch_source()
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {db_name}.{get_config("Historical")['delta_table']}
USING DELTA LOCATION '{get_config("Historical")['delta_path']}'
""")

In [None]:
# Inspect schema
spark.sql(f"""
describe formatted {db_name}.{get_config("Bronze")['delta_table']}
""")

### Union the live and historical data, and generate Temp View

With live and historical delta tables created, we can union them for subsequent usage

In [None]:
#Streaming Data from Bronze and Historical tables
df_bronze=spark.readStream.format("delta").option("latestFirst", "true").table(f"{db_name}.{get_config("Bronze")['delta_table']}")
df_historical=spark.readStream.format("delta").option("latestFirst", "true").table(f"{db_name}.{get_config("Historical")['delta_table']}")

#Joining both historical and Bronze Streaming Data. The TempView can be used like CTE in SQL statements
df_bronze_all = df_bronze.union(df_historical)
df_bronze_all.createOrReplaceTempView("vw_TempBronzeAll")

In [None]:
%sql
select count(*) from vw_TempBronzeAll

## Silver Zone

### Connect to Azure SQL DB for lookup tables

Establish connection to Azure SQL DB

In [None]:
# Config details for Azure SQL DB for VehicleInformation and LocationInformation tables
sqldbusername = dbutils.secrets.get(scope="KeyVaultScope",key="VehicleInformationDBUserId")
sqldbpwd = dbutils.secrets.get(scope="KeyVaultScope",key="VehicleInformationDBPwd")

jdbcHostname = "vehicledemoinformatiosrvr.database.windows.net"
jdbcDatabase = "VehicleInformationDB"
jdbcPort = 1433
jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2};user={3};password={4}".format(jdbcHostname, jdbcPort, jdbcDatabase, sqldbusername, sqldbpwd)
connectionProperties = {
  "user" : sqldbusername,
  "password" : sqldbpwd,
  "driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

Retrieve the two lookup tables and create Temp Views

In [None]:
# Reading dbo.VehicleInfo master table from Azure SQL DB and creating a view
vehicleInfo = "(select VehicleId,Make,Model,Category,ModelYear from dbo.VehicleInformation) vehicle"
df_vehicleInfo = spark.read.jdbc(url=jdbcUrl, table=vehicleInfo, properties=connectionProperties)
df_vehicleInfo.createOrReplaceTempView("vw_VehicleMaster")
display(df_vehicleInfo)

In [None]:
# Reading dbo.LocationInfo master table from Azure SQL DB and creating a view
locationInfo = "(select Borough,Location,Latitude,Longitude from dbo.LocationInfo) vehicle"
df_locationInfo = spark.read.jdbc(url=jdbcUrl, table=locationInfo, properties=connectionProperties)
df_locationInfo.createOrReplaceTempView("vw_LocationMaster")
display(df_locationInfo)

### Join data and output Silver table

Join the Bronze data with Lookup Tables, and save data in Delta format

In [None]:
# outputMode("append"): output only new rows to the output sink
# option("MergeSchema","True"): merging the schema between the Bronze and lookup tables

df_silver= spark.sql("select s.*,m.Make,m.Model,m.Category, Year(eventtime) as Year, month(eventtime) as Month,day(eventtime) as Day, \
                     hour(eventtime) as Hour,l.Borough,l.Location  \
                     from vw_TempBronzeAll s \
                     left join vw_VehicleMaster m on s.id = m.VehicleId \
                     left join vw_LocationMaster l on s.lat = l.Latitude and s.long = l.Longitude") \
            .writeStream.format("delta").option("MergeSchema","True") \
            .outputMode("append") \
            .option("checkpointLocation",get_config("Silver")['chkpt_path'])  \
            .start(get_config("Silver")['delta_path'])

Create Delta table for Silver Zone

In [None]:
# Create Delta table
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {db_name}.{get_config("Silver")['delta_table']}
USING DELTA LOCATION '{get_config("Silver")['delta_path']}'
""")

# Inspect schema
spark.sql(f"""
describe formatted {db_name}.{get_config("Silver")['delta_table']}
""")

# Run the following code to read data as streaming data from the Delta table.
display(spark.readStream.format("delta").table(get_config("Silver")['delta_table']).groupBy("Make").count().orderBy("Make"))

## Gold Zone

Aggregate the data and write to Gold Delta table

In [None]:
# Save data in Delta format
df_gold=(
spark.readStream.format("delta").option("latestFirst", "true").table(f"{db_name}.{get_config("Silver")['delta_table']}")
    # Apply Watermark to handle late data and perform aggeregation. See here for a discussion about withWatermark and outputMode
    # https://dvirgiln.github.io/spark-structured-streaming-output-modes/
    .withWatermark("timestamp","4 minutes")
    .groupBy(window('eventtime',"1 hour"),"Make","Borough","Location","Month","Day","Hour").count()) \
        .writeStream.format("delta") \
        .outputMode("update") \
        .option("checkpointLocation", get_config("Gold")['chkpt_path']) \
        .start(get_config("Gold")['delta_path'])

# Create Delta table
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {db_name}.{get_config("Gold")['delta_table']}
USING DELTA LOCATION '{get_config("Gold")['delta_path']}'
""")