# Mount storage

In [None]:
storageAccount="xxx"
mountpoint = "/mnt/xxx"
storageEndPoint ="abfss://rawdata@{}.dfs.core.windows.net/".format(storageAccount)
print ('Mount Point ='+mountpoint)

#ClientId, TenantId and Secret is for the Application(ADLSGen2App) was have created as part of this recipe
clientID ="xxx" #Called as Application Id as well
tenantID ="xxx"
clientSecret ="xxx"
oauth2Endpoint = "https://login.microsoftonline.com/{}/oauth2/token".format(tenantID)


configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": clientID,
           "fs.azure.account.oauth2.client.secret": clientSecret,
           "fs.azure.account.oauth2.client.endpoint": oauth2Endpoint}

try:
  dbutils.fs.mount(
  source = storageEndPoint,
  mount_point = mountpoint,
  extra_configs = configs)
except Exception as e:
  if 'Directory already mounted' in str(e):
    print('Directory already mounted')
  else:
    print(str(e))


# Read from EventHub

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions  import from_unixtime
from pyspark.sql.functions  import to_date
from pyspark.sql import Row
from pyspark.sql.functions import to_json, struct
from pyspark.sql import functions as F

In [None]:
# The $ConnectionString and $Default are fixed values, don't update them
TOPIC = "Event Hub namespace"
BOOTSTRAP_SERVERS = "Host name of Event Hub:9093"
CONN_STRING = "Go to Event Hub's Shared Access Policies -> Click onto a policy -> Copy Connection string–primary key here"
EH_SASL = EH_SASL = f"kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"{CONN_STRING}\";"
GROUP_ID = "$Default" 

# // Read stream using Spark SQL (structured streaming)
# // consider adding .option("startingOffsets", "earliest") to read earliest available offset during testing
kafkaDF = spark.readStream \
    .format("kafka") \
    .option("subscribe", TOPIC) \
    .option("kafka.bootstrap.servers", BOOTSTRAP_SERVERS) \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.jaas.config", EH_SASL) \
    .option("kafka.request.timeout.ms", "60000") \
    .option("kafka.session.timeout.ms", "60000") \
    .option("kafka.group.id", "POC") \
    .option("failOnDataLoss", "false") \
    .option("startingOffsets", "latest") \ 
    .load() \
    .withColumn("source", lit(TOPIC)) # Optional: Also add the topic as column

In [None]:
#Checking if streaming is on and getting the schema for the kakfa dataframe 
print(kafkaDF.isStreaming)
print(kafkaDF.printSchema())

#It should then output something like this:
#
#True
#root
# |-- key: binary (nullable = true)
# |-- value: binary (nullable = true)
# |-- topic: string (nullable = true)
# |-- partition: integer (nullable = true)
# |-- offset: long (nullable = true)
# |-- timestamp: timestamp (nullable = true)
# |-- timestampType: integer (nullable = true)
# |-- source: string (nullable = true)

Parse the Kafka message

In [None]:
# Select the key and value
newkafkaDF=kafkaDF.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

# Creating the schema for the vehicle data json structure
jsonschema = StructType() \
.add("id", StringType()) \
.add("timestamp", TimestampType()) \
.add("rpm", IntegerType()) \
.add("speed", IntegerType()) \
.add("kms", IntegerType()) 
newkafkaDF=newkafkaDF.withColumn('vehiclejson', from_json(col('value'),schema=jsonschema))

# Flatten the json
kafkajsonDF=newkafkaDF.select("key","value", "vehiclejson.*")

Writing the streaming data to Delta tables

In [None]:
raw_destination = "/mnt/Blob/Vehicle_Chkpoint_raw/"
delta_table = "VehicleDetails_Delta"

# Save raw data as-is in delta format. Checkpoint is set so that it can recover from failure in the event of server failure

kafkajsonDF.selectExpr(
                  "id"	  \
                  ,"timestamp"	   \
                  ,"rpm"	\
                  ,"speed" \
                  ,"kms" \
                  ,"source" ) \
.writeStream.format("delta") \
.outputMode("append") \
.option("checkpointLocation", data_destination) \
.option("mergeSchema", "true") \
.start("/mnt/Blob/Vehicle_Raw") 

In [None]:
# Apply aggregation. The result could be saved to another destination. Apply Watermark to handle late data.
agg_destination = "/mnt/Blob/Vehicle_Agg"

kafkajsonDF.withWatermark("timestamp","4 minutes").groupBy(window('timestamp',"1 minutes"),'id').count().coalesce(1) \
.writeStream.format("delta") \
.outputMode("complete") \
.option("truncate", "false") \
.option("checkpointLocation", "/mnt/Blob/Vehicle_Chkpoint1/") \
.start(agg_destination) 

In [None]:
# Data can thus be streamed to a Delta table 
%sql
CREATE TABLE IF NOT EXISTS delta_table
USING DELTA
LOCATION raw_destination

In [None]:
# Run the following code to read data as streaming data from the Delta table.
display(spark.readStream.format("delta").table(delta_table).groupBy("source").count().orderBy("source"))

# Reading and writing to Delta Tables 

Delta Table actually can be thought of a bunch of snappy-compressed parquet files, with Delta Log. This offer the below benefits:
- Easy rollback since it tracks every changes to the table in the delta log
- ACID compliance
- Enforce constraint

Define paths

In [None]:
# Input
ord_path = "/mnt/Gen2/Orders/parquetFiles"

# Delta output path
delta_path = "/mnt/Gen2/Orders/delta"

Save into Delta table

In [None]:
# Read input into dataframe
df_ord = (spark.read.format("parquet").load(ord_path)
      .withColumn("timestamp", current_timestamp())
      .withColumn("O_OrderDateYear", year(col("O_OrderDate")))
     )

# Save into delta path. Go over to the ADLS Gen2 container and you should see new files got created in the delta path
# Files are organized into different folders according to the "partitionBy" value
df_ord.write.format("delta").partitionBy("O_OrderDateYear").mode("overwrite").save(delta_path)

In [None]:
# Or alternatively, this code would implicitly convert the Parquet files to delta format for you can create a Delta table pointing to that location
# After running this you will see _delta_log created in the ord_path, then you can run the following command with LOCATION = ord_path
%sql
CONVERT TO DELTA parquet.`{ord_path}`

In [None]:
# Create Delta table. Visit the "Data" section in Databricks, you will see the relevant table
%sql
DROP TABLE IF EXISTS Orders;
CREATE TABLE Orders
USING DELTA
LOCATION delta_path

Read Delta table

In [None]:
# Via SQL
%sql
SELECT o.*
FROM Orders o

In [None]:
# Via Python
deltaTable = spark.read.format("delta").load(delta_path)
deltaTable.show()

Optimization
- Combining multiple small files via [OPTIMIZE and ZORDER](https://www.confessionsofadataguy.com/exploring-delta-lakes-zorder-and-performance-on-databricks/)