# Purpose
This code will: 
1. Retrieve the power consumption files (JSON format) from the Data Lake (landing container)
1. Using the parameters/widgets to configure which date and time to read.
1. Create the "iotSmartGrid.powerConsumption" table, if required.
1. Merge the new data into the powerConsumption table.

# Configuration
To use this sample code, you will have to provide the Data Lake storage account key in Cmd2. Ideally, you should create a secretScope to store the key value. 
For more information, please see this link - https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#create-a-databricks-backed-secret-scope

In [0]:
# Configuring my account key so Databricks can access the Data lake.

spark.conf.set("fs.azure.account.key.sbdemostaiotsmartgrid.dfs.core.windows.net", 
               dbutils.secrets.get(scope="sbdemokviotsmartgrid", key="sbdemostaiotsmartgrid"))

# Define the location of my files

landingLocation = "abfss://landing@sbdemostaiotsmartgrid.dfs.core.windows.net/iotSmartGridData"
processedLocation = "abfss://processed@sbdemostaiotsmartgrid.dfs.core.windows.net/iotSmartGrid"

In [0]:
# Execute this to reset the demo

# dbutils.widgets.removeAll()
# spark.sql("DROP SCHEMA iotSmartGrid CASCADE")
# dbutils.fs.rm(processedLocation, True)

Out[34]: False

In [0]:
dbutils.widgets.text("landing_folder", "2023/01/01/000000")

In [0]:
# Read the JSON Smart Grid power data from the Data Lake

gridDataDf = spark.read.json(f"{landingLocation}/{dbutils.widgets.get('landing_folder')}/*.json")

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Convert measurementDate from string to timestamp
gridDataDf = gridDataDf.withColumn("measurementDate", to_timestamp("measurementDate"))


In [0]:
from delta.tables import *

# check if the processedLocation contain the delta table
if(DeltaTable.isDeltaTable(spark, processedLocation)): 

    # If yes, merge data with the existing delta table
    DeltaTable.forPath(spark, processedLocation).alias("target").merge(
        source = gridDataDf.alias("src"),
        condition = "target.measurementDate = src.measurementDate and target.meterId = src.meterId and target.zipCode = src.zipCode"
    ).whenNotMatchedInsertAll().execute()
else:

    # If no, save the file to processedLocation
    gridDataDf.write.mode("overwrite").format("delta").save(processedLocation)


In [0]:
# create the schema and table, if required

spark.sql("CREATE SCHEMA IF NOT EXISTS iotSmartGrid")
spark.sql(f"CREATE EXTERNAL TABLE IF NOT EXISTS iotSmartGrid.powerConsumption USING delta LOCATION '{processedLocation}'")

# Note: Using spark.sql because we can use f-string to retrieve the processedLocation

Out[39]: DataFrame[]

In [0]:
%sql

DESCRIBE EXTENDED iotSmartGrid.powerConsumption

-- Location: stored in the storage account
-- Provider (format): Delta

col_name,data_type,comment
measurementDate,timestamp,
measurementInKWh,double,
meterId,string,
zipCode,bigint,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,iotSmartGrid,
Table,powerConsumption,
Type,EXTERNAL,


In [0]:
%sql

-- Show the transaction log on the delta version

SELECT version, operationMetrics, operationMetrics.numOutputRows, operationMetrics.numTargetRowsInserted, operationMetrics.numTargetRowsUpdated, operationMetrics.numTargetRowsDeleted
FROM (DESCRIBE HISTORY iotSmartGrid.powerConsumption)

-- if you execute Cmd 7 again, a new version is created with no data changes (numOutputRows = 0)

version,operationMetrics,numOutputRows,numTargetRowsInserted,numTargetRowsUpdated,numTargetRowsDeleted
0,"Map(numFiles -> 3, numOutputRows -> 1080, numOutputBytes -> 15725)",1080,,,


In [0]:
%sql

-- Check your result for testing. Do not do this in production!

SELECT measurementDate, count(*) as cnt
FROM iotSmartGrid.powerConsumption
GROUP BY measurementDate;

measurementDate,cnt
2023-01-01T00:15:00.000+0000,360
2023-01-01T00:10:00.000+0000,360
2023-01-01T00:05:00.000+0000,360
