### Connections to Storage options
- Storage Account Keys
- Azure Groups and Credential passthrough
- SAS Tokens
- Service principals

### Cleaning Beginnings

In [0]:
dbutils.fs.rm('/FileStore/tables/ingestion/raw',True)

True

In [0]:
%sql
DROP SCHEMA lakehouse CASCADE;

org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException: [SCHEMA_NOT_FOUND] The schema `lakehouse` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.
To tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704
	at org.apache.spark.sql.errors.QueryCompilationErrors$.noSuchNamespaceError(QueryCompilationErrors.scala:1725)
	at org.apache.spark.sql.execution.datasources.v2.DropNamespaceExec.run(DropNamespaceExec.scala:48)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.$anonfun$result$2(V2CommandExec.scala:48)
	at org.apache.spark.sql.execution.SparkPlan.runCommandWithAetherOff(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.runCommandInAetherOrSpark(SparkPlan.scala:191)
	at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.$anonfun$result$1(V2CommandExec.scala:48)
	at 

### Create Table with CDF Enabled

In [0]:
%sql
CREATE SCHEMA lakehouse

In [0]:
%sql
CREATE TABLE IF NOT EXISTS lakehouse.watermark (
  schemaName STRING,
  tableName STRING,
  watermarkType STRING,
  timestampWatermark TIMESTAMP,
  integerWatermark INT,
  bigIntWatermark BIGINT,
  stringWatermark STRING
) TBLPROPERTIES (delta.enableChangeDataFeed = true)

In [0]:
%sql
DESCRIBE HISTORY lakehouse.watermark

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
0,2024-08-26T03:36:50Z,7285002445872367,vasaicrow@gmail.com,CREATE TABLE,"Map(partitionBy -> [], clusterBy -> [], description -> null, isManaged -> true, properties -> {""delta.enableChangeDataFeed"":""true""}, statsOnLoad -> false)",,List(3780456485084141),0826-031505-kc4bi39o,,WriteSerializable,True,Map(),,Databricks-Runtime/15.4.x-scala2.12


### Using Python to enable CDF

In [0]:
def TurnCDFOn(schema, table):   
    spark.sql(f"ALTER TABLE {schema}.{table} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")
    print(f'TurnCDFOn: Turned CDF on for {schema}.{table}')

def TurnCDFOff(schema, table):   
    spark.sql(f"ALTER TABLE {schema}.{table} SET TBLPROPERTIES (delta.enableChangeDataFeed = false)")
    print(f'TurnCDFOn: Turned CDF off for {schema}.{table}')    

In [0]:
# TurnCDFOff('lakehouse','watermark')
TurnCDFOn('lakehouse','watermark')

TurnCDFOn: Turned CDF on for lakehouse.watermark


In [0]:
def isCDFOn(schema,table):
    statement = f"SHOW TBLPROPERTIES {schema}.{table}"
    print(statement)
    tlbPropertiesDF = spark.sql(statement)
    tblProperties = tlbPropertiesDF.where(tlbPropertiesDF.key == "delta.enableChangeDataFeed").collect()
    nummatching = len(tblProperties)
    if (nummatching == 0):
        print(f"IsCDFOn: No CDF Table properties found {schema}.{table}")
        return False
    elif nummatching == 1:
        isOn = tblProperties[0][1]
        if isOn != "true":
            return False
        else:
            return True
    print(f"IsCDFOn: CDF- This should not happen. Should be 1 {schema}.{table}")


In [0]:
isCDFOn('lakehouse','watermark')

SHOW TBLPROPERTIES lakehouse.watermark


True

In [0]:
def EnsureCDFisOn(schema,table):
    isOn = isCDFOn(schema,table)
    if isOn:
        print(schema,table,"EnsureCDFisOn: CDF already on")
        return
    TurnCDFOn(schema,table)
    print(schema,table,"Turned CDF On")

def TurnCDFOnForAllTables():
    schemasDF = spark.sql("SHOW SCHEMAS")
    results = []
    for schema in schemasDF.collect():
        if schema['databaseName'] != 'default':
            tablesDF = spark.sql(f"show tables from {schema['databaseName']}")
            for table in tablesDF.collect():                
                if table['isTemporary'] == False:
                    EnsureCDFisOn(table['database'],table['tableName'])

In [0]:
TurnCDFOnForAllTables()


SHOW TBLPROPERTIES lakehouse.watermark
lakehouse watermark EnsureCDFisOn: CDF already on


## Loading Files using self-managed watermarks

In [0]:
dbutils.fs.mkdirs('/FileStore/tables/ingestion')
dbutils.fs.mkdirs('/FileStore/tables/ingestion/raw')
dbutils.fs.mkdirs('/FileStore/tables/ingestion/raw/updates')

True

In [0]:
# dbutils.fs.rm('/FileStore/tables/ingestion/raw',True)

### Recursive Directory Read

In [0]:
def RecursiveDirectoryToDataFrame(path,allFiles=None):
    createDataFrame = False
    if allFiles is None:
        createDataFrame = True
        allFiles=[]
    ls = dbutils.fs.ls(path)
    for l in ls:
        if (l.isFile()):
            allFiles.append(l)
        elif l.isDir():
            if l != path:
                RecursiveDirectoryToDataFrame(l.path,allFiles)
    if createDataFrame:
        return spark.createDataFrame(allFiles)
    else:
        return

In [0]:
filepath = 'dbfs:/FileStore/tables/ingestion/raw'
allSourceFilesDf =RecursiveDirectoryToDataFrame(filepath)
display(allSourceFilesDf)

path,name,size,modificationTime
dbfs:/FileStore/tables/ingestion/raw/employees.csv,employees.csv,335,1724643436000
dbfs:/FileStore/tables/ingestion/raw/updates/employees_updated.csv,employees_updated.csv,336,1724643522000


In [0]:
from pyspark.sql import functions as F
import datetime
def GetWatermark(schema,table):
    watermarktable = "lakehouse.watermark"
    df = spark.read.table(watermarktable)
    display(df)
    df = df.where((F.col("schemaName") == schema) & (F.col("tableName") == table))
    df.cache()
    count = df.count()
    if count == 1:
        return df
    elif count == 0:
        updateWaterMarkTableSchema = "schemaName STRING,tableName STRING,watermarkType STRING,timestampWatermark TIMESTAMP,integerWatermark INT,bigIntWatermark BIGINT,stringWatermark STRING"
        updateList = [{
            "schemaName":schema, 
            "tableName":table, 
            "watermarkType":"bigint", 
            "timestampWatermark":None,
            "integerWatermark":None, 
            "bigIntWatermark":0, 
            "stringWatermark":None
        }]
        dfUpdates = spark.createDataFrame(updateList,updateWaterMarkTableSchema)
        dfUpdates.show()
        return dfUpdates
    else:
        return None

In [0]:
startingWatermarkDF = GetWatermark("lakehouse","employees")

schemaName,tableName,watermarkType,timestampWatermark,integerWatermark,bigIntWatermark,stringWatermark
lakehouse,employees,bigint,,,0,


In [0]:
from delta import * 

def UpsertWaterMark(dfUpdates):
    watermarktable = 'lakehouse.watermark'
    print("watermarkTable", watermarktable)
    watermark = DeltaTable.forName(spark, tableOrViewName=watermarktable)
    watermark.alias('watermark').merge(
        dfUpdates.alias('updates'), 'watermark.schemaName = updates.schemaName AND watermark.tableName = updates.tableName').whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

In [0]:
UpsertWaterMark(startingWatermarkDF)

watermarkTable lakehouse.watermark


In [0]:
def getWaterMarkValue(startingWatermarkDF):
    watermarkRow = startingWatermarkDF.collect()[0]
    watermarktype = watermarkRow.watermarkType
    print(watermarktype)
    if watermarktype == "bigint":
        watermarkvalue = watermarkRow.bigIntWatermark
    elif watermarktype == 'timestamp':
        watermarkvalue = watermarkRow.timestampWatermark
    elif watermarktype == 'integer':
        watermarkvalue = watermarkRow.integerWatermark
    elif watermarktype == 'string':
        watermarkvalue = watermarkRow.stringWatermark
    else:
        print("Unknown watermark type", watermarktype)
        watermarkvalue = None
    return watermarkvalue



### Update Watermark value

In [0]:
from pyspark.sql.functions import lit
def UpdatewatermarkValue(watermarkDF,maxValue):
  print(maxValue)
  watermarkRow = startingWatermarkDF.collect()[0]
  watermarkDF.createOrReplaceTempView('tempWatermarkRow')
  watermarktype = watermarkRow.watermarkType
  if watermarktype == "bigint":
      watermarkDF = watermarkDF.withColumn("bigIntWatermark",lit(maxValue))
  elif watermarktype == 'timestamp':
      watermarkDF = watermarkDF.withColumn("timestampWatermark",lit(maxValue))
  elif watermarktype == 'integer':
      watermarkDF = watermarkDF.withColumn("integerWatermark",lit(maxValue))
  elif watermarktype == 'string':
      watermarkDF = watermarkDF.withColumn("stringWatermark",lit(maxValue))
  return watermarkDF
  

### Get Source files into a DataFrame

In [0]:
def GetSourceDataFrame(path,fileformat,watermarkColumn,schema,table):
  allSourceFilesDF = RecursiveDirectoryToDataFrame(path)
  startingWatermarkDF = GetWatermark(schema,table).cache()
  if startingWatermarkDF is None or startingWatermarkDF.count() != 1:
    print("Error: null returned from GetWatermark or rows not 1")
    return None
  watermarkValue = getWaterMarkValue(startingWatermarkDF)  
  if watermarkValue is None:
    print("Error getting watermark value")  
  # display(watermarkValue)
  filteredDF = allSourceFilesDf.where(F.col(watermarkColumn) > watermarkValue).orderBy(F.col(watermarkColumn)).cache()
  # return allSourceFilesDf,watermarkValue
  # display(allSourceFilesDf)
  # display(filteredDF)
  recordsDF = None
  for f in filteredDF.collect():
    path = f["path"]
    dfRecords = spark.read.load(path,format=fileformat,header=True)
    if recordsDF is None:
      recordsDF = dfRecords
    else:
      recordsDF = recordsDF.union(dfRecords)
  recordsDF = recordsDF.cache()
  maxValue = filteredDF.agg({watermarkColumn:"max"}).collect()[0]
  return recordsDF,maxValue
  

In [0]:
# dbutils.fs.rm(filepath,True)

In [0]:
sourcePath = filepath
watermarkColumn = 'modificationTime'
fileformat = 'csv'
schema = 'lakehouse'
table = 'employees'
recordsDf,maxValue = GetSourceDataFrame(sourcePath,fileformat,watermarkColumn,schema,table)
print(maxValue)
display(recordsDf)

schemaName,tableName,watermarkType,timestampWatermark,integerWatermark,bigIntWatermark,stringWatermark
lakehouse,employees,bigint,,,1724643436000,


bigint
Row(max(modificationTime)=1724643522000)


EmPId,FirstName,LastName,Address1,CreatedOn,ModifiedOn
1,Leonardo,da Vinci,123 First St.,12/31/2024,12/31/2024
2,Napoleon,Bonaparte,234 Second St.,12/31/2024,12/31/2024
3,Charles,Darwin,345 Third St.,12/31/2024,12/31/2024
4,Albert,Einstein,456 Fourth St.,12/31/2024,10/31/2025
5,Thomas,Jefferson,678 Fifth St.,12/31/2024,12/31/2024


### Incremental Ingestion

In [0]:
def IncrementalUpdate(path,fileformat,fileWatermarkColumn,schema,table,tablePath,tableComment = "bronze table " + table):
    recordsDf,maxValue = GetSourceDataFrame(path,fileformat,fileWatermarkColumn,schema,table)
    fullTableName = '`'+schema+"`.`Bronze_" + table+'`'
    print(f"{fullTableName} will be updated with {recordsDf.count()} files")
    targetTable = None
    try:
        targetTable = DeltaTable.forName(spark,tableOrViewName=fullTableName)
    except:
        print("Exceptions getting table-it probably doesnt exist")
    if targetTable is not None:
        print('Exists')
        recordsDf.write.mode('append').format("delta").saveAsTable(fullTableName)
    else:
        print("Table doesnt exist yet")
        recordsDf.write.mode('overwrite').option("path",tablePath).option('mergeSchema',True).option('comments',tableComment).format("delta").saveAsTable(fullTableName)
        EnsureCDFisOn(schema,"`Bronze_" + table+'`')
    watermarkDf = UpdatewatermarkValue(startingWatermarkDF,maxValue[0])
    updateResults = UpsertWaterMark(watermarkDf)
    display(updateResults) 
  

In [0]:
sourcePath = filepath
tablePath = 'dbfs:/FileStore/tables/ingestion/bronze/employee'
watermarkColumn = 'modificationTime'
fileformat = 'csv'
schema = 'lakehouse'
table = 'employees'
IncrementalUpdate(sourcePath,fileformat,watermarkColumn,schema,table,tablePath)

schemaName,tableName,watermarkType,timestampWatermark,integerWatermark,bigIntWatermark,stringWatermark
lakehouse,employees,bigint,,,0,


bigint
`lakehouse`.`Bronze_employees` will be updated with 5 files
Exceptions getting table-it probably doesnt exist
Table doesnt exist yet
SHOW TBLPROPERTIES lakehouse.`Bronze_employees`
IsCDFOn: No CDF Table properties found lakehouse.`Bronze_employees`
TurnCDFOn: Turned CDF on for lakehouse.`Bronze_employees`
lakehouse `Bronze_employees` Turned CDF On
1724643436000
watermarkTable lakehouse.watermark


In [0]:
%sql
select * from lakehouse.watermark

schemaName,tableName,watermarkType,timestampWatermark,integerWatermark,bigIntWatermark,stringWatermark
lakehouse,employees,bigint,,,1724643436000,


### Added Update File

In [0]:
sourcePath = filepath
tablePath = 'dbfs:/FileStore/tables/ingestion/bronze/employee'
watermarkColumn = 'modificationTime'
fileformat = 'csv'
schema = 'lakehouse'
table = 'employees'
IncrementalUpdate(sourcePath,fileformat,watermarkColumn,schema,table,tablePath)

schemaName,tableName,watermarkType,timestampWatermark,integerWatermark,bigIntWatermark,stringWatermark
lakehouse,employees,bigint,,,1724643436000,


bigint
`lakehouse`.`Bronze_employees` will be updated with 5 files
Exists
1724643522000
watermarkTable lakehouse.watermark


In [0]:
%sql
select * from lakehouse.watermark

schemaName,tableName,watermarkType,timestampWatermark,integerWatermark,bigIntWatermark,stringWatermark
lakehouse,employees,bigint,,,1724643522000,


In [0]:
%sql
select * from lakehouse.bronze_employees

EmPId,FirstName,LastName,Address1,CreatedOn,ModifiedOn
1,Leonardo,daVinci,123 First St.,12/31/2024,12/31/2024
2,Napoleon,Bonaparte,234 Second St.,12/31/2024,12/31/2024
3,Charles,Darwin,345 Third St.,12/31/2024,12/31/2024
4,Albert,Einstein,456 Fourth St.,12/31/2024,12/31/2024
5,Thomas,Jefferson,678 Fifth St.,12/31/2024,12/31/2024
1,Leonardo,da Vinci,123 First St.,12/31/2024,12/31/2024
2,Napoleon,Bonaparte,234 Second St.,12/31/2024,12/31/2024
3,Charles,Darwin,345 Third St.,12/31/2024,12/31/2024
4,Albert,Einstein,456 Fourth St.,12/31/2024,10/31/2025
5,Thomas,Jefferson,678 Fifth St.,12/31/2024,12/31/2024


### Full Load into Bronze

In [0]:
def FullLoadToBronze(sourcePath,fileFormat, schema,table,tablePath,tableSchema="",tableComment =None):
    if tableComment is None:
        tableComment = "Bronze version of " + table
    df = spark.read.option("recursiveFileLookup","true").option('header','true').load(sourcePath,format=fileFormat)
    fullTableName = '`'+schema+"`.`Bronze_" + table+'`'
    df.write.mode("overwrite").option('overwriteSchema',"true").option("path",tablePath).option("comments",tableComment).saveAsTable(fullTableName)

In [0]:
%sql
SHOW TABLES IN lakehouse

database,tableName,isTemporary
lakehouse,bronze_employees,False
lakehouse,bronze_fullload_employees,False
lakehouse,watermark,False
,_sqldf,True
,tempwatermarkrow,True


In [0]:
sourcePath = filepath
tablePath = 'dbfs:/FileStore/tables/ingestion/bronze/employee'
watermarkColumn = 'modificationTime'
fileformat = 'csv'
schema = 'lakehouse'
table = 'fullload_employees'
FullLoadToBronze(sourcePath,fileformat, schema,table,tablePath)
display(spark.sql('select * from lakehouse.bronze_fullload_employees'))

EmPId,FirstName,LastName,Address1,CreatedOn,ModifiedOn
1,Leonardo,daVinci,123 First St.,12/31/2024,12/31/2024
2,Napoleon,Bonaparte,234 Second St.,12/31/2024,12/31/2024
3,Charles,Darwin,345 Third St.,12/31/2024,12/31/2024
4,Albert,Einstein,456 Fourth St.,12/31/2024,12/31/2024
5,Thomas,Jefferson,678 Fifth St.,12/31/2024,12/31/2024
1,Leonardo,da Vinci,123 First St.,12/31/2024,12/31/2024
2,Napoleon,Bonaparte,234 Second St.,12/31/2024,12/31/2024
3,Charles,Darwin,345 Third St.,12/31/2024,12/31/2024
4,Albert,Einstein,456 Fourth St.,12/31/2024,10/31/2025
5,Thomas,Jefferson,678 Fifth St.,12/31/2024,12/31/2024


## Incremental Loading files using Autoloader

In [0]:
def createTableFromFile(cloudFilesFormat,sourcePath,filePath,schema,table,schemaHints=None,readOptions=None):
    if not table.startswith("bronze_"):
        table = "bronze_" + table
    schemaLocation = filePath.replace('/bronze/','/schemas_bronze/')
    checkpointLocation = filePath.replace('/bronze/','/checkpoints_bronze/')
    dbutils.fs.mkdirs(schemaLocation)
    dbutils.fs.mkdirs(checkpointLocation)

    #setup input stream
    dataStreamReader = spark.readStream.format('cloudFiles').option("mergeSchema",'true').option('cloudFiles.inferColumnTypes','true')
    dataStreamReader = dataStreamReader.option('cloudFiles.format',cloudFilesFormat).option('cloudFiles.schemaLocation',schemaLocation)
    if schemaHints is not None:
        dataStreamReader = dataStreamReader.option('cloudFiles.schemaHints',schemaHints)
    else:
        dataStreamReader = dataStreamReader.option('delta.columnMappingMode.mode','name')
    
    if readOptions is not None:
        dataStreamReader = dataStreamReader.option(**readOptions)
    
    df = dataStreamReader.load(sourcePath)
    #Read Stream and create dataframe
    result = (df.writeStream.trigger(once=True).format('delta').outputMode('append').option('mergeSchema','true').option('checkpointLocation',checkpointLocation).option('path',filePath).toTable(schema+"."+table))
    result.awaitTermination()
    EnsureCDFisOn(schema,table)
    return result   

In [0]:
sourcePath = filepath
bronzePath = 'dbfs:/FileStore/tables/ingestion/autoloader/bronze/employees'
result = createTableFromFile('csv',sourcePath,bronzePath,'lakehouse','employeeautoloader')


SHOW TBLPROPERTIES lakehouse.bronze_employeeautoloader
IsCDFOn: No CDF Table properties found lakehouse.bronze_employeeautoloader
TurnCDFOn: Turned CDF on for lakehouse.bronze_employeeautoloader
lakehouse bronze_employeeautoloader Turned CDF On


In [0]:
%sql
show tables in lakehouse

database,tableName,isTemporary
lakehouse,bronze_employeeautoloader,False
lakehouse,bronze_employees,False
lakehouse,bronze_fullload_employees,False
lakehouse,watermark,False
,_sqldf,True
,tempwatermarkrow,True


In [0]:
display(spark.sql('select * from lakehouse.bronze_employeeautoloader'))

EmPId,FirstName,LastName,Address1,CreatedOn,ModifiedOn,_rescued_data
1,Leonardo,da Vinci,123 First St.,2024-12-31,2024-12-31,
2,Napoleon,Bonaparte,234 Second St.,2024-12-31,2024-12-31,
3,Charles,Darwin,345 Third St.,2024-12-31,2024-12-31,
4,Albert,Einstein,456 Fourth St.,2024-12-31,2025-10-31,
5,Thomas,Jefferson,678 Fifth St.,2024-12-31,2024-12-31,
1,Leonardo,daVinci,123 First St.,2024-12-31,2024-12-31,
2,Napoleon,Bonaparte,234 Second St.,2024-12-31,2024-12-31,
3,Charles,Darwin,345 Third St.,2024-12-31,2024-12-31,
4,Albert,Einstein,456 Fourth St.,2024-12-31,2024-12-31,
5,Thomas,Jefferson,678 Fifth St.,2024-12-31,2024-12-31,


In [0]:
%sql
DESCRIBE lakehouse.bronze_employeeautoloader

col_name,data_type,comment
EmPId,int,
FirstName,string,
LastName,string,
Address1,string,
CreatedOn,date,
ModifiedOn,date,
_rescued_data,string,


## Loading Files via Delta Live Tables

### DLT SQL API

In [0]:
%sql
-- This cannot be run interactively; need to create pipline
CREATE STREAMING LIVE TABLE Bronze_Employee_DLT
COMMENT "Famous Brands Employee Table"
TBLPROPERTIES ("quality" = "bronze") AS
SELECT
  *
FROM
  cloud_files(
    'dbfs:/FileStore/tables/ingestion/',
    'csv',
    map(
      'multiline',
      'false',
      'recursiveFileLookup',
      'true',
      'schemaEvolutionMode',
      'rescue',
      'autoMerge',
      'true',
      'cloudFiles.inferColumnTypes',
      'true'
    )
  )

message
"This Delta Live Tables query is syntactically valid, but you must create a pipeline in order to define and populate your table."


### DLT Python API for Incremental Ingestion

In [0]:
import dlt
@dlt.table(comment="Famous Brands Employee Table using Python DLT API",
           table_properties={'delta.enableChangeDataFeed':'true','quality':'bronze'})
def employee_dlt_python():
    return spark.readStream.format('cloudFiles').option('multiline',"false").option("cloudFiles.format",'json').load('dbfs:/FileStore/tables/ingestion/')


### Full Ingestion using DLT SQL API

In [0]:
%sql
CREATE OR REFRESH LIVE TABLE bronze_employees_fullload
-- CREATE OR REPLACE TABLE lakehouse.bronze_employees_fullload
as SELECT * FROM csv.`dbfs:/FileStore/tables/ingestion/raw/employees.csv`

num_affected_rows,num_inserted_rows


In [0]:
%sql
select * from lakehouse.bronze_employees_fullload

_c0,_c1,_c2,_c3,_c4,_c5
EmPId,FirstName,LastName,Address1,CreatedOn,ModifiedOn
1,Leonardo,daVinci,123 First St.,12/31/2024,12/31/2024
2,Napoleon,Bonaparte,234 Second St.,12/31/2024,12/31/2024
3,Charles,Darwin,345 Third St.,12/31/2024,12/31/2024
4,Albert,Einstein,456 Fourth St.,12/31/2024,12/31/2024
5,Thomas,Jefferson,678 Fifth St.,12/31/2024,12/31/2024


### Full Load using Python DLT API

In [0]:
import dlt
@dlt.table(comment="Famous Brands Employee Table using Python DLT API",
           table_properties={'delta.enableChangeDataFeed':'true','quality':'bronze'})
def employee_dlt_python():
    return spark.read.format('csv').load('dbfs:/FileStore/tables/ingestion/')


## Streaming Data

### Stream Processing with DLT Python API