In [0]:
# create autoloader input folder in volume
dbutils.fs.mkdirs("/Volumes/dev/bronze/landing/autoloader_input/2010/12/01")
dbutils.fs.mkdirs("/Volumes/dev/bronze/landing/autoloader_input/2010/12/02")
dbutils.fs.mkdirs("/Volumes/dev/bronze/landing/autoloader_input/2010/12/03")
dbutils.fs.mkdirs("/Volumes/dev/bronze/landing/autoloader_input/2010/12/04")
dbutils.fs.mkdirs("/Volumes/dev/bronze/landing/autoloader_input/2010/12/05")
dbutils.fs.mkdirs("/Volumes/dev/bronze/landing/autoloader_input/2010/12/06")
dbutils.fs.mkdirs("/Volumes/dev/bronze/landing/autoloader_input/2010/12/07")


True

In [0]:
# create checkpoint location in volume
dbutils.fs.mkdirs("/Volumes/dev/bronze/landing/checkpoint/autoloader")

True

In [0]:
dbutils.fs.cp("/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv", "/Volumes/dev/bronze/landing/autoloader_input/2010/12/01")
dbutils.fs.cp("/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-02.csv", "/Volumes/dev/bronze/landing/autoloader_input/2010/12/02")
dbutils.fs.cp("/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-03.csv", "/Volumes/dev/bronze/landing/autoloader_input/2010/12/03")


True

In [0]:
# Read Files using Autoloader with checkpoint
# and schema location "/Volumes/dev/bronze/landing/checkpoint/autoloader"
# File Detection Modes
# - Directory Listing (uses API calls to detect new files - Rocks DB)
# - File Notification (uses Notification and Queue Servies - requires elevated cloud permissions for setup)

df = (
    spark
    .readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("pathGlobFilter", "*.csv")
    .option("header", "true")
    .option("cloudFiles.schemaLocation", "/Volumes/dev/bronze/landing/checkpoint/autoloader/1/")
    .option("cloudFiles.schemaHints", "Quantity int, UnitPrice double")
    .load("/Volumes/dev/bronze/landing/autoloader_input/*/")
  )

In [0]:
# write data to delta table - dev.bronze.invoice_al_1
from pyspark.sql.functions import col
(
  df
  .withColumn("__file", col("_metadata.file_name"))
  .writeStream
  .format("delta")
  .option("mergeSchema", "true")
  .option("checkpointLocation", "/Volumes/dev/bronze/landing/checkpoint/autoloader/1/")
  .outputMode("append")
  .trigger(availableNow=True)
  .toTable("dev.bronze.invoice_al_1")
)

<pyspark.sql.streaming.query.StreamingQuery at 0x7ff4b1fdf470>

In [0]:
%sql
select __file, count(*) from dev.bronze.invoice_al_1
group by all

__file,count(1)
2010-12-01.csv,3108
2010-12-03.csv,2202
2010-12-02.csv,2109
2010-12-06.csv,3878
2010-12-05.csv,2725


In [0]:
dbutils.fs.cp("/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-05.csv", "/Volumes/dev/bronze/landing/autoloader_input/2010/12/05")

True

In [0]:
dbutils.fs.cp("/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-06.csv", "/Volumes/dev/bronze/landing/autoloader_input/2010/12/06")

True

In [0]:
# add new columns - default mode, job fails to update schema location, rerun the job and should work fine
# cloudFiles.schemaEvolutionMode -> rescue

df = (
    spark
    .readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("pathGlobFilter", "*.csv")
    .option("header", "true")
    .option("cloudFiles.schemaLocation", "/Volumes/dev/bronze/landing/checkpoint/autoloader/2/")
    .option("cloudFiles.schemaHints", "Quantity int, UnitPrice double")
    .option("cloudFiles.schemaEvolutionMode", "rescue")
    .load("/Volumes/dev/bronze/landing/autoloader_input/*/")
  )

In [0]:
from pyspark.sql.functions import col
(
  df
  .withColumn("__file", col("_metadata.file_name"))
  .writeStream
  .format("delta")
  .option("mergeSchema", "true")
  .option("checkpointLocation", "/Volumes/dev/bronze/landing/checkpoint/autoloader/2/")
  .outputMode("append")
  .trigger(availableNow=True)
  .toTable("dev.bronze.invoice_al_2")
)

<pyspark.sql.streaming.query.StreamingQuery at 0x7ff4955efc80>

In [0]:
%sql
select __file, count(*) from dev.bronze.invoice_al_2
group by __file;

--DROP  TABLE dev.bronze.invoice_al_2

__file,count(1)
2010-12-05.csv,2725
2010-12-01.csv,3108
2010-12-03.csv,2202
2010-12-02.csv,2109
2010-12-06.csv,3878


In [0]:
%sql
select * from  dev.bronze.invoice_al_2 where `_rescued_data` is not null

InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,_rescued_data,__file
537226,22811,SET OF 6 T-LIGHTS CACTI,6,6/12/2010 8:34,2.95,15987.0,United Kingdom,"{""State"":""BA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537226,21713,CITRONELLA CANDLE FLOWERPOT,8,6/12/2010 8:34,2.1,15987.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537226,22927,GREEN GIANT GARDEN THERMOMETER,2,6/12/2010 8:34,5.95,15987.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537226,20802,SMALL GLASS SUNDAE DISH CLEAR,6,6/12/2010 8:34,1.65,15987.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537226,22052,VINTAGE CARAVAN GIFT WRAP,25,6/12/2010 8:34,0.42,15987.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537226,22705,WRAP GREEN PEARS,25,6/12/2010 8:34,0.42,15987.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537226,20781,GOLD EAR MUFF HEADPHONES,2,6/12/2010 8:34,5.49,15987.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537226,22310,IVORY KNITTED MUG COSY,6,6/12/2010 8:34,1.65,15987.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537226,22389,PAPERWEIGHT SAVE THE PLANET,6,6/12/2010 8:34,2.55,15987.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv
537227,22941,CHRISTMAS LIGHTS 10 REINDEER,2,6/12/2010 8:42,8.5,17677.0,United Kingdom,"{""State"":""GA"",""_file_path"":""/Volumes/dev/bronze/landing/autoloader_input/2010/12/06/2010-12-06.csv""}",2010-12-06.csv


In [0]:
# schemaEvolutionMode -> None -> ignores new columns and does not fail the job
# schemaEvolutionMode -> failOnNewColumns -> just fails the job
