## Sets up Autoloader with structured streaming to ingest new data into the bronze Delta table

In [0]:
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [0]:
# 1. Define Schema Manually
activity_schema = StructType([
    StructField("Id", StringType(), True),
    StructField("ActivityDate", StringType(), True),
    StructField("TotalSteps", IntegerType(), True),
    StructField("TotalDistance", DoubleType(), True),
    StructField("TrackerDistance", DoubleType(), True),
    StructField("LoggedActivitiesDistance", DoubleType(), True),
    StructField("VeryActiveDistance", DoubleType(), True),
    StructField("ModeratelyActiveDistance", DoubleType(), True),
    StructField("LightActiveDistance", DoubleType(), True),
    StructField("SedentaryActiveDistance", DoubleType(), True),
    StructField("VeryActiveMinutes", IntegerType(), True),
    StructField("FairlyActiveMinutes", IntegerType(), True),
    StructField("LightlyActiveMinutes", IntegerType(), True),
    StructField("SedentaryMinutes", IntegerType(), True),
    StructField("Calories", IntegerType(), True)
])

In [0]:
# 2. Define Source Path (Landing Zone)
source_path = "s3://databricks-745bwkyiddeq9fthttjahg-cloud-storage-bucket/ohio-prod/3903799048317088/landing/bronze/daily_activity_stream/"

# 3. Define Destination Path (Bronze Table Path)
bronze_path = "s3://databricks-745bwkyiddeq9fthttjahg-cloud-storage-bucket/ohio-prod/3903799048317088/mnt/bronze/daily_activity_autoloaded/"

In [0]:
# 4. Read Stream Using Autoloader
df_stream = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.inferColumnTypes", "false")
    .schema(activity_schema)
    .load(source_path)
    .withColumn("ingestion_timestamp", current_timestamp())
)

In [0]:
# 5. Write Stream Out to Delta Table (Bronze)
query = (df_stream.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", bronze_path + "/_checkpoint")  
    .start(bronze_path))

print("Autoloader streaming started!")

In [0]:
%sql
SELECT * FROM delta.`s3://databricks-745bwkyiddeq9fthttjahg-cloud-storage-bucket/ohio-prod/3903799048317088/mnt/bronze/daily_activity_autoloaded/`