In [0]:
dbutils.fs.ls('/FileStore/tables')

Out[1]: [FileInfo(path='dbfs:/FileStore/tables/BigMart_Sales-1.csv', name='BigMart_Sales-1.csv', size=869537, modificationTime=1732766283000),
 FileInfo(path='dbfs:/FileStore/tables/BigMart_Sales-2.csv', name='BigMart_Sales-2.csv', size=869537, modificationTime=1738049024000),
 FileInfo(path='dbfs:/FileStore/tables/BigMart_Sales-3.csv', name='BigMart_Sales-3.csv', size=869537, modificationTime=1739937933000),
 FileInfo(path='dbfs:/FileStore/tables/BigMart_Sales.csv', name='BigMart_Sales.csv', size=869537, modificationTime=1732677345000),
 FileInfo(path='dbfs:/FileStore/tables/CSV/', name='CSV/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/Calendar-1.csv', name='Calendar-1.csv', size=9952, modificationTime=1731775279000),
 FileInfo(path='dbfs:/FileStore/tables/Calendar.csv', name='Calendar.csv', size=9952, modificationTime=1731775248000),
 FileInfo(path='dbfs:/FileStore/tables/Products.csv', name='Products.csv', size=58122, modificationTime=1732765779000),
 FileI

In [0]:
import json
import random
from datetime import datetime, timedelta

# Generate dummy IoT data
base_time = datetime.now()
data = []

for i in range(50):
    record = {
        "device_id": f"device_{random.randint(1, 5)}",
        "temperature": round(random.uniform(20.0, 30.0), 2),
        "humidity": round(random.uniform(30.0, 50.0), 2),
        "timestamp": (base_time + timedelta(seconds=i*10)).isoformat()
    }
    data.append(record)

# Write to DBFS as individual JSON files
dbutils.fs.mkdirs("/mnt/iot/bronze_input/")

for idx, record in enumerate(data):
    with open(f"/tmp/iot_data_{idx}.json", "w") as f:
        json.dump(record, f)
    dbutils.fs.cp(f"file:/tmp/iot_data_{idx}.json", f"dbfs:/mnt/iot/bronze_input/iot_data_{idx}.json")


In [0]:
dbutils.fs.ls("/mnt/iot/bronze_input/")

Out[3]: [FileInfo(path='dbfs:/mnt/iot/bronze_input/iot_data_0.json', name='iot_data_0.json', size=108, modificationTime=1742992764000),
 FileInfo(path='dbfs:/mnt/iot/bronze_input/iot_data_1.json', name='iot_data_1.json', size=109, modificationTime=1742992764000),
 FileInfo(path='dbfs:/mnt/iot/bronze_input/iot_data_10.json', name='iot_data_10.json', size=109, modificationTime=1742992766000),
 FileInfo(path='dbfs:/mnt/iot/bronze_input/iot_data_11.json', name='iot_data_11.json', size=108, modificationTime=1742992766000),
 FileInfo(path='dbfs:/mnt/iot/bronze_input/iot_data_12.json', name='iot_data_12.json', size=109, modificationTime=1742992766000),
 FileInfo(path='dbfs:/mnt/iot/bronze_input/iot_data_13.json', name='iot_data_13.json', size=109, modificationTime=1742992766000),
 FileInfo(path='dbfs:/mnt/iot/bronze_input/iot_data_14.json', name='iot_data_14.json', size=109, modificationTime=1742992766000),
 FileInfo(path='dbfs:/mnt/iot/bronze_input/iot_data_15.json', name='iot_data_15.json',

In [0]:
bronze_df = (
    spark.readStream.format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", "/mnt/iot/schema/bronze/")  
    .load("/mnt/iot/bronze_input/")
)

In [0]:
bronze_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/mnt/iot/checkpoints/bronze/") \
    .outputMode("append") \
    .start("/mnt/iot/bronze/")

Out[6]: <pyspark.sql.streaming.query.StreamingQuery at 0x708467986880>

In [0]:
silver_df = (
    spark.readStream.format("delta")
    .load("/mnt/iot/bronze/")
    .filter(col("temperature") > 25)
)

In [0]:
silver_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/mnt/iot/checkpoints/silver/") \
    .outputMode("append") \
    .start("/mnt/iot/silver/")

Out[8]: <pyspark.sql.streaming.query.StreamingQuery at 0x70846686a1c0>

In [0]:
from pyspark.sql.functions import avg

gold_df = (
    spark.readStream.format("delta")
    .load("/mnt/iot/silver/")
    .groupBy("device_id")
    .agg(
        avg("temperature").alias("avg_temp"),
        avg("humidity").alias("avg_humidity")
    )
)

In [0]:

gold_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/mnt/iot/checkpoints/gold/") \
    .outputMode("complete") \
    .start("/mnt/iot/gold/")

Out[10]: <pyspark.sql.streaming.query.StreamingQuery at 0x70846686aa30>