In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [44]:
# Using %system
%system python -c "import pyspark; print(pyspark.__version__)"


['3.5.0']

In [45]:
spark = (
    SparkSession.builder
    .appName("BronzeAutoloader")
    .master("local[*]")
    .config("spark.jars", r"C:\path\to\delta-spark-3.1.0.jar")  # downloaded manually
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)


In [46]:
sources = ['customers','orders','products','deliveries']

Schemas

In [47]:
# Define schemas in DDL format
my_schemas = {
    "customers": """
        customer_id STRING,
        name STRING,
        email STRING,
        phone STRING,
        address STRING,
        last_updated TIMESTAMP
    """,
    "products": """
        product_id STRING,
        product_name STRING,
        category STRING,
        price DOUBLE,
        stock_quantity INT,
        last_updated TIMESTAMP
    """,
    "orders": """
        order_id STRING,
        customer_id STRING,
        product_id STRING,
        quantity INT,
        total_amount DOUBLE,
        order_date TIMESTAMP,
        last_updated TIMESTAMP
    """,
    "deliveries": """
        delivery_id STRING,
        order_id STRING,
        delivery_date TIMESTAMP,
        status STRING,
        delivery_partner STRING,
        last_updated TIMESTAMP
    """
}

In [48]:
base_path = r"C:\Users\User\Desktop\E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines"

for src, my_schema in my_schemas.items():
    raw_path = f"{base_path}\\raw_data\\{src}"
    bronze_data_path = f"{base_path}\\bronze_layer\\{src}\\data"
    checkpoint_path = f"{base_path}\\bronze_layer\\{src}\\checkpoint"

    # Read with schema
    data = spark.readStream.format("csv")\
                            .option("header","true")\
                            .schema(my_schema)\
                            .load(raw_path)\
                            .withColumn("ingestion_timestamp",current_timestamp())
    

    

Write to Bronze data 

In [None]:
data.writeStream.format("delta")\
                .option("checkpointLocation",checkpoint_path)\
                .option("path",bronze_data_path)\
                .trigger(once=True)\
                .start()

spark.stream.awaitAnyTermination()