In [2]:
# autoloader.py
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("BronzeAutoloader")
    .master("local[*]")
    .config(
        "spark.jars.packages",
        "io.delta:delta-spark_2.12:3.1.0"
    )
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

In [3]:
import os
import shutil
from delta.tables import DeltaTable
from pyspark.sql.functions import col, current_timestamp, lit

# === Define Paths Explicitly ===
dim_orders_path = r"C:/Users/User/Desktop/E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines/Gold_layer/Gold_data/dim_orders/data"
dim_customer_path = r"C:/Users/User/Desktop/E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines/Gold_layer/Gold_data/dim_customer/data"
dim_products_path = r"C:/Users/User/Desktop/E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines/Gold_layer/Gold_data/dim_products/data"
dim_deliveries_path = r"C:/Users/User/Desktop/E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines/Gold_layer/Gold_data/dim_deliveries/data"
fact_sales_path = r"C:/Users/User/Desktop/E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines/Gold_layer/Gold_data/fact_sales/data"

try:
    # === Load Dimension Tables ===
    print("Loading dimensions...")
    dim_orders = spark.read.format("delta").load(dim_orders_path)
    dim_customer = spark.read.format("delta").load(dim_customer_path)
    dim_products = spark.read.format("delta").load(dim_products_path)
    dim_deliveries = spark.read.format("delta").load(dim_deliveries_path)

    # === Build Fact Table ===
    print("Building new_fact_sales...")
    new_fact_sales = (
        dim_orders.alias("o")
        .join(dim_customer.alias("c"), col("o.customer_id") == col("c.customer_id"), "left")
        .join(dim_products.alias("p"), col("o.product_id") == col("p.product_id"), "left")
        .join(dim_deliveries.alias("d"), col("o.order_id") == col("d.order_id"), "left")
        .select(
            col("c.customer_sk").alias("customer_sk"),
            col("p.product_sk").alias("product_sk"),
            col("o.customer_id"),
            col("o.order_id"),
            col("o.transaction_id"),
            col("d.delivery_id"),
            col("o.quantity"),
            col("o.total_amount"),
            col("p.stock_quantity"),
            col("c.signup_date"),
            col("o.order_date"),
            col("d.delivery_date"),
            col("c.full_name"),
            col("p.name").alias("product_name"),
            col("p.category"),
            col("o.payment_method"),
            col("o.order_status"),
            col("d.delivery_status"),
            col("o.last_updated"),
            col("o.ingestion_timestamp"),
        )
        .withColumn("start_date", current_timestamp())
        .withColumn("end_date", lit(None).cast("timestamp"))
        .withColumn("is_current", lit(True))
        .withColumn("load_timestamp", current_timestamp())
    )

    # === Merge / Create fact_sales ===
    if not DeltaTable.isDeltaTable(spark, fact_sales_path):
        print("Creating fact_sales table...")
        new_fact_sales.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(fact_sales_path)

        # Register in Hive Metastore
        spark_sql_path = fact_sales_path.replace("\\", "/")
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS fact_sales
            USING DELTA
            LOCATION '{spark_sql_path}'
        """)
        print("fact_sales table created.")
    else:
        print("Merging new fact rows into fact_sales...")
        fact_sales = DeltaTable.forPath(spark, fact_sales_path)
        (fact_sales.alias("t")
         .merge(
             new_fact_sales.alias("s"),
             "t.order_id = s.order_id AND t.transaction_id = s.transaction_id AND t.is_current = true"
         )
         .whenMatchedUpdate(
             condition=(
                 "t.total_amount <> s.total_amount OR "
                 "t.quantity <> s.quantity OR "
                 "t.order_status <> s.order_status OR "
                 "t.delivery_status <> s.delivery_status"
             ),
             set={
                 "end_date": "current_timestamp()",
                 "is_current": "false"
             }
         )
         .whenNotMatchedInsertAll()
         .execute()
        )
        print("fact_sales merged successfully.")

except Exception as e:
    print(f"Error while building fact_sales: {e}")
    # Auto-fix if schema mismatch
    if os.path.exists(fact_sales_path):
        print("Removing old fact_sales and retrying...")
        shutil.rmtree(fact_sales_path)
        new_fact_sales.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(fact_sales_path)
        spark_sql_path = fact_sales_path.replace("\\", "/")
        spark.sql(f"""
            CREATE TABLE IF NOT EXISTS fact_sales
            USING DELTA
            LOCATION '{spark_sql_path}'
        """)
        print("fact_sales recreated with fresh schema.")
    else:
        raise


Loading dimensions...
Building new_fact_sales...
Creating fact_sales table...
fact_sales table created.


In [4]:
df = spark.read.format("delta").load(f"C:\\Users\\User\\Desktop\\E-Commerce Data Lakaehouse with AI-Powered Self-Healing Pipelines\\Gold_layer\\Gold_data\\fact_sales\\data")
df.createOrReplaceTempView("fact_sales")

spark.sql("SELECT * FROM fact_sales").show()

+-----------+-----------+-------------+------------+----------------+--------------------+--------+------------+--------------+-------------------+-------------------+-------------------+------------------+--------------------+---------------+--------------+------------+---------------+-------------------+--------------------+--------------------+--------+----------+--------------------+
|customer_sk| product_sk|  customer_id|    order_id|  transaction_id|         delivery_id|quantity|total_amount|stock_quantity|        signup_date|         order_date|      delivery_date|         full_name|        product_name|       category|payment_method|order_status|delivery_status|       last_updated| ingestion_timestamp|          start_date|end_date|is_current|      load_timestamp|
+-----------+-----------+-------------+------------+----------------+--------------------+--------+------------+--------------+-------------------+-------------------+-------------------+------------------+------------

In [5]:
fact_sales = DeltaTable.forPath(spark, fact_sales_path)

# Convert to DataFrame
fact_sales_df = fact_sales.toDF()

fact_count = fact_sales_df.count()
print(f"Total records in fact table: {fact_count}")

Total records in fact table: 83532


In [6]:
from pyspark.sql.functions import col, when, sum

# Convert to DataFrame first
fact_sales_df = fact_sales.toDF()

null_counts = fact_sales_df.select([
    sum(when(col(c).isNull(), 1).otherwise(0)).alias(c + "_nulls")
    for c in fact_sales_df.columns
])

null_counts.show(truncate=False)


+-----------------+----------------+-----------------+--------------+--------------------+-----------------+--------------+------------------+--------------------+-----------------+----------------+-------------------+---------------+------------------+--------------+--------------------+------------------+---------------------+------------------+-------------------------+----------------+--------------+----------------+--------------------+
|customer_sk_nulls|product_sk_nulls|customer_id_nulls|order_id_nulls|transaction_id_nulls|delivery_id_nulls|quantity_nulls|total_amount_nulls|stock_quantity_nulls|signup_date_nulls|order_date_nulls|delivery_date_nulls|full_name_nulls|product_name_nulls|category_nulls|payment_method_nulls|order_status_nulls|delivery_status_nulls|last_updated_nulls|ingestion_timestamp_nulls|start_date_nulls|end_date_nulls|is_current_nulls|load_timestamp_nulls|
+-----------------+----------------+-----------------+--------------+--------------------+-----------------+