## Gold Notebook

In [0]:
#Step 0: Clean up old widgets
for w in ["silver_table", "gold_table", "processing_date"]:
    try:
        dbutils.widgets.remove(w)
    except:
        pass


In [0]:
#Step 1: Create widgets
dbutils.widgets.text("silver_table", "silver_events")   # Silver table name
dbutils.widgets.text("gold_table", "gold_events")       # Gold table name
dbutils.widgets.text("processing_date", "")            # Default empty, you will pass date

In [0]:
#Step 2: Read widget values
from pyspark.sql import functions as F

silver_table = dbutils.widgets.get("silver_table").strip()
gold_table = dbutils.widgets.get("gold_table").strip()
processing_date = dbutils.widgets.get("processing_date").strip()

processing_date = dbutils.widgets.get("processing_date").strip()
if not processing_date:
    # Use current date as default
    from datetime import datetime
    processing_date = datetime.today().strftime("%Y-%m-%d")
    print(f"⚠️ No processing_date provided. Using today: {processing_date}")


print(f"🔹 Silver Table: {silver_table}")
print(f"🔹 Gold Table: {gold_table}")
print(f"🔹 Processing Date: {processing_date}")

⚠️ No processing_date provided. Using today: 2026-01-15
🔹 Silver Table: silver_events
🔹 Gold Table: gold_events
🔹 Processing Date: 2026-01-15


In [0]:
# Step 3️: Read incremental Silver data for this date
from pyspark.sql import functions as F

df_silver = spark.table(silver_table)

df_incremental = df_silver.filter(F.col("ingestion_date") == processing_date)

# Safety check: ensure data exists
if df_incremental.count() == 0:
    raise Exception(f"❌ No Silver data found for processing_date = {processing_date}")

print(f"✅ Incremental Silver rows for {processing_date}: {df_incremental.count()}")

✅ Incremental Silver rows for 2026-01-15: 67360187


In [0]:
#Step 4: Compute Gold aggregates
# Product-level metrics
df_gold_product = df_incremental.groupBy("product_id", "category_id", "category_code", "brand") \
    .agg(
        F.countDistinct(F.when(F.col("event_type")=="view", F.col("user_id"))).alias("views"),
        F.countDistinct(F.when(F.col("event_type")=="purchase", F.col("user_id"))).alias("purchases"),
        F.sum(F.when(F.col("event_type")=="purchase", F.col("price"))).alias("revenue")
    ) \
    .withColumn("conversion_rate", 
                F.when(F.col("views")>0, F.col("purchases")/F.col("views")*100).otherwise(0))

In [0]:
# 4b: Category-level metrics
df_gold_category = df_incremental.groupBy(
    "category_id", "category_code"
).agg(
    F.countDistinct(F.when(F.col("event_type")=="view", F.col("user_id"))).alias("views"),
    F.countDistinct(F.when(F.col("event_type")=="purchase", F.col("user_id"))).alias("purchases"),
    F.sum(F.when(F.col("event_type")=="purchase", F.col("price"))).alias("revenue")
).withColumn(
    "conversion_rate",
    F.when(F.col("views")>0, F.col("purchases")/F.col("views")*100).otherwise(0)
).withColumn("ingestion_date", F.lit(processing_date))

In [0]:

# 4c: Daily product-level metrics
df_gold_daily = df_incremental.groupBy("product_id", "event_date").agg(
    F.countDistinct(F.when(F.col("event_type")=="view", F.col("user_id"))).alias("daily_views"),
    F.countDistinct(F.when(F.col("event_type")=="purchase", F.col("user_id"))).alias("daily_purchases"),
    F.sum(F.when(F.col("event_type")=="purchase", F.col("price"))).alias("daily_revenue")
).withColumn(
    "daily_conversion_rate",
    F.when(F.col("daily_views")>0, F.col("daily_purchases")/F.col("daily_views")*100).otherwise(0)
).withColumn("ingestion_date", F.lit(processing_date))

In [0]:
# Step 5️⃣: Write Gold tables (mergeSchema allows future schema changes)
df_gold_product.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(gold_table + "_product")
df_gold_category.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(gold_table + "_category")
df_gold_daily.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(gold_table + "_daily")

print(f"✅ Gold tables updated for processing_date = {processing_date}")

✅ Gold tables updated for processing_date = 2026-01-15
