In [0]:
# 03_Gold_Aggregation
# Purpose: Business aggregates (Gold layer)

from pyspark.sql import functions as F

# Read Silver
silver_df = spark.table("workspace.ecommerce.silver_events")

print("Silver count:", silver_df.count())

# -------------------------------
# Gold KPI 1: Distinct users per product
# -------------------------------
gold_users_per_product = silver_df.groupBy("product_id").agg(
    F.countDistinct("user_id").alias("distinct_users")
)

# -------------------------------
# Gold KPI 2: Distinct sessions per product
# -------------------------------
gold_sessions_per_product = silver_df.groupBy("product_id").agg(
    F.countDistinct("user_session").alias("distinct_sessions")
)

# -------------------------------
# Gold KPI 3: Distinct buyers per category
# -------------------------------
gold_buyers_by_category = silver_df.filter(F.col("event_type") == "purchase") \
    .groupBy("category_code") \
    .agg(F.countDistinct("user_id").alias("distinct_buyers"))

# Write Gold tables
gold_users_per_product.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.gold_distinct_users_per_product")

gold_sessions_per_product.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.gold_distinct_sessions_per_product")

gold_buyers_by_category.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("workspace.ecommerce.gold_distinct_buyers_by_category")

print("✅ Gold tables written:")
print(" - workspace.ecommerce.gold_distinct_users_per_product")
print(" - workspace.ecommerce.gold_distinct_sessions_per_product")
print(" - workspace.ecommerce.gold_distinct_buyers_by_category")


Silver count: 42412833
✅ Gold tables written:
 - workspace.ecommerce.gold_distinct_users_per_product
 - workspace.ecommerce.gold_distinct_sessions_per_product
 - workspace.ecommerce.gold_distinct_buyers_by_category
