In [0]:
#03_Gold layer code 
# ───────────────────────────────────────────────────────────
# Gold Layer: Curated, Consumption-Ready Fact Tables
# ───────────────────────────────────────────────────────────

from pyspark.sql.functions import count, countDistinct

# 1) Define paths
silver_path           = "dbfs:/tmp/silver/cc_events_enterprise/"
feature_usage_gold    = "dbfs:/tmp/gold/feature_usage_fact/"
user_activity_gold    = "dbfs:/tmp/gold/user_activity_fact/"

# 2) Read the Silver “enterprise view”
silver_df = spark.read.format("delta").load(silver_path)
display(silver_df.limit(5))

# 3) Feature Usage Fact
feature_usage = (
    silver_df
      .filter("feature_category IS NOT NULL")                  # only track known features
      .groupBy("event_date", "app_name", "feature_category")   # natural grain
      .agg(count("*").alias("usage_count"))                    # daily usage per feature
)

# 4) Write Feature Usage to Delta, partitioned by event_date
feature_usage.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("event_date") \
    .save(feature_usage_gold)

display(spark.read.format("delta").load(feature_usage_gold))

# 5) User Activity Fact
user_activity = (
    silver_df
      .groupBy("event_date", "app_name", "region")
      .agg(countDistinct("user_id").alias("active_users"))     # daily unique users
)

# 6) Write User Activity to Delta, partitioned by event_date
user_activity.write.format("delta") \
    .mode("overwrite") \
    .partitionBy("event_date") \
    .save(user_activity_gold)

display(spark.read.format("delta").load(user_activity_gold))

# 7) Optimize Gold tables for speed (Z-Order on high-cardinality column)
spark.sql(f"OPTIMIZE delta.`{feature_usage_gold}` ZORDER BY (feature_category)")
spark.sql(f"OPTIMIZE delta.`{user_activity_gold}`   ZORDER BY (region)")