In [0]:
# Gold Layer Aggregation for telemetry1 events
# Spark 4.0.0, serverless cluster compatible

from pyspark.sql.functions import (
    date_format, col, countDistinct, count, avg
)
from delta.tables import DeltaTable

# Paths for Silver and Gold layers
silver_path = "dbfs:/tmp/silver/telemetry1/"
gold_path   = "dbfs:/tmp/gold/telemetry1/"

# 0) Ensure Gold folder exists
dbutils.fs.mkdirs(gold_path)

# 1) Read Silver Delta
silver_df = spark.read.format("delta").load(silver_path)

# 2) Add event_date for daily aggregations
silver_df = silver_df.withColumn(
    "event_date", date_format(col("event_timestamp"), "yyyy-MM-dd")
)

# 3) Gold Table 1: Daily App Metrics
app_daily = (
    silver_df
      .groupBy("event_date", "app_name_norm")
      .agg(
        count("*").alias("total_events"),
        countDistinct("user_id").alias("daily_active_users"),
        avg("is_purchase").alias("purchase_rate")
      )
)
app_daily.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("event_date") \
    .save(f"{gold_path}/app_daily_metrics")

# 4) Gold Table 2: Daily Event-Type Distribution
event_daily = (
    silver_df
      .groupBy("event_date", "event_type_norm")
      .agg(
        count("*").alias("event_count")
      )
)
event_daily.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("event_date") \
    .save(f"{gold_path}/event_daily_metrics")

# 5) Gold Table 3: Daily Region Metrics
region_daily = (
    silver_df
      .groupBy("event_date", "region")
      .agg(
        count("*").alias("region_events"),
        countDistinct("user_id").alias("region_active_users")
      )
)
region_daily.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("event_date") \
    .save(f"{gold_path}/region_daily_metrics")

# 6) Optimize all Gold tables for performance
for sub in ["app_daily_metrics", "event_daily_metrics", "region_daily_metrics"]:
    path = f"{gold_path}/{sub}"
    DeltaTable.forPath(spark, path).optimize()

# 7) Sanity-check: print counts
print("App Daily partitions:", spark.read.format("delta").load(f"{gold_path}/app_daily_metrics").count())
print("Event Daily partitions:", spark.read.format("delta").load(f"{gold_path}/event_daily_metrics").count())
print("Region Daily partitions:", spark.read.format("delta").load(f"{gold_path}/region_daily_metrics").count())
