In [0]:
from pyspark.sql import functions as F

# ============================================================
# 1. LOAD GOLD SALES FACT
# ============================================================
df = spark.read.format("delta").load(
    "<output_path>/gold/sales_fact"
)

# ============================================================
# 2. AGGREGATIONS (BUSINESS RULES)
# ============================================================

df = df.groupBy("customer_id").agg(
    F.count("*").alias("total_orders"),
    F.sum("quantity").alias("total_quantity"),
    F.sum("total_amount").alias("total_revenue"),
    F.round(F.avg("total_amount"), 2).alias("avg_order_value"),
    F.min("order_date").alias("first_order_date"),
    F.max("order_date").alias("last_order_date")
)

# Add ingestion_date
df = df.withColumn("ingestion_date", F.current_timestamp())

# ============================================================
# 3. WRITE GOLD REVENUE SUMMARY
# ============================================================
df.write.format("delta").mode("overwrite").save(
    "<output_path>/gold/revenue_summary"
)


In [0]:
%sql
SELECT *
FROM delta.`<output_path>/gold/orders`;


customer_id,total_orders,total_quantity,total_revenue,avg_order_value,first_order_date,last_order_date,ingestion_date
C054,1,5.0,666.8,666.8,2026-01-24,2026-01-24,2026-01-30T19:29:41.803Z
C065,1,5.0,739.81,739.81,2026-01-05,2026-01-05,2026-01-30T19:29:41.803Z
C021,1,1.0,275.93,275.93,2026-01-23,2026-01-23,2026-01-30T19:29:41.803Z
C141,1,1.0,407.29,407.29,2026-01-09,2026-01-09,2026-01-30T19:29:41.803Z
C150,1,1.0,58.09,58.09,2026-01-12,2026-01-12,2026-01-30T19:29:41.803Z
C072,1,1.0,123.25,123.25,2026-01-13,2026-01-13,2026-01-30T19:29:41.803Z
C046,1,1.0,916.61,916.61,2026-01-25,2026-01-25,2026-01-30T19:29:41.803Z
C119,1,1.0,807.14,807.14,2026-01-19,2026-01-19,2026-01-30T19:29:41.803Z
C038,1,1.0,908.51,908.51,2026-01-14,2026-01-14,2026-01-30T19:29:41.803Z
C073,1,1.0,569.36,569.36,2026-01-15,2026-01-15,2026-01-30T19:29:41.803Z
