In [0]:
from pyspark.sql import functions as F

In [0]:
# Silver Paths
retail_silver_path = "/Volumes/project4cat/project4db/p4silver/retail_silver"
customer_silver_path = "/Volumes/project4cat/project4db/p4silver/customer_silver"


In [0]:
df_retail_silver = spark.read.format("delta").load(retail_silver_path)
df_customer_silver = spark.read.format("delta").load(customer_silver_path)

df_retail = df_retail_silver
df_customer = df_customer_silver

# display(df_retail)
# display(df_customer)

In [0]:
# df_retail.printSchema()
# df_customer.printSchema()


2 Join Customer + Retail

In [0]:
df_sales =(
  df_retail.alias("r").join(df_customer.alias("c"),F.col("c.customer_id") == F.col("r.customer_id"), "left")
  .select(F.col("r.*"), F.col("c.gender"),F.col("c.city"), F.col("c.age"), F.col("c.loyalty_tier"), F.col("c.signup_date"))
)
# display(df_sales)

In [0]:
sales_gold_path = "/Volumes/project4cat/project4db/p4gold/sales_gold"

df_sales.write.format("delta").mode("overwrite").save(sales_gold_path)

3 Gold Table: Customer Sales Summary (CLV / Total Orders / Total Spend)

In [0]:
df_customer_sales = (
    df_sales.groupBy(F.col("customer_id"), F.col("gender"), F.col("city"), F.col("loyalty_tier"))
    .agg(
        F.count("*").alias("total_orders"),
        F.sum("TotalAmount").alias("total_spend"),
        F.avg("TotalAmount").alias("avg_order_value")
    )
)
# display(df_customer_sales)

In [0]:
# write the above table

df_customer_sales.write.format("delta").mode("overwrite").save("/Volumes/project4cat/project4db/p4gold/customer_sales_summary")

4 Gold Table: Product Sales Summary

In [0]:
df_product_sales = (
    df_sales.groupBy("product_id", "product_name", "category")
        .agg(
            F.sum("quantity").alias("units_sold"),
            F.sum("totalAmount").alias("revenue"),
            F.count("order_id").alias("order_count")
        )
)

# display(df_product_sales)


In [0]:
# write the above table
df_product_sales.write.format("delta").mode("overwrite") \
    .save("/Volumes/project4cat/project4db/p4gold/product_sales_summary")


5 Gold Table: City-Level Revenue & Order KPIs

In [0]:
df_city_sales = (
    df_sales.groupBy("city")
        .agg(
            F.sum("totalAmount").alias("total_revenue"),
            F.count("order_id").alias("total_orders"),
            F.avg("totalAmount").alias("avg_order_value")
        )
)

# display(df_city_sales)


In [0]:
# write the above table

df_city_sales.write.format("delta").mode("overwrite") \
    .save("/Volumes/project4cat/project4db/p4gold/city_sales_summary")


6 Gold Table: Loyalty Tier Revenue Report

In [0]:
df_tier_sales = (
    df_sales.groupBy("loyalty_tier")
        .agg(
            F.sum("totalAmount").alias("total_revenue"),
            F.count("order_id").alias("order_count"),
            F.avg("totalAmount").alias("avg_order_value")
        )
)

# display(df_tier_sales)


In [0]:
# write the above table

df_tier_sales.write.format("delta").mode("overwrite") \
    .save("/Volumes/project4cat/project4db/p4gold/loyalty_sales_summary")


7 Gold Table: Monthly Revenue Trend

In [0]:
df_monthly = (
    df_sales
        .withColumn("month", F.date_format("order_date", "yyyy-MM"))
        .groupBy("month")
        .agg(F.sum("totalAmount").alias("monthly_revenue"))
        .orderBy("month")
)

# display(df_monthly)


In [0]:
# write tha above table

df_monthly.write.format("delta").mode("overwrite") \
    .save("/Volumes/project4cat/project4db/p4gold/monthly_revenue")
