In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from delta.tables import DeltaTable

orders_tbl = "my_catalog.silver_retailx.orders"
customers_tbl = "my_catalog.silver_retailx.customers" 

orders_df = spark.table(orders_tbl)
customers_df = spark.table(customers_tbl)

In [0]:
daily_sales_df = (
    orders_df
    .groupBy("order_date")
    .agg(
        count("order_id").alias("total_orders"),
        sum("amount").alias("total_revenue"),
        round(avg("amount"), 2).alias("avg_order_value")
    )
)
display(daily_sales_df )
daily_sales_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("my_catalog.gold_retailx.daily_sales")

order_date,total_orders,total_revenue,avg_order_value
2025-04-01,1,1200.0,1200.0
2025-04-10,1,850.0,850.0
2025-05-02,1,2300.0,2300.0
2025-04-03,1,1500.0,1500.0
2025-04-18,1,950.0,950.0
2025-05-05,1,1800.0,1800.0
2025-04-06,1,2200.0,2200.0
2025-04-22,1,1100.0,1100.0
2025-05-10,1,2750.0,2750.0
2025-04-08,1,900.0,900.0


In [0]:
orders_df =  orders_df.withColumn("year", year("order_date")).withColumn("month", month("order_date"))

monthly_sales_df = (
   orders_df
    .groupBy("year", "month")
    .agg(
        F.count("order_id").alias("total_orders"),
        F.sum("amount").alias("total_revenue"),
        F.round(F.avg("amount"), 2).alias("avg_order_value")
    )
)
display(monthly_sales_df)
monthly_sales_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("my_catalog.gold_retailx.monthly_sales")

year,month,total_orders,total_revenue,avg_order_value
2025,4,10,13700.0,1370.0
2025,5,5,10850.0,2170.0


In [0]:
customer_metrics_df = (
    orders_df
    .groupBy("customer_id")
    .agg(
        F.count("order_id").alias("total_orders"),
        F.sum("amount").alias("lifetime_value"),
        F.round(F.avg("amount"), 2).alias("avg_order_value")
    )
)

customers_df = customers_df.select("customer_id", "name")
customer_metrics_df = customer_metrics_df.join(customers_df, on="customer_id", how="inner").select("customer_id", "name", "total_orders", "lifetime_value", "avg_order_value")
display(customer_metrics_df)

customer_metrics_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("my_catalog.gold_retailx.customer_metrics")

customer_id,name,total_orders,lifetime_value,avg_order_value
2,Sneha Reddy,3,4250.0,1416.67
1,Ravi Kumar,3,4350.0,1450.0
3,Arjun Mehta,3,6050.0,2016.67
5,Vikram Singh,3,6100.0,2033.33
4,Priya Sharma,3,3800.0,1266.67


In [0]:
from pyspark.sql.window import Window

#rank_window = Window.partitionBy("country").orderBy(F.desc("total_revenue"))
rank_window = Window.orderBy(F.desc("total_revenue"))

top_customers_df = (
    orders_df
    .groupBy("customer_id")
    .agg(F.sum("amount").alias("total_revenue"))
    .withColumn("revenue_rank", F.rank().over(rank_window))
)

customers_df = customers_df.select("customer_id", "name")
top_customers_df = top_customers_df.join(customers_df, on="customer_id", how="inner").select("customer_id", "name", "total_revenue", "revenue_rank")
display(top_customers_df)

top_customers_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("my_catalog.gold_retailx.top_customers")



customer_id,name,total_revenue,revenue_rank
5,Vikram Singh,6100.0,1
3,Arjun Mehta,6050.0,2
1,Ravi Kumar,4350.0,3
2,Sneha Reddy,4250.0,4
4,Priya Sharma,3800.0,5


In [0]:
daily_revenue_df = orders_df.groupBy("order_date").agg(F.sum("amount").alias("daily_revenue"))


trend_window = Window.orderBy("order_date").rowsBetween(-6, 0)
revenue_trend_df = daily_revenue_df.withColumn("rolling_7day_avg", F.round(F.avg("daily_revenue").over(trend_window), 2))

display(revenue_trend_df)
revenue_trend_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("my_catalog.gold_retailx.revenue_trend")




order_date,daily_revenue,rolling_7day_avg
2025-04-01,1200.0,1200.0
2025-04-03,1500.0,1350.0
2025-04-06,2200.0,1633.33
2025-04-08,900.0,1450.0
2025-04-10,850.0,1330.0
2025-04-11,2000.0,1441.67
2025-04-18,950.0,1371.43
2025-04-22,1100.0,1357.14
2025-04-25,1300.0,1328.57
2025-04-27,1700.0,1257.14


In [0]:
fact_total = orders_df.agg(F.sum("amount")).collect()[0][0]

daily_total = (
    spark.table("my_catalog.gold_retailx.daily_sales")
    .agg(F.sum("total_revenue"))
    .collect()[0][0]
)

print(f"Fact Total Revenue : {fact_total}")
print(f"Gold Total Revenue : {daily_total}")


Fact Total Revenue : 24550.00
Gold Total Revenue : 24550.00
