In [0]:
# Databricks notebook: 04_gold_aggregations_features
# Default language: Python

from pyspark.sql import functions as F

catalog = "cscie103_catalog"
schema  = "final_project"

spark.sql(f"USE {catalog}.{schema}")

gdf = spark.table("gold_daily_store_family")

display(gdf.limit(10))

# =========================================
# 1. Daily rollups
# =========================================

# 1.1 Daily per store (aggregated across families)
gold_daily_store = (
    gdf.groupBy(
        "date",
        "store_nbr",
        "city",
        "state",
        "store_type",
        "cluster"
    )
    .agg(
        F.sum("sales").alias("total_sales"),
        F.sum("onpromotion").alias("total_onpromotion_items"),
        F.sum("transactions").alias("total_transactions"),
        F.avg("dcoilwtico").alias("avg_dcoilwtico"),
        F.max("is_holiday").alias("is_holiday")  # 1 if any family in that store-day is a holiday
    )
)

gold_daily_store.write.format("delta").mode("overwrite").saveAsTable("gold_daily_store")
print("✅ gold_daily_store written")

# 1.2 Daily per family (aggregated across stores)
gold_daily_family = (
    gdf.groupBy(
        "date",
        "family"
    )
    .agg(
        F.sum("sales").alias("total_sales"),
        F.sum("onpromotion").alias("total_onpromotion_items"),
        F.sum("transactions").alias("total_transactions"),
        F.avg("dcoilwtico").alias("avg_dcoilwtico"),
        F.max("is_holiday").alias("is_holiday")
    )
)

gold_daily_family.write.format("delta").mode("overwrite").saveAsTable("gold_daily_family")
print("✅ gold_daily_family written")

# =========================================
# 2. Monthly and yearly aggregations
# =========================================

# 2.1 Monthly per store-family
gold_monthly_store_family = (
    gdf
    .groupBy(
        F.date_trunc("month", "date").alias("month_start"),
        "store_nbr",
        "family"
    )
    .agg(
        F.sum("sales").alias("monthly_sales"),
        F.sum("onpromotion").alias("monthly_onpromotion_items"),
        F.sum("transactions").alias("monthly_transactions"),
        F.avg("dcoilwtico").alias("avg_monthly_oil_price"),
        F.max("is_holiday").alias("any_holiday_flag")
    )
)

gold_monthly_store_family.write.format("delta").mode("overwrite").saveAsTable("gold_monthly_store_family")
print("✅ gold_monthly_store_family written")

# 2.2 Monthly per store (for store-level performance / maps)
gold_monthly_store = (
    gdf
    .groupBy(
        F.date_trunc("month", "date").alias("month_start"),
        "store_nbr",
        "city",
        "state",
        "store_type",
        "cluster"
    )
    .agg(
        F.sum("sales").alias("monthly_sales"),
        F.sum("transactions").alias("monthly_transactions"),
        F.sum("onpromotion").alias("monthly_onpromotion_items"),
        F.avg("dcoilwtico").alias("avg_monthly_oil_price")
    )
)

gold_monthly_store.write.format("delta").mode("overwrite").saveAsTable("gold_monthly_store")
print("✅ gold_monthly_store written")

# 2.3 Yearly per store-family
gold_yearly_store_family = (
    gdf
    .groupBy(
        F.date_trunc("year", "date").alias("year_start"),
        "store_nbr",
        "family"
    )
    .agg(
        F.sum("sales").alias("yearly_sales"),
        F.sum("onpromotion").alias("yearly_onpromotion_items"),
        F.sum("transactions").alias("yearly_transactions"),
        F.avg("dcoilwtico").alias("avg_yearly_oil_price")
    )
)

gold_yearly_store_family.write.format("delta").mode("overwrite").saveAsTable("gold_yearly_store_family")
print("✅ gold_yearly_store_family written")

# =========================================
# 3. Store-level feature table
# =========================================

store_features = (
    gdf
    .groupBy("store_nbr", "city", "state", "store_type", "cluster")
    .agg(
        F.countDistinct("date").alias("num_days"),
        F.sum("sales").alias("total_sales"),
        F.avg("sales").alias("avg_daily_sales"),
        F.stddev("sales").alias("std_daily_sales"),
        F.sum("transactions").alias("total_transactions"),
        F.avg("transactions").alias("avg_daily_transactions"),
        (F.sum("sales") / F.sum(F.when(F.col("transactions") > 0, F.col("transactions")).otherwise(0.0))
         ).alias("avg_sales_per_transaction"),

        F.avg(F.when(F.col("onpromotion") > 0, F.col("sales")).otherwise(None)).alias("avg_sales_on_promo"),
        F.avg(F.when(F.col("onpromotion") == 0, F.col("sales")).otherwise(None)).alias("avg_sales_no_promo"),

        F.avg(F.when(F.col("is_holiday") == 1, F.col("sales")).otherwise(None)).alias("avg_sales_holiday"),
        F.avg(F.when(F.col("is_holiday") == 0, F.col("sales")).otherwise(None)).alias("avg_sales_nonholiday")
    )
    .withColumn(
        "promo_lift_pct",
        (F.col("avg_sales_on_promo") - F.col("avg_sales_no_promo")) / F.col("avg_sales_no_promo") * 100.0
    )
    .withColumn(
        "holiday_uplift_pct",
        (F.col("avg_sales_holiday") - F.col("avg_sales_nonholiday")) / F.col("avg_sales_nonholiday") * 100.0
    )
)

store_features.write.format("delta").mode("overwrite").saveAsTable("gold_store_features")
print("✅ gold_store_features written")

# =========================================
# 4. Family-level feature table
# =========================================

family_features = (
    gdf
    .groupBy("family")
    .agg(
        F.countDistinct("date").alias("num_days"),
        F.countDistinct("store_nbr").alias("num_stores_selling"),
        F.sum("sales").alias("total_sales"),
        F.avg("sales").alias("avg_daily_sales"),
        F.stddev("sales").alias("std_daily_sales"),
        F.sum("transactions").alias("total_transactions"),

        F.avg(F.when(F.col("onpromotion") > 0, F.col("sales")).otherwise(None)).alias("avg_sales_on_promo"),
        F.avg(F.when(F.col("onpromotion") == 0, F.col("sales")).otherwise(None)).alias("avg_sales_no_promo"),
        F.avg(F.when(F.col("is_holiday") == 1, F.col("sales")).otherwise(None)).alias("avg_sales_holiday"),
        F.avg(F.when(F.col("is_holiday") == 0, F.col("sales")).otherwise(None)).alias("avg_sales_nonholiday")
    )
    .withColumn(
        "promo_lift_pct",
        (F.col("avg_sales_on_promo") - F.col("avg_sales_no_promo")) / F.col("avg_sales_no_promo") * 100.0
    )
    .withColumn(
        "holiday_uplift_pct",
        (F.col("avg_sales_holiday") - F.col("avg_sales_nonholiday")) / F.col("avg_sales_nonholiday") * 100.0
    )
)

family_features.write.format("delta").mode("overwrite").saveAsTable("gold_family_features")
print("✅ gold_family_features written")

from pyspark.sql import functions as F

# =========================================
# 5. Calendar table for BI
# =========================================

# 5.1 Get min/max date from the Gold daily fact
min_max = (
    gdf.agg(
        F.min("date").alias("min_date"),
        F.max("date").alias("max_date")
    )
    .collect()[0]
)

start_date = min_max["min_date"]
end_date   = min_max["max_date"]

print(f"Building calendar from {start_date} to {end_date}")

# 5.2 Build continuous date range using sequence + explode
calendar_df = (
    spark.sql(f"""
      SELECT explode(
        sequence(
          to_date('{start_date}'),
          to_date('{end_date}'),
          interval 1 day
        )
      ) AS date
    """)
    .withColumn("year", F.year("date"))
    .withColumn("month", F.month("date"))
    .withColumn("day", F.dayofmonth("date"))
    .withColumn("day_of_week", F.date_format("date", "E"))   # Mon, Tue, Wed...
    .withColumn("week_of_year", F.weekofyear("date"))
    .withColumn("is_weekend", F.col("day_of_week").isin("Sat", "Sun").cast("int"))
    # Paydays: 15th and last day of month
    .withColumn(
        "is_payday",
        F.when(F.dayofmonth("date") == 15, 1)
         .when(F.dayofmonth("date") == F.dayofmonth(F.last_day("date")), 1)
         .otherwise(0)
    )
    # Earthquake period flag (example from Kaggle description: 2016-04-16 and weeks after)
    .withColumn(
        "is_earthquake_period",
        F.when(
            (F.col("date") >= F.to_date(F.lit("2016-04-16"))) &
            (F.col("date") <= F.to_date(F.lit("2016-05-31"))),
            1
        ).otherwise(0)
    )
)

display(calendar_df.limit(10))

# 5.3 Save as Gold calendar table
calendar_df.write.format("delta").mode("overwrite").saveAsTable("gold_calendar")
print("✅ gold_calendar written")

id,date,store_nbr,family,sales,onpromotion,transactions,dcoilwtico,city,state,store_type,cluster,is_holiday,year,month,day,week_of_year,day_of_week,is_weekend
0,2013-01-01,1,AUTOMOTIVE,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
1,2013-01-01,1,BABY CARE,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
2,2013-01-01,1,BEAUTY,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
3,2013-01-01,1,BEVERAGES,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
4,2013-01-01,1,BOOKS,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
5,2013-01-01,1,BREAD/BAKERY,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
6,2013-01-01,1,CELEBRATION,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
7,2013-01-01,1,CLEANING,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
8,2013-01-01,1,DAIRY,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
9,2013-01-01,1,DELI,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0


✅ gold_daily_store written
✅ gold_daily_family written
✅ gold_monthly_store_family written
✅ gold_monthly_store written
✅ gold_yearly_store_family written
✅ gold_store_features written
✅ gold_family_features written
Building calendar from 2013-01-01 to 2017-08-15


date,year,month,day,day_of_week,week_of_year,is_weekend,is_payday,is_earthquake_period
2013-01-01,2013,1,1,Tue,1,0,0,0
2013-01-02,2013,1,2,Wed,1,0,0,0
2013-01-03,2013,1,3,Thu,1,0,0,0
2013-01-04,2013,1,4,Fri,1,0,0,0
2013-01-05,2013,1,5,Sat,1,1,0,0
2013-01-06,2013,1,6,Sun,1,1,0,0
2013-01-07,2013,1,7,Mon,2,0,0,0
2013-01-08,2013,1,8,Tue,2,0,0,0
2013-01-09,2013,1,9,Wed,2,0,0,0
2013-01-10,2013,1,10,Thu,2,0,0,0


✅ gold_calendar written


In [0]:
%sql
SELECT *
FROM gold_daily_store
ORDER BY date DESC, total_sales DESC
LIMIT 50;

date,store_nbr,city,state,store_type,cluster,total_sales,total_onpromotion_items,total_transactions,avg_dcoilwtico,is_holiday
2017-08-15,44,Quito,Pichincha,A,5,33141.322,168,125895,47.56999999999999,1
2017-08-15,47,Quito,Pichincha,A,14,31653.69100000001,178,118173,47.56999999999999,1
2017-08-15,45,Quito,Pichincha,A,11,31562.926000000003,172,121605,47.56999999999999,1
2017-08-15,3,Quito,Pichincha,D,8,30185.60200000001,143,97548,47.56999999999999,1
2017-08-15,49,Quito,Pichincha,A,11,28429.226003,168,92862,47.56999999999999,1
2017-08-15,46,Quito,Pichincha,A,14,22348.41497,163,105501,47.56999999999999,1
2017-08-15,11,Cayambe,Pichincha,B,6,21727.24,335,77880,47.56999999999999,1
2017-08-15,51,Guayaquil,Guayas,A,17,20154.559,127,51909,47.56999999999999,1
2017-08-15,40,Machala,El Oro,C,3,19552.935,271,45936,47.56999999999999,1
2017-08-15,7,Quito,Pichincha,D,8,18744.295980000003,130,58740,47.56999999999999,1


In [0]:
%sql
SELECT *
FROM gold_daily_family
ORDER BY date DESC, total_sales DESC;

date,family,total_sales,total_onpromotion_items,total_transactions,avg_dcoilwtico,is_holiday
2017-08-15,GROCERY I,224208.125,2296,86561,47.570000000000014,1
2017-08-15,BEVERAGES,170773.0,982,86561,47.570000000000014,1
2017-08-15,PRODUCE,125108.971,3169,86561,47.570000000000014,1
2017-08-15,CLEANING,58474.0,841,86561,47.570000000000014,1
2017-08-15,DAIRY,40707.0,977,86561,47.570000000000014,1
2017-08-15,BREAD/BAKERY,29158.19502000001,512,86561,47.570000000000014,1
2017-08-15,MEATS,17928.170943999998,0,86561,47.570000000000014,1
2017-08-15,POULTRY,17586.709986,6,86561,47.570000000000014,1
2017-08-15,DELI,14935.453,341,86561,47.570000000000014,1
2017-08-15,PERSONAL CARE,14787.0,476,86561,47.570000000000014,1


In [0]:
%sql
SELECT *
FROM gold_monthly_store_family
ORDER BY month_start DESC, monthly_sales DESC;

month_start,store_nbr,family,monthly_sales,monthly_onpromotion_items,monthly_transactions,avg_monthly_oil_price,any_holiday_flag
2017-08-01T00:00:00.000Z,45,GROCERY I,164681.0,666,58745,48.90272727272728,1
2017-08-01T00:00:00.000Z,47,GROCERY I,152173.0,680,57541,48.90272727272728,1
2017-08-01T00:00:00.000Z,44,GROCERY I,143052.0,650,64335,48.90272727272728,1
2017-08-01T00:00:00.000Z,46,GROCERY I,140431.0,670,52733,48.90272727272728,1
2017-08-01T00:00:00.000Z,44,PRODUCE,139693.164,620,64335,48.90272727272728,1
2017-08-01T00:00:00.000Z,44,BEVERAGES,136453.0,208,64335,48.90272727272728,1
2017-08-01T00:00:00.000Z,45,BEVERAGES,130901.0,219,58745,48.90272727272728,1
2017-08-01T00:00:00.000Z,47,BEVERAGES,128260.0,185,57541,48.90272727272728,1
2017-08-01T00:00:00.000Z,48,GROCERY I,127820.0,636,45595,48.90272727272728,1
2017-08-01T00:00:00.000Z,11,GROCERY I,119260.91400000002,1002,35268,48.90272727272728,1


In [0]:
%sql
SELECT *
FROM gold_store_features
ORDER BY total_sales DESC;

store_nbr,city,state,store_type,cluster,num_days,total_sales,avg_daily_sales,std_daily_sales,total_transactions,avg_daily_transactions,avg_sales_per_transaction,avg_sales_on_promo,avg_sales_no_promo,avg_sales_holiday,avg_sales_nonholiday,promo_lift_pct,holiday_uplift_pct
44,Quito,Pichincha,A,5,1684,63356137.229998976,1120.118405112955,2691.663957182299,244741167,4344.697715289983,0.2588699645695445,2781.8325472271504,546.267166396499,1214.2719646854828,1101.9691107485924,409.2439594306503,10.191107249875683
45,Quito,Pichincha,A,11,1684,55689022.00246001,984.5659984169586,2401.3758856243603,208735461,3705.516695957821,0.2667923396229259,2496.220888821761,448.5577048224207,1098.79757960442,962.5464103940448,456.4993894843449,14.155283084438178
47,Quito,Pichincha,A,14,1684,52024475.95600099,919.7778712916976,2230.0309008321105,219915663,3903.990041007616,0.236565578123469,2321.316154595245,411.38003981147295,1008.295537970244,902.7149668588812,464.2753488134856,11.695892389902951
3,Quito,Pichincha,D,8,1684,51533528.135851,911.0980540972917,2152.1393441221085,180583425,3207.634818288394,0.2853724151917652,2395.589058539299,420.6671422431085,976.4273055877916,898.5050111864574,469.4737758136716,8.672438487398
49,Quito,Pichincha,A,11,1684,44346822.76006599,784.0391563252005,1904.1514723306875,154116468,2735.908611599297,0.2877487612814095,2136.191737510336,331.9393537080847,865.914677535718,768.2566097870563,543.5488030108517,12.711646929498002
46,Quito,Pichincha,A,14,1684,42804727.2936452,756.7753490620062,1895.8455871498095,201725469,3581.0738137082603,0.2121929744708894,1869.4327168783743,338.6729562569309,858.7576187599824,737.1169713957993,451.9875981653958,16.502217705535294
48,Quito,Pichincha,A,14,1684,36741804.09981652,649.584599197633,1676.2929318751667,172072263,3054.66373755126,0.2135254308814228,1627.074526200671,285.6346666522953,747.6381200317252,630.6835377703234,469.63482243607285,18.54409941868393
51,Guayaquil,Guayas,A,17,1684,33603861.27734599,594.1066666197446,1356.2981823349262,96734253,1717.2472173403632,0.3473832715423562,1545.6375567538746,307.3316365972666,643.263892545564,584.630987022353,402.9217212607722,10.02904512842204
8,Quito,Pichincha,D,8,1684,31124067.145029,550.2646148479367,1279.364914301214,155944800,2769.9882766705746,0.1995838729154739,1481.21274414179,257.842824316528,586.3573261508586,543.3072863643533,474.4634344849762,7.923700062000462
50,Ambato,Tungurahua,A,14,1684,29273750.99657431,517.5515539863213,1286.4590415433377,147630153,2620.76215582894,0.1982911376957951,1343.2155861467138,232.89464781771235,572.7777533644896,506.9060026796041,476.74815575755713,12.99486499206452


In [0]:
%sql
SELECT *
FROM gold_family_features
ORDER BY total_sales DESC;

family,num_days,num_stores_selling,total_sales,avg_daily_sales,std_daily_sales,total_transactions,avg_sales_on_promo,avg_sales_no_promo,avg_sales_holiday,avg_sales_nonholiday,promo_lift_pct,holiday_uplift_pct
GROCERY I,1684,54,350827297.99000514,3790.4327973335617,2903.6999741165228,144262952,4426.943571433489,2717.34138897272,4189.966893769218,3713.417526134709,62.91451598236897,12.8331749468137
BEVERAGES,1684,54,221663540.0,2394.9127014996325,2316.775067180202,144262952,3224.894336480784,1290.3258455262032,2633.358202968311,2348.9493028170828,149.928678687022,12.107919903172805
PRODUCE,1684,54,125447968.02497062,1355.3736983552726,2189.741018068485,144262952,2434.579491118157,794.3993375063012,1525.0925814068703,1322.658228205445,206.4679659427392,15.305114267960525
CLEANING,1684,54,99421019.0,1074.1715177838282,736.7841670700276,144262952,1242.3257814796668,867.5771351520551,1120.5572937558495,1065.230070362638,43.19485048000161,5.193922414749002
DAIRY,1684,54,65823605.0,711.175990751545,674.0652428519559,144262952,935.5534230270569,479.19555233260814,764.3955742746357,700.9172401350551,95.2341624359844,9.05646637074434
BREAD/BAKERY,1684,54,42959924.004929096,464.15061157492863,369.0495265425512,144262952,576.5095401801594,379.3926480313712,473.0086606698093,462.44311011405006,51.955907203686536,2.284724396294603
POULTRY,1684,54,32494450.88882909,351.0788159474166,400.8667893257101,144262952,486.0896972391841,313.3226587229422,378.0208424064721,345.88539818182045,55.14029506209839,9.290777926323198
MEATS,1684,54,31650996.2860629,341.96590481506223,454.0823812288984,144262952,465.8441845040816,302.9493593957968,351.0291196266812,340.2188550566637,53.769654913006846,3.177444286036724
PERSONAL CARE,1684,54,25100482.0,271.1923808289036,228.1703423579115,144262952,340.1985828302466,220.73617761484363,294.5421179302046,266.6914224593417,54.119993607866846,10.443041329950857
DELI,1684,54,24585626.80473691,265.6297463669228,210.77412288548007,144262952,322.502658458386,221.74795941470265,280.1288864631637,262.8348529732877,45.43658453932225,6.579809828962693


In [0]:
%sql
SELECT *
FROM gold_family_features
ORDER BY total_sales DESC;

family,num_days,num_stores_selling,total_sales,avg_daily_sales,std_daily_sales,total_transactions,avg_sales_on_promo,avg_sales_no_promo,avg_sales_holiday,avg_sales_nonholiday,promo_lift_pct,holiday_uplift_pct
GROCERY I,1684,54,350827297.99000514,3790.4327973335617,2903.6999741165228,144262952,4426.943571433489,2717.34138897272,4189.966893769218,3713.417526134709,62.91451598236897,12.8331749468137
BEVERAGES,1684,54,221663540.0,2394.9127014996325,2316.775067180202,144262952,3224.894336480784,1290.3258455262032,2633.358202968311,2348.9493028170828,149.928678687022,12.107919903172805
PRODUCE,1684,54,125447968.02497062,1355.3736983552726,2189.741018068485,144262952,2434.579491118157,794.3993375063012,1525.0925814068703,1322.658228205445,206.4679659427392,15.305114267960525
CLEANING,1684,54,99421019.0,1074.1715177838282,736.7841670700276,144262952,1242.3257814796668,867.5771351520551,1120.5572937558495,1065.230070362638,43.19485048000161,5.193922414749002
DAIRY,1684,54,65823605.0,711.175990751545,674.0652428519559,144262952,935.5534230270569,479.19555233260814,764.3955742746357,700.9172401350551,95.2341624359844,9.05646637074434
BREAD/BAKERY,1684,54,42959924.004929096,464.15061157492863,369.0495265425512,144262952,576.5095401801594,379.3926480313712,473.0086606698093,462.44311011405006,51.955907203686536,2.284724396294603
POULTRY,1684,54,32494450.88882909,351.0788159474166,400.8667893257101,144262952,486.0896972391841,313.3226587229422,378.0208424064721,345.88539818182045,55.14029506209839,9.290777926323198
MEATS,1684,54,31650996.2860629,341.96590481506223,454.0823812288984,144262952,465.8441845040816,302.9493593957968,351.0291196266812,340.2188550566637,53.769654913006846,3.177444286036724
PERSONAL CARE,1684,54,25100482.0,271.1923808289036,228.1703423579115,144262952,340.1985828302466,220.73617761484363,294.5421179302046,266.6914224593417,54.119993607866846,10.443041329950857
DELI,1684,54,24585626.80473691,265.6297463669228,210.77412288548007,144262952,322.502658458386,221.74795941470265,280.1288864631637,262.8348529732877,45.43658453932225,6.579809828962693


In [0]:
%sql
SELECT *
FROM gold_calendar
ORDER BY date
LIMIT 100;

date,year,month,day,day_of_week,week_of_year,is_weekend,is_payday,is_earthquake_period
2013-01-01,2013,1,1,Tue,1,0,0,0
2013-01-02,2013,1,2,Wed,1,0,0,0
2013-01-03,2013,1,3,Thu,1,0,0,0
2013-01-04,2013,1,4,Fri,1,0,0,0
2013-01-05,2013,1,5,Sat,1,1,0,0
2013-01-06,2013,1,6,Sun,1,1,0,0
2013-01-07,2013,1,7,Mon,2,0,0,0
2013-01-08,2013,1,8,Tue,2,0,0,0
2013-01-09,2013,1,9,Wed,2,0,0,0
2013-01-10,2013,1,10,Thu,2,0,0,0
