In [0]:
# Databricks notebook: 03_gold_daily_fact
# Default language: Python

# =========================================
# 1. Setup
# =========================================
from pyspark.sql import functions as F

catalog = "cscie103_catalog"
schema  = "final_project"

spark.sql(f"USE {catalog}.{schema}")

# =========================================
# 2. Load Silver tables
# =========================================
silver_train  = spark.table("silver_train")            # date, store_nbr, family, sales, onpromotion, ...
silver_stores = spark.table("silver_stores")           # store_nbr, city, state, cluster, type, ...
silver_oil    = spark.table("silver_oil")              # date, dcoilwtico
silver_hol    = spark.table("silver_holidays_events")  # date, is_holiday, ...
silver_tx     = spark.table("silver_transactions")     # date, store_nbr, transactions

# =========================================
# 3. Build base_fact at date–store–family grain
# =========================================

# Normalize is_holiday safely to 0/1 int
h_str = F.col("h.is_holiday").cast("string")
is_holiday_expr = (
    F.when(h_str.isin("True", "true", "1"), F.lit(1))
     .when(h_str.isin("False", "false", "0"), F.lit(0))
     .otherwise(F.lit(0))
     .cast("int")
     .alias("is_holiday")
)

base_fact = (
    silver_train.alias("t")
    # Add store attributes (city/state/cluster)
    .join(silver_stores.alias("s"), "store_nbr", "left")
    # Daily store-level transactions
    .join(silver_tx.alias("x"), ["date", "store_nbr"], "left")
    # Oil price by date
    .join(silver_oil.alias("o"), "date", "left")
    # Holidays by date
    .join(silver_hol.alias("h"), "date", "left")
    .select(
        F.col("t.id").alias("id"),
        F.col("t.date").alias("date"),
        F.col("t.store_nbr").alias("store_nbr"),
        F.col("t.family").alias("family"),
        F.col("t.sales").cast("double").alias("sales"),
        F.col("t.onpromotion").cast("int").alias("onpromotion"),
        F.col("x.transactions").cast("int").alias("transactions"),
        F.col("o.dcoilwtico").cast("double").alias("dcoilwtico"),
        F.col("s.city").alias("city"),
        F.col("s.state").alias("state"),
        F.col("s.type").alias("store_type"),
        F.col("s.cluster").alias("cluster"),
        is_holiday_expr
    )
)

# =========================================
# 3b. Add core time features
# =========================================
base_fact = (
    base_fact
    .withColumn("year", F.year("date"))
    .withColumn("month", F.month("date"))
    .withColumn("day", F.dayofmonth("date"))
    .withColumn("week_of_year", F.weekofyear("date"))
    .withColumn("day_of_week", F.dayofweek("date"))              # 1=Sun, 7=Sat
    .withColumn("is_weekend", F.col("day_of_week").isin(1, 7).cast("int"))
)

display(base_fact.limit(20))
print("Rows in base_fact:", base_fact.count())

# =========================================
# 4. Save as Gold table: gold_daily_store_family
# =========================================

# Drop if exists to avoid schema merge issues during development
spark.sql("DROP TABLE IF EXISTS gold_daily_store_family")

(
    base_fact
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("gold_daily_store_family")
)

print("✅ Created gold_daily_store_family (with time features)")

id,date,store_nbr,family,sales,onpromotion,transactions,dcoilwtico,city,state,store_type,cluster,is_holiday,year,month,day,week_of_year,day_of_week,is_weekend
0,2013-01-01,1,AUTOMOTIVE,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
1,2013-01-01,1,BABY CARE,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
2,2013-01-01,1,BEAUTY,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
3,2013-01-01,1,BEVERAGES,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
4,2013-01-01,1,BOOKS,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
5,2013-01-01,1,BREAD/BAKERY,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
6,2013-01-01,1,CELEBRATION,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
7,2013-01-01,1,CLEANING,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
8,2013-01-01,1,DAIRY,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0
9,2013-01-01,1,DELI,0.0,0,,,Quito,Pichincha,D,13,1,2013,1,1,1,3,0


Rows in base_fact: 3054348
✅ Created gold_daily_store_family (with time features)
