## Building Out Gold Data

In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
# place where raw csvs land after download
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"
# raw data
VOLUME_BRONZE_DIR = f"{VOLUME_ROOT_PATH}/bronze"
# place where prepared data is written
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver"
# place where final data is written
VOLUME_GOLD_DIR = f"{VOLUME_ROOT_PATH}/gold"

class DataframeNames:
    HOLIDAYS = "holidays"
    OIL = "oil"
    STORES = "stores"
    TEST = "test"
    TRAIN = "train"
    TRANSACTIONS = "transactions"
    TRAINING = "training"
    
    ALL = [HOLIDAYS, OIL, STORES, TEST, TRAIN, TRANSACTIONS]

In [0]:
from pyspark.sql import functions as F

# Use the right catalog & schema
spark.sql("USE cscie103_catalog.final_project")

# Load Silver tables
silver_train  = spark.table(f"/silver/train")
silver_stores = spark.table(f"/silver/stores")
silver_oil    = spark.table(f"/silver/oil")
silver_hol    = spark.table(f"/silver/holidays")
silver_tx     = spark.table(f"/silver/transactions")

# Enriched base fact: one row per date-store-family
base_fact = (
    silver_train.alias("t")
    .join(silver_stores.alias("s"), "store_nbr", "left")
    .join(silver_tx.alias("x"), ["date", "store_nbr"], "left")
    .join(silver_oil.alias("o"), "date", "left")
    .join(silver_hol.alias("h"), "date", "left")
    .select(
        F.col("t.date").alias("date"),
        F.col("t.store_nbr").alias("store_nbr"),
        F.col("t.family").alias("family"),
        F.col("t.sales").alias("sales"),
        F.col("t.onpromotion").alias("onpromotion"),
        F.col("x.transactions").alias("transactions"),
        F.col("o.dcoilwtico").alias("dcoilwtico"),
        F.col("s.city").alias("city"),
        F.col("s.state").alias("state"),
        F.col("s.type").alias("store_type"),
        F.col("s.cluster").alias("cluster"),
        F.col("h.is_holiday").alias("is_holiday")
    )
)

# Initial write of Gold fact table as managed UC table
base_fact.write.format("delta").mode("overwrite").saveAsTable("gold_daily_store_family")

print("âœ… Gold table 'gold_daily_store_family' created.")


In [0]:
display(spark.table("cscie103_catalog.final_project.gold_daily_store_family"))


In [0]:
%sql
SELECT *
FROM cscie103_catalog.final_project.gold_daily_store_family
WHERE sales = 1
  AND is_holiday IS NOT NULL LIMIT 50;