In [0]:
%py
COMPETITION_NAME = "store-sales-time-series-forecasting"
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"
DOWNLOAD_PATH = VOLUME_TARGET_DIR


## Building Out Silver Data

In [0]:
from pyspark.sql import functions as F

# Make sure we're in the right catalog & schema
spark.sql("USE cscie103_catalog.final_project")

# Path for checkpoint inside your UC Volume
checkpoint_path = "/Volumes/cscie103_catalog/final_project/data/checkpoints/silver_train"

# Read from Bronze as a streaming source
bronze_train_stream = (
    spark.readStream
        .option("skipChangeCommits", "true")
        .table("bronze_train")   # managed UC Delta table
)

# Apply cleaning / typing
silver_train_stream = (
    bronze_train_stream
    .withColumn("date", F.to_date("date"))
    .withColumn("store_nbr", F.col("store_nbr").cast("int"))
    .withColumn("onpromotion", F.col("onpromotion").cast("int"))
    .withColumn("sales", F.col("sales").cast("double"))
    .withColumn("family", F.col("family").cast("string")) 
)

# Write as managed Delta table using trigger=once
query = (
    silver_train_stream
    .writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path)  # ✅ now in a UC Volume, not public DBFS root
    .trigger(once=True)
    .toTable("silver_train")   # creates/updates UC managed table cscie103_catalog.final_project.silver_train
)

query.awaitTermination()

print("✅ Silver table 'silver_train' created via streaming with trigger once.")


In [0]:
from pyspark.sql import functions as F

spark.sql("USE cscie103_catalog.final_project")

# ---- Silver STORES ----
silver_stores = (
    spark.table("bronze_stores")
    .withColumn("store_nbr", F.col("store_nbr").cast("int"))
    .withColumn("cluster", F.col("cluster").cast("int"))
)

silver_stores.write.format("delta").mode("overwrite").saveAsTable("silver_stores")


# ---- Silver OIL ----
silver_oil = (
    spark.table("bronze_oil")
    .withColumn("date", F.to_date("date"))
    .withColumn("dcoilwtico", F.col("dcoilwtico").cast("double"))
)

silver_oil.write.format("delta").mode("overwrite").saveAsTable("silver_oil")


# ---- Silver HOLIDAYS_EVENTS ----
silver_holidays = (
    spark.table("bronze_holidays_events")
    .withColumn("date", F.to_date("date"))
    .withColumn("is_holiday", (F.col("type") != "Work Day").cast("int"))
)

silver_holidays.write.format("delta").mode("overwrite").saveAsTable("silver_holidays_events")


# ---- Silver TRANSACTIONS ----
silver_transactions = (
    spark.table("bronze_transactions")
    .withColumn("date", F.to_date("date"))
    .withColumn("store_nbr", F.col("store_nbr").cast("int"))
    .withColumn("transactions", F.col("transactions").cast("int"))
)

silver_transactions.write.format("delta").mode("overwrite").saveAsTable("silver_transactions")


print("Silver tables created: silver_stores, silver_oil, silver_holidays_events, silver_transactions.")


In [0]:
display(dbutils.fs.ls("/Volumes/cscie103_catalog/final_project/data/checkpoints"))


In [0]:
%%sql
SHOW TABLES IN cscie103_catalog.final_project
