This notebook serves as a sandbox to provide exploratory data analysis in preparation for ERD & E2E workflow specifications.
It will:
1. Create schema & volume if needed.
2. Fetch data from [Kaggle competition](https://www.kaggle.com/competitions/store-sales-time-series-forecasting/overview).
3. Create respective tables per csv file.

... (wip)

In [0]:
%sql
-- creates cscie103_catalog.final_project schema and data volume (if not exist)
CREATE SCHEMA IF NOT EXISTS cscie103_catalog.final_project;
CREATE VOLUME IF NOT EXISTS cscie103_catalog.final_project.data;

In [0]:
%pip install kaggle

In [0]:
%py
# all imports here
import os
import json
from pathlib import Path
import subprocess
import zipfile

In [0]:
%py #this is the old code with the username as a secret key setup
# Kaggle authentication set up
kaggle_token = dbutils.secrets.get(scope="e-103-finalproject-credentials", key="kaggle-api-token")
kaggle_username = dbutils.secrets.get(scope="e-103-finalproject-credentials", key="kaggle-username")

kaggle_config = { "username": kaggle_username, "key": kaggle_token }
kaggle_config_dir = "/tmp/kaggle_config"
kaggle_file_path = Path(kaggle_config_dir) / "kaggle.json"

os.makedirs(kaggle_config_dir, exist_ok=True)
with open(kaggle_file_path, "w") as f:
    json.dump(kaggle_config, f)
os.chmod(kaggle_file_path, 0o600)

os.environ["KAGGLE_CONFIG_DIR"] = kaggle_config_dir
os.environ["KAGGLE_USERNAME"] = kaggle_username
os.environ["KAGGLE_KEY"] = kaggle_token

In [0]:
%py
import os, json
from pathlib import Path

# ---- USE YOUR VALUES HERE ----
kaggle_token = "KGAT_835aa679c6e8303c990aa2b1873a0c10"
kaggle_username = "kevinalviar"
# --------------------------------

# Build kaggle.json config
kaggle_config = { 
    "username": kaggle_username, 
    "key": kaggle_token 
}

kaggle_config_dir = "/tmp/kaggle_config"
kaggle_file_path = Path(kaggle_config_dir) / "kaggle.json"

# Create the directory and save the kaggle.json file
os.makedirs(kaggle_config_dir, exist_ok=True)
with open(kaggle_file_path, "w") as f:
    json.dump(kaggle_config, f)

# Secure file permissions
os.chmod(kaggle_file_path, 0o600)

# Export environment variables for kaggle CLI
os.environ["KAGGLE_CONFIG_DIR"] = kaggle_config_dir
os.environ["KAGGLE_USERNAME"] = kaggle_username
os.environ["KAGGLE_KEY"] = kaggle_token

print("Kaggle credentials configured successfully.")


In [0]:
# only after the Kaggle auth setup
import kaggle as kgl

In [0]:
%py
COMPETITION_NAME = "store-sales-time-series-forecasting"
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"
DOWNLOAD_PATH = VOLUME_TARGET_DIR

# download data only if it does not exist in the VOLUME_TARGET_DIR, marker is train.csv file
if not os.path.exists(f"{VOLUME_TARGET_DIR}/train.csv"):
    print("Downloading data...")

    os.makedirs(VOLUME_TARGET_DIR, exist_ok=True)
    os.makedirs(DOWNLOAD_PATH, exist_ok=True)

    # kaggle competitions download -c store-sales-time-series-forecasting -p <path>
    command = [
        "kaggle", "competitions", "download", 
        "-c", COMPETITION_NAME, 
        "-p", DOWNLOAD_PATH
    ]

    # --- Download data ---
    try:
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        print("Download successful.")
    except subprocess.CalledProcessError as e:
        print(f"Error during download: {e}")
        print(f"Stdout: {e.stdout}")
        print(f"Stderr: {e.stderr}")
        raise

    # --- Unzip ---
    zip_file_name = f"{DOWNLOAD_PATH}/{COMPETITION_NAME}.zip"

    try:
        with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
            zip_ref.extractall(DOWNLOAD_PATH)
            print(f"Extracted files from {zip_file_name}.")
    except FileNotFoundError:
        print(f"Error: ZIP file not found at {zip_file_name}. Download may have failed.")
        raise
else:
    print("Skipped downloading data because file-marker exists already (train.csv).")

file_path = f"{VOLUME_TARGET_DIR}/train.csv"
try:
    df = (spark.read
      .option("header", "true")
      .option("inferSchema", "true")
      .csv(file_path)
    )
    print("\nVerification: Data loaded into Spark DataFrame.")
    df.printSchema()
    # df.display()
except Exception as e:
    print(f"Error reading file from Volume: {e}. Check file path.")

In [0]:
%python # manually putting in kaggle key
import os
import requests
import zipfile
from pathlib import Path
from requests.auth import HTTPBasicAuth

# ---- USE YOUR REAL kaggle.json VALUES HERE ----
kaggle_username = "kevinalviar"             # from kaggle.json["username"]
kaggle_token    = "15446cb3990c4f8071488e98244d9010"    # from kaggle.json["key"]
# -----------------------------------------------

COMPETITION_NAME   = "store-sales-time-series-forecasting"
VOLUME_ROOT_PATH   = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR  = f"{VOLUME_ROOT_PATH}/raw"
DOWNLOAD_PATH      = VOLUME_TARGET_DIR
ZIP_PATH           = f"{DOWNLOAD_PATH}/{COMPETITION_NAME}.zip"

os.makedirs(DOWNLOAD_PATH, exist_ok=True)

# Only download if train.csv not present
if not os.path.exists(f"{VOLUME_TARGET_DIR}/train.csv"):
    print("Downloading data directly from Kaggle API...")

    url = f"https://www.kaggle.com/api/v1/competitions/data/download-all/{COMPETITION_NAME}"

    resp = requests.get(
        url,
        auth=HTTPBasicAuth(kaggle_username, kaggle_token),
        stream=True,
    )

    if resp.status_code != 200:
        raise Exception(f"Failed to download data: HTTP {resp.status_code}\n{resp.text}")

    # Write ZIP
    with open(ZIP_PATH, "wb") as f:
        for chunk in resp.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

    print(f"Download complete: {ZIP_PATH}")

    # Extract ZIP
    with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
        zip_ref.extractall(DOWNLOAD_PATH)

    print(f"Extracted files to: {DOWNLOAD_PATH}")

else:
    print("Skipped downloading because train.csv already exists.")

# Verify train.csv
file_path = f"{VOLUME_TARGET_DIR}/train.csv"
try:
    df = (
        spark.read
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(file_path)
    )
    print("\nVerification: Data loaded into Spark DataFrame.")
    df.printSchema()
except Exception as e:
    print(f"Error reading file from Volume: {e}")


In [0]:
filenames = {
    'holidays_events': 'holidays_events.csv',
    'oil': 'oil.csv',
    'sample_submission': 'sample_submission.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

holidays_events_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('holidays_events')}", header=True, inferSchema=True)
oil_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('oil')}", header=True, inferSchema=True)
stores_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('stores')}", header=True, inferSchema=True)
transactions_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('transactions')}", header=True, inferSchema=True)
train_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('train')}", header=True, inferSchema=True)


/Volumes/cscie103_catalog/final_project/data/raw/
    train.csv
    test.csv
    stores.csv
    oil.csv
    holidays_events.csv
    transactions.csv
    sample_submission.csv


In [0]:
# Reading in CSVs
VOLUME_ROOT_PATH  = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"

filenames = {
    "holidays_events": "holidays_events.csv",
    "oil": "oil.csv",
    "sample_submission": "sample_submission.csv",
    "stores": "stores.csv",
    "test": "test.csv",
    "train": "train.csv",
    "transactions": "transactions.csv",
}

holidays_events_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames['holidays_events']}", header=True, inferSchema=True)
oil_df             = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames['oil']}",              header=True, inferSchema=True)
stores_df          = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames['stores']}",           header=True, inferSchema=True)
transactions_df    = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames['transactions']}",     header=True, inferSchema=True)
train_df           = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames['train']}",            header=True, inferSchema=True)
test_df            = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames['test']}",             header=True, inferSchema=True)

print("Loaded DataFrames.")


We are now writing Bronze Delta Tables one per CSV

In [0]:
from pyspark.sql import SparkSession

# Use your UC catalog & schema
catalog = "cscie103_catalog"
schema = "final_project"

spark.sql(f"USE {catalog}.{schema}")

print(f"Writing Bronze tables into {catalog}.{schema} ...")

# ---- WRITE MANAGED DELTA TABLES (UC FRIENDLY) ----
train_df.write.format("delta").mode("overwrite").saveAsTable("bronze_train")
test_df.write.format("delta").mode("overwrite").saveAsTable("bronze_test")
stores_df.write.format("delta").mode("overwrite").saveAsTable("bronze_stores")
oil_df.write.format("delta").mode("overwrite").saveAsTable("bronze_oil")
holidays_events_df.write.format("delta").mode("overwrite").saveAsTable("bronze_holidays_events")
transactions_df.write.format("delta").mode("overwrite").saveAsTable("bronze_transactions")

print("✅ Bronze Delta tables created successfully as managed UC tables!")


Silver Format Now

In [0]:
from pyspark.sql import functions as F

# Make sure we're in the right catalog & schema
spark.sql("USE cscie103_catalog.final_project")

# Path for checkpoint inside your UC Volume
checkpoint_path = "/Volumes/cscie103_catalog/final_project/data/checkpoints/silver_train"

# Read from Bronze as a streaming source
bronze_train_stream = (
    spark.readStream
         .table("bronze_train")   # managed UC Delta table
)

# Apply cleaning / typing
silver_train_stream = (
    bronze_train_stream
    .withColumn("date", F.to_date("date"))
    .withColumn("store_nbr", F.col("store_nbr").cast("int"))
    .withColumn("onpromotion", F.col("onpromotion").cast("int"))
    .withColumn("sales", F.col("sales").cast("double"))
    .withColumn("family", F.col("family").cast("string")) 
)

# Write as managed Delta table using trigger=once
query = (
    silver_train_stream
    .writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path)  # ✅ now in a UC Volume, not public DBFS root
    .trigger(once=True)
    .toTable("silver_train")   # creates/updates UC managed table cscie103_catalog.final_project.silver_train
)

query.awaitTermination()

print("✅ Silver table 'silver_train' created via streaming with trigger once.")


In [0]:
from pyspark.sql import functions as F

spark.sql("USE cscie103_catalog.final_project")

# ---- Silver STORES ----
silver_stores = (
    spark.table("bronze_stores")
    .withColumn("store_nbr", F.col("store_nbr").cast("int"))
    .withColumn("cluster", F.col("cluster").cast("int"))
)

silver_stores.write.format("delta").mode("overwrite").saveAsTable("silver_stores")


# ---- Silver OIL ----
silver_oil = (
    spark.table("bronze_oil")
    .withColumn("date", F.to_date("date"))
    .withColumn("dcoilwtico", F.col("dcoilwtico").cast("double"))
)

silver_oil.write.format("delta").mode("overwrite").saveAsTable("silver_oil")


# ---- Silver HOLIDAYS_EVENTS ----
silver_holidays = (
    spark.table("bronze_holidays_events")
    .withColumn("date", F.to_date("date"))
    .withColumn("is_holiday", (F.col("type") != "Work Day").cast("int"))
)

silver_holidays.write.format("delta").mode("overwrite").saveAsTable("silver_holidays_events")


# ---- Silver TRANSACTIONS ----
silver_transactions = (
    spark.table("bronze_transactions")
    .withColumn("date", F.to_date("date"))
    .withColumn("store_nbr", F.col("store_nbr").cast("int"))
    .withColumn("transactions", F.col("transactions").cast("int"))
)

silver_transactions.write.format("delta").mode("overwrite").saveAsTable("silver_transactions")


print("Silver tables created: silver_stores, silver_oil, silver_holidays_events, silver_transactions.")


In [0]:
display(dbutils.fs.ls("/Volumes/cscie103_catalog/final_project/data/checkpoints"))


In [0]:
%%sql
SHOW TABLES IN cscie103_catalog.final_project


Building Out Gold Data

In [0]:
from pyspark.sql import functions as F

# Use the right catalog & schema
spark.sql("USE cscie103_catalog.final_project")

# Load Silver tables
silver_train  = spark.table("silver_train")
silver_stores = spark.table("silver_stores")
silver_oil    = spark.table("silver_oil")
silver_hol    = spark.table("silver_holidays_events")
silver_tx     = spark.table("silver_transactions")

# Enriched base fact: one row per date-store-family
base_fact = (
    silver_train.alias("t")
    .join(silver_stores.alias("s"), "store_nbr", "left")
    .join(silver_tx.alias("x"), ["date", "store_nbr"], "left")
    .join(silver_oil.alias("o"), "date", "left")
    .join(silver_hol.alias("h"), "date", "left")
    .select(
        F.col("t.date").alias("date"),
        F.col("t.store_nbr").alias("store_nbr"),
        F.col("t.family").alias("family"),
        F.col("t.sales").alias("sales"),
        F.col("t.onpromotion").alias("onpromotion"),
        F.col("x.transactions").alias("transactions"),
        F.col("o.dcoilwtico").alias("dcoilwtico"),
        F.col("s.city").alias("city"),
        F.col("s.state").alias("state"),
        F.col("s.type").alias("store_type"),
        F.col("s.cluster").alias("cluster"),
        F.col("h.is_holiday").alias("is_holiday")
    )
)

# Initial write of Gold fact table as managed UC table
base_fact.write.format("delta").mode("overwrite").saveAsTable("gold_daily_store_family")

print("✅ Gold table 'gold_daily_store_family' created.")


In [0]:
display(spark.table("cscie103_catalog.final_project.gold_daily_store_family"))


In [0]:
%sql
SELECT *
FROM cscie103_catalog.final_project.gold_daily_store_family
WHERE sales = 1
  AND is_holiday IS NOT NULL LIMIT 50;