This notebook serves as a sandbox to provide exploratory data analysis in preparation for ERD & E2E workflow specifications.
It will:
1. Create schema & volume if needed.
2. Fetch data from [Kaggle competition](https://www.kaggle.com/competitions/store-sales-time-series-forecasting/overview).
3. Create respective tables per csv file.

... (wip)

In [0]:
%sql
-- creates cscie103_catalog.final_project schema and data volume (if not exist)
CREATE SCHEMA IF NOT EXISTS cscie103_catalog.final_project;
CREATE VOLUME IF NOT EXISTS cscie103_catalog.final_project.data;

In [0]:
%pip install kaggle

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
%py
# all imports here
import os
import json
from pathlib import Path
import subprocess
import zipfile

In [0]:
%py
# Kaggle authentication set up
kaggle_token = dbutils.secrets.get(scope="e-103-finalproject-credentials", key="kaggle-api-token")
kaggle_username = dbutils.secrets.get(scope="e-103-finalproject-credentials", key="kaggle-username")

kaggle_config = { "username": kaggle_username, "key": kaggle_token }
kaggle_config_dir = "/tmp/kaggle_config"
kaggle_file_path = Path(kaggle_config_dir) / "kaggle.json"

os.makedirs(kaggle_config_dir, exist_ok=True)
with open(kaggle_file_path, "w") as f:
    json.dump(kaggle_config, f)
os.chmod(kaggle_file_path, 0o600)

os.environ["KAGGLE_CONFIG_DIR"] = kaggle_config_dir
os.environ["KAGGLE_USERNAME"] = kaggle_username
os.environ["KAGGLE_KEY"] = kaggle_token

In [0]:
# only after the Kaggle auth setup
import kaggle as kgl

In [0]:
%py
COMPETITION_NAME = "store-sales-time-series-forecasting"
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"
DOWNLOAD_PATH = VOLUME_TARGET_DIR

# download data only if it does not exist in the VOLUME_TARGET_DIR, marker is train.csv file
if not os.path.exists(f"{VOLUME_TARGET_DIR}/train.csv"):
    print("Downloading data...")

    os.makedirs(VOLUME_TARGET_DIR, exist_ok=True)
    os.makedirs(DOWNLOAD_PATH, exist_ok=True)

    # kaggle competitions download -c store-sales-time-series-forecasting -p <path>
    command = [
        "kaggle", "competitions", "download", 
        "-c", COMPETITION_NAME, 
        "-p", DOWNLOAD_PATH
    ]

    # --- Download data ---
    try:
        result = subprocess.run(command, check=True, capture_output=True, text=True)
        print("Download successful.")
    except subprocess.CalledProcessError as e:
        print(f"Error during download: {e}")
        print(f"Stdout: {e.stdout}")
        print(f"Stderr: {e.stderr}")
        raise

    # --- Unzip ---
    zip_file_name = f"{DOWNLOAD_PATH}/{COMPETITION_NAME}.zip"

    try:
        with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
            zip_ref.extractall(DOWNLOAD_PATH)
            print(f"Extracted files from {zip_file_name}.")
    except FileNotFoundError:
        print(f"Error: ZIP file not found at {zip_file_name}. Download may have failed.")
        raise
else:
    print("Skipped downloading data because file-marker exists already (train.csv).")

file_path = f"{VOLUME_TARGET_DIR}/train.csv"
try:
    df = (spark.read
      .option("header", "true")
      .option("inferSchema", "true")
      .csv(file_path)
    )
    print("\nVerification: Data loaded into Spark DataFrame.")
    df.printSchema()
    # df.display()
except Exception as e:
    print(f"Error reading file from Volume: {e}. Check file path.")

Skipped downloading data because file-marker exists already (train.csv).

Verification: Data loaded into Spark DataFrame.
root
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- store_nbr: integer (nullable = true)
 |-- family: string (nullable = true)
 |-- sales: double (nullable = true)
 |-- onpromotion: integer (nullable = true)



In [0]:
filenames = {
    'holidays_events': 'holidays_events.csv',
    'oil': 'oil.csv',
    'sample_submission': 'sample_submission.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

holidays_events_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('holidays_events')}", header=True, inferSchema=True)
oil_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('oil')}", header=True, inferSchema=True)
stores_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('stores')}", header=True, inferSchema=True)
transactions_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('transactions')}", header=True, inferSchema=True)
train_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('train')}", header=True, inferSchema=True)
