This notebook serves as a sandbox to provide exploratory data analysis in preparation for ERD & E2E workflow specifications.
It will:
1. Create schema & volume if needed.
2. Fetch data from [Kaggle competition](https://www.kaggle.com/competitions/store-sales-time-series-forecasting/overview).
3. Create respective tables per csv file.

... (wip)

In [0]:
%sql
-- creates cscie103_catalog.final_project schema and data volume (if not exist)
CREATE SCHEMA IF NOT EXISTS cscie103_catalog.final_project;
CREATE VOLUME IF NOT EXISTS cscie103_catalog.final_project.data;

In [0]:
%pip install kaggle

In [0]:
%py
# all imports here
import os
import json
from pathlib import Path
import subprocess
import zipfile
import requests

In [0]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.service.workspace import CreateScope

w = WorkspaceClient()
SCOPE = "kaggle"

try:
    # Create scope (will error if already exists)
    w.secrets.create_scope(scope=SCOPE)
    print(f"‚úì Created scope '{SCOPE}'")
except Exception as e:
    if "already exists" in str(e).lower():
        print(f"‚Ñπ Scope '{SCOPE}' already exists")
    else:
        raise

# Add secrets
# fetch from github secrets
_username = dbutils.secrets.get(
    scope="e-103-finalproject-credentials",
    key="kaggle-username"
)
_token = dbutils.secrets.get(
    scope="e-103-finalproject-credentials",
    key="kaggle-api-token"
)
secrets_to_add = {
    "kaggle-username": _username,
    "kaggle-api-token": _token
}

for key, value in secrets_to_add.items():
    w.secrets.put_secret(scope=SCOPE, key=key, string_value=value)
    print(f"‚úì Added secret '{key}'")

# Verify secrets were added
print(f"\nSecrets in '{SCOPE}':")
for secret in w.secrets.list_secrets(scope=SCOPE):
    print(f"  - {secret.key} (last updated: {secret.last_updated_timestamp})")

In [0]:
# For READING secrets in notebooks, continue using dbutils
username = dbutils.secrets.get(scope=SCOPE, key="kaggle-username")
api_key = dbutils.secrets.get(scope=SCOPE, key="kaggle-api-token")

print(f"‚úì Retrieved username (length: {len(username)})")
print(f"‚úì Retrieved API key (length: {len(api_key)})")

In [0]:
# Kaggle Data Download using Official Python API
import os
import json
from pathlib import Path

# ---- STEP 1: Configure paths ----
COMPETITION_NAME = "store-sales-time-series-forecasting"
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"
DOWNLOAD_PATH = VOLUME_TARGET_DIR

# Create directory if it doesn't exist
os.makedirs(DOWNLOAD_PATH, exist_ok=True)
print(f"‚úì Target directory: {DOWNLOAD_PATH}")

# ---- STEP 2: Set up Kaggle credentials ----
# Retrieve credentials from Databricks secrets
try:
    kaggle_username = dbutils.secrets.get(scope="kaggle", key="kaggle-username")
    kaggle_token = dbutils.secrets.get(scope="kaggle", key="kaggle-api-token")
    print("‚úì Successfully retrieved Kaggle credentials from secrets")
    print(f"  Username: {kaggle_username}")
    print(f"  Token length: {len(kaggle_token)} characters")
except Exception as e:
    print(f"‚úó Error retrieving secrets: {e}")
    print("\nTo set up secrets, run these commands in Databricks CLI:")
    print('  databricks secrets create-scope --scope kaggle')
    print('  databricks secrets put --scope kaggle --key username')
    print('  databricks secrets put --scope kaggle --key token')
    raise

# Create kaggle.json configuration file
kaggle_dir = os.path.expanduser("~/.kaggle")
os.makedirs(kaggle_dir, exist_ok=True)

kaggle_config_path = os.path.join(kaggle_dir, "kaggle.json")
kaggle_config = {
    "username": kaggle_username,
    "key": kaggle_token
}

with open(kaggle_config_path, "w") as f:
    json.dump(kaggle_config, f)

# Set proper permissions (Kaggle requires this)
os.chmod(kaggle_config_path, 0o600)
print(f"‚úì Created Kaggle config at: {kaggle_config_path}")

# ---- STEP 3: Install and import Kaggle API ----
try:
    import kaggle
    from kaggle.api.kaggle_api_extended import KaggleApi
    print(f"‚úì Kaggle package version: {kaggle.__version__}")
except ImportError:
    print("üì¶ Installing Kaggle package...")
    import subprocess
    subprocess.run(["pip", "install", "kaggle", "-q"], check=True)
    import kaggle
    from kaggle.api.kaggle_api_extended import KaggleApi
    print("‚úì Kaggle package installed")

# ---- STEP 4: Authenticate and download ----
if not os.path.exists(f"{VOLUME_TARGET_DIR}/train.csv"):
    print(f"\nüì• Downloading competition data: {COMPETITION_NAME}")
    print("‚ö†Ô∏è  NOTE: If this fails with 403, you must accept competition rules at:")
    print(f"   https://www.kaggle.com/competitions/{COMPETITION_NAME}/rules")
    print("   Click 'I Understand and Accept' button, then re-run this cell\n")
    
    try:
        # Initialize and authenticate API
        api = KaggleApi()
        api.authenticate()
        print("‚úì API authenticated successfully")
        
        # Download competition files
        print(f"üìÇ Downloading files to: {DOWNLOAD_PATH}")
        api.competition_download_files(
            competition=COMPETITION_NAME,
            path=DOWNLOAD_PATH,
            quiet=False
        )
        print("‚úì Download complete")
        
        # Unzip the downloaded file
        import zipfile
        zip_file = f"{DOWNLOAD_PATH}/{COMPETITION_NAME}.zip"
        
        if os.path.exists(zip_file):
            print(f"\nüìÇ Extracting files from: {zip_file}")
            with zipfile.ZipFile(zip_file, "r") as zip_ref:
                file_list = zip_ref.namelist()
                zip_ref.extractall(DOWNLOAD_PATH)
                print(f"‚úì Extracted {len(file_list)} files:")
                for file in file_list:
                    file_size = os.path.getsize(f"{DOWNLOAD_PATH}/{file}") / (1024*1024)
                    print(f"  ‚Ä¢ {file} ({file_size:.2f} MB)")
            
            # Clean up ZIP file
            os.remove(zip_file)
            print("‚úì Cleaned up ZIP file")
        else:
            print(f"‚ö†Ô∏è  Warning: ZIP file not found at {zip_file}")
            # List downloaded files
            print("\nDownloaded files:")
            for file in os.listdir(DOWNLOAD_PATH):
                print(f"  ‚Ä¢ {file}")
            
    except Exception as e:
        error_msg = str(e)
        print(f"‚úó Error during download: {error_msg}")
        
        if "403" in error_msg or "Forbidden" in error_msg:
            print("\n‚ùå ACCESS FORBIDDEN (403)")
            print("You must accept the competition rules first:")
            print(f"  1. Visit: https://www.kaggle.com/competitions/{COMPETITION_NAME}/rules")
            print("  2. Scroll down and click 'I Understand and Accept' button")
            print("  3. Re-run this cell")
        elif "401" in error_msg or "Unauthorized" in error_msg:
            print("\n‚ùå AUTHENTICATION FAILED (401)")
            print("Your credentials are incorrect or expired:")
            print("  1. Go to: https://www.kaggle.com/settings/account")
            print("  2. Scroll to 'API' section")
            print("  3. Click 'Create New Token' (downloads kaggle.json)")
            print("  4. Update your Databricks secrets with the new username and key")
        
        raise
else:
    print("‚ÑπÔ∏è  Skipped downloading because train.csv already exists.")

# ---- STEP 5: List all downloaded files ----
print("\nüìÅ Files in target directory:")
files = os.listdir(VOLUME_TARGET_DIR)
for file in sorted(files):
    file_path = os.path.join(VOLUME_TARGET_DIR, file)
    if os.path.isfile(file_path):
        file_size = os.path.getsize(file_path) / (1024*1024)
        print(f"  ‚Ä¢ {file} ({file_size:.2f} MB)")

# ---- STEP 6: Verify data ----
print("\nüîç Verifying train.csv...")
file_path = f"{VOLUME_TARGET_DIR}/train.csv"

if os.path.exists(file_path):
    try:
        df = (
            spark.read
            .option("header", "true")
            .option("inferSchema", "true")
            .csv(file_path)
        )
        
        row_count = df.count()
        col_count = len(df.columns)
        
        print(f"‚úì Data loaded into Spark DataFrame")
        print(f"  Rows: {row_count:,}")
        print(f"  Columns: {col_count}")
        print(f"\nüìã Schema:")
        df.printSchema()
        
        print(f"\nüìä Sample data (first 5 rows):")
        df.show(5, truncate=False)
        
    except Exception as e:
        print(f"‚úó Error reading file from Volume: {e}")
        raise
else:
    print(f"‚úó train.csv not found at: {file_path}")

print("\n‚úÖ All done! Data is ready for analysis.")

# Optional: Clean up credentials file for security
# os.remove(kaggle_config_path)
# print("‚úì Cleaned up Kaggle credentials")

In [0]:
filenames = {
    'holidays_events': 'holidays_events.csv',
    'oil': 'oil.csv',
    'sample_submission': 'sample_submission.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

holidays_events_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('holidays_events')}", header=True, inferSchema=True)
oil_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('oil')}", header=True, inferSchema=True)
stores_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('stores')}", header=True, inferSchema=True)
transactions_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('transactions')}", header=True, inferSchema=True)
train_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('train')}", header=True, inferSchema=True)


In [0]:
# 5. Delete a secret (if needed)
w.secrets.delete_secret(scope=SCOPE, key="kaggle-username")
w.secrets.delete_secret(scope=SCOPE, key="kaggle-api-token")

# 6. Delete entire scope (if needed)
# w.secrets.delete_scope(scope=SCOPE)