In [0]:
%restart_python 


In [0]:
!pip install -q mlflow lightgbm
!pip install synapseml


# Databricks setup
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import mlflow
import mlflow.lightgbm
import lightgbm as lgb
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
%matplotlib inline


In [0]:
spark.sql("USE cscie103_catalog.final_project")


In [0]:
%sql
select * from silver_training LIMIT 2;

In [0]:
# Use your catalog + schema
spark.sql("USE cscie103_catalog.final_project")

# MLflow experiment in workspace
mlflow.set_experiment("/Shared/store_sales_experiment")

# ---------------------------------------------------------
# Load training data
# ---------------------------------------------------------
df = spark.table("silver_training").toPandas()

def make_features(df):
    df = df.copy()

    # Ensure datetime
    df["date"] = pd.to_datetime(df["date"])

    # Hash columns to exclude
    hash_col = ["hash_storeNbr_family_city_state_type_cluster", "transactions", "id"]

    # Convert categoricals same as training
    cat_cols = ["state", "store_nbr", "family", "city", "type", "cluster", "is_holiday"]

    # Convert to categorical codes (LightGBM-friendly)
    for c in cat_cols:
        df[c] = df[c].astype("category").cat.codes

    # Date to ordinal
    df["date_ordinal"] = df["date"].map(pd.Timestamp.toordinal)

    # Feature columns match training
    feature_cols = [
        c for c in df.columns
        if c not in ["sales", "date"] + hash_col
    ]

    return df, feature_cols

df, feature_cols = make_features(df)

# df["date"] = pd.to_datetime(df["date"])


# # Identify hash column
# hash_col = "hash_storeNbr_family_city_state_type_cluster"

# # Categorical columns detected automatically
# cat_cols = ["state", "store_nbr", "family", "city", "type", "cluster", "is_holiday"]

# # Convert to categorical codes (LightGBM-friendly)
# for c in cat_cols:
#     df[c] = df[c].astype("category").cat.codes


# # Convert date to numeric
# df["date_ordinal"] = df["date"].map(pd.Timestamp.toordinal)


# # ---------------------------------------------------------
# # Select features: all except target + hash column
# # ---------------------------------------------------------
# feature_cols = [c for c in df.columns if c not in ["sales", hash_col, "date"]]

X = df[feature_cols]
y = df["sales"].astype(float)


In [0]:
X

In [0]:
# ---------------------------------------------------------
# Time-based validation split (last 28 days)
# ---------------------------------------------------------
cutoff = df["date"].max() - pd.Timedelta(days=28)

mask_train = df["date"] <= cutoff
mask_valid = df["date"] > cutoff

X_train = X[mask_train]
y_train = y[mask_train]

X_valid = X[mask_valid]
y_valid = y[mask_valid]

y_train_log = np.log1p(y_train) #y_train # 
y_valid_log = np.log1p(y_valid) #y_valid # 


In [0]:
# ---------------------------------------------------------
# Train LightGBM with MLflow
# ---------------------------------------------------------
mlflow.lightgbm.autolog()

with mlflow.start_run(run_name="lgbm-pandas-training"):

    params = {
        "n_estimators": 400,
        "learning_rate": 0.03,
        "num_leaves": 64,
        "min_child_samples": 50,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 3,
        "random_state": 42,
    }

    model = lgb.LGBMRegressor(**params)

    model.fit(
        X_train,
        y_train_log,
        eval_set=[(X_train, y_train_log), (X_valid, y_valid_log)],
        eval_metric="rmse",
        # verbose=False
    )

    # Validation prediction
    val_pred = np.expm1(model.predict(X_valid)).clip(0, None)

    rmsle = np.sqrt(mean_squared_log_error(y_valid, val_pred))
    mlflow.log_metric("rmsle", rmsle)

    print("Validation RMSLE:", rmsle)

    run_id = mlflow.active_run().info.run_id
    model_uri = f"runs:/{run_id}/model"

print("Model saved to:", model_uri)

## Create dataframe for future predictions

In [0]:
# Find last known date
last_date = spark.table("silver_training") \
                 .selectExpr("max(date) as max_date") \
                 .collect()[0][0]

# Create future date range
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1),
                             periods=90,  # change as needed
                             freq="D")

# Build future frame using unique store / family combinations
base = spark.table("silver_training") \
    .select("state", "store_nbr", "family", "onpromotion", "city", "type", "cluster", "is_holiday") \
    .dropDuplicates() \
    .toPandas()


# Cross join stores with future dates
future_custom = base.assign(key=1).merge(
    pd.DataFrame({"date": future_dates, "key": 1}),
    on="key"
).drop(columns="key")


In [0]:
future_custom

In [0]:
%%sql
show tables

In [0]:
future_custom_fe, feature_cols = make_features(future_custom)
future_custom_fe["prediction"] = np.expm1(
    model.predict(future_custom_fe[feature_cols])
).clip(0, None)


In [0]:
future_custom_fe


In [0]:
%sql
select max(date) from bronze_holidays_events;