In [0]:
!pip install -q mlflow lightgbm

In [0]:
dbutils.library.restartPython()

In [0]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
%matplotlib inline

## Configuration

In [0]:
ex = "store_sales_experiment"
mdl = "mdl_store_sales"

mlflow.set_experiment('/' + ex)

## Load Pre-processed Data

Load silver_training and silver_test tables which already contain all necessary features.

In [0]:
spark.sql("USE cscie103_catalog.final_project")

tr = spark.table("silver_training").toPandas()
ts = spark.table("silver_test").toPandas()

# tr.drop(columns=['id', 'transactions'], inplace=True)
# ts.drop(columns=['id', 'transactions'], inplace=True)

# Convert date columns
tr["date"] = pd.to_datetime(tr["date"])
ts["date"] = pd.to_datetime(ts["date"])

print(f"Training data shape: {tr.shape}")
print(f"Test data shape: {ts.shape}")
print(f"\nTraining columns: {list(tr.columns)}")

In [0]:
tr.head(2)

In [0]:
ts.head(2)

In [0]:
set(tr.columns) - set(ts.columns), set(ts.columns) - set(tr.columns)

## Prepare Features

Select feature columns (excluding oil since it's not in the pre-processed tables).

In [0]:
# Define feature columns 
fc = ["strIndxer_family", "store_nbr", "strIndxer_city", "strIndxer_state", 
      "strIndxer_type", "cluster", "is_holiday", "is_salary_day", 
      # "transactions", 
      "onpromotion", "day_of_week", "day_of_month", "month"]

# Extract features and target
X = tr[fc]
y = tr["sales"].astype(float)
Xt = ts[fc]

print(f"Feature columns: {fc}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

## Create Train/Validation Split

Split the training data using the last 28 days as validation.

In [0]:
cut = tr["date"].max() - pd.Timedelta(28, "D")
m1 = tr["date"] <= cut
m2 = tr["date"] > cut

Xtr = X[m1]
ytr = y[m1]
Xv = X[m2]
yv = y[m2]

# Apply log transformation
ytrlog = np.log1p(ytr)
yvlog = np.log1p(yv)

print(f"Training set: {Xtr.shape[0]} samples")
print(f"Validation set: {Xv.shape[0]} samples")
print(f"Validation split date: {cut}")

## Train Model with MLflow

Train LightGBM model with MLflow tracking.

In [0]:
mlflow.lightgbm.autolog()

with mlflow.start_run(run_name="lightgbm_baseline") as rr:
    # Model parameters
    p = {
        "n_estimators": 1000,
        "learning_rate": 0.03,
        "num_leaves": 64,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 3,
        "random_state": 42
    }
    
    # Train model
    mdl1 = lgb.LGBMRegressor(**p)
    mdl1.fit(
        Xtr, ytrlog,
        eval_set=[(Xtr, ytrlog), (Xv, yvlog)],
        eval_metric="rmse"
    )
    
    # Validation predictions
    vp = mdl1.predict(Xv)
    vp = np.expm1(vp).clip(0, None)
    
    # Calculate RMSLE
    sc = np.sqrt(mean_squared_log_error(yv, vp))
    mlflow.log_metric("rmsle", sc)
    print(f"RMSLE: {sc:.6f}")
    
    rid = rr.info.run_id
    muri = f"runs:/{rid}/model"

## Register Model

In [0]:
rv = None
try:
    x = mlflow.register_model(muri, mdl)
    rv = x.version
    print(f"Model registered: version {rv}")
except Exception as e:
    print(f"No registry available: {e}")

if rv:
    loadu = f"models:/{mdl}/{rv}"
else:
    loadu = muri

## Load and Verify Model

In [0]:
m2load = mlflow.pyfunc.load_model(loadu)
vp2 = m2load.predict(Xv)
vp2 = np.expm1(vp2).clip(0, None)
print(f"Mean absolute difference: {np.abs(vp - vp2).mean():.6f}")

## Save Validation Predictions

Store validation predictions to silver_validation_predictions table.

In [0]:
# Create validation predictions dataframe
val_preds = tr[m2][["date", "store_nbr", "family", "sales"]].copy()
val_preds["predicted_sales"] = vp
val_preds["residual"] = val_preds["sales"] - val_preds["predicted_sales"]

# Convert to Spark DataFrame and save
val_preds_spark = spark.createDataFrame(val_preds)
val_preds_spark.write.mode("overwrite").saveAsTable("silver_validation_predictions")

print(f"Validation predictions saved to silver_validation_predictions")
print(f"Records: {len(val_preds)}")
print(f"\nValidation RMSLE: {sc:.6f}")

## Validation Results Visualization

### Daily Predictions

In [0]:
vv = tr[m2].copy()
vv["p"] = vp
d1 = vv.groupby("date")[["sales", "p"]].sum().reset_index()

plt.figure(figsize=(12, 5))
plt.plot(d1["date"], d1["sales"], label="Actual")
plt.plot(d1["date"], d1["p"], label="Predicted")
plt.title("Daily Sales: Actual vs Predicted")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend()
plt.grid()
plt.show()

### Residuals

In [0]:
d1["r"] = d1["sales"] - d1["p"]
plt.figure(figsize=(12, 4))
plt.plot(d1["date"], d1["r"])
plt.axhline(0, color="black", linestyle="--")
plt.title("Residuals (Actual - Predicted)")
plt.xlabel("Date")
plt.ylabel("Residual")
plt.grid()
plt.show()

In [0]:
plt.figure(figsize=(10, 4))
plt.hist(d1["r"], bins=50, edgecolor="black")
plt.title("Residuals Distribution")
plt.xlabel("Residual")
plt.ylabel("Frequency")
plt.grid(axis="y")
plt.show()

### Weekly Aggregation

In [0]:
vv["wk"] = vv["date"].dt.to_period("W").dt.start_time
d2 = vv.groupby("wk")[["sales", "p"]].sum().reset_index()

plt.figure(figsize=(12, 4))
plt.plot(d2["wk"], d2["sales"], label="Actual")
plt.plot(d2["wk"], d2["p"], label="Predicted")
plt.title("Weekly Sales: Actual vs Predicted")
plt.xlabel("Week")
plt.ylabel("Sales")
plt.legend()
plt.grid()
plt.show()

### Top 6 Product Families

In [0]:
top = vv.groupby("family")["sales"].sum().nlargest(6).index

for fml in top:
    x = vv[vv["family"] == fml].copy()
    x = x.groupby("date")[["sales", "p"]].sum().reset_index()
    plt.figure(figsize=(10, 3))
    plt.plot(x["date"], x["sales"], label="Actual")
    plt.plot(x["date"], x["p"], label="Predicted")
    plt.title(f"Family: {fml}")
    plt.xlabel("Date")
    plt.ylabel("Sales")
    plt.legend()
    plt.grid()
    plt.show()

## Feature Importance

In [0]:
imp = pd.DataFrame({
    "feature": fc,
    "importance": mdl1.feature_importances_
}).sort_values("importance")

plt.figure(figsize=(8, 6))
plt.barh(imp["feature"], imp["importance"])
plt.title("Feature Importance")
plt.xlabel("Importance")
plt.grid(axis="x")
plt.tight_layout()
plt.show()

## Generate Test Predictions

In [0]:
# Ensure test data has same schema as training data
Xt_aligned = Xt[fc].copy()

# Ensure dtypes match training data
for col in fc:
    if X[col].dtype != Xt_aligned[col].dtype:
        Xt_aligned[col] = Xt_aligned[col].astype(X[col].dtype)

tp = m2load.predict(Xt_aligned)
tp = np.expm1(tp).clip(0, None)

# Create test predictions dataframe
test_preds = ts[["date", "store_nbr", "family"]].copy()
test_preds["predicted_sales"] = tp
test_preds['scenario'] = 'current'

# Save to CSV
# test_preds.to_csv("gold_store_family_day_predictions.csv", index=False)

# Save to Spark table
test_preds_spark = spark.createDataFrame(test_preds)
test_preds_spark.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("silver_test_predictions")

print(f"Test predictions saved to:")
# print(f"  - CSV: gold_store_family_day_predictions.csv")
print(f"  - Table: silver_test_predictions")
print(f"Total predictions: {len(test_preds)}")
print("\nSample predictions:")
print(test_preds.head(10))

In [0]:
Xt_scenario = Xt_aligned.copy()
Xt_scenario["onpromotion"] = Xt_scenario["onpromotion"] * 2


In [0]:
tp = m2load.predict(Xt_scenario)
tp = np.expm1(tp).clip(0, None)


In [0]:
test_preds = ts[["date", "store_nbr", "family"]].copy()
test_preds["predicted_sales"] = tp


In [0]:
from pyspark.sql.functions import lit

test_preds_spark = spark.createDataFrame(test_preds)

test_preds_spark = test_preds_spark.withColumn(
    "scenario", lit("onpromotion_2x")
)

test_preds_spark.write.mode("append").saveAsTable("silver_test_predictions")


In [0]:
%sql
SElect distinct(scenario) from  silver_test_predictions limit 2;