# MLflow Model Training Pipeline

**Objective:** Train a LightGBM model to forecast store sales with MLflow experiment tracking

**Pipeline:**
1. Load data from Silver layer
2. Feature engineering
3. Model training with validation
4. Experiment tracking & model registration
5. Performance visualization
6. Generate predictions for Gold layer

**Model:** LightGBM Regressor  
**Target:** Sales forecasting (log-transformed)  
**Metric:** RMSLE (Root Mean Squared Logarithmic Error)

In [0]:
# Install dependencies
%pip install -q mlflow lightgbm

# Imports
import pandas as pd
import numpy as np
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
from sklearn.metrics import mean_squared_log_error
import matplotlib.pyplot as plt
%matplotlib inline

print("✓ Libraries imported")

In [0]:
# MLflow Configuration
# Get current username for experiment path
current_user = spark.sql("SELECT current_user() as user").collect()[0]["user"]
EXPERIMENT_NAME = f"/Users/{current_user}/store_sales_forecast"
MODEL_NAME = "store_sales_lgbm"

mlflow.set_experiment(EXPERIMENT_NAME)
print(f"✓ MLflow experiment: {EXPERIMENT_NAME}")
print(f"✓ Model name: {MODEL_NAME}")

## 1. Load Data from Silver Layer

In [0]:
# Load Silver tables
spark.sql("USE cscie103_catalog.final_project")

print("Loading data from Silver layer...")
train_df = spark.table("silver_train").toPandas()
test_df = spark.table("silver_test").toPandas()
oil_df = spark.table("silver_oil").toPandas()
holidays_df = spark.table("silver_holidays").toPandas()
stores_df = spark.table("silver_stores").toPandas()
transactions_df = spark.table("silver_transactions").toPandas()

print(f"Train: {len(train_df):,} rows")
print(f"Test: {len(test_df):,} rows")
print(f"Oil: {len(oil_df):,} rows")
print(f"Holidays: {len(holidays_df):,} rows")
print(f"Stores: {len(stores_df):,} rows")
print(f"Transactions: {len(transactions_df):,} rows")

## 2. Feature Engineering

In [0]:
# Convert date columns to datetime
print("Converting date columns...")
for df in [train_df, test_df, oil_df, holidays_df, transactions_df]:
    df["date"] = pd.to_datetime(df["date"])

print("Date conversion complete")

In [0]:
# Temporal features: day of week and weekend indicator
print("Creating temporal features...")

train_df["day_of_week"] = train_df["date"].dt.dayofweek
test_df["day_of_week"] = test_df["date"].dt.dayofweek

train_df["is_weekend"] = train_df["day_of_week"].isin([5, 6]).astype(int)
test_df["is_weekend"] = test_df["day_of_week"].isin([5, 6]).astype(int)

print("✓ Temporal features created")

In [0]:
# Oil price features: forward/backward fill missing values
print("Processing oil prices...")

# Create complete date range
full_date_range = pd.date_range(train_df["date"].min(), test_df["date"].max(), freq="D")

# Reindex oil data to fill gaps
oil_df = oil_df.set_index("date").reindex(full_date_range)
oil_df.index.name = "date"

# Fill missing values (forward fill, then backward fill, then median)
oil_df["dcoilwtico"] = oil_df["dcoilwtico"].ffill().bfill()
oil_df["dcoilwtico"] = oil_df["dcoilwtico"].fillna(oil_df["dcoilwtico"].median())

# Rename column for clarity
oil_df = oil_df.reset_index().rename(columns={"dcoilwtico": "oil_price"})

print(f"✓ Oil prices processed, missing values filled")

In [0]:
# Holiday features
print("Creating holiday features...")

# Create binary holiday indicator (1 if not a work day)
holidays_df["is_holiday"] = (holidays_df["type"] != "Work Day").astype(int)

# Aggregate by date (max in case of multiple holidays per day)
holidays_agg = holidays_df.groupby("date")["is_holiday"].max().reset_index()

print(f"✓ Holiday features created")

In [0]:
# Salary day feature (15th and 30th of month)
print("Creating salary day indicator...")

def is_salary_day(date):
    """Returns 1 if date is 15th or 30th (typical salary days in Ecuador)"""
    return int(date.day in [15, 30])

train_df["is_salary_day"] = train_df["date"].map(is_salary_day)
test_df["is_salary_day"] = test_df["date"].map(is_salary_day)

print("✓ Salary day indicator created")

In [0]:
# Earthquake impact feature (April 16, 2016 earthquake in Ecuador)
print("Creating earthquake impact feature...")

EARTHQUAKE_DATE = pd.to_datetime("2016-04-16")
EARTHQUAKE_WINDOW_DAYS = 15

for df in [train_df, test_df]:
    df["earthquake_impact"] = (
        (df["date"] >= EARTHQUAKE_DATE - pd.Timedelta(EARTHQUAKE_WINDOW_DAYS, "D")) & 
        (df["date"] <= EARTHQUAKE_DATE + pd.Timedelta(EARTHQUAKE_WINDOW_DAYS, "D"))
    ).astype(int)

print(f"✓ Earthquake impact window: ±{EARTHQUAKE_WINDOW_DAYS} days from {EARTHQUAKE_DATE.date()}")

In [0]:
# Merge external features
print("Merging external features...")

train_df = train_df.merge(oil_df, on="date", how="left")
test_df = test_df.merge(oil_df, on="date", how="left")

train_df = train_df.merge(holidays_agg, on="date", how="left")
test_df = test_df.merge(holidays_agg, on="date", how="left")

# Fill missing holidays with 0
train_df["is_holiday"] = train_df["is_holiday"].fillna(0).astype(int)
test_df["is_holiday"] = test_df["is_holiday"].fillna(0).astype(int)

print("✓ External features merged")

In [0]:
# Store features: encode categorical variables
print("Encoding store features...")

categorical_cols = ["city", "state", "type", "cluster"]

for col in categorical_cols:
    stores_df[col] = stores_df[col].astype(str)
    # Create mapping
    unique_values = stores_df[col].unique()
    encoding_map = {value: idx for idx, value in enumerate(unique_values)}
    stores_df[col] = stores_df[col].map(encoding_map).astype(int)
    print(f"  • {col}: {len(unique_values)} unique values encoded")

# Merge store features
train_df = train_df.merge(stores_df, on="store_nbr", how="left")
test_df = test_df.merge(stores_df, on="store_nbr", how="left")

print("✓ Store features encoded and merged")

In [0]:
# Transaction features
print("Merging transaction data...")

train_df = train_df.merge(transactions_df, on=["store_nbr", "date"], how="left")
test_df = test_df.merge(transactions_df, on=["store_nbr", "date"], how="left")

# Fill missing transactions with 0
train_df["transactions"] = train_df["transactions"].fillna(0)
test_df["transactions"] = test_df["transactions"].fillna(0)

print("✓ Transaction data merged")

In [0]:
# Zero sales streak feature
print("Calculating zero sales streak...")

train_df = train_df.sort_values(["store_nbr", "family", "date"])
train_df["is_zero_sales"] = (train_df["sales"] == 0).astype(int)

# Calculate consecutive zero sales days
zero_streaks = []
streak_count = 0
prev_store = None
prev_family = None

for _, row in train_df[["store_nbr", "family", "is_zero_sales"]].iterrows():
    # Reset streak if store/family changes
    if row["store_nbr"] != prev_store or row["family"] != prev_family:
        streak_count = 0
    
    # Update streak
    if row["is_zero_sales"] == 1:
        streak_count += 1
    else:
        streak_count = 0
    
    zero_streaks.append(streak_count)
    prev_store = row["store_nbr"]
    prev_family = row["family"]

train_df["zero_sales_streak"] = zero_streaks

# Store closed indicator (14+ days of zero sales)
train_df["store_closed"] = (train_df["zero_sales_streak"] >= 14).astype(int)
test_df["store_closed"] = 0  # Assume stores open for test period

print(f"✓ Zero sales streaks calculated")
print(f"  • Max streak: {train_df['zero_sales_streak'].max()} days")
print(f"  • Closed store instances: {train_df['store_closed'].sum():,}")

In [0]:
# Family (product category) encoding
print("Encoding product families...")

train_df["family"] = train_df["family"].astype(str)
test_df["family"] = test_df["family"].astype(str)

# Create unified encoding for train and test
all_families = pd.concat([train_df["family"], test_df["family"]]).unique()
family_encoding = {family: idx for idx, family in enumerate(all_families)}

train_df["family_encoded"] = train_df["family"].map(family_encoding)
test_df["family_encoded"] = test_df["family"].map(family_encoding)

print(f"✓ {len(all_families)} product families encoded")

## 3. Prepare Training Data

In [0]:
# Define feature columns
FEATURE_COLS = [
    "family_encoded", "store_nbr", "city", "state", "type", "cluster",
    "oil_price", "is_holiday", "is_salary_day", "earthquake_impact",
    "transactions", "store_closed", "day_of_week", "is_weekend"
]

print(f"Selected features ({len(FEATURE_COLS)}):")
for i, feature in enumerate(FEATURE_COLS, 1):
    print(f"  {i:2d}. {feature}")

In [0]:
# Prepare feature matrices and target
X_train_full = train_df[FEATURE_COLS]
y_train_full = train_df["sales"].astype(float)
X_test = test_df[FEATURE_COLS]

print(f"✓ Features prepared")
print(f"  • Training samples: {len(X_train_full):,}")
print(f"  • Test samples: {len(X_test):,}")

In [0]:
# Train/validation split: last 28 days for validation
VALIDATION_DAYS = 28

validation_cutoff_date = train_df["date"].max() - pd.Timedelta(VALIDATION_DAYS, "D")

train_mask = train_df["date"] <= validation_cutoff_date
val_mask = train_df["date"] > validation_cutoff_date

X_train = X_train_full[train_mask]
y_train = y_train_full[train_mask]
X_val = X_train_full[val_mask]
y_val = y_train_full[val_mask]

# Log transform target (sales are always positive)
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

print(f"✓ Train/validation split:")
print(f"  • Training: {len(X_train):,} samples (up to {validation_cutoff_date.date()})")
print(f"  • Validation: {len(X_val):,} samples (last {VALIDATION_DAYS} days)")
print(f"  • Log transformation applied to target variable")

## 4. Model Training with MLflow

In [0]:
# Enable MLflow autologging for LightGBM
mlflow.lightgbm.autolog()

print("Starting model training with MLflow tracking...")

with mlflow.start_run(run_name="lgbm_baseline") as run:
    # Model hyperparameters
    params = {
        "n_estimators": 1000,
        "learning_rate": 0.03,
        "num_leaves": 64,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 3,
        "random_state": 42
    }
    
    print("Hyperparameters:")
    for key, value in params.items():
        print(f"  • {key}: {value}")
    
    # Initialize and train model
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train, y_train_log,
        eval_set=[(X_train, y_train_log), (X_val, y_val_log)],
        eval_metric="rmse"
    )
    
    # Generate predictions
    val_pred_log = model.predict(X_val)
    val_pred = np.expm1(val_pred_log).clip(0, None)  # Inverse log transform
    
    # Calculate RMSLE (competition metric)
    rmsle = np.sqrt(mean_squared_log_error(y_val, val_pred))
    mlflow.log_metric("rmsle", rmsle)
    
    print(f"\n✓ Model trained successfully")
    print(f"  • RMSLE: {rmsle:.4f}")
    
    run_id = run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    print(f"  • Run ID: {run_id}")

## 5. Model Registration

In [0]:
# Register model in MLflow Model Registry
print("Registering model...")

model_version = None
try:
    registered_model = mlflow.register_model(model_uri, MODEL_NAME)
    model_version = registered_model.version
    print(f"✓ Model registered: {MODEL_NAME} v{model_version}")
    load_uri = f"models:/{MODEL_NAME}/{model_version}"
except Exception as e:
    print(f"⚠ Model registry not available: {e}")
    print("  Using run URI instead")
    load_uri = model_uri

# Load and verify registered model
loaded_model = mlflow.pyfunc.load_model(load_uri)
val_pred_reloaded = loaded_model.predict(X_val)
val_pred_reloaded = np.expm1(val_pred_reloaded).clip(0, None)

prediction_diff = np.abs(val_pred - val_pred_reloaded).mean()
print(f"✓ Model loaded and verified")
print(f"  • Average prediction difference: {prediction_diff:.6f}")

## 6. Model Evaluation & Visualization

In [0]:
# Prepare validation results
val_results = train_df[val_mask].copy()
val_results["predicted_sales"] = val_pred

# Daily aggregated results
daily_results = val_results.groupby("date")[["sales", "predicted_sales"]].sum().reset_index()

print(f"✓ Validation period: {daily_results['date'].min().date()} to {daily_results['date'].max().date()}")
print(f"  • Total actual sales: ${daily_results['sales'].sum():,.0f}")
print(f"  • Total predicted sales: ${daily_results['predicted_sales'].sum():,.0f}")

In [0]:
# Visualization 1: Daily actual vs predicted
plt.figure(figsize=(14, 5))
plt.plot(daily_results["date"], daily_results["sales"], label="Actual Sales", linewidth=2)
plt.plot(daily_results["date"], daily_results["predicted_sales"], label="Predicted Sales", linewidth=2, alpha=0.8)
plt.title("Daily Sales: Actual vs Predicted (Validation Period)", fontsize=14)
plt.xlabel("Date")
plt.ylabel("Total Sales ($)")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [0]:
# Visualization 2: Prediction residuals
daily_results["residual"] = daily_results["sales"] - daily_results["predicted_sales"]

plt.figure(figsize=(14, 4))
plt.plot(daily_results["date"], daily_results["residual"], linewidth=2, color="orange")
plt.axhline(0, color="black", linestyle="--", linewidth=1)
plt.title("Prediction Residuals (Actual - Predicted)", fontsize=14)
plt.xlabel("Date")
plt.ylabel("Residual ($)")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Residual statistics:")
print(f"  • Mean: ${daily_results['residual'].mean():,.2f}")
print(f"  • Std: ${daily_results['residual'].std():,.2f}")
print(f"  • Min: ${daily_results['residual'].min():,.2f}")
print(f"  • Max: ${daily_results['residual'].max():,.2f}")

In [0]:
# Visualization 3: Residual distribution
plt.figure(figsize=(10, 5))
plt.hist(daily_results["residual"], bins=50, edgecolor="black", alpha=0.7)
plt.axvline(0, color="red", linestyle="--", linewidth=2, label="Zero Error")
plt.title("Distribution of Prediction Residuals", fontsize=14)
plt.xlabel("Residual ($)")
plt.ylabel("Frequency")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [0]:
# Visualization 4: Weekly aggregation
val_results["week"] = val_results["date"].dt.to_period("W").dt.start_time
weekly_results = val_results.groupby("week")[["sales", "predicted_sales"]].sum().reset_index()

plt.figure(figsize=(12, 4))
plt.plot(weekly_results["week"], weekly_results["sales"], marker="o", label="Actual", linewidth=2)
plt.plot(weekly_results["week"], weekly_results["predicted_sales"], marker="s", label="Predicted", linewidth=2)
plt.title("Weekly Sales: Actual vs Predicted", fontsize=14)
plt.xlabel("Week")
plt.ylabel("Total Sales ($)")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [0]:
# Visualization 5: Performance by top product families
top_families = val_results.groupby("family")["sales"].sum().nlargest(6).index

print(f"Top {len(top_families)} product families by sales:")
for i, family in enumerate(top_families, 1):
    family_sales = val_results[val_results["family"] == family]["sales"].sum()
    print(f"  {i}. {family}: ${family_sales:,.0f}")

for family in top_families:
    family_data = val_results[val_results["family"] == family].copy()
    family_daily = family_data.groupby("date")[["sales", "predicted_sales"]].sum().reset_index()
    
    plt.figure(figsize=(12, 3))
    plt.plot(family_daily["date"], family_daily["sales"], label="Actual", linewidth=2)
    plt.plot(family_daily["date"], family_daily["predicted_sales"], label="Predicted", linewidth=2, alpha=0.8)
    plt.title(f"Product Family: {family}", fontsize=12)
    plt.xlabel("Date")
    plt.ylabel("Sales ($)")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

In [0]:
# Visualization 6: Feature importance
feature_importance = pd.DataFrame({
    "feature": FEATURE_COLS,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=True)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance["feature"], feature_importance["importance"])
plt.title("Feature Importance", fontsize=14)
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.grid(axis="x", alpha=0.3)
plt.tight_layout()
plt.show()

print("\nTop 5 most important features:")
for i, row in feature_importance.tail(5)[::-1].iterrows():
    print(f"  {row['feature']}: {row['importance']:.1f}")

## 7. Generate Test Predictions

In [0]:
# Generate predictions for test set
print("Generating test set predictions...")

test_pred_log = loaded_model.predict(X_test)
test_pred = np.expm1(test_pred_log).clip(0, None)

# Prepare predictions dataframe
predictions_df = test_df[["date", "store_nbr", "family"]].copy()
predictions_df["predicted_sales"] = test_pred

print(f"✓ Test predictions generated")
print(f"  • Total predictions: {len(predictions_df):,}")
print(f"  • Date range: {predictions_df['date'].min().date()} to {predictions_df['date'].max().date()}")
print(f"  • Predicted total sales: ${test_pred.sum():,.0f}")

In [0]:
# Save predictions to Gold layer
print("Saving predictions to Gold layer...")

predictions_spark = spark.createDataFrame(predictions_df)
predictions_spark.write.mode("overwrite").saveAsTable("cscie103_catalog.final_project.gold_predictions")

print("✓ Predictions saved to gold_predictions table")
print("\n" + "="*50)
print("TRAINING PIPELINE COMPLETE")
print("="*50)
print(f"✓ Model: {MODEL_NAME}")
print(f"✓ RMSLE: {rmsle:.4f}")
print(f"✓ Predictions: gold_predictions table")