In [1]:
# Cell 1: Setup and Data Loading
import sys
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    src_path = os.path.abspath(os.path.join(current_dir, '..', 'src'))
    data_dir = '../data'
else:
    src_path = os.path.abspath(os.path.join(current_dir, 'src'))
    data_dir = 'data'

if src_path not in sys.path:
    sys.path.append(src_path)

from features import FeatureEngineer
from models import ForecastingModels
from optimization import InventoryOptimizer

print("Loading and preparing data...")
df = pd.read_csv(f'{data_dir}/sales.csv', parse_dates=['date']).merge(
    pd.read_csv(f'{data_dir}/products.csv'), on='product_id', how='left').merge(
    pd.read_csv(f'{data_dir}/stores.csv'), on='store_id', how='left').merge(
    pd.read_csv(f'{data_dir}/promotions.csv', parse_dates=['date']), on='date', how='left')

df['actual_sales'] = df['sales_quantity']
df.loc[df['is_stockout'] == 1, 'actual_sales'] = np.nan
df['imputed_sales'] = df.groupby(['store_id', 'product_id'])['actual_sales'].transform(lambda x: x.interpolate(method='linear', limit_direction='both')).bfill().fillna(0)

fe = FeatureEngineer()
df_featured = fe.run_pipeline(df)
print(f"Data ready. Shape: {df_featured.shape}")

Loading and preparing data...
Data ready. Shape: (68875, 37)


In [4]:
# Cell 2: Walk Forward Cross Validation & Historical Tracking
import os

modeler = ForecastingModels()

# We will use 3 expanding windows to simulate real-world weekly/monthly retraining
splits = modeler.walk_forward_split(df_featured, n_splits=3)
historical_results = []

print("ðŸš€ Starting Walk-Forward Cross-Validation...")

for fold, (train_df, test_df) in enumerate(splits):
    print(f"\n--- Fold {fold + 1} ---")
    print(f"Training up to: {train_df['date'].max().date()} | Testing: {test_df['date'].min().date()} to {test_df['date'].max().date()}")
    
    # 1. Train & Predict XGBoost
    modeler.train_xgboost(train_df)
    
    # ---> MLOps: Save Model Artifacts (Saves the final fold's model) <---
    if fold == len(splits) - 1:
        modeler.save_model_artifacts(version="v1")
        print("ðŸ’¾ MLOps: XGBoost model saved to disk.")
        
    xgb_preds, xgb_lower, xgb_upper = modeler.predict_xgboost(test_df)
    test_df['xgb_forecast'] = xgb_preds
    
    # 2. Train & Predict Statistical Baseline (Routed by Segment)
    es_preds = []
    unique_pairs = test_df[['store_id', 'product_id', 'demand_type']].drop_duplicates()
    
    for _, row in unique_pairs.iterrows():
        s_id, p_id, d_type = row['store_id'], row['product_id'], row['demand_type']
        
        train_series = train_df[(train_df['store_id'] == s_id) & (train_df['product_id'] == p_id)].set_index('date')['imputed_sales']
        test_len = len(test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)])
        
        # This now explicitly uses Croston's for Intermittent, and ES for fast moving
        preds = modeler.train_predict_statistical(train_series, test_len, d_type)
        
        temp_df = pd.DataFrame({
            'date': test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)]['date'],
            'store_id': s_id, 
            'product_id': p_id, 
            'stat_forecast': preds
        })
        es_preds.append(temp_df)

    stat_results = pd.concat(es_preds)
    test_df = test_df.merge(stat_results, on=['date', 'store_id', 'product_id'], how='left')
    
    # 3. Create the Ensemble explicitly using the class method
    test_df['ensemble_forecast'] = modeler.generate_ensemble(
        test_df['xgb_forecast'].values, 
        test_df['stat_forecast'].values
    )
    
    metrics = modeler.calculate_metrics(test_df['imputed_sales'], test_df['ensemble_forecast'])
    print(f"Fold {fold + 1} Ensemble Metrics: {metrics}")
    
    # Save results to build a historical track record
    historical_results.append(test_df[['date', 'store_id', 'product_id', 'demand_type', 'imputed_sales', 'ensemble_forecast']])

# Export historical performance for the dashboard
df_history = pd.concat(historical_results)
data_dir = '../data' if os.path.exists('../data') else 'data'
df_history.to_csv(f'{data_dir}/historical_performance.csv', index=False)
print(f"\nâœ… Historical performance saved to {data_dir}/historical_performance.csv")

2026-02-22 12:38:10,135 - INFO - Training Global XGBoost Model...


ðŸš€ Starting Walk-Forward Cross-Validation...

--- Fold 1 ---
Training up to: 2025-09-30 | Testing: 2025-10-01 to 2025-10-31


  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction

Fold 1 Ensemble Metrics: {'MAE': 6.9, 'RMSE': np.float64(14.86), 'MAPE (%)': np.float64(105196.21)}

--- Fold 2 ---
Training up to: 2025-10-30 | Testing: 2025-10-31 to 2025-11-30


  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction

Fold 2 Ensemble Metrics: {'MAE': 5.53, 'RMSE': np.float64(11.53), 'MAPE (%)': np.float64(118437.86)}

--- Fold 3 ---
Training up to: 2025-11-29 | Testing: 2025-11-30 to 2025-12-30


TypeError: `_estimator_type` undefined.  Please use appropriate mixin to define estimator type.

In [3]:
# Cell 3: Baseline (Exp. Smoothing) and Ensemble
print("Training Statistical Baseline (Exponential Smoothing) per series...")

# Create a dataframe to store baseline predictions
es_preds = []

# We need to loop through each unique store-product combination
unique_pairs = test_df[['store_id', 'product_id']].drop_duplicates()

for _, row in unique_pairs.iterrows():
    s_id, p_id = row['store_id'], row['product_id']
    
    # Get the specific time series for this item
    train_series = train_df[(train_df['store_id'] == s_id) & (train_df['product_id'] == p_id)].set_index('date')['imputed_sales']
    test_len = len(test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)])
    
    # Train and Forecast with ES
    try:
        preds = modeler.train_predict_expsmoothing(train_series, test_len)
    except:
        # Fallback to simple moving average if ES fails
        preds = np.full(test_len, train_series.mean())
        
    # Store results
    temp_df = test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)].copy()
    temp_df['es_forecast'] = preds
    es_preds.append(temp_df)

# Combine back into the main test dataframe
es_results = pd.concat(es_preds)
test_df = test_df.merge(es_results[['date', 'store_id', 'product_id', 'es_forecast']], 
                        on=['date', 'store_id', 'product_id'], how='left')

# --- THIS IS THE UPDATED PART ---
# Create Ensemble explicitly using the new class method
test_df['ensemble_forecast'] = modeler.generate_ensemble(
    test_df['xgb_forecast'].values, 
    test_df['es_forecast'].values
)

print("Baseline and Ensemble forecasts generated.")

Training Statistical Baseline (Exponential Smoothing) per series...


  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction

Baseline and Ensemble forecasts generated.
