In [1]:
# Cell 1: Setup and Data Loading
import sys
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    src_path = os.path.abspath(os.path.join(current_dir, '..', 'src'))
    data_dir = '../data'
else:
    src_path = os.path.abspath(os.path.join(current_dir, 'src'))
    data_dir = 'data'

if src_path not in sys.path:
    sys.path.append(src_path)

from features import FeatureEngineer
from models import ForecastingModels
from optimization import InventoryOptimizer

print("Loading and preparing data...")
df = pd.read_csv(f'{data_dir}/sales.csv', parse_dates=['date']).merge(
    pd.read_csv(f'{data_dir}/products.csv'), on='product_id', how='left').merge(
    pd.read_csv(f'{data_dir}/stores.csv'), on='store_id', how='left').merge(
    pd.read_csv(f'{data_dir}/promotions.csv', parse_dates=['date']), on='date', how='left')

df['actual_sales'] = df['sales_quantity']
df.loc[df['is_stockout'] == 1, 'actual_sales'] = np.nan
df['imputed_sales'] = df.groupby(['store_id', 'product_id'])['actual_sales'].transform(lambda x: x.interpolate(method='linear', limit_direction='both')).bfill().fillna(0)

fe = FeatureEngineer()
df_featured = fe.run_pipeline(df)
print(f"Data ready. Shape: {df_featured.shape}")

Loading and preparing data...
Data ready. Shape: (68875, 37)


In [2]:
# Cell 2: Walk-Forward Cross Validation & Historical Tracking
modeler = ForecastingModels()

# We will use 3 expanding windows to simulate real-world weekly/monthly retraining
splits = modeler.walk_forward_split(df_featured, n_splits=3)
historical_results = []

print("üöÄ Starting Walk-Forward Cross-Validation...")

for fold, (train_df, test_df) in enumerate(splits):
    print(f"\n--- Fold {fold + 1} ---")
    print(f"Training up to: {train_df['date'].max().date()} | Testing: {test_df['date'].min().date()} to {test_df['date'].max().date()}")
    
    # 1. Train & Predict XGBoost
    modeler.train_xgboost(train_df)
    xgb_preds, xgb_lower, xgb_upper = modeler.predict_xgboost(test_df)
    test_df['xgb_forecast'] = xgb_preds
    
    # 2. Train & Predict Statistical Baseline (Routed by Segment)
    es_preds = []
    unique_pairs = test_df[['store_id', 'product_id', 'demand_type']].drop_duplicates()
    
    for _, row in unique_pairs.iterrows():
        s_id, p_id, d_type = row['store_id'], row['product_id'], row['demand_type']
        train_series = train_df[(train_df['store_id'] == s_id) & (train_df['product_id'] == p_id)].set_index('date')['imputed_sales']
        test_len = len(test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)])
        
        # This now uses Croston's for intermittent, and ES for fast-moving
        preds = modeler.train_predict_statistical(train_series, test_len, demand_type=d_type)
        
        temp_df = pd.DataFrame({
            'date': test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)]['date'],
            'store_id': s_id, 'product_id': p_id, 'stat_forecast': preds
        })
        es_preds.append(temp_df)
    
    stat_results = pd.concat(es_preds)
    test_df = test_df.merge(stat_results, on=['date', 'store_id', 'product_id'], how='left')
    
    # 3. Create the Ensemble (Average of ML and Statistical)
    test_df['ensemble_forecast'] = (test_df['xgb_forecast'] + test_df['stat_forecast']) / 2
    
    metrics = modeler.calculate_metrics(test_df['imputed_sales'], test_df['ensemble_forecast'])
    print(f"Fold {fold + 1} Ensemble Metrics: {metrics}")
    
    # Save results to build a historical track record
    historical_results.append(test_df[['date', 'store_id', 'product_id', 'demand_type', 'imputed_sales', 'ensemble_forecast']])

# Export historical performance for the dashboard
df_history = pd.concat(historical_results)
df_history.to_csv(f'{data_dir}/historical_performance.csv', index=False)
print(f"\n‚úÖ Historical performance saved to {data_dir}/historical_performance.csv")

üöÄ Starting Walk-Forward Cross-Validation...

--- Fold 1 ---
Training up to: 2024-07-23 | Testing: 2024-07-24 to 2025-01-14
Fold 1 Ensemble Metrics: {'MAE': 7.14, 'RMSE': np.float64(12.21), 'MAPE (%)': np.float64(64878.69)}

--- Fold 2 ---
Training up to: 2025-01-14 | Testing: 2025-01-15 to 2025-07-08
Fold 2 Ensemble Metrics: {'MAE': 7.92, 'RMSE': np.float64(16.12), 'MAPE (%)': np.float64(85662.68)}

--- Fold 3 ---
Training up to: 2025-07-08 | Testing: 2025-07-09 to 2025-12-30
Fold 3 Ensemble Metrics: {'MAE': 6.84, 'RMSE': np.float64(12.28), 'MAPE (%)': np.float64(90768.49)}

‚úÖ Historical performance saved to ../data/historical_performance.csv


In [3]:
# Cell 3: Final Training & Inventory Optimization
# We reserve the last 60 days to act as our "Future" projection for the dashboard
split_date = df_featured['date'].max() - pd.Timedelta(days=60)
train_df = df_featured[df_featured['date'] < split_date].copy()
test_df = df_featured[df_featured['date'] >= split_date].copy()

print("\nüìà Training Final Global Model for Future Projections...")
modeler.train_xgboost(train_df)
xgb_preds, xgb_lower, xgb_upper = modeler.predict_xgboost(test_df)
test_df['xgb_forecast'] = xgb_preds
test_df['xgb_lower_bound'] = xgb_lower
test_df['xgb_upper_bound'] = xgb_upper

# Statistical baseline
es_preds = []
unique_pairs = test_df[['store_id', 'product_id', 'demand_type']].drop_duplicates()
for _, row in unique_pairs.iterrows():
    s_id, p_id, d_type = row['store_id'], row['product_id'], row['demand_type']
    train_series = train_df[(train_df['store_id'] == s_id) & (train_df['product_id'] == p_id)].set_index('date')['imputed_sales']
    test_len = len(test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)])
    
    preds = modeler.train_predict_statistical(train_series, test_len, demand_type=d_type)
    temp_df = pd.DataFrame({'date': test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)]['date'],
                            'store_id': s_id, 'product_id': p_id, 'stat_forecast': preds})
    es_preds.append(temp_df)

stat_results = pd.concat(es_preds)
test_df = test_df.merge(stat_results, on=['date', 'store_id', 'product_id'], how='left')
test_df['ensemble_forecast'] = (test_df['xgb_forecast'] + test_df['stat_forecast']) / 2

# Final Segmentation Evaluation
print("\n--- Final 60-Day Forward Look Evaluation ---")
for segment in test_df['demand_type'].unique():
    segment_data = test_df[test_df['demand_type'] == segment]
    metrics = modeler.calculate_metrics(segment_data['imputed_sales'], segment_data['ensemble_forecast'])
    print(f"Segment: {segment} | MAPE: {metrics['MAPE (%)']}%")

# Inventory Optimization
print("\n‚öôÔ∏è Running Inventory Optimization Module...")
optimizer = InventoryOptimizer(lead_time_days=3, holding_cost_annual_rate=0.2, stockout_penalty_per_unit=15)

# Pass in the test dataframe and the mathematically calculated residual_std
df_inventory = optimizer.generate_recommendations(test_df, modeler.residual_std)

# Append the prediction intervals so the dashboard can plot them
df_inventory['lower_bound'] = test_df['xgb_lower_bound']
df_inventory['upper_bound'] = test_df['xgb_upper_bound']

df_inventory.to_csv(f'{data_dir}/final_inventory_recommendations.csv', index=False)
print(f"‚úÖ Optimization complete. Future recommendations saved to {data_dir}/final_inventory_recommendations.csv")


üìà Training Final Global Model for Future Projections...

--- Final 60-Day Forward Look Evaluation ---
Segment: Fast-Moving | MAPE: 12.1%
Segment: Seasonal | MAPE: 12.26%
Segment: Intermittent | MAPE: 449865.93%

‚öôÔ∏è Running Inventory Optimization Module...
‚úÖ Optimization complete. Future recommendations saved to ../data/final_inventory_recommendations.csv
