In [1]:
# Cell 1: Setup and Data Loading (Robust Path Version)
import sys
import os
import pandas as pd
import numpy as np

# Smart path resolution: works whether notebook is in root or /notebooks
current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    src_path = os.path.abspath(os.path.join(current_dir, '..', 'src'))
    data_dir = '../data'
else:
    src_path = os.path.abspath(os.path.join(current_dir, 'src'))
    data_dir = 'data'

if src_path not in sys.path:
    sys.path.append(src_path)

# Now the imports will work
from features import FeatureEngineer
from models import ForecastingModels

# Re-run quick prep
sales = pd.read_csv(f'{data_dir}/sales.csv', parse_dates=['date'])
products = pd.read_csv(f'{data_dir}/products.csv')
stores = pd.read_csv(f'{data_dir}/stores.csv')
promotions = pd.read_csv(f'{data_dir}/promotions.csv', parse_dates=['date'])

df = sales.merge(products, on='product_id', how='left').merge(stores, on='store_id', how='left').merge(promotions, on='date', how='left')
df['actual_sales'] = df['sales_quantity']
df.loc[df['is_stockout'] == 1, 'actual_sales'] = np.nan
df['imputed_sales'] = df.groupby(['store_id', 'product_id'])['actual_sales'].transform(lambda x: x.interpolate(method='linear', limit_direction='both')).bfill().fillna(0)

# Apply Feature Engineering
fe = FeatureEngineer()
df_featured = fe.run_pipeline(df)
print("Data prepped and ready for modeling.")

Data prepped and ready for modeling.


In [2]:
# Cell 2: Train/Test Split and XGBoost Training
modeler = ForecastingModels()

# We reserve the last 60 days of our dataset for out-of-sample testing
split_date = df_featured['date'].max() - pd.Timedelta(days=60)
train_df, test_df = modeler.chronological_split(df_featured, split_date)

print(f"Training on dates: {train_df['date'].min().date()} to {train_df['date'].max().date()}")
print(f"Testing on dates: {test_df['date'].min().date()} to {test_df['date'].max().date()}")

# Train global XGBoost
print("\nTraining Global XGBoost Model...")
modeler.train_xgboost(train_df)

# Predict
test_df['xgb_forecast'] = modeler.predict_xgboost(test_df)

# Evaluate Global Model
xgb_metrics = modeler.calculate_metrics(test_df['imputed_sales'], test_df['xgb_forecast'])
print(f"Global XGBoost Performance: {xgb_metrics}")

# Calculate Prediction Intervals (Uncertainty) based on training residuals
train_df['xgb_preds'] = modeler.predict_xgboost(train_df)
residual_std = np.std(train_df['imputed_sales'] - train_df['xgb_preds'])

# 95% Confidence Interval (1.96 * standard deviation)
test_df['xgb_lower_bound'] = np.maximum(test_df['xgb_forecast'] - (1.96 * residual_std), 0)
test_df['xgb_upper_bound'] = test_df['xgb_forecast'] + (1.96 * residual_std)
print("Prediction intervals calculated successfully.")

Training on dates: 2024-01-29 to 2025-10-30
Testing on dates: 2025-10-31 to 2025-12-30

Training Global XGBoost Model...
Global XGBoost Performance: {'MAE': 3.72, 'RMSE': np.float64(6.21), 'MAPE (%)': np.float64(131810.61)}
Prediction intervals calculated successfully.


In [3]:
# Cell 3: Baseline (Exp. Smoothing) and Ensemble
print("Training Statistical Baseline (Exponential Smoothing) per series...")

# Create a dataframe to store baseline predictions
es_preds = []

# We need to loop through each unique store-product combination
unique_pairs = test_df[['store_id', 'product_id']].drop_duplicates()

for _, row in unique_pairs.iterrows():
    s_id, p_id = row['store_id'], row['product_id']
    
    # Get the specific time series for this item
    train_series = train_df[(train_df['store_id'] == s_id) & (train_df['product_id'] == p_id)].set_index('date')['imputed_sales']
    test_len = len(test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)])
    
    # Train and Forecast with ES
    # We use a try-except block because ES can fail on series with too many zeros
    try:
        preds = modeler.train_predict_expsmoothing(train_series, test_len)
    except:
        # Fallback to simple moving average if ES fails
        preds = np.full(test_len, train_series.mean())
        
    # Store results
    temp_df = test_df[(test_df['store_id'] == s_id) & (test_df['product_id'] == p_id)].copy()
    temp_df['es_forecast'] = preds
    es_preds.append(temp_df)

# Combine back into the main test dataframe
es_results = pd.concat(es_preds)
test_df = test_df.merge(es_results[['date', 'store_id', 'product_id', 'es_forecast']], 
                        on=['date', 'store_id', 'product_id'], how='left')

# Create Ensemble (Simple Average of XGBoost and ES)
test_df['ensemble_forecast'] = (test_df['xgb_forecast'] + test_df['es_forecast']) / 2

print("Baseline and Ensemble forecasts generated.")

Training Statistical Baseline (Exponential Smoothing) per series...


  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_inde

Baseline and Ensemble forecasts generated.


  return get_prediction_index(
  return get_prediction_index(


In [4]:
# Cell 4: Final Evaluation by Segment
print("--- Final Model Evaluation by Segment ---")

# Define segments based on the 'demand_type' column we generated
segments = test_df['demand_type'].unique()

for segment in segments:
    segment_data = test_df[test_df['demand_type'] == segment]
    
    # Calculate MAPE for the Ensemble model on this segment
    # We use the modeler's metric function
    metrics = modeler.calculate_metrics(segment_data['imputed_sales'], segment_data['ensemble_forecast'])
    
    print(f"\nSegment: {segment} ({len(segment_data)} records)")
    print(f"Ensemble Performance: {metrics}")
    
    if segment == 'Fast-Moving':
        if metrics['MAPE (%)'] < 20:
            print("SUCCESS: Fast-Moving MAPE is under 20%!")
        else:
            print("WARNING: Fast-Moving MAPE is above 20%. Tuning may be required.")

--- Final Model Evaluation by Segment ---

Segment: Fast-Moving (3007 records)
Ensemble Performance: {'MAE': 9.3, 'RMSE': np.float64(15.9), 'MAPE (%)': np.float64(12.1)}
SUCCESS: Fast-Moving MAPE is under 20%!

Segment: Seasonal (1499 records)
Ensemble Performance: {'MAE': 3.89, 'RMSE': np.float64(7.47), 'MAPE (%)': np.float64(12.26)}

Segment: Intermittent (1495 records)
Ensemble Performance: {'MAE': 0.57, 'RMSE': np.float64(0.88), 'MAPE (%)': np.float64(555843.78)}


In [5]:
# Cell 5: Inventory Optimization Execution
import sys
import os
import pandas as pd

# Ensure src path is still loaded
current_dir = os.getcwd()
src_path = os.path.abspath(os.path.join(current_dir, '..', 'src')) if current_dir.endswith('notebooks') else os.path.abspath(os.path.join(current_dir, 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from optimization import InventoryOptimizer

print("Running Inventory Optimization Module...")

# Initialize optimizer with business constraints
# (e.g., 3 days lead time, 20% annual holding cost, $15 penalty per missed sale)
optimizer = InventoryOptimizer(lead_time_days=3, holding_cost_annual_rate=0.2, stockout_penalty_per_unit=15)

# We pass in the test_df containing our ensemble forecasts, 
# and the residual_std we calculated back in Cell 2 as our measure of uncertainty.
df_inventory = optimizer.generate_recommendations(test_df, residual_std)

# Save the final analytical dataset for our dashboard
data_dir = '../data' if current_dir.endswith('notebooks') else 'data'
df_inventory.to_csv(f'{data_dir}/final_inventory_recommendations.csv', index=False)

print("Optimization complete. Final recommendations saved to data/final_inventory_recommendations.csv")

# Display a sample of the actionable business recommendations
display(df_inventory[df_inventory['product_id'] == 'P001'].head(5))

Running Inventory Optimization Module...
Optimization complete. Final recommendations saved to data/final_inventory_recommendations.csv


Unnamed: 0,date,store_id,product_id,base_price,ensemble_forecast,target_service_level,safety_stock,reorder_point,recommended_order_quantity
0,2025-10-31,S001,P001,78.35,65.356773,0.99,23.0,220.0,220.0
1,2025-11-01,S001,P001,78.35,63.217238,0.99,23.0,213.0,213.0
2,2025-11-02,S001,P001,78.35,62.589488,0.99,23.0,211.0,211.0
3,2025-11-03,S001,P001,78.35,64.062245,0.99,23.0,216.0,216.0
4,2025-11-04,S001,P001,78.35,62.696028,0.99,23.0,212.0,212.0
