# 03 — Backtesting & Model Selection
Rolling origin evaluation with WAPE, sMAPE, MASE, Bias. Compare SARIMAX, Prophet, and Global GBM. Focus on **holiday weeks** and **tail stores**.


In [1]:

import pandas as pd, numpy as np
from pathlib import Path
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

RAW = Path("../data/raw")
train = pd.read_csv(RAW/"train.csv")
stores = pd.read_csv(RAW/"stores.csv")
features = pd.read_csv(RAW/"features.csv")
for df in [train, features]:
    df['Date'] = pd.to_datetime(df['Date'])

df = (train.merge(stores, on='Store', how='left')
           .merge(features, on=['Store','Date'], how='left')
      ).sort_values(['Store','Dept','Date'])
df = df.drop(columns=['IsHoliday_y'])  # remove duplicate IsHoliday
df = df.rename(columns={'IsHoliday_x': 'IsHoliday'})
df['IsHoliday'] = df['IsHoliday'].astype(bool)

# Features (minimal for demo; reuse engineered features from notebook 02 for real run)
df['Year'] = df['Date'].dt.year
df['Week'] = df['Date'].dt.isocalendar().week.astype(int)
for k in [1,2,3,4,13,52]:
    df[f'lag_{k}'] = df.groupby(['Store','Dept'])['Weekly_Sales'].shift(k)

y = 'Weekly_Sales'
X_cols = ['IsHoliday','Year','Week'] + [c for c in df.columns if c.startswith('lag_')]

def wape(y_true, y_pred):
    return np.abs(y_true - y_pred).sum() / (np.abs(y_true).sum() + 1e-9)

def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2*np.abs(y_pred - y_true) / (np.abs(y_true)+np.abs(y_pred)+1e-9))

def mase(y_true, y_pred, m=52):
    # Seasonal naive denominator
    d = np.abs(y_true[m:] - y_true[:-m]).mean()
    return np.abs(y_true - y_pred).mean() / (d + 1e-9)

# Rolling origin splits
dates = sorted(df['Date'].unique())
folds = []
for split in dates[int(len(dates)*0.5): int(len(dates)*0.9): 5]:
    train_mask = df['Date'] <= split
    valid_mask = (df['Date'] > split) & (df['Date'] <= split + np.timedelta64(28,'D'))
    if valid_mask.sum() < 1000: 
        continue
    folds.append((train_mask, valid_mask))

scores = []
for i,(tr,va) in enumerate(folds,1):
    tr_df, va_df = df[tr].copy(), df[va].copy()
    tr_df = tr_df.dropna(subset=X_cols+[y])
    va_df = va_df.dropna(subset=X_cols+[y])
    model = LGBMRegressor(n_estimators=1000, learning_rate=0.03, num_leaves=64, random_state=42)
    model.fit(tr_df[X_cols], tr_df[y])
    pred = model.predict(va_df[X_cols])
    scores.append({
        'fold': i,
        'wape': wape(va_df[y].values, pred),
        'smape': smape(va_df[y].values, pred),
        'bias': (pred.sum()-va_df[y].sum())/(va_df[y].sum()+1e-9)
    })

pd.DataFrame(scores)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002773 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1553
[LightGBM] [Info] Number of data points in the train set: 56071, number of used features: 8
[LightGBM] [Info] Start training from score 16317.107284
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001127 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 70259, number of used features: 8
[LightGBM] [Info] Start training from score 16324.169269
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1563
[LightGBM] [Info] Number of data points in the tra

Unnamed: 0,fold,wape,smape,bias
0,1,0.076477,16.065903,0.020593
1,2,0.077652,15.852307,-0.009437
2,3,0.097663,18.227117,0.034755
3,4,0.092208,17.228288,-0.05455
4,5,0.184332,22.96489,-0.003418
5,6,0.156549,27.307839,0.056794
6,7,0.089437,23.338491,-0.025783
7,8,0.081082,18.15768,-0.002226
8,9,0.121061,21.47235,0.003751
9,10,0.081661,18.341053,-0.037523
