# HMData — Q4 Forecast Pedagogical Notebook (Fully Commented)

This notebook walks through a simplified forecasting workflow for H&M-style product data. We simulate a realistic task: predicting Q4 (October–December) demand for products whose Q4 sales are not yet observed. Each code block is fully commented in complete sentences so that students can follow the logic clearly.

In [33]:
# Import data and modeling libraries used throughout the notebook.
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Define a fixed list of months so we can later infer month order if needed.
MONTHS = [
    'January','February','March','April','May','June',
    'July','August','September','October','November','December'
]

# Define a helper function that prints R², RMSE, and MAE for evaluation.
def report_metrics(y_true, y_pred, label=""):
    r2 = r2_score(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{label}R²={r2:.3f}, RMSE={rmse:.2f}, MAE={mae:.2f}")

# Define a helper to display the most influential coefficients by magnitude.
def show_top_coeffs(model, feature_names, k=10):
    coefs = pd.Series(model.coef_, index=feature_names)
    coefs_abs = coefs.abs().sort_values(ascending=False)
    print('\n[Top features by |coefficient|]')
    print(coefs_abs.head(k))

print('[INFO] Libraries imported successfully.')

[INFO] Libraries imported successfully.


In [34]:
# Load the dataset directly from GitHub. Each row represents a product-month combination.
GITHUB_URL = 'https://raw.githubusercontent.com/ucla-anderson-SSAI/SSAI/main/HMData.csv'
df = pd.read_csv(GITHUB_URL)
print('[INFO] Loaded dataset shape:', df.shape)

# Convert boolean or string TRUE/FALSE values into numeric 1s and 0s.
df = df.replace({True: 1, False: 0, 'TRUE': 1, 'FALSE': 0})

# Simplify month column names (e.g., 'month_January' → 'January').
df.columns = [c.replace('month_', '') if c.startswith('month_') else c for c in df.columns]

# Drop 'start' column if it exists because we will not use date strings directly.
if 'start' in df.columns:
    df = df.drop(columns=['start'])

# Focus on a single product type to make the exercise manageable and interpretable.
SELECTED_NAME = 'Hoodie'
df = df[df['name'] == SELECTED_NAME].copy()
if df.empty:
    raise ValueError(f'No rows found for name == {SELECTED_NAME!r}')

print('[INFO] After filtering for product type:', df.shape)

[INFO] Loaded dataset shape: (310849, 118)
[INFO] After filtering for product type: (6366, 118)


In [35]:
# Create a product-level split so that test products represent unseen SKUs. We will only predict their Q4 demand (October–December).
Q4_MONTHS = ['October', 'November', 'December']
unique_ids = df['id'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=0)
print(f'[INFO] Train products: {len(train_ids)}, Test products: {len(test_ids)}')

# Training data contains all months for known products.
train_df = df[df['id'].isin(train_ids)].copy()

# Test data includes only Q4 months for unseen products.
test_df = df[df['id'].isin(test_ids) & (
    (df['October'] == 1) | (df['November'] == 1) | (df['December'] == 1)
)].copy()

print('[INFO] Train rows (all months):', len(train_df))
print('[INFO] Test rows (Q4 only):', len(test_df))

[INFO] Train products: 1162, Test products: 291
[INFO] Train rows (all months): 5117
[INFO] Test rows (Q4 only): 321


In [36]:
# Model 1 fits a LASSO regression using the same features. LASSO adds regularization to prevent overfitting and select relevant predictors.
scaler2 = StandardScaler(with_mean=False)
Xtr2 = scaler2.fit_transform(X_train)
Xte2 = scaler2.transform(X_test)

lasso2 = LassoCV()
lasso2.fit(Xtr2, y_train)
y_pred2 = lasso2.predict(Xte2)

report_metrics(y_test, y_pred2, label='[Model 2 — LASSO] ')
show_top_coeffs(lasso2, X_train.columns, k=12)

[Model 2 — LASSO] R²=0.228, RMSE=56.95, MAE=35.74

[Top features by |coefficient|]
Jersey Basic      14.576694
Divided           10.594909
Baby/Children      9.678115
Jersey Fancy       5.858913
Special Offers     5.612637
September          5.156905
Black              5.127727
November           4.196228
October            4.163397
Menswear           3.600460
January            3.398209
White              3.376988
dtype: float64


In [37]:
# Model 2 introduces time-based features such as lagged demand, moving averages, and price changes.
# These capture short-term temporal patterns that often drive retail demand.

df_fe = df.copy()

# Infer the numeric month for each row to sort observations within product.
def infer_month(row):
    for i, m in enumerate(MONTHS, start=1):
        if m in row and row[m] == 1:
            return i
    return np.nan

df_fe['month_num'] = df_fe.apply(infer_month, axis=1)
df_fe = df_fe.sort_values(['id', 'month_num']).reset_index(drop=True)

# Create lagged and moving average features.
df_fe['lag_demand_1'] = df_fe.groupby('id')['demand'].shift(1)
df_fe['lag_demand_2'] = df_fe.groupby('id')['demand'].shift(2)
df_fe['lag_demand_3'] = df_fe.groupby('id')['demand'].shift(3)
df_fe['ma3_demand'] = df_fe.groupby('id')['demand'].shift(1).rolling(3, min_periods=1).mean().reset_index(level=0, drop=True)
df_fe['price_change'] = df_fe.groupby('id')['price'].pct_change()
df_fe[['lag_demand_1','lag_demand_2','lag_demand_3','ma3_demand','price_change']] = df_fe[['lag_demand_1','lag_demand_2','lag_demand_3','ma3_demand','price_change']].fillna(0)

# Rebuild Q4-style train/test split with new features.
train_fe = df_fe[df_fe['id'].isin(train_ids)].copy()
test_fe = df_fe[df_fe['id'].isin(test_ids) & ((df_fe['October']==1)|(df_fe['November']==1)|(df_fe['December']==1))].copy()

drop_cols_fe = ['id','name','demand','month_num']
X3_train = train_fe.drop(columns=[c for c in drop_cols_fe if c in train_fe.columns])
X3_train = X3_train.apply(pd.to_numeric, errors='coerce').fillna(0)
y3_train = train_fe['demand'].astype(float)
X3_test = test_fe.drop(columns=[c for c in drop_cols_fe if c in test_fe.columns])
X3_test = X3_test.apply(pd.to_numeric, errors='coerce').fillna(0)
y3_test = test_fe['demand'].astype(float)

scaler3 = StandardScaler(with_mean=False)
X3tr = scaler3.fit_transform(X3_train)
X3te = scaler3.transform(X3_test)
lasso3 = LassoCV()
lasso3.fit(X3tr, y3_train)
y3_pred = lasso3.predict(X3te)
report_metrics(y3_test, y3_pred, label='[Model 3 — FE + LASSO] ')
show_top_coeffs(lasso3, X3_train.columns, k=12)

[Model 3 — FE + LASSO] R²=0.339, RMSE=52.72, MAE=26.55

[Top features by |coefficient|]
lag_demand_1     20.601428
ma3_demand       15.786294
lag_demand_3      6.274418
September         5.739292
Divided           5.422986
January           5.401350
price_change      4.694253
Jersey Basic      3.340162
Black             2.801901
Baby/Children     2.772060
lag_demand_2      2.001770
Solid             1.856049
dtype: float64


In [38]:
# ---------------------------------------------------------
# MODEL 3 — Pairwise interactions among engineered features
# ---------------------------------------------------------
# This version keeps the feature space focused by only creating
# interactions BETWEEN the continuous, engineered, time/price features.
# We do NOT interact with wide product attributes like "Black", "Stripe", etc.
# This makes the model more interpretable and less noisy.

# 1) Pick the engineered / continuous features to interact
eng_feats = [
    "lag_demand_1",
    "lag_demand_2",
    "lag_demand_3",
    "ma3_demand",
    "price_change",
    "price",
]

# Keep only the ones that actually exist in the frame
eng_feats = [f for f in eng_feats if f in train_fe.columns]

# 2) Start from the engineered train/test (same as Model 3)
train_adv = train_fe.copy()
test_adv  = test_fe.copy()

# 3) Add simple nonlinear terms (squares) for each engineered feature
for f in eng_feats:
    train_adv[f"{f}_sq"] = train_adv[f] ** 2
    test_adv[f"{f}_sq"]  = test_adv[f] ** 2

# 4) Add all pairwise products among engineered features
#    (order doesn't matter, so we do i < j)
for i in range(len(eng_feats)):
    for j in range(i + 1, len(eng_feats)):
        fi = eng_feats[i]
        fj = eng_feats[j]
        new_name = f"{fi}__x__{fj}"
        train_adv[new_name] = train_adv[fi] * train_adv[fj]
        test_adv[new_name]  = test_adv[fi] * test_adv[fj]

print("[INFO] Added squared terms and pairwise interactions among engineered features only.")

# 5) Build X/y like before
drop_cols_adv = ["id", "name", "demand", "month_num"]
X5_train = train_adv.drop(columns=[c for c in drop_cols_adv if c in train_adv.columns])
X5_train = X5_train.apply(pd.to_numeric, errors="coerce").fillna(0)
y5_train = train_adv["demand"].astype(float)

X5_test = test_adv.drop(columns=[c for c in drop_cols_adv if c in test_adv.columns])
X5_test = X5_test.apply(pd.to_numeric, errors="coerce").fillna(0)
y5_test = test_adv["demand"].astype(float)

print("[INFO] Model 5 feature count:", X5_train.shape[1])

# 6) Scale and fit LASSO
scaler5 = StandardScaler(with_mean=False)
X5tr = scaler5.fit_transform(X5_train)
X5te = scaler5.transform(X5_test)

lasso5 = LassoCV()
lasso5.fit(X5tr, y5_train)
y5_pred = lasso5.predict(X5te)

# 7) Evaluate and show which engineered interactions mattered
report_metrics(y5_test, y5_pred, label="[Model 5 — engineered pairwise interactions] ")
show_top_coeffs(lasso5, X5_train.columns, k=15)


[INFO] Added squared terms and pairwise interactions among engineered features only.
[INFO] Model 5 feature count: 141
[Model 5 — engineered pairwise interactions] R²=0.400, RMSE=50.23, MAE=25.69

[Top features by |coefficient|]
lag_demand_1__x__price           12.149027
ma3_demand                       11.651285
lag_demand_1                      9.464709
lag_demand_1__x__price_change     5.453512
September                         5.140199
January                           4.553462
Divided                           4.347834
lag_demand_2__x__price_change     3.857777
Jersey Basic                      2.605350
ma3_demand_sq                     1.958544
Baby/Children                     1.914190
lag_demand_2__x__lag_demand_3     1.901599
lag_demand_3__x__price            1.570366
Black                             1.207588
lag_demand_3                      1.119887
dtype: float64
