# HMData — Q4 Forecast Pedagogical Notebook (Fully Commented)

This notebook walks through a simplified forecasting workflow for H&M-style product data. We simulate a realistic task: predicting Q4 (October–December) demand for products whose Q4 sales are not yet observed. Each code block is fully commented in complete sentences so that students can follow the logic clearly.

In [ ]:
# Import data and modeling libraries used throughout the notebook.
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Define a fixed list of months so we can later infer month order if needed.
MONTHS = [
    'January','February','March','April','May','June',
    'July','August','September','October','November','December'
]

# Define a helper function that prints R², RMSE, and MAE for evaluation.
def report_metrics(y_true, y_pred, label=""):
    r2 = r2_score(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{label}R²={r2:.3f}, RMSE={rmse:.2f}, MAE={mae:.2f}")

# Define a helper to display the most influential coefficients by magnitude.
def show_top_coeffs(model, feature_names, k=10):
    coefs = pd.Series(model.coef_, index=feature_names)
    coefs_abs = coefs.abs().sort_values(ascending=False)
    print('\n[Top features by |coefficient|]')
    print(coefs_abs.head(k))

print('[INFO] Libraries imported successfully.')

In [ ]:
# Load the dataset directly from GitHub. Each row represents a product-month combination.
GITHUB_URL = 'https://raw.githubusercontent.com/ucla-anderson-SSAI/SSAI/main/HMData.csv'
df = pd.read_csv(GITHUB_URL)
print('[INFO] Loaded dataset shape:', df.shape)

# Convert boolean or string TRUE/FALSE values into numeric 1s and 0s.
df = df.replace({True: 1, False: 0, 'TRUE': 1, 'FALSE': 0})

# Simplify month column names (e.g., 'month_January' → 'January').
df.columns = [c.replace('month_', '') if c.startswith('month_') else c for c in df.columns]

# Drop 'start' column if it exists because we will not use date strings directly.
if 'start' in df.columns:
    df = df.drop(columns=['start'])

# Focus on a single product type to make the exercise manageable and interpretable.
SELECTED_NAME = 'Vest top'
df = df[df['name'] == SELECTED_NAME].copy()
if df.empty:
    raise ValueError(f'No rows found for name == {SELECTED_NAME!r}')

print('[INFO] After filtering for product type:', df.shape)

In [ ]:
# Create a product-level split so that test products represent unseen SKUs. We will only predict their Q4 demand (October–December).
Q4_MONTHS = ['October', 'November', 'December']
unique_ids = df['id'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=0)
print(f'[INFO] Train products: {len(train_ids)}, Test products: {len(test_ids)}')

# Training data contains all months for known products.
train_df = df[df['id'].isin(train_ids)].copy()

# Test data includes only Q4 months for unseen products.
test_df = df[df['id'].isin(test_ids) & (
    (df['October'] == 1) | (df['November'] == 1) | (df['December'] == 1)
)].copy()

print('[INFO] Train rows (all months):', len(train_df))
print('[INFO] Test rows (Q4 only):', len(test_df))

In [ ]:
# Model 1 uses basic linear regression with all available columns. This provides a baseline performance level.
drop_cols = ['id', 'name', 'demand']

X_train = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
X_train = X_train.apply(pd.to_numeric, errors='coerce').fillna(0)
y_train = train_df['demand'].astype(float)

X_test = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns])
X_test = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)
y_test = test_df['demand'].astype(float)

linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred1 = linreg.predict(X_test)
report_metrics(y_test, y_pred1, label='[Model 1 — Linear] ')


In [ ]:
# Model 2 fits a LASSO regression using the same features. LASSO adds regularization to prevent overfitting and select relevant predictors.
scaler2 = StandardScaler(with_mean=False)
Xtr2 = scaler2.fit_transform(X_train)
Xte2 = scaler2.transform(X_test)

lasso2 = LassoCV(cv=3, max_iter=5000, random_state=0)
lasso2.fit(Xtr2, y_train)
y_pred2 = lasso2.predict(Xte2)

report_metrics(y_test, y_pred2, label='[Model 2 — LASSO] ')
show_top_coeffs(lasso2, X_train.columns, k=12)

In [ ]:
# Model 3 introduces time-based features such as lagged demand, moving averages, and price changes.
# These capture short-term temporal patterns that often drive retail demand.

df_fe = df.copy()

# Infer the numeric month for each row to sort observations within product.
def infer_month(row):
    for i, m in enumerate(MONTHS, start=1):
        if m in row and row[m] == 1:
            return i
    return np.nan

df_fe['month_num'] = df_fe.apply(infer_month, axis=1)
df_fe = df_fe.sort_values(['id', 'month_num']).reset_index(drop=True)

# Create lagged and moving average features.
df_fe['lag_demand_1'] = df_fe.groupby('id')['demand'].shift(1)
df_fe['lag_demand_2'] = df_fe.groupby('id')['demand'].shift(2)
df_fe['lag_demand_3'] = df_fe.groupby('id')['demand'].shift(3)
df_fe['ma3_demand'] = df_fe.groupby('id')['demand'].shift(1).rolling(3, min_periods=1).mean().reset_index(level=0, drop=True)
df_fe['price_change'] = df_fe.groupby('id')['price'].pct_change()
df_fe[['lag_demand_1','lag_demand_2','lag_demand_3','ma3_demand','price_change']] = df_fe[['lag_demand_1','lag_demand_2','lag_demand_3','ma3_demand','price_change']].fillna(0)

# Rebuild Q4-style train/test split with new features.
train_fe = df_fe[df_fe['id'].isin(train_ids)].copy()
test_fe = df_fe[df_fe['id'].isin(test_ids) & ((df_fe['October']==1)|(df_fe['November']==1)|(df_fe['December']==1))].copy()

drop_cols_fe = ['id','name','demand','month_num']
X3_train = train_fe.drop(columns=[c for c in drop_cols_fe if c in train_fe.columns])
X3_train = X3_train.apply(pd.to_numeric, errors='coerce').fillna(0)
y3_train = train_fe['demand'].astype(float)
X3_test = test_fe.drop(columns=[c for c in drop_cols_fe if c in test_fe.columns])
X3_test = X3_test.apply(pd.to_numeric, errors='coerce').fillna(0)
y3_test = test_fe['demand'].astype(float)

scaler3 = StandardScaler(with_mean=False)
X3tr = scaler3.fit_transform(X3_train)
X3te = scaler3.transform(X3_test)
lasso3 = LassoCV(cv=3, max_iter=6000, random_state=0)
lasso3.fit(X3tr, y3_train)
y3_pred = lasso3.predict(X3te)
report_metrics(y3_test, y3_pred, label='[Model 3 — FE + LASSO] ')
show_top_coeffs(lasso3, X3_train.columns, k=12)

In [ ]:
# Model 4 adds simple nonlinear transformations (log, squared terms) and a few intuitive interactions between features.

def add_nonlinear_and_interactions(df_in):
    df_out = df_in.copy()
    if 'lag_demand_1' in df_out.columns:
        df_out['log_lag1'] = np.log1p(df_out['lag_demand_1'])
    if 'ma3_demand' in df_out.columns:
        df_out['log_ma3'] = np.log1p(df_out['ma3_demand'])
    if 'price' in df_out.columns:
        df_out['price_sq'] = df_out['price'] ** 2
    if {'lag_demand_1','price'}.issubset(df_out.columns):
        df_out['lag1_x_price'] = df_out['lag_demand_1'] * df_out['price']
    if {'price_change','price'}.issubset(df_out.columns):
        df_out['pch_x_price'] = df_out['price_change'] * df_out['price']
    if 'Ladieswear' in df_out.columns and 'lag_demand_1' in df_out.columns:
        df_out['lag1_x_ladies'] = df_out['lag_demand_1'] * df_out['Ladieswear']
    return df_out

train_int = add_nonlinear_and_interactions(train_fe)
test_int  = add_nonlinear_and_interactions(test_fe)
drop_cols_int = ['id','name','demand','month_num']
X4_train = train_int.drop(columns=[c for c in drop_cols_int if c in train_int.columns]).apply(pd.to_numeric, errors='coerce').fillna(0)
y4_train = train_int['demand'].astype(float)
X4_test = test_int.drop(columns=[c for c in drop_cols_int if c in test_int.columns]).apply(pd.to_numeric, errors='coerce').fillna(0)
y4_test = test_int['demand'].astype(float)
scaler4 = StandardScaler(with_mean=False)
X4tr = scaler4.fit_transform(X4_train)
X4te = scaler4.transform(X4_test)
lasso4 = LassoCV(cv=3, max_iter=8000, random_state=0)
lasso4.fit(X4tr, y4_train)
y4_pred = lasso4.predict(X4te)
report_metrics(y4_test, y4_pred, label='[Model 4 — Nonlinear + Interactions + LASSO] ')
show_top_coeffs(lasso4, X4_train.columns, k=12)