In [None]:
# Cell 1 — imports & load
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

URL = "https://raw.githubusercontent.com/ucla-anderson-SSAI/SSAI/main/HandMSales_fullspan.csv"
df = pd.read_csv(URL)
df["month_ts"] = pd.to_datetime(df["month_ts"], format="%m/%d/%y", errors="coerce")
df = df.sort_values(["article_id", "month_ts"]).reset_index(drop=True)
df["month_num"] = df["month_ts"].dt.month
print("[INFO] raw df:", df.shape)


In [None]:
# Cell 2 — select a product category
# AVAILABLE CATEGORIES (product_type_name):
# Bag, Ballerinas, Belt, Bikini top, Blazer, Blouse, Bodysuit, Boots,
# Bra, Bra extender, Braces, Cardigan, Coat, Costumes, Dress,
# Dungarees, Earring, Felt hat, Hat/beanie, Hat/brim, Hoodie, Jacket,
# Jumpsuit/Playsuit, Kids Underwear top, Leggings/Tights, Night gown,
# Nipple covers, Polo shirt, Pyjama bottom, Sandals, Shirt, Shorts,
# Skirt, Sneakers, Socks, Sunglasses, Sweater, Swimsuit, Swimwear bottom,
# T-shirt, Top, Trousers, Underwear body, Underwear bottom,
# Underwear Tights, Unknown, Vest top

# 👉 STUDENTS: change this to any of the above strings
SELECTED_PRODUCT_TYPE = "Vest top"

df_sub = df[df["product_type_name"] == SELECTED_PRODUCT_TYPE].copy()
print(f"[INFO] filtered to product_type_name == '{SELECTED_PRODUCT_TYPE}'")
print("[INFO] subset shape:", df_sub.shape)
print("[INFO] unique articles in subset:", df_sub["article_id"].nunique())


In [None]:
# Cell 3 — feature engineering on the subset
df_sub["lag_m1"] = df_sub.groupby("article_id")["demand"].shift(1)
df_sub["lag_m2"] = df_sub.groupby("article_id")["demand"].shift(2)
df_sub["lag_m3"] = df_sub.groupby("article_id")["demand"].shift(3)

df_sub["ma_3"] = (
    df_sub.groupby("article_id")["demand"]
          .shift(1)
          .rolling(3, min_periods=1)
          .mean()
          .reset_index(level=0, drop=True)
)
df_sub[["lag_m1","lag_m2","lag_m3","ma_3"]] = df_sub[["lag_m1","lag_m2","lag_m3","ma_3"]].fillna(0)

# time-based split in this subset
months_sub = np.sort(df_sub["month_ts"].dropna().unique())
train = df_sub[df_sub["month_ts"] < months_sub[-1]].copy()
test  = df_sub[df_sub["month_ts"] == months_sub[-1]].copy()

print("[INFO] train rows:", len(train), "| test rows:", len(test))


In [None]:
# Cell 4 — Model A: baseline + month dummies
modelA_num = ["mean_price"]
modelA_cat = ["index_group_name", "month_num"]

train_A = pd.get_dummies(train[modelA_num + modelA_cat], drop_first=False)
test_A  = pd.get_dummies(test[modelA_num + modelA_cat], drop_first=False)
train_A, test_A = train_A.align(test_A, join="left", axis=1, fill_value=0)

scaler_A = StandardScaler()
train_A[modelA_num] = scaler_A.fit_transform(train_A[modelA_num])
test_A[modelA_num]  = scaler_A.transform(test_A[modelA_num])

X_tr_A = train_A.to_numpy()
X_te_A = test_A.to_numpy()
y_tr = train["demand"].to_numpy()
y_te = test["demand"].to_numpy()

model_A = LassoCV(cv=3, max_iter=5000, random_state=0, n_jobs=-1).fit(X_tr_A, y_tr)
pred_A = model_A.predict(X_te_A)

r2_A = r2_score(y_te, pred_A)
rmse_A = mean_squared_error(y_te, pred_A) ** 0.5
mae_A = mean_absolute_error(y_te, pred_A)
print(f"Model A (baseline + month) — R²={r2_A:.3f}, RMSE={rmse_A:.2f}, MAE={mae_A:.2f}")


In [None]:
# Cell 5 — Model B: add lags, MA, price_change + month dummies
modelB_num = [
    "mean_price",
    "price_change",
    "lag_m1", "lag_m2", "lag_m3",
    "ma_3",
]
modelB_cat = ["index_group_name", "month_num"]

train_B = pd.get_dummies(train[modelB_num + modelB_cat], drop_first=False)
test_B  = pd.get_dummies(test[modelB_num + modelB_cat], drop_first=False)
train_B, test_B = train_B.align(test_B, join="left", axis=1, fill_value=0)

scaler_B = StandardScaler()
train_B[modelB_num] = scaler_B.fit_transform(train_B[modelB_num])
test_B[modelB_num]  = scaler_B.transform(test_B[modelB_num])

X_tr_B = train_B.to_numpy()
X_te_B = test_B.to_numpy()

model_B = LassoCV(cv=3, max_iter=5000, random_state=0, n_jobs=-1).fit(X_tr_B, y_tr)
pred_B = model_B.predict(X_te_B)

r2_B = r2_score(y_te, pred_B)
rmse_B = mean_squared_error(y_te, pred_B) ** 0.5
mae_B = mean_absolute_error(y_te, pred_B)
print(f"Model B (+ lags / MA / price_change + month) — R²={r2_B:.3f}, RMSE={rmse_B:.2f}, MAE={mae_B:.2f}")


In [None]:
# Cell 6 — Model C: rich numeric interactions + month
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_tr = poly.fit_transform(train_B[modelB_num])
poly_te = poly.transform(test_B[modelB_num])

X_tr_C = np.hstack([train_B.drop(columns=modelB_num).to_numpy(), poly_tr])
X_te_C = np.hstack([test_B.drop(columns=modelB_num).to_numpy(),  poly_te])

model_C = LassoCV(cv=3, max_iter=6000, random_state=0, n_jobs=-1).fit(X_tr_C, y_tr)
pred_C = model_C.predict(X_te_C)

r2_C = r2_score(y_te, pred_C)
rmse_C = mean_squared_error(y_te, pred_C) ** 0.5
mae_C = mean_absolute_error(y_te, pred_C)
print(f"Model C (rich interactions + month) — R²={r2_C:.3f}, RMSE={rmse_C:.2f}, MAE={mae_C:.2f}")


In [None]:
# Cell 7 — summary
summary = pd.DataFrame({
    "model": [
        "Model A: baseline + month",
        "Model B: + lags / MA / price_change + month",
        "Model C: + interactions + month",
    ],
    "R2":   [r2_A, r2_B, r2_C],
    "RMSE": [rmse_A, rmse_B, rmse_C],
    "MAE":  [mae_A, mae_B, mae_C],
})
print(f"\n[SUMMARY for product_type_name == '{SELECTED_PRODUCT_TYPE}']")
print(summary)
