In [None]:
# Cell 1: imports + load
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sentence_transformers import SentenceTransformer

URL = "https://raw.githubusercontent.com/ucla-anderson-SSAI/SSAI/main/HandMSales_fullspan.csv"
df = pd.read_csv(URL)
df["month_ts"] = pd.to_datetime(df["month_ts"], format="%m/%d/%y", errors="coerce")
df = df.sort_values(["article_id", "month_ts"]).reset_index(drop=True)
print("[INFO] raw df:", df.shape)


In [None]:
# Cell 2: shared feature engineering
# lags
df["lag_m1"] = df.groupby("article_id")["demand"].shift(1)
df["lag_m2"] = df.groupby("article_id")["demand"].shift(2)
df["lag_m3"] = df.groupby("article_id")["demand"].shift(3)

# 3-month moving average of past demand
df["ma_3"] = (
    df.groupby("article_id")["demand"]
      .shift(1)
      .rolling(3, min_periods=1)
      .mean()
      .reset_index(level=0, drop=True)
)

# long-run avg demand up to last month
g = df.groupby("article_id")["demand"]
df["cum_sum"] = g.cumsum().shift(1)
df["cum_cnt"] = g.cumcount()
df["mean_prev"] = df["cum_sum"] / df["cum_cnt"].replace(0, np.nan)

# fill early rows
df[["lag_m1","lag_m2","lag_m3","ma_3","mean_prev"]] = (
    df[["lag_m1","lag_m2","lag_m3","ma_3","mean_prev"]].fillna(0)
)

# drop helpers
df = df.drop(columns=["cum_sum","cum_cnt"])


In [None]:
# Cell 3: cold-start split (first row per article is test)
df["is_first_for_article"] = (
    df.groupby("article_id")["month_ts"].rank(method="first").eq(1)
)

test = df[df["is_first_for_article"]].copy()
train = df[~df["is_first_for_article"]].copy()

print("[INFO] cold-start split")
print(f"train rows: {len(train)} | test rows: {len(test)}")
print(f"train articles: {train['article_id'].nunique()} | test articles: {test['article_id'].nunique()}")


In [None]:
# Cell 4: base feature lists + scaling
base_numeric = [
    "mean_price",
    "price_change",
    "lag_m1",
    "lag_m2",
    "lag_m3",
    "ma_3",
    "mean_prev",
]

dummy_cols = [
    c for c in df.columns
    if (c.startswith("month_") and c != "month_ts") or c.startswith("channel_")
]

scaler = StandardScaler()
train_s = train.copy()
test_s  = test.copy()
train_s[base_numeric] = scaler.fit_transform(train[base_numeric])
test_s[base_numeric]  = scaler.transform(test[base_numeric])

X_tr_base = train_s[base_numeric + dummy_cols].to_numpy()
X_te_base = test_s[base_numeric + dummy_cols].to_numpy()
y_tr = train["demand"].to_numpy()
y_te = test["demand"].to_numpy()

print("[INFO] base shapes:", X_tr_base.shape, X_te_base.shape)


In [None]:
# Cell 5: build embeddings from ALL text fields
target_dim = 128
text_cols = [
    "detail_desc",
    "product_type_name",
    "graphical_appearance_name",
    "colour_group_name",
    "index_group_name",
    "garment_group_name",
]

embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

uniq = df[["article_id"] + text_cols].drop_duplicates("article_id").fillna("")
uniq["combined_text"] = uniq[text_cols].agg(" ".join, axis=1)

emb = embed_model.encode(uniq["combined_text"].tolist(), show_progress_bar=False)[:, :target_dim]

emb_cols = [f"emb_{i}" for i in range(target_dim)]
emb_df = pd.DataFrame(emb, columns=emb_cols)
emb_df.insert(0, "article_id", uniq["article_id"].values)

trainE = train_s.merge(emb_df, on="article_id", how="left")
testE  = test_s.merge(emb_df, on="article_id", how="left")

X_tr_emb = trainE[emb_cols].to_numpy()
X_te_emb = testE[emb_cols].to_numpy()
print("[INFO] embeddings shapes:", X_tr_emb.shape, X_te_emb.shape)


In [None]:
# Cell 6: Model 1 — numeric only
model1 = LassoCV(cv=3, max_iter=5000, random_state=0, n_jobs=-1)
model1.fit(X_tr_base, y_tr)
pred1 = model1.predict(X_te_base)

r2_1 = r2_score(y_te, pred1)
rmse_1 = np.sqrt(mean_squared_error(y_te, pred1))
mae_1 = mean_absolute_error(y_te, pred1)
print(f"Model 1 (numeric) -> R2={r2_1:.3f} RMSE={rmse_1:.2f} MAE={mae_1:.2f}")


In [None]:
# Cell 7: Model 2 — numeric + interactions
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_tr = poly.fit_transform(train_s[base_numeric])
poly_te = poly.transform(test_s[base_numeric])

X_tr_2 = np.hstack([train_s[dummy_cols].to_numpy(), poly_tr])
X_te_2 = np.hstack([test_s[dummy_cols].to_numpy(),  poly_te])

model2 = LassoCV(cv=3, max_iter=5000, random_state=0, n_jobs=-1)
model2.fit(X_tr_2, y_tr)
pred2 = model2.predict(X_te_2)

r2_2 = r2_score(y_te, pred2)
rmse_2 = np.sqrt(mean_squared_error(y_te, pred2))
mae_2 = mean_absolute_error(y_te, pred2)
print(f"Model 2 (numeric + interactions) -> R2={r2_2:.3f} RMSE={rmse_2:.2f} MAE={mae_2:.2f}")


In [None]:
# Cell 8: Model 3 — embeddings only
model3 = LassoCV(cv=3, max_iter=5000, random_state=0, n_jobs=-1)
model3.fit(X_tr_emb, y_tr)
pred3 = model3.predict(X_te_emb)

r2_3 = r2_score(y_te, pred3)
rmse_3 = np.sqrt(mean_squared_error(y_te, pred3))
mae_3 = mean_absolute_error(y_te, pred3)
print(f"Model 3 (embeddings only) -> R2={r2_3:.3f} RMSE={rmse_3:.2f} MAE={mae_3:.2f}")


In [None]:
# Cell 9: Model 4 — numeric + embeddings
X_tr_4 = np.hstack([X_tr_base, X_tr_emb])
X_te_4 = np.hstack([X_te_base, X_te_emb])

model4 = LassoCV(cv=3, max_iter=5000, random_state=0, n_jobs=-1)
model4.fit(X_tr_4, y_tr)
pred4 = model4.predict(X_te_4)

r2_4 = r2_score(y_te, pred4)
rmse_4 = np.sqrt(mean_squared_error(y_te, pred4))
mae_4 = mean_absolute_error(y_te, pred4)
print(f"Model 4 (numeric + embeddings) -> R2={r2_4:.3f} RMSE={rmse_4:.2f} MAE={mae_4:.2f}")


In [None]:
# Cell 10: Model 5 — numeric + interactions + embeddings
X_tr_5 = np.hstack([X_tr_2, X_tr_emb])
X_te_5 = np.hstack([X_te_2, X_te_emb])

model5 = LassoCV(cv=3, max_iter=5000, random_state=0, n_jobs=-1)
model5.fit(X_tr_5, y_tr)
pred5 = model5.predict(X_te_5)

r2_5 = r2_score(y_te, pred5)
rmse_5 = np.sqrt(mean_squared_error(y_te, pred5))
mae_5 = mean_absolute_error(y_te, pred5)
print(f"Model 5 (numeric + interactions + embeddings) -> R2={r2_5:.3f} RMSE={rmse_5:.2f} MAE={mae_5:.2f}")


In [None]:
# Cell 11: summary
summary = pd.DataFrame({
    "model": [
        "Model 1: numeric",
        "Model 2: numeric+interactions",
        "Model 3: embeddings only",
        "Model 4: numeric+embeddings",
        "Model 5: numeric+interactions+embeddings",
    ],
    "R2":   [r2_1, r2_2, r2_3, r2_4, r2_5],
    "RMSE": [rmse_1, rmse_2, rmse_3, rmse_4, rmse_5],
    "MAE":  [mae_1, mae_2, mae_3, mae_4, mae_5],
})
print(summary)
