# HandM Sales — Numeric vs Embeddings vs Interactions (with scaling)

This version pulls the CSV from GitHub and **standardizes only the continuous features**
(`mean_price`, `lag_m1`, `price_change`, `age_m`) so LASSO behaves nicely.


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

GITHUB_CSV_URL = "https://raw.githubusercontent.com/ucla-anderson-SSAI/SSAI/main/HandMSales_final.csv"
df = pd.read_csv(GITHUB_CSV_URL, parse_dates=["month_ts"])
months = np.sort(df["month_ts"].unique())
train = df[df["month_ts"] < months[-1]].copy()
test  = df[df["month_ts"] == months[-1]].copy()

y_tr = np.log1p(train["demand"].to_numpy())
y_te = test["demand"].to_numpy()

base_numeric = ["mean_price", "lag_m1", "price_change", "age_m"]
dummy_cols = [c for c in df.columns if (c.startswith("month_") and c != "month_ts") or c.startswith("channel_")]
all_feature_cols = base_numeric + dummy_cols

scaler = StandardScaler()
train_scaled = train.copy()
test_scaled = test.copy()
train_scaled[base_numeric] = scaler.fit_transform(train[base_numeric])
test_scaled[base_numeric] = scaler.transform(test[base_numeric])

print("[INFO] Train rows:", len(train_scaled), "| Test rows:", len(test_scaled))


In [None]:
# Model A — numeric only
X_tr_A = train_scaled[all_feature_cols].to_numpy()
X_te_A = test_scaled[all_feature_cols].to_numpy()

model_A = LassoCV(cv=3, random_state=0).fit(X_tr_A, y_tr)
pred_A = np.expm1(model_A.predict(X_te_A))
r2_A = r2_score(y_te, pred_A)
rmse_A = np.sqrt(mean_squared_error(y_te, pred_A))
mae_A = mean_absolute_error(y_te, pred_A)
print(f"Model A (numeric) — R²={r2_A:.3f}, RMSE={rmse_A:.2f}, MAE={mae_A:.2f}")


In [None]:
# Build embeddings
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
uniq = df[["article_id", "detail_desc"]].drop_duplicates("article_id").fillna("")
emb = embed_model.encode(uniq["detail_desc"].tolist(), show_progress_bar=False)
emb_cols = [f"emb_{i}" for i in range(emb.shape[1])]
emb_df = pd.DataFrame(emb, columns=emb_cols)
emb_df.insert(0, "article_id", uniq["article_id"].values)
trainE = train_scaled.merge(emb_df, on="article_id", how="left")
testE  = test_scaled.merge(emb_df, on="article_id", how="left")
print("[INFO] Embeddings merged:", trainE.shape, testE.shape)


In [None]:
# Model B — numeric + embeddings
num_plus_emb = all_feature_cols + [c for c in emb_cols]
X_tr_B = trainE[num_plus_emb].to_numpy()
X_te_B = testE[num_plus_emb].to_numpy()
model_B = LassoCV(cv=3, random_state=0).fit(X_tr_B, y_tr)
pred_B = np.expm1(model_B.predict(X_te_B))
r2_B = r2_score(y_te, pred_B)
rmse_B = np.sqrt(mean_squared_error(y_te, pred_B))
mae_B = mean_absolute_error(y_te, pred_B)
print(f"Model B (numeric + emb) — R²={r2_B:.3f}, RMSE={rmse_B:.2f}, MAE={mae_B:.2f}")


In [None]:
# Model C — numeric + embeddings + interactions
for c in emb_cols:
    trainE[f"age_x_{c}"] = trainE["age_m"] * trainE[c]
    testE[f"age_x_{c}"] = testE["age_m"] * testE[c]

num_emb_int = all_feature_cols + [c for c in trainE.columns if c.startswith("emb_") or c.startswith("age_x_")]
X_tr_C = trainE[num_emb_int].to_numpy()
X_te_C = testE[num_emb_int].to_numpy()

model_C = LassoCV(cv=3, random_state=0).fit(X_tr_C, y_tr)
pred_C = np.expm1(model_C.predict(X_te_C))
r2_C = r2_score(y_te, pred_C)
rmse_C = np.sqrt(mean_squared_error(y_te, pred_C))
mae_C = mean_absolute_error(y_te, pred_C)
print(f"Model C (numeric + emb + interactions) — R²={r2_C:.3f}, RMSE={rmse_C:.2f}, MAE={mae_C:.2f}")


In [None]:
labels = ["A numeric", "B +emb", "C +emb×age"]
scores = [r2_A, r2_B, r2_C]
plt.figure(figsize=(5,3))
plt.bar(labels, scores)
plt.ylabel("R² (test)")
plt.title("Embeddings Ablation (GitHub CSV, scaled numeric)")
plt.show()
