# HandM Sales — Numeric vs Embeddings vs Interactions (full-span, no age)

This notebook:
1. Pulls the prefiltered CSV from GitHub (`HandMSales_fullspan.csv`), which already keeps only products present in **all** months.
2. Trains three models:
   - **Model A**: numeric only (mean_price, lag_m1, price_change + month/channel dummies)
   - **Model B**: numeric + text embeddings from `detail_desc`
   - **Model C**: numeric + embeddings + interactions of embeddings with `price_change`

We scale only the continuous numeric features.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt

GITHUB_CSV_URL = "https://raw.githubusercontent.com/ucla-anderson-SSAI/SSAI/main/HandMSales_fullspan.csv"
print('[INFO] Loading CSV from GitHub…')
df = pd.read_csv(GITHUB_CSV_URL, parse_dates=["month_ts"])
print('[INFO] Data shape:', df.shape)

# time-based split: last month overall is test
months = np.sort(df["month_ts"].unique())
train = df[df["month_ts"] < months[-1]].copy()
test  = df[df["month_ts"] == months[-1]].copy()
print('[INFO] Train rows:', len(train), '| Test rows:', len(test))

y_tr = np.log1p(train["demand"].to_numpy())
y_te = test["demand"].to_numpy()

# features (no age_m)
base_numeric = ["mean_price", "lag_m1", "price_change"]
dummy_cols = [c for c in df.columns if (c.startswith('month_') and c != 'month_ts') or c.startswith('channel_')]
all_feature_cols = base_numeric + dummy_cols

# scale only continuous
scaler = StandardScaler()
train_s = train.copy(); test_s = test.copy()
train_s[base_numeric] = scaler.fit_transform(train[base_numeric])
test_s[base_numeric] = scaler.transform(test[base_numeric])


## Model A — numeric only

In [None]:
X_tr_A = train_s[all_feature_cols].to_numpy()
X_te_A = test_s[all_feature_cols].to_numpy()

model_A = LassoCV(cv=3, random_state=0).fit(X_tr_A, y_tr)
pred_A = np.expm1(model_A.predict(X_te_A))

r2_A = r2_score(y_te, pred_A)
rmse_A = np.sqrt(mean_squared_error(y_te, pred_A))
mae_A = mean_absolute_error(y_te, pred_A)
print(f"Model A (numeric) — R²={r2_A:.3f}, RMSE={rmse_A:.2f}, MAE={mae_A:.2f}")


## Build embeddings from `detail_desc`

In [None]:
embed_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
uniq = df[["article_id", "detail_desc"]].drop_duplicates("article_id").fillna("")
emb = embed_model.encode(uniq["detail_desc"].tolist(), show_progress_bar=False)
emb_cols = [f"emb_{i}" for i in range(emb.shape[1])]
emb_df = pd.DataFrame(emb, columns=emb_cols)
emb_df.insert(0, "article_id", uniq["article_id"].values)

trainE = train_s.merge(emb_df, on="article_id", how="left")
testE  = test_s.merge(emb_df, on="article_id", how="left")
print('[INFO] Train with embeddings:', trainE.shape, '| Test with embeddings:', testE.shape)


## Model B — numeric + embeddings

In [None]:
num_plus_emb = all_feature_cols + emb_cols
X_tr_B = trainE[num_plus_emb].to_numpy()
X_te_B = testE[num_plus_emb].to_numpy()

model_B = LassoCV(cv=3, random_state=0).fit(X_tr_B, y_tr)
pred_B = np.expm1(model_B.predict(X_te_B))

r2_B = r2_score(y_te, pred_B)
rmse_B = np.sqrt(mean_squared_error(y_te, pred_B))
mae_B = mean_absolute_error(y_te, pred_B)
print(f"Model B (numeric + emb) — R²={r2_B:.3f}, RMSE={rmse_B:.2f}, MAE={mae_B:.2f}")


## Model C — numeric + embeddings + interactions with price_change

In [None]:
# create interaction features: scaled price_change * each embedding dim
for c in emb_cols:
    trainE[f"pch_x_{c}"] = trainE["price_change"] * trainE[c]
    testE[f"pch_x_{c}"] = testE["price_change"] * testE[c]

num_emb_int = all_feature_cols + [c for c in trainE.columns if c.startswith('emb_') or c.startswith('pch_x_')]

X_tr_C = trainE[num_emb_int].to_numpy()
X_te_C = testE[num_emb_int].to_numpy()

model_C = LassoCV(cv=3, random_state=0).fit(X_tr_C, y_tr)
pred_C = np.expm1(model_C.predict(X_te_C))

r2_C = r2_score(y_te, pred_C)
rmse_C = np.sqrt(mean_squared_error(y_te, pred_C))
mae_C = mean_absolute_error(y_te, pred_C)
print(f"Model C (numeric + emb × price_change) — R²={r2_C:.3f}, RMSE={rmse_C:.2f}, MAE={mae_C:.2f}")


## Compare models

In [None]:
labels = ["A numeric", "B +emb", "C +emb×price_change"]
scores = [r2_A, r2_B, r2_C]
plt.figure(figsize=(5, 3))
plt.bar(labels, scores)
plt.ylabel("R² (test)")
plt.title("Embeddings ablation — full-span products")
plt.show()
