# HMData Pedagogical LASSO Notebook

This notebook loads the wide HMData.csv from GitHub, filters to one product `name`, and fits 4 models:

1. Linear regression to predict **December** demand.
2. LASSO using the same features.
3. LASSO with simple feature engineering (lags, 3-month MA, price change).
4. LASSO with interaction terms between engineered features and the base features.

The goal is to keep the code readable for teaching.

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

MONTHS = [
    "January","February","March","April","May","June",
    "July","August","September","October","November","December"
]

def report(y_true, y_pred, label=""):
    r2 = r2_score(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) ** 0.5
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{label}R²={r2:.3f}, RMSE={rmse:.2f}, MAE={mae:.2f}")

print("[INFO] Imports OK")

In [None]:
GITHUB_URL = "https://raw.githubusercontent.com/ucla-anderson-SSAI/SSAI/main/HMData.csv"

df = pd.read_csv(GITHUB_URL)
print("[INFO] loaded:", df.shape)

# Replace booleans/strings with 1/0 if present
df = df.replace({True: 1, False: 0, "TRUE": 1, "FALSE": 0})

# Rename month_... columns to plain month names
new_cols = []
for c in df.columns:
    if c.startswith("month_"):
        new_cols.append(c.replace("month_", ""))
    else:
        new_cols.append(c)
df.columns = new_cols

# Drop 'start' if present
if "start" in df.columns:
    df = df.drop(columns=["start"])
    print("[INFO] Dropped 'start' column")

# Filter to a single product type (in this dataset it's the 'name' column)
SELECTED_NAME = "Vest top"  # change as needed
df = df[df["name"] == SELECTED_NAME].copy()
if df.empty:
    raise ValueError(f"No rows found for name == {SELECTED_NAME!r}")
print(f"[INFO] after name filter: {df.shape}")

# Keep only December rows since that's our prediction target
df_dec = df[df["December"] == 1].copy()
if df_dec.empty:
    raise ValueError("No December rows found (December == 1)")
print("[INFO] December rows:", df_dec.shape)

## Model 1 & Model 2 (no feature engineering)
Model 1: plain linear regression.
Model 2: same features, but with LASSO.

In [None]:
# Drop identifier/text/target columns
drop_cols = ["id", "name", "demand", "December"]
X = df_dec.drop(columns=[c for c in drop_cols if c in df_dec.columns])
X = X.apply(pd.to_numeric, errors="coerce").fillna(0)
y = df_dec["demand"].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=0
)
print(f"[INFO] Train: {len(y_train)}, Test: {len(y_test)}")

# Model 1 — Linear Regression
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred1 = linreg.predict(X_test)
report(y_test, y_pred1, label="[Model 1 — Linear] ")

# Model 2 — LASSO
scaler2 = StandardScaler(with_mean=False)
Xtr2 = scaler2.fit_transform(X_train)
Xte2 = scaler2.transform(X_test)

lasso2 = LassoCV(cv=3, max_iter=5000, n_jobs=-1, random_state=0)
lasso2.fit(Xtr2, y_train)
y_pred2 = lasso2.predict(Xte2)
report(y_test, y_pred2, label="[Model 2 — LASSO] ")

# (optional) show some coefficients
coef2 = pd.Series(lasso2.coef_, index=X.columns).sort_values(ascending=False)
print("\n[Model 2] Top 10 coefficients:\n", coef2.head(10))

## Model 3 — with simple feature engineering
We go back to the full (all months for that product) dataset, create:
- lagged demand (1, 2, 3 months)
- 3-month moving average of demand
- price change from previous month
Then we filter to December again and run LASSO.

In [None]:
# Work on full filtered df (all months for this product)
df_fe = df.copy()

# Infer month number from month dummies so we can sort for lags
def infer_month(row):
    for i, m in enumerate(MONTHS, start=1):
        if m in row and row[m] == 1:
            return i
    return np.nan

df_fe["month_num"] = df_fe.apply(infer_month, axis=1)
df_fe = df_fe.sort_values(["id", "month_num"]).reset_index(drop=True)

# Lagged demands
df_fe["lag_demand_1"] = df_fe.groupby("id")["demand"].shift(1)
df_fe["lag_demand_2"] = df_fe.groupby("id")["demand"].shift(2)
df_fe["lag_demand_3"] = df_fe.groupby("id")["demand"].shift(3)

# 3-month moving average of previous demands
df_fe["ma3_demand"] = (
    df_fe.groupby("id")["demand"]
         .shift(1)
         .rolling(3, min_periods=1)
         .mean()
         .reset_index(level=0, drop=True)
)

# price change
df_fe["price_change"] = df_fe.groupby("id")["price"].pct_change()

# fill engineered NaNs with 0
eng_cols = ["lag_demand_1","lag_demand_2","lag_demand_3","ma3_demand","price_change"]
df_fe[eng_cols] = df_fe[eng_cols].fillna(0)

# now go back to December rows
df_dec_fe = df_fe[df_fe["December"] == 1].copy()
print("[INFO] December with FE:", df_dec_fe.shape)

drop_cols_fe = ["id","name","demand","December","month_num"]
X3 = df_dec_fe.drop(columns=[c for c in drop_cols_fe if c in df_dec_fe.columns])
X3 = X3.apply(pd.to_numeric, errors="coerce").fillna(0)
y3 = df_dec_fe["demand"].astype(float)

X3_train, X3_test, y3_train, y3_test = train_test_split(
    X3, y3, test_size=0.2, random_state=0
)

scaler3 = StandardScaler(with_mean=False)
X3tr = scaler3.fit_transform(X3_train)
X3te = scaler3.transform(X3_test)

lasso3 = LassoCV(cv=3, max_iter=6000, n_jobs=-1, random_state=0)
lasso3.fit(X3tr, y3_train)
y3_pred = lasso3.predict(X3te)
report(y3_test, y3_pred, label="[Model 3 — FE + LASSO] ")

coef3 = pd.Series(lasso3.coef_, index=X3.columns).sort_values(ascending=False)
print("\n[Model 3] Top 10 coefficients:\n", coef3.head(10))

## Model 4 — FE + interaction terms
We create interaction terms between the engineered features and the base/features we already had, then run LASSO again.

In [None]:
df_dec_int = df_dec_fe.copy()

engineered = ["lag_demand_1","lag_demand_2","lag_demand_3","ma3_demand","price_change"]
base_feats = [c for c in X3.columns if c not in engineered]

for fe in engineered:
    for base in base_feats:
        df_dec_int[f"{fe}__{base}"] = df_dec_int[fe] * df_dec_int[base]

drop_cols_int = ["id","name","demand","December","month_num"]
X4 = df_dec_int.drop(columns=[c for c in drop_cols_int if c in df_dec_int.columns])
X4 = X4.apply(pd.to_numeric, errors="coerce").fillna(0)
y4 = df_dec_int["demand"].astype(float)

X4_train, X4_test, y4_train, y4_test = train_test_split(
    X4, y4, test_size=0.2, random_state=0
)

scaler4 = StandardScaler(with_mean=False)
X4tr = scaler4.fit_transform(X4_train)
X4te = scaler4.transform(X4_test)

lasso4 = LassoCV(cv=3, max_iter=8000, n_jobs=-1, random_state=0)
lasso4.fit(X4tr, y4_train)
y4_pred = lasso4.predict(X4te)
report(y4_test, y4_pred, label="[Model 4 — FE + Interactions + LASSO] ")

coef4 = pd.Series(lasso4.coef_, index=X4.columns).sort_values(ascending=False)
print("\n[Model 4] Top 10 coefficients:\n", coef4.head(10))