[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tunnel-ai/way/blob/main/notebooks/02_02_exercise_open.ipynb)

# Module 2 — Supervised Learning (Regression)
## 02_02 — Constrained Decision-Making Workbook

**Target:** `transaction_loss_amount`  
**Dataset:** `generate_transaction_risk_dataset(seed=1955)` (canonical; do not modify)

### Your role in this notebook
You are not being graded on “how many models you try.” You are being graded on **two modeling decisions** you make and defend with evidence.

### Hard constraints (do not change)
1. Use the canonical generator with **seed = 1955**.
2. Use the provided split: `test_size=0.25, random_state=1955`.
3. Report **MAE, RMSE, and R²** on the validation set.
4. Your final solution must use **a single sklearn `Pipeline`** named `final_model`.
5. Include **one residual diagnostic plot** and interpret it.

### What you will submit
This notebook with:
- a completed **Decision Log**
- a `final_model` pipeline
- final metrics + residual plot


In [None]:
# Colab-first setup (run this cell first)
# If you're running locally, you can skip the git clone and just ensure the repo is on your PYTHONPATH.

!git clone https://github.com/tunnel-ai/way.git

import sys
sys.path.insert(0, "/content/way/src")

# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
from core.generators.transaction_risk_dgp import generate_transaction_risk_dataset

data = generate_transaction_risk_dataset(seed=1955)

# Defensive handling in case the generator returns (df, meta) or similar.
if isinstance(data, tuple) and len(data) > 0:
    df = data[0]
else:
    df = data

df.head()


In [None]:
# Quick sanity checks (do not overthink)
print("Rows:", len(df))
print("Columns:", len(df.columns))
print("Target summary:")
display(df["transaction_loss_amount"].describe())

# Zero inflation check
zero_rate = (df["transaction_loss_amount"] == 0).mean()
print(f"Share of zero losses: {zero_rate:.3f}")

# A quick look at the heavy tail (non-zero only)
nz = df.loc[df["transaction_loss_amount"] > 0, "transaction_loss_amount"]
print("Non-zero count:", len(nz))
if len(nz) > 0:
    display(nz.describe(percentiles=[0.5, 0.9, 0.95, 0.99]))


In [None]:
# Target and leakage exclusions (keep these explicit)
TARGET = "transaction_loss_amount"

# --- IMPORTANT ---
# We are predicting loss amount per transaction. In this synthetic DGP, 'is_fraud' is strongly coupled to loss
# and should be treated as leakage for this regression exercise (we want the model to learn signals, not the label).
LEAKAGE_COLS = [
    "is_fraud",
]

# Optional: drop identifiers that you decide are inappropriate (you will justify this if you do it).
ID_COLS = [
    # "merchant_id",   # NOTE: high-cardinality; included by default unless *you* decide otherwise (Decision Menu)
]

drop_cols = list(set(LEAKAGE_COLS + ID_COLS))

y = df[TARGET].copy()
X = df.drop(columns=[TARGET] + drop_cols).copy()

# Fixed split for comparability
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, random_state=1955
)

print("Train:", X_train.shape, "Valid:", X_valid.shape)


In [None]:
def regression_report(y_true, y_pred, label="model"):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {"model": label, "MAE": mae, "RMSE": rmse, "R2": r2}

def show_reports(reports):
    out = pd.DataFrame(reports).sort_values("RMSE")
    display(out)
    return out

# Two baseline models:
# 1) Predict 0 (important for zero-inflated targets)
# 2) Predict the training mean (classic baseline)
yhat_zero = np.zeros_like(y_valid, dtype=float)
yhat_mean = np.full_like(y_valid, fill_value=float(y_train.mean()), dtype=float)

reports = []
reports.append(regression_report(y_valid, yhat_zero, label="baseline: predict 0"))
reports.append(regression_report(y_valid, yhat_mean, label="baseline: predict train mean"))

show_reports(reports)


In [None]:
# Identify column types from the training set
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = [c for c in X_train.columns if c not in numeric_features]

print("Numeric:", len(numeric_features))
print("Categorical:", len(categorical_features))
print("Categorical columns:", categorical_features)

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="drop",
)


In [None]:
# Starter models (you will choose what to keep/improve)
# These are NOT the goal by themselves. They give you a baseline workflow to adapt.

models = {
    "LinearRegression": LinearRegression(),
    # Regularization with CV (kept modest to avoid huge run times)
    "RidgeCV": RidgeCV(alphas=np.logspace(-3, 3, 13)),
    "LassoCV": LassoCV(alphas=None, cv=5, n_jobs=None, random_state=1955, max_iter=20000),
    "ElasticNetCV": ElasticNetCV(l1_ratio=[0.2, 0.5, 0.8], cv=5, random_state=1955, max_iter=20000),
}

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_valid)
    reports.append(regression_report(y_valid, preds, label=name))

results = show_reports(reports)


## Decision Menu (choose **exactly two**)

Pick **two** decisions below. For each decision:
1. Describe *what you changed*.
2. Explain *why* (tradeoff).
3. Provide evidence (metrics and/or plot).

### Decision A — High-cardinality `merchant_id`
Choose one strategy (and justify):
- A1: **Keep it** (one-hot; accept many columns)
- A2: **Drop it** (reduce dimensionality; may lose signal)
- A3: **Top-K one-hot + Other** (you decide K)
- A4: **Frequency encoding** (target-agnostic; simple and compact)

### Decision B — Target strategy for a zero-inflated heavy tail
Choose one strategy (and justify):
- B1: Model `y` directly (plain regression)
- B2: Model `log1p(y)` and transform back with `expm1`
- B3: Two-stage model (classify fraud → regress loss for predicted fraud), then combine

### Decision C — Metric emphasis
Pick a “primary” metric (MAE vs RMSE vs R²) and justify why it fits this context.

### Decision D — Feature group inclusion
Compare two feature sets and justify:
- Transaction context only vs. context + customer/device
- Or: remove suspected nuisance predictors and see what happens

> **Rule:** You must implement **two** decisions. You may discuss more, but you may not build a sprawling model zoo.


In [None]:
# === Decision implementation area (choose exactly two decisions) ===
# Write your code below. Keep it readable and short.

# Tip: Start by copying the best-performing model name from the table above,
# then adapt it based on your decisions.

# Example skeleton:
# chosen_model = RidgeCV(alphas=np.logspace(-3, 3, 13))
# model_pipe = Pipeline([("preprocess", preprocess), ("model", chosen_model)])
# model_pipe.fit(X_train, y_train)
# preds = model_pipe.predict(X_valid)
# print(regression_report(y_valid, preds, "my_candidate"))



### Interpretation notes
Write 3–6 sentences explaining your two decisions and what you expect to happen **before** you run the full evaluation.

In [None]:
# (Optional) Write your quick prediction here, then proceed.

In [None]:
# Build your final_model pipeline here
# final_model = Pipeline([...])


In [None]:
# Residual diagnostic (required)
# Make *one* residual plot and interpret it in the markdown cell below.

# Fit your chosen final model pipeline first (name it final_model).
# Then run this cell.

# --- TODO: ensure final_model exists ---
# final_model = ...

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_valid)

residuals = y_valid - y_pred

plt.figure()
plt.scatter(y_pred, residuals, alpha=0.3)
plt.axhline(0)
plt.xlabel("Predicted loss amount")
plt.ylabel("Residual (actual - predicted)")
plt.title("Residuals vs Predicted")
plt.show()

# Optional: look at absolute error vs prediction (tail sensitivity)
plt.figure()
plt.scatter(y_pred, np.abs(residuals), alpha=0.3)
plt.xlabel("Predicted loss amount")
plt.ylabel("Absolute error")
plt.title("Absolute Error vs Predicted")
plt.show()


## Decision Log (write-up)

### Decision 1
- **What I changed:**
- **Why (tradeoff):**
- **Evidence (metrics/plot):**

### Decision 2
- **What I changed:**
- **Why (tradeoff):**
- **Evidence (metrics/plot):**

### Residual diagnosis
What pattern do you see in the residual plot (if any)? What failure mode does it suggest (e.g., systematic underprediction for large losses, heteroskedasticity, missed nonlinearity, etc.)?


In [None]:
# Final evaluation
final_preds = final_model.predict(X_valid)
final_metrics = regression_report(y_valid, final_preds, label="FINAL")
final_metrics
