[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tunnel-ai/way/blob/main/notebooks/03_00_main.ipynb)

In [None]:
# --- Course setup (uncomment and run if using Colab) --------------------------
#!git clone https://github.com/tunnel-ai/way.git
#import sys; sys.path.insert(0, "/content/way/src")


# Module 3 — Supervised Learning: Classification

**Target:** `is_fraud` (binary classification)  
**Canonical dataset:** `core.generators.transaction_risk_dgp.generate_transaction_risk_dataset(seed=1955)`  

**Why this module matters:** fraud is **rare** and the business decision is usually about **thresholds** (who do we flag?) rather than just “accuracy”.

We will:
1. Load the canonical dataset (same generator as Modules 1–4)
2. Establish baselines (majority class + simple heuristics)
3. Fit a logistic regression (interpretable baseline)
4. Fit tree-based models (nonlinear interactions)
5. Evaluate with confusion matrix, ROC, **and Precision–Recall** (better for imbalance)
6. Pick an operating threshold using a simple cost sketch


In [None]:
# --- Imports ------------------------------------------------------------------
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    roc_auc_score,
    average_precision_score,
    RocCurveDisplay,
    PrecisionRecallDisplay,
    precision_recall_curve
)

RANDOM_STATE = 1955
np.random.seed(RANDOM_STATE)


In [None]:
# --- Load canonical dataset (do not modify generator) --------------------------
from core.generators.transaction_risk_dgp import generate_transaction_risk_dataset

df = generate_transaction_risk_dataset(seed=RANDOM_STATE)

print("Shape:", df.shape)
df.head()


In [None]:
# --- Quick EDA: class balance + target leakage reminder ------------------------
TARGET = "is_fraud"

fraud_rate = df[TARGET].mean()
print(f"Fraud rate (mean of {TARGET}): {fraud_rate:.4f}")

# IMPORTANT: transaction_loss_amount is a *post-event* outcome (0 if not fraud, >0 if fraud).
# If we include it as a feature while predicting is_fraud, the model will "cheat".
print("Loss > 0 rate:", (df["transaction_loss_amount"] > 0).mean())

df[[TARGET, "transaction_loss_amount"]].describe()


In [None]:
# --- Define X/y and split (stratify for class imbalance) ----------------------
# Drop target and any leakage columns
X = df.drop(columns=[TARGET, "transaction_loss_amount"])
y = df[TARGET].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train fraud rate:", y_train.mean())
print("Valid fraud rate:", y_valid.mean())


In [None]:
# --- Identify feature types ---------------------------------------------------
# We keep merchant_id out of the default baseline pipeline because it is high-cardinality.
# We'll return to it later and treat it as an encoding decision.
high_card_col = "merchant_id"

categorical_cols = [c for c in X_train.columns if X_train[c].dtype == "object"]
numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

# remove high-card col from categorical baseline set
categorical_low_card = [c for c in categorical_cols if c != high_card_col]

print("Numeric cols:", len(numeric_cols))
print("Categorical low-card cols:", categorical_low_card)
print("High-card col:", high_card_col)


In [None]:
# --- Preprocessing pipeline (leakage-safe) ------------------------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess_low_card = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_low_card),
    ],
    remainder="drop"
)


In [None]:
# --- Baseline 1: majority class (DummyClassifier) ------------------------------
dummy = Pipeline(steps=[
    ("preprocess", preprocess_low_card),
    ("model", DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE))
])

dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_valid)

print("Confusion matrix (majority-class baseline):")
print(confusion_matrix(y_valid, y_pred_dummy))
print()
print(classification_report(y_valid, y_pred_dummy, digits=4))


In [None]:
# --- Model 1: Logistic Regression (interpretable baseline) --------------------
# class_weight='balanced' often improves recall under class imbalance.
logit_model = Pipeline(steps=[
    ("preprocess", preprocess_low_card),
    ("model", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        solver="liblinear",
        random_state=RANDOM_STATE
    ))
])

logit_model.fit(X_train, y_train)

y_pred = logit_model.predict(X_valid)
y_proba = logit_model.predict_proba(X_valid)[:, 1]

print("Confusion matrix (logistic regression @ threshold=0.50):")
print(confusion_matrix(y_valid, y_pred))
print()
print(classification_report(y_valid, y_pred, digits=4))

print("ROC-AUC:", roc_auc_score(y_valid, y_proba))
print("PR-AUC:", average_precision_score(y_valid, y_proba))


In [None]:
# --- ROC + Precision–Recall curves --------------------------------------------
fig = plt.figure(figsize=(6, 4))
RocCurveDisplay.from_predictions(y_valid, y_proba)
plt.title("ROC Curve — Logistic Regression")
plt.show()

fig = plt.figure(figsize=(6, 4))
PrecisionRecallDisplay.from_predictions(y_valid, y_proba)
plt.title("Precision–Recall Curve — Logistic Regression")
plt.show()


In [None]:
# --- Thresholding: choose an operating point ---------------------------------
# Accuracy is often misleading in imbalanced problems.
# Instead, we pick a threshold based on a (simple) cost sketch.

precision, recall, thresholds = precision_recall_curve(y_valid, y_proba)

# Drop the last precision/recall entry (has no threshold)
precision = precision[:-1]
recall = recall[:-1]

# Example costs (edit these in class):
# - False Negative: missed fraud (expensive)
# - False Positive: manual review / customer friction (less expensive)
C_FN = 50
C_FP = 1

# Compute expected cost per threshold
# cost = C_FN * FN + C_FP * FP
costs = []
for t in thresholds:
    y_hat = (y_proba >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_valid, y_hat).ravel()
    costs.append(C_FN * fn + C_FP * fp)

costs = np.array(costs)
best_idx = costs.argmin()

best_t = thresholds[best_idx]
print("Best threshold (by expected cost):", best_t)
print("Min cost:", costs[best_idx])

# Show a small table around the best threshold
window = 8
lo = max(0, best_idx - window)
hi = min(len(thresholds), best_idx + window + 1)

summary = pd.DataFrame({
    "threshold": thresholds[lo:hi],
    "precision": precision[lo:hi],
    "recall": recall[lo:hi],
    "expected_cost": costs[lo:hi]
})
summary.sort_values("expected_cost").head(10)


In [None]:
# Evaluate logistic regression at the chosen threshold
t = best_t
y_hat_best = (y_proba >= t).astype(int)

print("Confusion matrix (logistic regression @ chosen threshold):")
print(confusion_matrix(y_valid, y_hat_best))
print()
print(classification_report(y_valid, y_hat_best, digits=4))


In [None]:
# --- Model 2: Decision Tree ---------------------------------------------------
# Trees can learn nonlinear rules and interactions but overfit easily.
tree_model = Pipeline(steps=[
    ("preprocess", preprocess_low_card),
    ("model", DecisionTreeClassifier(
        max_depth=6,
        min_samples_leaf=50,
        random_state=RANDOM_STATE,
        class_weight="balanced"
    ))
])

tree_model.fit(X_train, y_train)
tree_proba = tree_model.predict_proba(X_valid)[:, 1]
tree_pred = (tree_proba >= 0.5).astype(int)

print(classification_report(y_valid, tree_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_valid, tree_proba))
print("PR-AUC:", average_precision_score(y_valid, tree_proba))


In [None]:
# --- Model 3: Random Forest ---------------------------------------------------
# Forests reduce overfitting by averaging many trees.
rf_model = Pipeline(steps=[
    ("preprocess", preprocess_low_card),
    ("model", RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=20,
        n_jobs=-1,
        random_state=RANDOM_STATE,
        class_weight="balanced_subsample"
    ))
])

rf_model.fit(X_train, y_train)
rf_proba = rf_model.predict_proba(X_valid)[:, 1]
rf_pred = (rf_proba >= 0.5).astype(int)

print(classification_report(y_valid, rf_pred, digits=4))
print("ROC-AUC:", roc_auc_score(y_valid, rf_proba))
print("PR-AUC:", average_precision_score(y_valid, rf_proba))


In [None]:
# --- Compare models (same split, same metrics) --------------------------------
models = {
    "Dummy (majority)": dummy,
    "Logistic Regression": logit_model,
    "Decision Tree": tree_model,
    "Random Forest": rf_model
}

rows = []
for name, model in models.items():
    if name == "Dummy (majority)":
        proba = model.predict_proba(X_valid)[:, 1] if hasattr(model.named_steps["model"], "predict_proba") else None
        pred = model.predict(X_valid)
        # For dummy majority-class, proba may be constant; ROC/PR can be ill-defined in edge cases.
        roc = roc_auc_score(y_valid, pred)
        pr = average_precision_score(y_valid, pred)
    else:
        proba = model.predict_proba(X_valid)[:, 1]
        pred = (proba >= 0.5).astype(int)
        roc = roc_auc_score(y_valid, proba)
        pr = average_precision_score(y_valid, proba)

    tn, fp, fn, tp = confusion_matrix(y_valid, pred).ravel()
    precision_hat = tp / (tp + fp) if (tp + fp) else 0.0
    recall_hat = tp / (tp + fn) if (tp + fn) else 0.0

    rows.append({
        "model": name,
        "precision@0.5": precision_hat,
        "recall@0.5": recall_hat,
        "ROC_AUC": roc,
        "PR_AUC": pr
    })

pd.DataFrame(rows).sort_values("PR_AUC", ascending=False)


# Encoding decision: high-cardinality merchant_id (optional section)

`merchant_id` is intentionally **high-cardinality** (Zipfian).  
A naive one-hot encoding can explode the feature space and overfit.

Here we demonstrate a simple **frequency encoding**:
- Compute merchant frequency in the training set
- Map merchants to that frequency (unseen merchants get a small default)

This keeps the feature numeric, avoids huge sparse matrices, and forces a discussion about
representation choices.


In [None]:
# --- Frequency encoding for merchant_id ---------------------------------------
X_train_fe = X_train.copy()
X_valid_fe = X_valid.copy()

merchant_freq = X_train_fe[high_card_col].value_counts(dropna=False) / len(X_train_fe)
default_freq = merchant_freq.min()  # reasonable default for unseen merchants

X_train_fe["merchant_id_freq"] = X_train_fe[high_card_col].map(merchant_freq).fillna(default_freq)
X_valid_fe["merchant_id_freq"] = X_valid_fe[high_card_col].map(merchant_freq).fillna(default_freq)

# Drop raw merchant_id after encoding
X_train_fe = X_train_fe.drop(columns=[high_card_col])
X_valid_fe = X_valid_fe.drop(columns=[high_card_col])

# Recompute column sets
categorical_cols_fe = [c for c in X_train_fe.columns if X_train_fe[c].dtype == "object"]
numeric_cols_fe = [c for c in X_train_fe.columns if c not in categorical_cols_fe]

preprocess_fe = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols_fe),
        ("cat", categorical_transformer, categorical_cols_fe),
    ],
    remainder="drop"
)

logit_fe = Pipeline(steps=[
    ("preprocess", preprocess_fe),
    ("model", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        solver="liblinear",
        random_state=RANDOM_STATE
    ))
])

logit_fe.fit(X_train_fe, y_train)
proba_fe = logit_fe.predict_proba(X_valid_fe)[:, 1]

print("Logistic Regression + merchant_id frequency encoding")
print("ROC-AUC:", roc_auc_score(y_valid, proba_fe))
print("PR-AUC:", average_precision_score(y_valid, proba_fe))


## Conceptual Notes to Self

- If Random Forest improves **PR-AUC** materially over Logistic Regression, what does that imply about **nonlinear interactions** in the fraud mechanism?
- Why is **Precision–Recall** often more informative than ROC for rare-event detection?
- How does your **threshold choice** change the number of false positives you must operationally handle?
