# Santander LightGBM Upgraded Pipeline
_Last generated: 2025-10-06T16:21:35_

This notebook provides a **production-leaning** upgrade of your Santander hybrid recommendation system:
- Robust **feature engineering** for tabular data
- **LightGBM** with 5-fold CV and early stopping
- **Hyperparameter tuning** (optional, Optuna)
- **Explainability** via SHAP
- **Fairness auditing** by group
- Save **versioned artifacts** (model + encoders)

## 1) Setup

In [None]:
# !pip install -q pandas numpy scikit-learn lightgbm shap optuna pyarrow
import os, json, gc, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pathlib import Path
from typing import List, Optional

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from sklearn.preprocessing import OrdinalEncoder

BASE_DIR = Path(".")
ARTIFACT_DIR = Path("models"); ARTIFACT_DIR.mkdir(exist_ok=True, parents=True)
REPORT_DIR = Path("reports"); REPORT_DIR.mkdir(exist_ok=True, parents=True)
DATA_DIR = Path("data/processed"); DATA_DIR.mkdir(exist_ok=True, parents=True)

SEED = 42
np.random.seed(SEED)

## 2) Data Loading
Fill the file paths and label/feature settings below.

In [None]:
# ==== TODO: set your processed dataset paths ====
# Expected: tabular classification with binary label
TRAIN_PATH = "data/processed/train.parquet"   # or .csv
VALID_PATH = "data/processed/valid.parquet"   # optional; else split from train
TEST_PATH  = "data/processed/test.parquet"    # optional

# ==== TODO: set your column names ====
LABEL_COL = "target"     # binary label column
ID_COL    = "customer_id"  # optional unique id
GROUP_COL = None         # e.g., "region" or "gender" for fairness auditing; set None if unavailable

# Load data (change to read_csv if needed)
def safe_read(path):
    if str(path).endswith(".parquet"):
        return pd.read_parquet(path)
    elif str(path).endswith(".csv"):
        return pd.read_csv(path)
    else:
        raise ValueError("Use .parquet or .csv")

df = safe_read(TRAIN_PATH) if os.path.exists(TRAIN_PATH) else None
df_valid = safe_read(VALID_PATH) if os.path.exists(VALID_PATH) else None
df_test = safe_read(TEST_PATH) if os.path.exists(TEST_PATH) else None

if df is None:
    print("⚠️ Could not find training data at", TRAIN_PATH, "\nPlease place your processed training data file.")
else:
    print("✅ Train shape:", df.shape)
    if df_valid is not None: print("✅ Valid shape:", df_valid.shape)
    if df_test is not None:  print("✅ Test  shape:", df_test.shape)

## 3) Feature Engineering (Minimal Template)

In [None]:
def split_features_label(df, label_col, id_col=None):
    cols = [c for c in df.columns if c != label_col and c != id_col]
    X = df[cols].copy()
    y = df[label_col].astype(int).values if label_col in df.columns else None
    return X, y, cols

def detect_categorical(df: pd.DataFrame, max_cardinality: int = 64) -> List[str]:
    cats = []
    for c in df.columns:
        if pd.api.types.is_object_dtype(df[c]) or pd.api.types.is_categorical_dtype(df[c]):
            cats.append(c)
        elif pd.api.types.is_integer_dtype(df[c]) and df[c].nunique() <= max_cardinality:
            cats.append(c)
    return cats

def prepare_data(df, label_col, id_col=None):
    X, y, cols = split_features_label(df, label_col, id_col)
    cat_cols = detect_categorical(X)
    enc = None
    if len(cat_cols) > 0:
        enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
        X[cat_cols] = enc.fit_transform(X[cat_cols].astype(str))
    return X, y, cols, cat_cols, enc

if df is not None:
    X, y, features, cat_cols, enc = prepare_data(df, LABEL_COL, ID_COL)
    print("Features:", len(features), "| Categorical:", len(cat_cols))

## 4) LightGBM with 5-Fold CV + Early Stopping

In [None]:
def train_lgbm_cv(X, y, cat_cols: List[str], seed=SEED, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_pred = np.zeros(len(X))
    models = []
    aucs, aps = [], []

    params = {
        "objective": "binary",
        "metric": "auc",
        "learning_rate": 0.05,
        "num_leaves": 64,
        "max_depth": -1,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "lambda_l1": 0.0,
        "lambda_l2": 0.0,
        "verbosity": -1,
        "seed": seed
    }

    for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y)):
        X_trn, y_trn = X.iloc[trn_idx], y[trn_idx]
        X_val, y_val = X.iloc[val_idx], y[val_idx]

        lgb_trn = lgb.Dataset(X_trn, label=y_trn, categorical_feature=cat_cols, free_raw_data=False)
        lgb_val = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols, free_raw_data=False)

        model = lgb.train(params,
                          lgb_trn,
                          num_boost_round=5000,
                          valid_sets=[lgb_trn, lgb_val],
                          valid_names=["train", "valid"],
                          early_stopping_rounds=100,
                          verbose_eval=200)

        pred_val = model.predict(X_val, num_iteration=model.best_iteration)
        oof_pred[val_idx] = pred_val
        auc = roc_auc_score(y_val, pred_val)
        ap  = average_precision_score(y_val, pred_val)
        aucs.append(auc); aps.append(ap)
        models.append(model)
        print(f"[Fold {fold}] AUC={auc:.4f} | AP={ap:.4f} | Best iters={model.best_iteration}")

    print("OOF AUC:", roc_auc_score(y, oof_pred))
    print("OOF AP :", average_precision_score(y, oof_pred))
    return models, oof_pred, {"auc_per_fold": aucs, "ap_per_fold": aps, "oof_auc": float(roc_auc_score(y, oof_pred)), "oof_ap": float(average_precision_score(y, oof_pred))}

if df is not None:
    models, oof_pred, cv_report = train_lgbm_cv(X, y, cat_cols)
    with open(REPORT_DIR / "metrics.json", "w") as f:
        json.dump(cv_report, f, indent=2)

## 5) Save Artifacts

In [None]:
import pickle

# pick the best model by validation score observed (here simply take first as placeholder)
best_model = models[0] if len(models) > 0 else None
if best_model is not None:
    best_model.save_model(str(ARTIFACT_DIR / "lgbm_model.txt"), num_iteration=best_model.best_iteration)
    print("✅ Saved:", ARTIFACT_DIR / "lgbm_model.txt")

if 'enc' in globals() and enc is not None:
    with open(ARTIFACT_DIR / "encoder.pkl", "wb") as f:
        pickle.dump(enc, f)
    print("✅ Saved:", ARTIFACT_DIR / "encoder.pkl")

## 6) SHAP Explainability

In [None]:
import shap
import matplotlib.pyplot as plt

if best_model is not None:
    # shap for tree models
    explainer = shap.TreeExplainer(best_model)
    # Use a sample for speed
    sample_X = X.sample(min(500, len(X)), random_state=SEED)
    shap_values = explainer.shap_values(sample_X)
    plt.figure()
    shap.summary_plot(shap_values, sample_X, show=False)  # don't display in some environments
    plt.tight_layout()
    out_path = REPORT_DIR / "shap_summary.png"
    plt.savefig(out_path, dpi=160)
    print("✅ Saved SHAP summary to", out_path)

## 7) Fairness Audit by Group (optional)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score

def group_metrics(df_in, y_true, y_prob, group_col, thr=0.5, min_n=30):
    report = []
    for g, idx in df_in.groupby(group_col).groups.items():
        idx = list(idx)
        if len(idx) < min_n:
            continue
        auc = roc_auc_score(y_true[idx], y_prob[idx])
        f1  = f1_score(y_true[idx], (y_prob[idx] > thr).astype(int))
        report.append({"group": str(g), "n": int(len(idx)), "auc": float(auc), "f1": float(f1)})
    return pd.DataFrame(report).sort_values("auc", ascending=False)

if df is not None and GROUP_COL is not None and GROUP_COL in df.columns:
    fairness_df = group_metrics(df.reset_index(drop=True), y, oof_pred, GROUP_COL)
    fairness_df.to_csv(REPORT_DIR / "fairness_by_group.csv", index=False)
    print("✅ Saved fairness report to", REPORT_DIR / "fairness_by_group.csv")
else:
    print("ℹ️ Set GROUP_COL to a valid column to enable fairness auditing.")

## 8) Batch Inference Helper

In [None]:
def batch_predict(model, df_in, enc=None, id_col=None):
    dfc = df_in.copy()
    if enc is not None:
        cat_cols = [c for c in dfc.columns if str(dfc[c].dtype) == "object"]
        if len(cat_cols) > 0:
            dfc[cat_cols] = enc.transform(dfc[cat_cols].astype(str))
    Xb = dfc.drop(columns=[c for c in [LABEL_COL, id_col] if c and c in dfc.columns], errors="ignore")
    probs = model.predict(Xb, num_iteration=model.best_iteration)
    out = pd.DataFrame({
        id_col if id_col and id_col in df_in.columns else "row_id": df_in[id_col] if id_col and id_col in df_in.columns else np.arange(len(df_in)),
        "prob": probs
    })
    return out

if best_model is not None and df_valid is not None:
    preds = batch_predict(best_model, df_valid, enc=enc, id_col=ID_COL)
    preds.head()