# PyTorch binary classifier

# train_binary.py — train & validate

In [3]:
# ✅ Correct for Jupyter
!pip install torch pandas scikit-learn joblib




In [None]:
# ================== PARAMETERS (EDIT THESE) ==================
TRAIN_CSV = r"C:\Users\19452\Downloads\Take Home Project\Take Home Project\training_loan_data.csv"
TEST_CSV  = r"C:\Users\19452\Downloads\Take Home Project\Take Home Project\testing_loan_data.csv"
TARGET    = "bad_flag"        # resolved case-insensitively
SKIPROWS  = 1                 # set to 1 if the first CSV line is not the header
ARTIFACTS = "artifacts"

HIDDEN_DIM   = 128
DROPOUT      = 0.10
EPOCHS       = 20
BATCH_SIZE   = 256
LR           = 1e-3
WEIGHT_DECAY = 1e-4
THRESHOLD    = 0.5
# =============================================================

import os, json, math, numpy as np, pandas as pd
os.makedirs(ARTIFACTS, exist_ok=True)

# -------------------- Header / Schema + Numeric-ish Coercion --------------------
import re

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (df.columns.astype(str)
                  .str.replace(r'[\r\n\t]+', ' ', regex=True)
                  .str.replace(u'\xa0', ' ', regex=False)
                  .str.replace(' +', ' ', regex=True)
                  .str.strip())
    return df

def resolve_target_column(df: pd.DataFrame, target_name: str) -> str:
    if target_name in df.columns:
        return target_name
    low = {c.lower(): c for c in df.columns}
    if target_name.lower() in low:
        return low[target_name.lower()]
    raise KeyError(f"Target '{target_name}' not found. First columns: {df.columns.tolist()[:12]}")

def add_missing_columns(df: pd.DataFrame, required_cols: list) -> pd.DataFrame:
    df = df.copy()
    for c in required_cols:
        if c not in df.columns:
            df[c] = np.nan
    return df

def parse_numericish_series(s: pd.Series) -> pd.Series:
    if s.dtype != object:
        return pd.to_numeric(s, errors="coerce")
    t = s.astype(str).str.strip()
    t = (t.str.replace(r'years?', '', regex=True)
           .str.replace(r'yrs?', '', regex=True)
           .str.replace(r'year', '', regex=True)
           .str.replace(r'months?', '', regex=True))
    t = (t.str.replace(r'[%$,]', '', regex=True)
           .str.replace(r',', '', regex=True))
    t = (t.str.replace(r'<\s*1', '0.5', regex=True)
           .str.replace(r'\+$', '', regex=True))
    num = t.str.extract(r'([-+]?\d*\.?\d+)', expand=False)
    return pd.to_numeric(num, errors="coerce")

def coerce_numericish_cols(df: pd.DataFrame, min_fraction: float = 0.8) -> pd.DataFrame:
    df = df.copy()
    for c in df.columns:
        if df[c].dtype == object:
            parsed = parse_numericish_series(df[c])
            if parsed.notna().mean() >= min_fraction:
                df[c] = parsed.astype(float)
    return df

# -------------------- sklearn preprocessing (kept sparse) --------------------
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support, confusion_matrix

def build_preprocessor(X: pd.DataFrame):
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    for c in list(num_cols):
        if pd.api.types.is_integer_dtype(X[c]) and X[c].nunique(dropna=True) <= 10:
            num_cols.remove(c); cat_cols.append(c)

    numeric = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),   # sparse-friendly
    ])

    try:
        ohe = OneHotEncoder(
            handle_unknown="infrequent_if_exist",
            min_frequency=0.01,
            sparse_output=True
        )
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

    categorical = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe),
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric, num_cols),
        ("cat", categorical, cat_cols),
    ])

    return preprocessor, num_cols, cat_cols

def enforce_schema(df: pd.DataFrame, num_cols: list, cat_cols: list) -> pd.DataFrame:
    """Force train-defined numeric columns to numeric (strings->NaN), categorical to object."""
    df = df.copy()
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].astype("object")
    return df

# -------------------- PyTorch model & loaders --------------------
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import joblib

class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(hidden_dim, 1)  # logits
        )
    def forward(self, x): return self.net(x)

class IndexDataset(Dataset):
    def __init__(self, n, y=None):
        self.n = n
        self.y = None if y is None else y.astype(np.float32).reshape(-1, 1)
    def __len__(self): return self.n
    def __getitem__(self, idx):
        if self.y is None: return idx
        return idx, self.y[idx]

class CSRCollator:
    def __init__(self, X_csr, y=None):
        self.X = X_csr
        self.y = y
    def __call__(self, batch):
        if self.y is None:
            idxs = batch
            Xa = self.X[idxs].toarray().astype(np.float32)
            return torch.from_numpy(Xa)
        else:
            idxs, ys = zip(*batch)
            Xa = self.X[list(idxs)].toarray().astype(np.float32)
            Ya = np.vstack(ys).astype(np.float32)
            return torch.from_numpy(Xa), torch.from_numpy(Ya)

def evaluate(model, loader, criterion, device):
    model.eval()
    losses, ys, ps = [], [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            logits = model(xb)
            loss = criterion(logits, yb)
            prob = torch.sigmoid(logits)
            losses.append(loss.item())
            ys.append(yb.cpu().numpy())
            ps.append(prob.cpu().numpy())
    y_true = np.vstack(ys).ravel()
    y_prob = np.vstack(ps).ravel()
    try:
        auc = roc_auc_score(y_true, y_prob)
    except Exception:
        auc = float("nan")
    return float(np.mean(losses)), auc, y_true, y_prob

# -------------------- LOAD TRAIN (normalize + coerce numeric-ish) --------------------
df_train = pd.read_csv(TRAIN_CSV, engine="python", skiprows=SKIPROWS)
df_train = normalize_columns(df_train)
TARGET = resolve_target_column(df_train, TARGET)

df_train = coerce_numericish_cols(df_train)

y = pd.to_numeric(df_train[TARGET], errors="coerce")
mask = y.isin([0, 1])
df_train = df_train.loc[mask].reset_index(drop=True)
y = y.loc[mask].astype(int).values
X = df_train.drop(columns=[TARGET])

# keep TRAIN feature list for test alignment
train_feature_cols = X.columns.tolist()

# -------------------- Preprocessor & Schema Enforcement --------------------
preprocessor, num_cols, cat_cols = build_preprocessor(X)

# enforce schema on TRAIN features before split/fit
X = enforce_schema(X, num_cols, cat_cols)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# enforce schema on splits
X_train = enforce_schema(X_train, num_cols, cat_cols)
X_val   = enforce_schema(X_val,   num_cols, cat_cols)

X_train_csr = preprocessor.fit_transform(X_train)    # CSR matrix
X_val_csr   = preprocessor.transform(X_val)          # CSR matrix
input_dim   = X_train_csr.shape[1]
print(f"TRAIN rows: {X_train_csr.shape[0]}, features: {input_dim}")
print("First TRAIN columns:", X.columns.tolist()[:12])

# -------------------- DataLoaders --------------------
train_ds = IndexDataset(len(y_train), y_train)
val_ds   = IndexDataset(len(y_val),   y_val)

train_collate = CSRCollator(X_train_csr, y_train)
val_collate   = CSRCollator(X_val_csr,   y_val)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
    collate_fn=train_collate,
)

val_loader = DataLoader(
    val_ds,
    batch_size=min(1024, BATCH_SIZE * 2),
    shuffle=False,
    num_workers=0,
    collate_fn=val_collate,
)

# -------------------- Train --------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleMLP(input_dim=input_dim, hidden_dim=HIDDEN_DIM, dropout=DROPOUT).to(device)

pos = int(y_train.sum()); neg = len(y_train) - pos
pos_weight = torch.tensor([(neg / max(pos, 1))], dtype=torch.float32).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

best_auc, best_state, y_true, y_prob = -1.0, None, None, None
for epoch in range(1, EPOCHS + 1):
    model.train()
    ep_losses = []
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        ep_losses.append(loss.item())

    val_loss, val_auc, y_true, y_prob = evaluate(model, val_loader, criterion, device)
    if not math.isnan(val_auc) and val_auc > best_auc:
        best_auc = val_auc
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    print(f"epoch {epoch:03d} | train_loss={np.mean(ep_losses):.4f} | val_loss={val_loss:.4f} | val_auc={val_auc:.4f}")

if best_state is not None:
    model.load_state_dict(best_state)
    _, _, y_true, y_prob = evaluate(model, val_loader, criterion, device)

# Metrics @ threshold
y_pred = (y_prob >= THRESHOLD).astype(int)
acc = accuracy_score(y_true, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", zero_division=0)
cm = confusion_matrix(y_true, y_pred)
print("\nBest Val AUC:", f"{best_auc:.4f}")
print("Acc/Prec/Rec/F1:", f"{acc:.4f}", f"{prec:.4f}", f"{rec:.4f}", f"{f1:.4f}")
print("Confusion matrix:\n", cm)

# Save artifacts
import joblib
torch.save(model.state_dict(), os.path.join(ARTIFACTS, "simple_mlp.pt"))
joblib.dump(preprocessor, os.path.join(ARTIFACTS, "preprocessor.joblib"))
with open(os.path.join(ARTIFACTS, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump({"val_auc": float(best_auc), "acc": float(acc), "prec": float(prec),
               "rec": float(rec), "f1": float(f1)}, f, indent=2)
print(f"Saved model & preprocessor to: {ARTIFACTS}")

# -------------------- LOAD TEST (normalize + coerce + align + enforce schema) --------------------
df_test = pd.read_csv(TEST_CSV, engine="python", skiprows=SKIPROWS)
df_test = normalize_columns(df_test)
df_test = coerce_numericish_cols(df_test)

df_test_aligned = add_missing_columns(df_test, train_feature_cols)
df_test_aligned = enforce_schema(df_test_aligned, num_cols, cat_cols)  # <-- key fix

X_test_csr = preprocessor.transform(df_test_aligned)

class OnlyIndexDataset(Dataset):
    def __init__(self, n): self.n = n
    def __len__(self): return self.n
    def __getitem__(self, i): return i

test_ds = OnlyIndexDataset(X_test_csr.shape[0])
test_collate = CSRCollator(X_test_csr, None)

test_loader = DataLoader(
    test_ds,
    batch_size=min(2048, BATCH_SIZE * 4),
    shuffle=False,
    num_workers=0,
    collate_fn=test_collate,
)

probs = []
model.eval()
with torch.no_grad():
    for xb in test_loader:
        xb = xb.to(device)
        p = torch.sigmoid(model(xb)).cpu().numpy().ravel()
        probs.append(p)
probs = np.concatenate(probs)
preds = (probs >= THRESHOLD).astype(int)

id_col = "id" if "id" in df_test_aligned.columns else df_test_aligned.columns[0]
pred_df = pd.DataFrame({id_col: df_test_aligned[id_col], "bad_flag_pred_prob": probs, "bad_flag_pred": preds})
pred_path = os.path.join(ARTIFACTS, "predictions.csv")
pred_df.to_csv(pred_path, index=False)
print(f"Saved predictions to: {pred_path}")


TRAIN rows: 151565, features: 64596
First TRAIN columns: ['id', 'member_id', 'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'desc', 'purpose', 'percent_bc_gt_75', 'bc_util']
epoch 001 | train_loss=1.1932 | val_loss=1.1720 | val_auc=0.7283
