
# PyTorch Tabular Binary Classifier (MLP) — Loan Default (`bad_flag`)

This notebook trains a **Multi-Layer Perceptron (MLP)** on your tabular dataset to predict `bad_flag` (binary target).

> **Data assumption:** Rows with `bad_flag` = NaN are treated as **test** rows for inference.



## 0) Requirements
Run this once to install dependencies (if needed):


In [19]:

# If running locally and you need packages, uncomment:
#!pip install torch scikit-learn pandas joblib openpyxl


## 1) Imports & Reproducibility

In [20]:

import os, json, random, math
from typing import List, Tuple

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, precision_recall_fscore_support

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

## 2) Load Dataset

In [21]:

# Update the path if needed
DATA_PATH = r"C:\Users\luwil\Documents\misc_data\modeldata.xlsx"

df = pd.read_excel(DATA_PATH)
print(df.shape)
df.head()


(291962, 27)


Unnamed: 0.1,Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,...,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag,emp_length_num,desc_length,has_desc
0,0,10000001,11983056,7550,36,16.24,3 years,RENT,28000.0,,...,4000.0,,3828.953801,5759.0,1,99,0.0,3.0,0,0
1,1,10000002,12002921,27050,36,10.99,10+ years,OWN,55000.0,Borrower added on 12/31/13 > Combining high ...,...,35700.0,,34359.94073,114834.0,1,353,0.0,10.0,95,1
2,2,10000003,11983096,12000,36,10.99,4 years,RENT,60000.0,Borrower added on 12/31/13 > I would like to...,...,18100.0,,16416.61776,7137.0,1,157,0.0,4.0,176,1
3,3,10000004,12003142,28000,36,7.62,5 years,MORTGAGE,325000.0,,...,42200.0,,38014.14976,799592.0,1,365,0.0,5.0,0,0
4,4,10000005,11993233,12000,36,13.53,10+ years,RENT,40000.0,,...,7000.0,53.0,6471.462236,13605.0,1,157,0.0,10.0,0,0


## 3) Preprocessing Utilities - Data cleaning to drop unneeded columns, impute missing values, categorical one hot encoding, scaling numeric features

In [22]:

DROP_COLS_DEFAULT = ["Unnamed: 0", "member_id", "desc"]   # obvious non-features
POSSIBLE_DROP_IF_PRESENT = ["emp_length"]                 # keep emp_length_num instead if present

def preprocess(df: pd.DataFrame, target_col: str = "bad_flag"):
    df = df.copy()

    # Drop obvious columns if present
    for c in DROP_COLS_DEFAULT + POSSIBLE_DROP_IF_PRESENT:
        if c in df.columns:
            df.drop(columns=[c], inplace=True)

    # Identify test rows (bad_flag missing)
    is_test = df[target_col].isna() if target_col in df.columns else pd.Series([False] * len(df))
    train_df = df.loc[~is_test].copy()
    test_df  = df.loc[is_test].copy()

    # Target
    y = None
    if target_col in train_df.columns:
        y = train_df[target_col].astype(int)
        train_df.drop(columns=[target_col], inplace=True)
    if target_col in test_df.columns:
        test_df.drop(columns=[target_col], inplace=True)

    # Separate types
    cat_cols = train_df.select_dtypes(include=["object", "category"]).columns.tolist()
    num_cols = train_df.select_dtypes(include=["number"]).columns.tolist()

    # Keep ID for output but not as feature
    id_col = None
    for cand in ["id", "ID", "Id"]:
        if cand in train_df.columns:
            id_col = cand
            if cand in num_cols: 
                num_cols.remove(cand)
            break

    # Impute missing
    for c in cat_cols:
        train_df[c] = train_df[c].fillna("Unknown")
    for c in num_cols:
        med = train_df[c].median() # we use median here as it is robust to outliers.
        train_df[c] = train_df[c].fillna(med)
    # impute missing using the training portion of the dataset to prevent data leakage
    if len(test_df) > 0:
        for c in cat_cols:
            if c in test_df.columns:
                test_df[c] = test_df[c].fillna("Unknown")
        for c in num_cols:
            if c in test_df.columns:
                med = train_df[c].median()
                test_df[c] = test_df[c].fillna(med)

    # One-hot encode categoricals across combined frame to align columns
    combined = pd.concat([train_df, test_df], axis=0, sort=False)
    combined = pd.get_dummies(combined, columns=cat_cols, drop_first=True)

    # Split back
    X_all   = combined
    X_train = X_all.iloc[: len(train_df)].copy()
    X_test  = X_all.iloc[len(train_df):].copy()

    # Scale numeric columns only
    scaler = StandardScaler()
    if num_cols:
        scaler.fit(X_train[num_cols].values)
        X_train.loc[:, num_cols] = scaler.transform(X_train[num_cols].values)
        if len(X_test) > 0:
            X_test.loc[:, num_cols]  = scaler.transform(X_test[num_cols].values)

    feature_cols = X_train.columns.tolist()
    return X_train, y, X_test, feature_cols, scaler, id_col
 

## 4) Run Preprocessing

In [23]:

X_train_df, y, X_test_df, feature_cols, scaler, id_col = preprocess(df, target_col="bad_flag")
print("Train shape:", X_train_df.shape, "Test shape:", X_test_df.shape, "Features:", len(feature_cols))
y.value_counts(normalize=True).rename("class_ratio")


Train shape: (189457, 36) Test shape: (102505, 36) Features: 36


  1.6822226 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, num_cols] = scaler.transform(X_train[num_cols].values)
  1.80069231]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, num_cols] = scaler.transform(X_train[num_cols].values)
  1.15830436]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, num_cols] = scaler.transform(X_train[num_cols].values)
  1.6821323 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, num_cols] = scaler.transform(X_train[num_cols].values)
  1.04198828]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, num_cols] = scaler.transform(X_train[num_cols].values)
  1.14470019]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, num_cols] =

bad_flag
0    0.930707
1    0.069293
Name: class_ratio, dtype: float64

### define MLP class and dataset type class

In [24]:
#Tabular dataset is a function to make our dataset able to be used with pytorch's dataloader function
class TabularDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray | None = None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

class MLP(nn.Module):
    def __init__(self, in_dim: int, hidden: List[int], dropout: float = 0.2, use_bn: bool = True):
        super().__init__()
        layers: List[nn.Module] = []
        prev = in_dim
        for h in hidden:
            layers.append(nn.Linear(prev, h))
            if use_bn:
                layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(p=dropout))
            prev = h
        layers.append(nn.Linear(prev, 1))  # binary logit
        self.net = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.net(x).squeeze(-1)  # logits

#%% md
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for Xb, yb in loader:
        Xb = Xb.to(device)
        yb = yb.to(device)
        optimizer.zero_grad()
        logits = model(Xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * Xb.size(0)
    return running_loss / len(loader.dataset)

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    all_logits = []
    all_targets = []
    running_loss = 0.0
    for Xb, yb in loader:
        Xb = Xb.to(device)
        yb = yb.to(device)
        logits = model(Xb)
        loss = criterion(logits, yb)
        running_loss += loss.item() * Xb.size(0)
        all_logits.append(logits.detach().cpu().numpy())
        all_targets.append(yb.detach().cpu().numpy())
    avg_loss = running_loss / len(loader.dataset)

    logits = np.concatenate(all_logits)
    targets = np.concatenate(all_targets).astype(int)
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs >= 0.5).astype(int)

    try:
        auc = roc_auc_score(targets, probs)
    except ValueError:
        auc = float('nan')

    acc = accuracy_score(targets, preds)
    pr, rc, f1, _ = precision_recall_fscore_support(targets, preds, average="binary", zero_division=0)
    return {"loss": avg_loss, "auc": auc, "acc": acc, "precision": pr, "recall": rc, "f1": f1}

class EarlyStopper:
    def __init__(self, patience: int = 5, mode: str = "max"):
        self.patience = patience
        self.mode = mode
        self.best = -float("inf") if mode == "max" else float("inf")
        self.count = 0
        self.best_state = None

    def step(self, metric_value, model):
        improved = (metric_value > self.best) if self.mode == "max" else (metric_value < self.best)
        if improved:
            self.best = metric_value
            self.count = 0
            self.best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            return True
        else:
            self.count += 1
            return False

    def should_stop(self):
        return self.count >= self.patience


### create train/validation split data

In [25]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train_df.astype(np.float32).values, 
    y.astype(np.float32).values,
    test_size=0.2,
    random_state=SEED,
    stratify=y.values
)

train_ds = TabularDataset(X_train, y_train)
val_ds   = TabularDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=512, shuffle=True, num_workers=0)
val_loader   = DataLoader(val_ds, batch_size=512, shuffle=False, num_workers=0)

### train the model - iteration 1

In [26]:
hidden = [128]     # starting with a fairly commonly used MLP architecture of two hidden layers
dropout = 0.2 # regularization parameter
lr = .0005 # learning rate
weight_decay = .0001 #adds L2 regularization to prevent overfitting.
epochs = 46 #max # of training iterations
patience = 6 #early stopping parameter

model = MLP(in_dim=X_train.shape[1], hidden=hidden, dropout=dropout, use_bn=True).to(device)

# class imbalance handling
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32).to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
early = EarlyStopper(patience=patience, mode="max")

best_epoch = -1
for epoch in range(1, epochs + 1):
    tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_metrics = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch:03d} | loss {tr_loss:.4f} | val_loss {val_metrics['loss']:.4f} | "
          f"AUC {val_metrics['auc']:.4f} | acc {val_metrics['acc']:.4f} | "
          f"prec {val_metrics['precision']:.4f} | rec {val_metrics['recall']:.4f} | f1 {val_metrics['f1']:.4f}")

    improved = early.step(val_metrics["auc"], model)
    if improved:
        best_epoch = epoch
    if early.should_stop():
        print(f"Early stopping at epoch {epoch}. Best epoch: {best_epoch} (AUC={early.best:.4f})")
        break

# Restore best weights
if early.best_state is not None:
    model.load_state_dict(early.best_state)

best_epoch, early.best

Epoch 001 | loss 1.2296 | val_loss 2.7893 | AUC 0.6663 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 002 | loss 1.2249 | val_loss 18.7494 | AUC 0.6666 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 003 | loss 1.2232 | val_loss 2.1642 | AUC 0.6673 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 004 | loss 1.2222 | val_loss 1.6960 | AUC 0.6672 | acc 0.0693 | prec 0.0693 | rec 1.0000 | f1 0.1296
Epoch 005 | loss 1.2207 | val_loss 4.3912 | AUC 0.6675 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 006 | loss 1.2203 | val_loss 1.5591 | AUC 0.6680 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 007 | loss 1.2208 | val_loss 2.3255 | AUC 0.6676 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 008 | loss 1.2189 | val_loss 1.3380 | AUC 0.6677 | acc 0.9229 | prec 0.1387 | rec 0.0217 | f1 0.0375
Epoch 009 | loss 1.2199 | val_loss 1.3250 | AUC 0.6675 | acc 0.8403 | prec 0.1442 | rec 0.2643 | f1 0.1866
Epoch 010 | loss 1.2199 | val_loss 1

(11, np.float64(0.6689239140815084))

### grid search cross validation - instead of grid search CV on just 1 train/val split we could do a k-fold cross validation instead which has many splits and makes our model fitting more consistent and stable. However, it takes a lot more time

In [22]:
import itertools, numpy as np, torch, torch.nn as nn
from torch.utils.data import DataLoader

def run_once(hidden, dropout, lr, weight_decay, batch_size, use_bn=True, epochs=50, patience=6):
    # Datasets/loaders (rebuild per batch size)
    train_ds = TabularDataset(X_train, y_train.astype(np.float32))
    val_ds   = TabularDataset(X_val,   y_val.astype(np.float32))
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=0)

    # Model
    model = MLP(in_dim=X_train.shape[1], hidden=hidden, dropout=dropout, use_bn=use_bn).to(device)

    # Class imbalance
    pos = (y_train == 1).sum()
    neg = (y_train == 0).sum()
    pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32).to(device)

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    early = EarlyStopper(patience=patience, mode="max")

    best_epoch = -1
    best_auc = float("-inf")
    for epoch in range(1, epochs + 1):
        tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_metrics = evaluate(model, val_loader, criterion, device)
        auc = val_metrics["auc"]
        if early.step(auc, model):
            best_epoch = epoch
            best_auc = auc
        if early.should_stop():
            break

    # restore best weights
    if early.best_state is not None:
        model.load_state_dict(early.best_state)

    return best_auc, best_epoch, model

# ------- Define search space -------
param_grid = {
    "hidden":       [[128], [128,64], [256,128,64]],  # [] = logistic regression baseline
    "dropout":      [0.2, 0.3, 0.4],
    "lr":           [5e-4, 1e-3],
    "weight_decay": [1e-5, 1e-4],
    "batch_size":   [256, 512],
    "use_bn":       [True, False],
}

results = []
for hidden, dropout, lr, weight_decay, batch_size, use_bn in itertools.product(
    param_grid["hidden"],
    param_grid["dropout"],
    param_grid["lr"],
    param_grid["weight_decay"],
    param_grid["batch_size"],
    param_grid["use_bn"]
):
    auc, best_ep, _ = run_once(hidden, dropout, lr, weight_decay, batch_size, use_bn=use_bn, epochs=50, patience=6)
    results.append({
        "hidden": hidden, "dropout": dropout, "lr": lr,
        "weight_decay": weight_decay, "batch_size": batch_size, "use_bn": use_bn,
        "best_auc": auc, "best_epoch": best_ep
    })
    print(f"Tried hidden={hidden}, drop={dropout}, lr={lr}, wd={weight_decay}, bs={batch_size}, bn={use_bn} -> AUC={auc:.4f} @ epoch {best_ep}")

# Pick best
results_sorted = sorted(results, key=lambda d: d["best_auc"], reverse=True)
best = results_sorted[0]
best


Tried hidden=[128], drop=0.2, lr=0.0005, wd=1e-05, bs=256, bn=True -> AUC=0.6691 @ epoch 3


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.0005, wd=1e-05, bs=256, bn=False -> AUC=0.5000 @ epoch 2
Tried hidden=[128], drop=0.2, lr=0.0005, wd=1e-05, bs=512, bn=True -> AUC=0.6711 @ epoch 22


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.0005, wd=1e-05, bs=512, bn=False -> AUC=0.5000 @ epoch 1


  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.0005, wd=0.0001, bs=256, bn=True -> AUC=0.7167 @ epoch 46


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.0005, wd=0.0001, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.2, lr=0.0005, wd=0.0001, bs=512, bn=True -> AUC=0.6730 @ epoch 13


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.0005, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.2, lr=0.001, wd=1e-05, bs=256, bn=True -> AUC=0.6677 @ epoch 7


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.001, wd=1e-05, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.2, lr=0.001, wd=1e-05, bs=512, bn=True -> AUC=0.6681 @ epoch 9


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.001, wd=1e-05, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.2, lr=0.001, wd=0.0001, bs=256, bn=True -> AUC=0.6675 @ epoch 4


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.001, wd=0.0001, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.2, lr=0.001, wd=0.0001, bs=512, bn=True -> AUC=0.6675 @ epoch 4


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.2, lr=0.001, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.3, lr=0.0005, wd=1e-05, bs=256, bn=True -> AUC=0.6681 @ epoch 2


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.3, lr=0.0005, wd=1e-05, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.3, lr=0.0005, wd=1e-05, bs=512, bn=True -> AUC=0.6706 @ epoch 27


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.3, lr=0.0005, wd=1e-05, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.3, lr=0.0005, wd=0.0001, bs=256, bn=True -> AUC=0.6682 @ epoch 6


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.3, lr=0.0005, wd=0.0001, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.3, lr=0.0005, wd=0.0001, bs=512, bn=True -> AUC=0.6677 @ epoch 7


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.3, lr=0.0005, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.3, lr=0.001, wd=1e-05, bs=256, bn=True -> AUC=0.6772 @ epoch 33


  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.3, lr=0.001, wd=1e-05, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.3, lr=0.001, wd=1e-05, bs=512, bn=True -> AUC=0.6671 @ epoch 3


  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.3, lr=0.001, wd=1e-05, bs=512, bn=False -> AUC=0.5000 @ epoch 2
Tried hidden=[128], drop=0.3, lr=0.001, wd=0.0001, bs=256, bn=True -> AUC=0.6670 @ epoch 2


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.3, lr=0.001, wd=0.0001, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.3, lr=0.001, wd=0.0001, bs=512, bn=True -> AUC=0.6673 @ epoch 1


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.3, lr=0.001, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.4, lr=0.0005, wd=1e-05, bs=256, bn=True -> AUC=0.6673 @ epoch 5


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.4, lr=0.0005, wd=1e-05, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.4, lr=0.0005, wd=1e-05, bs=512, bn=True -> AUC=0.6673 @ epoch 10


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.4, lr=0.0005, wd=1e-05, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.4, lr=0.0005, wd=0.0001, bs=256, bn=True -> AUC=0.6672 @ epoch 6


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.4, lr=0.0005, wd=0.0001, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.4, lr=0.0005, wd=0.0001, bs=512, bn=True -> AUC=0.6669 @ epoch 5


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.4, lr=0.0005, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.4, lr=0.001, wd=1e-05, bs=256, bn=True -> AUC=0.6670 @ epoch 11


  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.4, lr=0.001, wd=1e-05, bs=256, bn=False -> AUC=0.6664 @ epoch 8
Tried hidden=[128], drop=0.4, lr=0.001, wd=1e-05, bs=512, bn=True -> AUC=0.6666 @ epoch 6


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.4, lr=0.001, wd=1e-05, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.4, lr=0.001, wd=0.0001, bs=256, bn=True -> AUC=0.6670 @ epoch 15


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.4, lr=0.001, wd=0.0001, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128], drop=0.4, lr=0.001, wd=0.0001, bs=512, bn=True -> AUC=0.6666 @ epoch 4


  probs = 1 / (1 + np.exp(-logits))
  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128], drop=0.4, lr=0.001, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128, 64], drop=0.2, lr=0.0005, wd=1e-05, bs=256, bn=True -> AUC=0.6826 @ epoch 25
Tried hidden=[128, 64], drop=0.2, lr=0.0005, wd=1e-05, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128, 64], drop=0.2, lr=0.0005, wd=1e-05, bs=512, bn=True -> AUC=0.6676 @ epoch 6
Tried hidden=[128, 64], drop=0.2, lr=0.0005, wd=1e-05, bs=512, bn=False -> AUC=0.5000 @ epoch 3
Tried hidden=[128, 64], drop=0.2, lr=0.0005, wd=0.0001, bs=256, bn=True -> AUC=0.6690 @ epoch 10
Tried hidden=[128, 64], drop=0.2, lr=0.0005, wd=0.0001, bs=256, bn=False -> AUC=0.6660 @ epoch 1
Tried hidden=[128, 64], drop=0.2, lr=0.0005, wd=0.0001, bs=512, bn=True -> AUC=0.6689 @ epoch 20
Tried hidden=[128, 64], drop=0.2, lr=0.0005, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1


  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128, 64], drop=0.2, lr=0.001, wd=1e-05, bs=256, bn=True -> AUC=0.6719 @ epoch 21
Tried hidden=[128, 64], drop=0.2, lr=0.001, wd=1e-05, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128, 64], drop=0.2, lr=0.001, wd=1e-05, bs=512, bn=True -> AUC=0.6667 @ epoch 6
Tried hidden=[128, 64], drop=0.2, lr=0.001, wd=1e-05, bs=512, bn=False -> AUC=0.5000 @ epoch 2
Tried hidden=[128, 64], drop=0.2, lr=0.001, wd=0.0001, bs=256, bn=True -> AUC=0.6764 @ epoch 20
Tried hidden=[128, 64], drop=0.2, lr=0.001, wd=0.0001, bs=256, bn=False -> AUC=0.5000 @ epoch 1


  probs = 1 / (1 + np.exp(-logits))


Tried hidden=[128, 64], drop=0.2, lr=0.001, wd=0.0001, bs=512, bn=True -> AUC=0.6935 @ epoch 45
Tried hidden=[128, 64], drop=0.2, lr=0.001, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128, 64], drop=0.3, lr=0.0005, wd=1e-05, bs=256, bn=True -> AUC=0.6671 @ epoch 5
Tried hidden=[128, 64], drop=0.3, lr=0.0005, wd=1e-05, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128, 64], drop=0.3, lr=0.0005, wd=1e-05, bs=512, bn=True -> AUC=0.6670 @ epoch 8
Tried hidden=[128, 64], drop=0.3, lr=0.0005, wd=1e-05, bs=512, bn=False -> AUC=0.6661 @ epoch 1
Tried hidden=[128, 64], drop=0.3, lr=0.0005, wd=0.0001, bs=256, bn=True -> AUC=0.6695 @ epoch 11
Tried hidden=[128, 64], drop=0.3, lr=0.0005, wd=0.0001, bs=256, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128, 64], drop=0.3, lr=0.0005, wd=0.0001, bs=512, bn=True -> AUC=0.6691 @ epoch 24
Tried hidden=[128, 64], drop=0.3, lr=0.0005, wd=0.0001, bs=512, bn=False -> AUC=0.5000 @ epoch 1
Tried hidden=[128, 64], drop=0.3, lr=0

{'hidden': [128],
 'dropout': 0.2,
 'lr': 0.0005,
 'weight_decay': 0.0001,
 'batch_size': 256,
 'use_bn': True,
 'best_auc': np.float64(0.7166818492156812),
 'best_epoch': 46}

### We try several combinations of each hyperparameter. I belive the most important one we are tuning here is the number of hidden layers. With more hidden layers being able to detect more subtle relationships. Less hidden layers being less prone to overfitting but may miss those subtle relationships.
Below is the final model with hyperparameters set from the best AUC model from grid search cross validation.

In [28]:
# --- hyperparams ---
hidden = [128]          # one hidden layer (128 units)
dropout = 0.2           # regularization parameter
lr = 5e-4               # learning rate
weight_decay = 1e-4     # L2 regularization
epochs = 46             # max training epochs
patience = 6            # early stopping patience
batch_size = 256        # << set batch size to 256
use_bn = True           # << enable BatchNorm

# --- (re)build DataLoaders with batch_size=256 ---
from torch.utils.data import DataLoader
train_ds = TabularDataset(X_train.astype(np.float32), y_train.astype(np.float32))
val_ds   = TabularDataset(X_val.astype(np.float32),   y_val.astype(np.float32))
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=0)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=0)

# --- model / loss / optim ---
model = MLP(in_dim=X_train.shape[1], hidden=hidden, dropout=dropout, use_bn=use_bn).to(device)

# class imbalance handling
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32).to(device)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
early = EarlyStopper(patience=patience, mode="max")

# --- training loop ---
best_epoch = -1
for epoch in range(1, epochs + 1):
    tr_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_metrics = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch:03d} | loss {tr_loss:.4f} | val_loss {val_metrics['loss']:.4f} | "
          f"AUC {val_metrics['auc']:.4f} | acc {val_metrics['acc']:.4f} | "
          f"prec {val_metrics['precision']:.4f} | rec {val_metrics['recall']:.4f} | f1 {val_metrics['f1']:.4f}")

    if early.step(val_metrics["auc"], model):
        best_epoch = epoch
    if early.should_stop():
        print(f"Early stopping at epoch {epoch}. Best epoch: {best_epoch} (AUC={early.best:.4f})")
        break

# restore best weights
if early.best_state is not None:
    model.load_state_dict(early.best_state)

best_epoch, early.best


Epoch 001 | loss 1.2262 | val_loss 2.0119 | AUC 0.6667 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 002 | loss 1.2244 | val_loss 5.2809 | AUC 0.6679 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 003 | loss 1.2236 | val_loss 3.0926 | AUC 0.6675 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 004 | loss 1.2213 | val_loss 1.3971 | AUC 0.6677 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 005 | loss 1.2212 | val_loss 2.9499 | AUC 0.6675 | acc 0.0693 | prec 0.0693 | rec 1.0000 | f1 0.1296
Epoch 006 | loss 1.2203 | val_loss 4.9679 | AUC 0.6675 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Epoch 007 | loss 1.2217 | val_loss 1.4119 | AUC 0.6676 | acc 0.9302 | prec 0.1600 | rec 0.0015 | f1 0.0030
Epoch 008 | loss 1.2199 | val_loss 2.2833 | AUC 0.6678 | acc 0.9307 | prec 0.0000 | rec 0.0000 | f1 0.0000
Early stopping at epoch 8. Best epoch: 2 (AUC=0.6679)


(2, np.float64(0.6678843066657066))

### Our AUC is .667 which is around what we got from iteration 1. Grid search CV had previously shown best AUC around .71. The model doesn't seem very good. 

In [29]:
from pathlib import Path
ARTIF_DIR = Path(r"C:\Users\luwil\Documents\LoanProject\MLP")  # change if you prefer
ARTIF_DIR.mkdir(parents=True, exist_ok=True)  # <-- creates folder if missing

model_path  = os.path.join(ARTIF_DIR, "nn_model.pt")
feats_path  = os.path.join(ARTIF_DIR, "nn_features.json")
scaler_path = os.path.join(ARTIF_DIR, "nn_scaler.pkl")

torch.save(model.state_dict(), model_path)
with open(feats_path, "w") as f:
    json.dump({"feature_cols": feature_cols}, f, indent=2)
joblib.dump(scaler, scaler_path)

model_path, feats_path, scaler_path

('C:\\Users\\luwil\\Documents\\LoanProject\\MLP\\nn_model.pt',
 'C:\\Users\\luwil\\Documents\\LoanProject\\MLP\\nn_features.json',
 'C:\\Users\\luwil\\Documents\\LoanProject\\MLP\\nn_scaler.pkl')

In [39]:
pred_path = os.path.join(ARTIF_DIR, "nn_predictions.csv")

if len(X_test_df) > 0:
    X_np = (
    X_test_df.apply(pd.to_numeric, errors="coerce")
             .fillna(0.0)              # or another fill policy
             .to_numpy(dtype=np.float32)
    )
    
    test_ds = TabularDataset(X_np)
    test_ds = TabularDataset(X_test_df.values)
    test_loader = DataLoader(test_ds, batch_size=512, shuffle=False, num_workers=0)

    model.eval()
    all_logits = []
    with torch.no_grad():
        for Xb in test_loader:
            Xb = Xb.to(device)
            logits = model(Xb)
            all_logits.append(logits.cpu().numpy())

    probs = 1 / (1 + np.exp(-np.concatenate(all_logits)))
    out = pd.DataFrame({"bad_flag_pred_prob": probs})
    if "bad_flag" in df.columns:
        mask = df["bad_flag"].isna()
        if 'id' in df.columns:
            out['id'] = df.loc[mask, 'id'].values
            out = out[['id', 'bad_flag_pred_prob']]
    out.to_csv(pred_path, index=False)
    print(f"Wrote predictions to {pred_path}")
else:
    print("No test rows detected (no NaN target rows). Skipping inference.")

pred_path

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [37]:
# --- Load artifacts ---
with open(feats_path, "r") as f:
    feature_cols = json.load(f)["feature_cols"]
scaler = joblib.load(scaler_path)

if hasattr(scaler, "feature_names_in_"):
    expected_cols = list(scaler.feature_names_in_)
else:
    # fall back to your saved list if scaler doesn't have names
    expected_cols = list(feature_cols)

In [38]:
expected_cols

['id',
 'loan_amnt',
 'term',
 'int_rate',
 'annual_inc',
 'percent_bc_gt_75',
 'bc_util',
 'dti',
 'inq_last_6mths',
 'mths_since_recent_inq',
 'revol_util',
 'total_bc_limit',
 'mths_since_last_major_derog',
 'tot_hi_cred_lim',
 'tot_cur_bal',
 'application_approved_flag',
 'internal_score',
 'emp_length_num',
 'desc_length',
 'has_desc',
 'home_ownership_NONE',
 'home_ownership_OTHER',
 'home_ownership_OWN',
 'home_ownership_RENT',
 'purpose_credit_card',
 'purpose_debt_consolidation',
 'purpose_home_improvement',
 'purpose_house',
 'purpose_major_purchase',
 'purpose_medical',
 'purpose_moving',
 'purpose_other',
 'purpose_renewable_energy',
 'purpose_small_business',
 'purpose_vacation',
 'purpose_wedding']