In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score, accuracy_score, precision_score, recall_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

## 1. load the combined train + test and cleaned data

In [4]:
df = pd.read_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/combined_train_test_cleaned.hdf5')
df.shape

(290628, 1333)

## 2. Build the Neuron Network Pipeline

In [28]:
def train_nn_pipeline(df, hidden_layers=[64], activation_fn=nn.ReLU, 
                      lr=0.001, batch_size=32, epochs=200, patience=10, 
                      train_indicator='train_indicator', target='bad_flag'):
    """
    Train a neural network for binary classification with configurable architecture and activation.

    Parameters:
    -----------
    df : DataFrame with combined train/test and train_indicator.
    hidden_layers : list, neurons in each hidden layer. e.g. [64, 32]
    activation_fn : torch activation class (e.g., nn.ReLU, nn.Sigmoid, nn.Tanh)
    lr : learning rate
    batch_size : batch size for training
    epochs : maximum training epochs
    patience : early stopping patience
    train_indicator : column name indicating train/test split
    target : dependent variable name

    Returns:
    --------
    model : trained model
    scaler : fitted scaler
    best_threshold : tuned threshold for classification
    val_auc : validation AUC
    val_f1 : best validation F1
    test_df : DataFrame with predictions for test set
    """

    # ✅ Split train/test
    train_df = df[df[train_indicator] == 1].copy()
    test_df = df[df[train_indicator] == 0].copy()

    X = train_df.drop(columns=[target, train_indicator])
    y = train_df[target].astype(float)
    X_test = test_df.drop(columns=[target, train_indicator])

    # ✅ Train/val split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # ✅ Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # ✅ Convert to tensors
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
    X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)
    X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

    # ✅ Compute class weight
    pos_weight_value = (y_train == 0).sum() / (y_train == 1).sum()

    # ✅ Dynamic Model Definition
    class DynamicNN(nn.Module):
        def __init__(self, input_dim, hidden_layers, activation_fn):
            super(DynamicNN, self).__init__()
            layers = []
            prev_dim = input_dim
            for h in hidden_layers:
                layers.append(nn.Linear(prev_dim, h))
                layers.append(activation_fn())
                prev_dim = h
            layers.append(nn.Linear(prev_dim, 1))
            layers.append(nn.Sigmoid())  # Output for binary classification
            self.net = nn.Sequential(*layers)

        def forward(self, x):
            return self.net(x)

    input_dim = X_train_tensor.shape[1]
    model = DynamicNN(input_dim=input_dim, hidden_layers=hidden_layers, activation_fn=activation_fn)

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # ✅ Training loop with early stopping
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    best_val_loss = np.inf
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0

        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X).view(-1)
            targets = batch_y.view(-1)
            # Manual class weights
            weights = torch.ones_like(targets)
            weights[targets == 1] = pos_weight_value
            loss = criterion(outputs, targets)
            weighted_loss = (loss * weights).mean()
            weighted_loss.backward()
            optimizer.step()
            epoch_loss += weighted_loss.item()

        # Validation
        model.eval()
        with torch.no_grad():
            val_probs = model(X_val_tensor).view(-1).numpy()
            val_loss = criterion(torch.tensor(val_probs), y_val_tensor.view(-1)).item()

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    model.load_state_dict(best_state)

    # ✅ Metrics & threshold tuning
    model.eval()
    with torch.no_grad():
        val_probs = model(X_val_tensor).view(-1).numpy()

    val_auc = roc_auc_score(y_val, val_probs)
    precisions, recalls, thresholds = precision_recall_curve(y_val, val_probs)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]
    val_f1 = f1_scores[best_idx]

    print(f"Validation AUC: {val_auc:.4f} | Best F1: {val_f1:.4f} | Threshold: {best_threshold:.4f}")

    # ✅ Test predictions
    with torch.no_grad():
        test_probs = model(X_test_tensor).view(-1).numpy()
        test_preds = (test_probs > best_threshold).astype(int)

    test_df['bad_flag_pred_prob'] = test_probs
    test_df['bad_flag_pred'] = test_preds

    # ✅ Print class distributions
    train_bad_pct = (y_train.sum() / len(y_train)) * 100
    test_bad_pct = (test_df['bad_flag_pred'].sum() / len(test_df)) * 100
    print(f"Train bad_flag=1: {train_bad_pct:.2f}% | Predicted Test bad_flag=1: {test_bad_pct:.2f}%")

    return model, scaler, best_threshold, val_auc, val_f1, test_df

## Model 1. One layer, [64], activation=Sigmoid, Validation AUC: 0.6542 | Best F1: 0.2056 | Pred test bad_flag=1: 16.82%

In [29]:
model, scaler, thr, auc, f1, test_preds = train_nn_pipeline(
    df=df,
    hidden_layers=[64],         # ✅ One layer with 64 neurons
    activation_fn=nn.Sigmoid,   # Hidden layer activation
    lr=0.001,
    batch_size=32,
    epochs=100,
    patience=10
)

Early stopping at epoch 11
Validation AUC: 0.6542 | Best F1: 0.2056 | Threshold: 0.1047
Train bad_flag=1: 6.98% | Predicted Test bad_flag=1: 16.82%


In [32]:
test_preds.to_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_64_sigmoid.hdf5', key='data')

## Model 2. One layer, [64], activation=ReLU, Validation AUC: 0.6349 | Best F1: 0.1921 | Pred test bad_flag=1: 22.35%

In [33]:
model, scaler, thr, auc, f1, test_preds = train_nn_pipeline(
    df=df,
    hidden_layers=[64],         # ✅ One layer with 64 neurons
    activation_fn=nn.ReLU,   # Hidden layer activation
    lr=0.001,
    batch_size=32,
    epochs=100,
    patience=10
)

Early stopping at epoch 11
Validation AUC: 0.6349 | Best F1: 0.1921 | Threshold: 0.1081
Train bad_flag=1: 6.98% | Predicted Test bad_flag=1: 22.35%


In [34]:
test_preds.to_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_64_relu.hdf5', key='data')

## Model 3. 3 layer, [128, 64, 32], activation=Sigmoid, Validation AUC: 0.591 | Best F1: 0.1831 | Pred test bad_flag=1: 14.08%

In [41]:
model, scaler, thr, auc, f1, test_preds = train_nn_pipeline(
    df=df,
    hidden_layers=[128, 64, 32],         # ✅ One layer with 64 neurons
    activation_fn=nn.Sigmoid,   # Hidden layer activation
    lr=0.001,
    batch_size=32,
    epochs=100,
    patience=10
)

Early stopping at epoch 12
Validation AUC: 0.5910 | Best F1: 0.1831 | Threshold: 0.1015
Train bad_flag=1: 6.98% | Predicted Test bad_flag=1: 14.08%


In [42]:
test_preds.to_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_128_64_32_sigmoid.hdf5', key='data')

## Model 4. 2 layer, [128, 64], activation=Sigmoid, Validation AUC: 0.6049 | Best F1: 0.1914 | Pred test bad_flag=1: 17.41%

In [37]:
model, scaler, thr, auc, f1, test_preds = train_nn_pipeline(
    df=df,
    hidden_layers=[128, 64],         # ✅ One layer with 64 neurons
    activation_fn=nn.Sigmoid,   # Hidden layer activation
    lr=0.001,
    batch_size=32,
    epochs=100,
    patience=10
)

Early stopping at epoch 12
Validation AUC: 0.6049 | Best F1: 0.1914 | Threshold: 0.1116
Train bad_flag=1: 6.98% | Predicted Test bad_flag=1: 17.41%


In [38]:
test_preds.to_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_128_64_sigmoid.hdf5', key='data')

## Model 5. One layer, [32], activation=Sigmoid, Validation AUC: 0.6536 | Best F1: 0.1999 | Pred test bad_flag=1: 12.95%

In [43]:
model, scaler, thr, auc, f1, test_preds = train_nn_pipeline(
    df=df,
    hidden_layers=[32],         # ✅ One layer with 64 neurons
    activation_fn=nn.Sigmoid,   # Hidden layer activation
    lr=0.001,
    batch_size=32,
    epochs=100,
    patience=10
)

Early stopping at epoch 12
Validation AUC: 0.6536 | Best F1: 0.1999 | Threshold: 0.1280
Train bad_flag=1: 6.98% | Predicted Test bad_flag=1: 12.95%


In [44]:
test_preds.to_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_32_sigmoid.hdf5', key='data')

### Model 6. One layer, [16], activation=Sigmoid, Validation AUC: 0.6753 | Best F1: 0.2141 | Pred test bad_flag=1: 10.51%

In [45]:
model, scaler, thr, auc, f1, test_preds = train_nn_pipeline(
    df=df,
    hidden_layers=[16],         # ✅ One layer with 64 neurons
    activation_fn=nn.Sigmoid,   # Hidden layer activation
    lr=0.001,
    batch_size=32,
    epochs=100,
    patience=10
)

Early stopping at epoch 11
Validation AUC: 0.6753 | Best F1: 0.2141 | Threshold: 0.1402
Train bad_flag=1: 6.98% | Predicted Test bad_flag=1: 10.51%


In [46]:
test_preds.to_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_16_sigmoid.hdf5', key='data')

### Model 7. One layer, [8], activation=Sigmoid, Validation AUC: 0.6844 | Best F1: 0.2179 | Pred test bad_flag=1: 8.12%

In [47]:
model, scaler, thr, auc, f1, test_preds = train_nn_pipeline(
    df=df,
    hidden_layers=[8],         # ✅ One layer with 64 neurons
    activation_fn=nn.Sigmoid,   # Hidden layer activation
    lr=0.001,
    batch_size=32,
    epochs=100,
    patience=10
)

Early stopping at epoch 11
Validation AUC: 0.6844 | Best F1: 0.2179 | Threshold: 0.1408
Train bad_flag=1: 6.98% | Predicted Test bad_flag=1: 8.12%


In [48]:
test_preds.to_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_8_sigmoid.hdf5', key='data')

### Model 8. One layer, [4], activation=Sigmoid, Validation AUC: 0.6786 | Best F1: 0.2114 | Pred test bad_flag=1: 7.21%

In [49]:
model, scaler, thr, auc, f1, test_preds = train_nn_pipeline(
    df=df,
    hidden_layers=[4],         # ✅ One layer with 64 neurons
    activation_fn=nn.Sigmoid,   # Hidden layer activation
    lr=0.001,
    batch_size=32,
    epochs=100,
    patience=10
)

Early stopping at epoch 11
Validation AUC: 0.6786 | Best F1: 0.2114 | Threshold: 0.1456
Train bad_flag=1: 6.98% | Predicted Test bad_flag=1: 7.21%


In [50]:
test_preds.to_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_4_sigmoid.hdf5', key='data')

## Choose the winner model and dump only the predicted value and combine with the test id

In [51]:
## winner is Model 7, i.e., one layer with 8 neurons and activation function is sigmoid

In [52]:
winner_pred = pd.read_hdf('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/test_result_8_sigmoid.hdf5')
winner_pred.shape

(102505, 1335)

In [53]:
test_df = pd.read_csv('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/testing_loan_data.csv')
test_df.shape

  test_df = pd.read_csv('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/testing_loan_data.csv')


(102505, 23)

In [55]:
test_df.head()

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,20000001,22419852,10000,36 months,22.15%,8 years,RENT,37000.0,,debt_consolidation,...,1,3.0,73.10%,16200,,14877.17028,36809,1,131,
1,20000002,22349118,1400,36 months,18.24%,6 years,RENT,41000.0,,other,...,0,9.0,11.50%,4000,,4097.30477,19536,1,19,
2,20000003,22398818,7000,36 months,12.49%,3 years,RENT,68900.0,,debt_consolidation,...,0,11.0,48.10%,11900,80.0,12688.49516,241465,1,92,
3,20000004,22419015,18000,60 months,16.29%,9 years,MORTGAGE,41000.0,,debt_consolidation,...,1,0.0,38.10%,7600,73.0,7908.799817,179757,1,235,
4,20000005,22388614,12000,36 months,12.99%,10+ years,MORTGAGE,64000.0,,home_improvement,...,0,,57.90%,21000,,19378.56106,31953,1,157,


In [64]:
aa = winner_pred.copy()
bb = test_df.copy()
aa.index = range(len(aa))
bb.index = range(len(bb))
cc = bb.loc[:, ['id', 'member_id']].copy()
cc['bad_flag_pred'] = aa['bad_flag_pred'].values
print(cc.shape)
cc['bad_flag_pred'].value_counts()

(102505, 3)


bad_flag_pred
0    94183
1     8322
Name: count, dtype: int64

In [68]:
cc.to_csv('C:/Users/xiaow/work/jp_awm/interview/Take Home Project/final_test_output_to_submit.csv', index=False)