In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [29]:
pd.set_option('display.max_colwidth', 100)  # Limit column width for better readability
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns


In [48]:
# Load training and testing data
train_data = pd.read_csv(r'C:\Users\padhee.3\Downloads\Take Home Project\training_processed_data.csv')  # Replace with your training file path
inference_data = pd.read_csv(r'C:\Users\padhee.3\Downloads\Take Home Project\testing_processed_data.csv')    # Replace with your testing file path

In [49]:
print(train_data.columns)
print(inference_data.columns)

Index(['id', 'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti',
       'inq_last_6mths', 'mths_since_recent_inq', 'revol_util',
       'total_bc_limit', 'tot_cur_bal', 'bad_flag'],
      dtype='object')
Index(['id', 'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti',
       'inq_last_6mths', 'mths_since_recent_inq', 'revol_util',
       'total_bc_limit', 'tot_cur_bal', 'bad_flag'],
      dtype='object')


In [50]:
# Drop the 'id' column 
if 'id' in train_data.columns:
    train_data = train_data.drop(columns=['id'])


In [51]:
train_data.head(2)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,percent_bc_gt_75,bc_util,dti,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,tot_cur_bal,bad_flag
0,7550,36 months,0.1624,3.0,RENT,28000.0,debt_consolidation,100.0,96.0,8.4,0.0,17.0,0.72,4000.0,5759.0,0.0
1,27050,36 months,0.1099,10.0,OWN,55000.0,debt_consolidation,25.0,53.9,22.87,0.0,8.0,0.612,35700.0,114834.0,0.0


In [52]:
print("Train dataset shape:", train_data.shape)


Train dataset shape: (189457, 16)


In [53]:
# Check the distribution of the target variable
class_counts = train_data['bad_flag'].value_counts()
print("Class Distribution:")
print(class_counts)

# Calculate imbalance ratio
imbalance_ratio = class_counts.min() / class_counts.max()
print(f"Imbalance Ratio: {imbalance_ratio:.2f}")

Class Distribution:
bad_flag
0.0    176329
1.0     13128
Name: count, dtype: int64
Imbalance Ratio: 0.07


###An imbalance ratio of 0.07 indicates a highly imbalanced dataset, with the majority class being much more frequent than the minority class. This will likely cause the model to be biased towards predicting the majority class, resulting in poor performance for the minority class.

#We can try oversampling, undersampling, or weighted loss.

In [54]:
# Define categorical and numerical columns
categorical_columns = ['home_ownership', 'purpose', 'term']
target_column = 'bad_flag'
numerical_columns = [col for col in df_train_balanced.columns if col not in categorical_columns + [target_column]]

In [55]:
# Separate majority and minority classes
df_majority = train_data[train_data[target_column] == 0]
df_minority = train_data[train_data[target_column] == 1]

# Undersample the majority class using stratified sampling
df_majority_undersampled = resample(
    df_majority,
    replace=False,              # Sample without replacement
    n_samples=len(df_minority), # Match minority class size
    random_state=42,            # For reproducibility
    stratify=df_majority[target_column]  # Stratify to preserve class distribution
)

# Combine undersampled majority class with minority class
df_train_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42)

# Verify the new distribution
print("Balanced Class Distribution:")
print(df_train_balanced[target_column].value_counts())

Balanced Class Distribution:
bad_flag
1.0    13128
0.0    13128
Name: count, dtype: int64


In [72]:
# Define preprocessing pipeline for categorical and numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Scale numerical columns
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # One-hot encode categorical columns
    ]
)

# Separate features (X) and target (y) in the balanced dataset
X_balanced = df_train_balanced.drop(columns=[target_column])
y_balanced = df_train_balanced[target_column].values

# Apply preprocessing to the balanced dataset
X_balanced_transformed = preprocessor.fit_transform(X_balanced)

# Convert the transformed data into tensors
X_train_tensor = torch.tensor(X_balanced_transformed, dtype=torch.float32)
y_train_tensor = torch.tensor(y_balanced, dtype=torch.float32).unsqueeze(1)

# Verify the new shapes
print("X_train_tensor shape:", X_train_tensor.shape)
print("y_train_tensor shape:", y_train_tensor.shape)

X_train_tensor shape: torch.Size([352658, 32])
y_train_tensor shape: torch.Size([352658, 1])


In [73]:
# Inspect the number of unique categories in each categorical column
for col in categorical_columns:
    print(f"{col}: {df_train_balanced[col].nunique()} unique categories")

# Check the shape of the transformed feature matrix
print("Transformed X_balanced shape:", X_balanced_transformed.shape)

home_ownership: 5 unique categories
purpose: 13 unique categories
term: 2 unique categories
Transformed X_balanced shape: (352658, 32)


In [74]:
# Print unique categories
categories = df_train_balanced['home_ownership'].unique()
print("Categories in 'home_ownership':", categories)

categories = df_train_balanced['purpose'].unique()
print("Categories in 'purpose':", categories)

categories = df_train_balanced['term'].unique()
print("Categories in 'term':", categories)

categories = test_data['home_ownership'].unique()
print("Categories in 'home_ownership':", categories)

categories = test_data['purpose'].unique()
print("Categories in 'purpose':", categories)

categories = test_data['term'].unique()
print("Categories in 'term':", categories)

Categories in 'home_ownership': ['RENT' 'OWN' 'MORTGAGE' 'OTHER' 'NONE']
Categories in 'purpose': ['debt_consolidation' 'credit_card' 'other' 'small_business'
 'home_improvement' 'major_purchase' 'medical' 'car' 'vacation' 'wedding'
 'moving' 'house' 'renewable_energy']
Categories in 'term': [' 36 months' ' 60 months']
Categories in 'home_ownership': ['RENT' 'OWN' 'MORTGAGE' 'NONE' 'OTHER']
Categories in 'purpose': ['debt_consolidation' 'home_improvement' 'credit_card' 'other'
 'major_purchase' 'small_business' 'house' 'moving' 'medical' 'car'
 'vacation' 'renewable_energy' 'wedding']
Categories in 'term': [' 36 months' ' 60 months']


###There is a mismatch in categories in purpose as inference data has an unseen category of renewable energy. Hence, for now, I will fit the encoding only on training data and address unseen categories to be ignored.

In [75]:
from sklearn.model_selection import train_test_split

# Split the data into 80% train and 20% validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_tensor, y_train_tensor, test_size=0.2, random_state=42
)

# Verify the new shapes after splitting
print("X_train_split shape:", X_train_split.shape)
print("y_train_split shape:", y_train_split.shape)
print("X_val_split shape:", X_val_split.shape)
print("y_val_split shape:", y_val_split.shape)


X_train_split shape: torch.Size([282126, 32])
y_train_split shape: torch.Size([282126, 1])
X_val_split shape: torch.Size([70532, 32])
y_val_split shape: torch.Size([70532, 1])


In [76]:
# Create TensorDataset for training and validation sets
train_dataset = TensorDataset(X_train_split, y_train_split)
val_dataset = TensorDataset(X_val_split, y_val_split)

# Create DataLoader for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Verify the DataLoader setup
print(f"Train loader: {len(train_loader)} batches")
print(f"Validation loader: {len(val_loader)} batches")

Train loader: 8817 batches
Validation loader: 2205 batches


In [87]:
# Define the neural network
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(32, 64)  # Input size 32, output size 64
        self.fc2 = nn.Linear(64, 32)  # Input size 64, output size 32
        self.fc3 = nn.Linear(32, 1)   # Output layer (binary classification)
        
        # Adding Dropout layers
        self.dropout = nn.Dropout(0.3)  # Dropout rate of 30%
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout after the first layer
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)  # Apply dropout after the second layer
        x = torch.sigmoid(self.fc3(x))  # Sigmoid for binary classification
        return x

# Initialize the model
input_size = X_train.shape[1]
hidden_size = 64  # Configurable number of neurons in the hidden layer
model = NeuralNetwork(input_size, hidden_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
#optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)  

In [None]:
# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}")

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20)

The validation loss is still higher than the training loss, which might indicate that the model is overfitting to the training data.
The loss is still high with learning rate varying 0.001 - 0.005. Next, adding drop out/regularization.

In [67]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Add performance evaluation after training
def evaluate_model(model, val_loader, y_val_tensor):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            preds = (outputs >= 0.5).int()  # Binary classification
            all_preds.append(preds)
            all_labels.append(y_batch)
    
    # Flatten the lists
    all_preds = torch.cat(all_preds).numpy()
    all_labels = torch.cat(all_labels).numpy()

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    auc = roc_auc_score(all_labels, all_preds)

    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation Precision: {precision:.4f}")
    print(f"Validation Recall: {recall:.4f}")
    print(f"Validation F1-Score: {f1:.4f}")
    print(f"Validation AUC: {auc:.4f}")

# After training, evaluate the model on the validation set
evaluate_model(model, val_loader, y_val_tensor)

Validation Accuracy: 0.6365
Validation Precision: 0.6283
Validation Recall: 0.6739
Validation F1-Score: 0.6503
Validation AUC: 0.6364


In [65]:
print("Balanced dataset shape:", df_train_balanced.shape)


Balanced dataset shape: (26256, 16)


In [66]:
print("Columns in balanced dataset:", df_train_balanced.columns)


Columns in balanced dataset: Index(['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti',
       'inq_last_6mths', 'mths_since_recent_inq', 'revol_util',
       'total_bc_limit', 'tot_cur_bal', 'bad_flag'],
      dtype='object')


In [70]:
# Define model, loss function, optimizer, and learning rate scheduler
model = NeuralNetwork(input_size, hidden_size)

criterion = nn.BCEWithLogitsLoss()  # Binary Cross-Entropy for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Implement learning rate scheduler (for example, reduce learning rate on plateau)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True)

# Loop over each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_tensor)):
    print(f"Training fold {fold + 1}...")

    # Split the data into train and validation sets
    X_train_fold, X_val_fold = X_train_tensor[train_idx], X_train_tensor[val_idx]
    y_train_fold, y_val_fold = y_train_tensor[train_idx], y_train_tensor[val_idx]

    # Create DataLoader for the current fold
    train_dataset = TensorDataset(X_train_fold, y_train_fold)
    val_dataset = TensorDataset(X_val_fold, y_val_fold)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    # Train the model
    for epoch in range(20):  # Adjust number of epochs as needed
        model.train()
        for batch in train_loader:
            X_batch, y_batch = batch
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

        # Step the scheduler after each epoch
        scheduler.step(loss)  # The scheduler will reduce the learning rate if validation loss plateaus

    # Validate the model (same as before)
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for batch in val_loader:
            X_batch, y_batch = batch
            outputs = model(X_batch)
            y_true.extend(y_batch.numpy())
            y_pred.extend(torch.sigmoid(outputs).numpy())

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Calculate performance metrics for the current fold
    accuracy = accuracy_score(y_true, y_pred > 0.5)
    precision = precision_score(y_true, y_pred > 0.5)
    recall = recall_score(y_true, y_pred > 0.5)
    f1 = f1_score(y_true, y_pred > 0.5)
    auc = roc_auc_score(y_true, y_pred)

    # Append metrics to lists
    fold_accuracies.append(accuracy)
    fold_precisions.append(precision)
    fold_recalls.append(recall)
    fold_f1_scores.append(f1)
    fold_aucs.append(auc)

    print(f"Fold {fold + 1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}")

# Calculate average metrics across all folds
avg_accuracy = np.mean(fold_accuracies)
avg_precision = np.mean(fold_precisions)
avg_recall = np.mean(fold_recalls)
avg_f1 = np.mean(fold_f1_scores)
avg_auc = np.mean(fold_aucs)

print(f"\nAverage Metrics Across All Folds:")
print(f"Accuracy: {avg_accuracy:.4f}")
print(f"Precision: {avg_precision:.4f}")
print(f"Recall: {avg_recall:.4f}")
print(f"F1-Score: {avg_f1:.4f}")
print(f"AUC: {avg_auc:.4f}")




Training fold 1...
Fold 1 - Accuracy: 0.5029, Precision: 0.5022, Recall: 1.0000, F1: 0.6686, AUC: 0.6801
Training fold 2...
Fold 2 - Accuracy: 0.4997, Precision: 0.4993, Recall: 0.9996, F1: 0.6660, AUC: 0.6919
Training fold 3...
Fold 3 - Accuracy: 0.4986, Precision: 0.4979, Recall: 0.9989, F1: 0.6645, AUC: 0.6952
Training fold 4...
Fold 4 - Accuracy: 0.5092, Precision: 0.5084, Recall: 1.0000, F1: 0.6741, AUC: 0.6965
Training fold 5...
Fold 5 - Accuracy: 0.4959, Precision: 0.4953, Recall: 1.0000, F1: 0.6625, AUC: 0.6789

Average Metrics Across All Folds:
Accuracy: 0.5013
Precision: 0.5006
Recall: 0.9997
F1-Score: 0.6671
AUC: 0.6885


#Model is performing well in terms of recall, achieving almost perfect recall across all folds (close to 1.0). However, the precision is relatively lower (around 0.5), and accuracy is also quite close to 0.5. This indicates that your model might be predicting a large number of false positives, which leads to a lower precision despite having high recall. Next, I will try oversampling as there is a huge class imbalance. 

In [71]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = train_data[train_data[target_column] == 0]
df_minority = train_data[train_data[target_column] == 1]

# Oversample the minority class
df_minority_oversampled = resample(
    df_minority,
    replace=True,               # Sample with replacement
    n_samples=len(df_majority), # Match majority class size
    random_state=42,            # For reproducibility
    stratify=df_minority[target_column]  # Stratify to preserve class distribution
)

# Combine oversampled minority class with majority class
df_train_balanced = pd.concat([df_majority, df_minority_oversampled])

# Shuffle the dataset
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42)

# Verify the new distribution
print("Balanced Class Distribution:")
print(df_train_balanced[target_column].value_counts())


Balanced Class Distribution:
bad_flag
0.0    176329
1.0    176329
Name: count, dtype: int64
