In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# read data
df = pd.read_csv("/content/drive/MyDrive/Take_Home_Project/training_loan_data.csv")
df.head(5)

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,10000001,11983056.0,7550,36 months,16.24%,3 years,RENT,28000.0,,debt_consolidation,...,0.0,17.0,72%,4000.0,,3828.953801,5759.0,1,99,0.0
1,10000002,12002921.0,27050,36 months,10.99%,10+ years,OWN,55000.0,Borrower added on 12/31/13 > Combining high ...,debt_consolidation,...,0.0,8.0,61.20%,35700.0,,34359.94073,114834.0,1,353,0.0
2,10000003,11983096.0,12000,36 months,10.99%,4 years,RENT,60000.0,Borrower added on 12/31/13 > I would like to...,debt_consolidation,...,1.0,3.0,24%,18100.0,,16416.61776,7137.0,1,157,0.0
3,10000004,12003142.0,28000,36 months,7.62%,5 years,MORTGAGE,325000.0,,debt_consolidation,...,1.0,3.0,54.60%,42200.0,,38014.14976,799592.0,1,365,0.0
4,10000005,11993233.0,12000,36 months,13.53%,10+ years,RENT,40000.0,,debt_consolidation,...,0.0,17.0,68.80%,7000.0,53.0,6471.462236,13605.0,1,157,0.0


In [None]:
# drop non-numerical value
df = df.drop(['term', 'int_rate', 'emp_length', 'home_ownership', 'desc', 'purpose','revol_util'], axis=1)
df.head(5)

Unnamed: 0,id,member_id,loan_amnt,annual_inc,percent_bc_gt_75,bc_util,dti,inq_last_6mths,mths_since_recent_inq,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,10000001,11983056.0,7550,28000.0,100.0,96.0,8.4,0.0,17.0,4000.0,,3828.953801,5759.0,1,99,0.0
1,10000002,12002921.0,27050,55000.0,25.0,53.9,22.87,0.0,8.0,35700.0,,34359.94073,114834.0,1,353,0.0
2,10000003,11983096.0,12000,60000.0,0.0,15.9,4.62,1.0,3.0,18100.0,,16416.61776,7137.0,1,157,0.0
3,10000004,12003142.0,28000,325000.0,16.7,67.1,18.55,1.0,3.0,42200.0,,38014.14976,799592.0,1,365,0.0
4,10000005,11993233.0,12000,40000.0,33.3,79.6,16.94,0.0,17.0,7000.0,53.0,6471.462236,13605.0,1,157,0.0


In [None]:
# pre-processing
print("origianl shape:", df.shape)
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

df = df.drop_duplicates()
print("number of processed rows (duplicated): ", df.shape)


df = df.dropna()
print("number of processed rows (NA rows): ", df.shape)


origianl shape: (199121, 16)
number of duplicate rows:  (1334, 16)
number of processed rows (duplicated):  (197787, 16)
number of processed rows (NA rows):  (29280, 16)


In [None]:
# identify input/output
X = df.drop('bad_flag', axis=1).values
y = df['bad_flag'].values

In [None]:
#split data into train/test with ratio 4:1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Create data loaders
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32, shuffle=False)

In [None]:
# model calss (NN with one hidden layer)
class BinaryClassifier(nn.Module):
    def __init__(self, input_dim, hidden_units):
        super(BinaryClassifier, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_units)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_units, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.sigmoid(self.layer2(x))
        return x

# Initialize the model
model = BinaryClassifier(input_dim=X_train.shape[1], hidden_units=16)

In [None]:
# define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# train loop
def train_model(num_epochs):
    for epoch in range(num_epochs):
        model.train()  # Set the model to training mode
        for inputs, labels in train_loader:
            optimizer.zero_grad()  # Clear gradients
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

        # Print statistics
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

# Train the model
train_model(num_epochs=50)

Epoch 1/50, Loss: 0.1916
Epoch 2/50, Loss: 0.2547
Epoch 3/50, Loss: 0.1917
Epoch 4/50, Loss: 0.1800
Epoch 5/50, Loss: 0.3955
Epoch 6/50, Loss: 0.2515
Epoch 7/50, Loss: 0.0865
Epoch 8/50, Loss: 0.1261
Epoch 9/50, Loss: 0.2305
Epoch 10/50, Loss: 0.4147
Epoch 11/50, Loss: 0.3429
Epoch 12/50, Loss: 0.1361
Epoch 13/50, Loss: 0.1969
Epoch 14/50, Loss: 0.4099
Epoch 15/50, Loss: 0.2404
Epoch 16/50, Loss: 0.1672
Epoch 17/50, Loss: 0.1072
Epoch 18/50, Loss: 0.3149
Epoch 19/50, Loss: 0.4242
Epoch 20/50, Loss: 0.1542
Epoch 21/50, Loss: 0.4187
Epoch 22/50, Loss: 0.3194
Epoch 23/50, Loss: 0.0680
Epoch 24/50, Loss: 0.0576
Epoch 25/50, Loss: 0.1739
Epoch 26/50, Loss: 0.4340
Epoch 27/50, Loss: 0.1776
Epoch 28/50, Loss: 0.1333
Epoch 29/50, Loss: 0.0665
Epoch 30/50, Loss: 0.1536
Epoch 31/50, Loss: 0.1044
Epoch 32/50, Loss: 0.3242
Epoch 33/50, Loss: 0.2114
Epoch 34/50, Loss: 0.1262
Epoch 35/50, Loss: 0.0701
Epoch 36/50, Loss: 0.2547
Epoch 37/50, Loss: 0.0558
Epoch 38/50, Loss: 0.2191
Epoch 39/50, Loss: 0.

In [None]:
# evaluate the model
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predicted = (outputs.data > 0.5).float()  # Using 0.5 as the threshold
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Accuracy: {100 * correct / total:.2f}%')

Accuracy: 93.46%


In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/Take_Home_Project/testing_loan_data.csv")
df_test.head(5)

  df_test = pd.read_csv("/content/drive/MyDrive/Take_Home_Project/testing_loan_data.csv")


Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,20000001,22419852,10000,36 months,22.15%,8 years,RENT,37000.0,,debt_consolidation,...,1,3.0,73.10%,16200,,14877.17028,36809,1,131,
1,20000002,22349118,1400,36 months,18.24%,6 years,RENT,41000.0,,other,...,0,9.0,11.50%,4000,,4097.30477,19536,1,19,
2,20000003,22398818,7000,36 months,12.49%,3 years,RENT,68900.0,,debt_consolidation,...,0,11.0,48.10%,11900,80.0,12688.49516,241465,1,92,
3,20000004,22419015,18000,60 months,16.29%,9 years,MORTGAGE,41000.0,,debt_consolidation,...,1,0.0,38.10%,7600,73.0,7908.799817,179757,1,235,
4,20000005,22388614,12000,36 months,12.99%,10+ years,MORTGAGE,64000.0,,home_improvement,...,0,,57.90%,21000,,19378.56106,31953,1,157,


In [None]:
df_test = df_test.drop(['term', 'int_rate', 'emp_length', 'home_ownership', 'desc', 'purpose','revol_util'], axis=1)
df_test.head(5)

Unnamed: 0,id,member_id,loan_amnt,annual_inc,percent_bc_gt_75,bc_util,dti,inq_last_6mths,mths_since_recent_inq,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,20000001,22419852,10000,37000.0,80.0,83.0,28.51,1,3.0,16200,,14877.17028,36809,1,131,
1,20000002,22349118,1400,41000.0,0.0,0.0,26.58,0,9.0,4000,,4097.30477,19536,1,19,
2,20000003,22398818,7000,68900.0,60.0,75.9,6.6,0,11.0,11900,80.0,12688.49516,241465,1,92,
3,20000004,22419015,18000,41000.0,33.3,61.1,20.61,1,0.0,7600,73.0,7908.799817,179757,1,235,
4,20000005,22388614,12000,64000.0,75.0,67.0,24.61,0,,21000,,19378.56106,31953,1,157,


In [None]:
X_test = df_test.drop('bad_flag', axis=1).values
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

X_test = torch.tensor(X_test, dtype=torch.float32)



In [None]:
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    outputs = model(X_test)
    predicted = (outputs.data > 0.5).float()
    predicted_labels = predicted.numpy().flatten()

In [None]:
df_test['bad_flag'] = predicted_labels
print(df_test.head())


         id  member_id  loan_amnt  annual_inc  percent_bc_gt_75  bc_util  \
0  20000001   22419852      10000     37000.0              80.0     83.0   
1  20000002   22349118       1400     41000.0               0.0      0.0   
2  20000003   22398818       7000     68900.0              60.0     75.9   
3  20000004   22419015      18000     41000.0              33.3     61.1   
4  20000005   22388614      12000     64000.0              75.0     67.0   

     dti  inq_last_6mths  mths_since_recent_inq  total_bc_limit  \
0  28.51               1                    3.0           16200   
1  26.58               0                    9.0            4000   
2   6.60               0                   11.0           11900   
3  20.61               1                    0.0            7600   
4  24.61               0                    NaN           21000   

   mths_since_last_major_derog  tot_hi_cred_lim  tot_cur_bal  \
0                          NaN     14877.170280        36809   
1           

In [None]:
df_test.to_csv('/content/drive/MyDrive/Take_Home_Project/testing_loan_data_withprediction.csv', index=False)  # Save the DataFrame to a CSV file
