In [1]:
import pandas as pd
import torch
import numpy as np

import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [2]:
pd.set_option('display.max_colwidth', 100)  # Limit column width for better readability
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns


In [3]:
# Load training and testing data
train_data = pd.read_csv(r'C:\Users\padhee.3\Downloads\Take Home Project\training_processed_data.csv')  # Replace with your training file path
test_data = pd.read_csv(r'C:\Users\padhee.3\Downloads\Take Home Project\testing_processed_data.csv')    # Replace with your testing file path

In [4]:
print(train_data.columns)
print(test_data.columns)

Index(['id', 'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti',
       'inq_last_6mths', 'mths_since_recent_inq', 'revol_util',
       'total_bc_limit', 'tot_cur_bal', 'bad_flag'],
      dtype='object')
Index(['id', 'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti',
       'inq_last_6mths', 'mths_since_recent_inq', 'revol_util',
       'total_bc_limit', 'tot_cur_bal', 'bad_flag'],
      dtype='object')


In [5]:
# Drop the 'id' column 
if 'id' in train_data.columns:
    train_data = train_data.drop(columns=['id'])
if 'id' in test_data.columns:
    test_data = test_data.drop(columns=['id'])

In [7]:
train_data.head(2)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,percent_bc_gt_75,bc_util,dti,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,tot_cur_bal,bad_flag
0,7550,36 months,0.1624,3.0,RENT,28000.0,debt_consolidation,100.0,96.0,8.4,0.0,17.0,0.72,4000.0,5759.0,0.0
1,27050,36 months,0.1099,10.0,OWN,55000.0,debt_consolidation,25.0,53.9,22.87,0.0,8.0,0.612,35700.0,114834.0,0.0


In [8]:
test_data.head(2)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,percent_bc_gt_75,bc_util,dti,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,tot_cur_bal,bad_flag
0,7550,36 months,0.1624,3.0,RENT,28000.0,debt_consolidation,100.0,96.0,8.4,0.0,17.0,0.72,4000.0,5759.0,0.0
1,27050,36 months,0.1099,10.0,OWN,55000.0,debt_consolidation,25.0,53.9,22.87,0.0,8.0,0.612,35700.0,114834.0,0.0


In [9]:
# Check the distribution of the target variable
class_counts = train_data['bad_flag'].value_counts()
print("Class Distribution:")
print(class_counts)

# Calculate imbalance ratio
imbalance_ratio = class_counts.min() / class_counts.max()
print(f"Imbalance Ratio: {imbalance_ratio:.2f}")

Class Distribution:
bad_flag
0.0    176329
1.0     13128
Name: count, dtype: int64
Imbalance Ratio: 0.07


###An imbalance ratio of 0.07 indicates a highly imbalanced dataset, with the majority class being much more frequent than the minority class. This will likely cause the model to be biased towards predicting the majority class, resulting in poor performance for the minority class.

#We can try oversampling, undersampling, or weighted loss. I am first going to try undersampling to not add synthetic data

In [10]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = train_data[train_data['bad_flag'] == 0]
df_minority = train_data[train_data['bad_flag'] == 1]

# Undersample the majority class
df_majority_undersampled = resample(
    df_majority,
    replace=False,              # Sample without replacement
    n_samples=len(df_minority), # Match minority class size
    random_state=42             # For reproducibility
)

# Combine undersampled majority class with minority class
df_train_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42)

# Verify the new distribution
print("Balanced Class Distribution:")
print(df_train_balanced['bad_flag'].value_counts())

Balanced Class Distribution:
bad_flag
1.0    13128
0.0    13128
Name: count, dtype: int64


In [11]:

#Define categorical and numerical columns
categorical_columns = ['purpose', 'term', 'home_ownership']  # Replace with actual categorical column names
target_column = 'bad_flag'
numerical_columns = [col for col in train_data.columns if col not in categorical_columns + [target_column]]



In [12]:
#Preprocessing (One-Hot Encoding + Scaling)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),  # Standardize numerical columns
        ('cat', OneHotEncoder(drop='first'), categorical_columns)  # One-Hot Encode categorical columns
    ]
)

In [13]:
# Split the training data into train and validation sets (80-20 split)
X = df_train_balanced.drop(columns=[target_column])
y = df_train_balanced[target_column]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Preprocess training and validation data
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

In [14]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_transformed, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_val_tensor = torch.tensor(X_val_transformed, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)