In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [29]:
pd.set_option('display.max_colwidth', 100)  # Limit column width for better readability
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns


In [30]:
# Load training and testing data
train_data = pd.read_csv(r'C:\Users\padhee.3\Downloads\Take Home Project\training_processed_data.csv')  # Replace with your training file path
inference_data = pd.read_csv(r'C:\Users\padhee.3\Downloads\Take Home Project\testing_processed_data.csv')    # Replace with your testing file path

In [31]:
print(train_data.columns)
print(inference_data.columns)

Index(['id', 'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti',
       'inq_last_6mths', 'mths_since_recent_inq', 'revol_util',
       'total_bc_limit', 'tot_cur_bal', 'bad_flag'],
      dtype='object')
Index(['id', 'loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership',
       'annual_inc', 'purpose', 'percent_bc_gt_75', 'bc_util', 'dti',
       'inq_last_6mths', 'mths_since_recent_inq', 'revol_util',
       'total_bc_limit', 'tot_cur_bal', 'bad_flag'],
      dtype='object')


In [32]:
# Drop the 'id' column 
if 'id' in train_data.columns:
    train_data = train_data.drop(columns=['id'])


In [33]:
train_data.head(2)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,percent_bc_gt_75,bc_util,dti,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,tot_cur_bal,bad_flag
0,7550,36 months,0.1624,3.0,RENT,28000.0,debt_consolidation,100.0,96.0,8.4,0.0,17.0,0.72,4000.0,5759.0,0.0
1,27050,36 months,0.1099,10.0,OWN,55000.0,debt_consolidation,25.0,53.9,22.87,0.0,8.0,0.612,35700.0,114834.0,0.0


In [34]:
# Check the distribution of the target variable
class_counts = train_data['bad_flag'].value_counts()
print("Class Distribution:")
print(class_counts)

# Calculate imbalance ratio
imbalance_ratio = class_counts.min() / class_counts.max()
print(f"Imbalance Ratio: {imbalance_ratio:.2f}")

Class Distribution:
bad_flag
0.0    176329
1.0     13128
Name: count, dtype: int64
Imbalance Ratio: 0.07


###An imbalance ratio of 0.07 indicates a highly imbalanced dataset, with the majority class being much more frequent than the minority class. This will likely cause the model to be biased towards predicting the majority class, resulting in poor performance for the minority class.

#We can try oversampling, undersampling, or weighted loss.

In [35]:
# Define categorical and numerical columns
categorical_columns = ['purpose', 'term', 'home_ownership']  # Replace with actual categorical column names
target_column = 'bad_flag'

# Automatically detect numerical columns by excluding categorical and target columns
numerical_columns = [col for col in train_data.columns if col not in categorical_columns + [target_column]]

In [36]:
# Separate majority and minority classes
df_majority = train_data[train_data[target_column] == 0]
df_minority = train_data[train_data[target_column] == 1]

# Undersample the majority class using stratified sampling
df_majority_undersampled = resample(
    df_majority,
    replace=False,              # Sample without replacement
    n_samples=len(df_minority), # Match minority class size
    random_state=42,            # For reproducibility
    stratify=df_majority[target_column]  # Stratify to preserve class distribution
)

# Combine undersampled majority class with minority class
df_train_balanced = pd.concat([df_majority_undersampled, df_minority])

# Shuffle the dataset
df_train_balanced = df_train_balanced.sample(frac=1, random_state=42)

# Verify the new distribution
print("Balanced Class Distribution:")
print(df_train_balanced[target_column].value_counts())

Balanced Class Distribution:
bad_flag
1.0    13128
0.0    13128
Name: count, dtype: int64


In [37]:
# Print unique categories
categories = df_train_balanced['home_ownership'].unique()
print("Categories in 'home_ownership':", categories)

categories = df_train_balanced['purpose'].unique()
print("Categories in 'purpose':", categories)

categories = df_train_balanced['term'].unique()
print("Categories in 'term':", categories)

categories = test_data['home_ownership'].unique()
print("Categories in 'home_ownership':", categories)

categories = test_data['purpose'].unique()
print("Categories in 'purpose':", categories)

categories = test_data['term'].unique()
print("Categories in 'term':", categories)

Categories in 'home_ownership': ['MORTGAGE' 'RENT' 'OWN' 'OTHER' 'NONE']
Categories in 'purpose': ['debt_consolidation' 'other' 'medical' 'credit_card' 'car' 'wedding'
 'renewable_energy' 'vacation' 'home_improvement' 'major_purchase'
 'small_business' 'moving' 'house']
Categories in 'term': [' 36 months' ' 60 months']
Categories in 'home_ownership': ['RENT' 'OWN' 'MORTGAGE' 'NONE' 'OTHER']
Categories in 'purpose': ['debt_consolidation' 'home_improvement' 'credit_card' 'other'
 'major_purchase' 'small_business' 'house' 'moving' 'medical' 'car'
 'vacation' 'renewable_energy' 'wedding']
Categories in 'term': [' 36 months' ' 60 months']


###There is a mismatch in categories in purpose as inference data has an unseen category of renewable energy. Hence, for now, I will fit the encoding only on training data and address unseen categories to be ignored.

In [39]:
# Separate features and target in the balanced data
X = df_train_balanced.drop(columns=[target_column])
y = df_train_balanced[target_column].values

# Preprocessing pipeline for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

# Fit the preprocessor on the training data
preprocessor.fit(X)

# Apply transformations
X_transformed = preprocessor.transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_transformed, y, test_size=0.2, random_state=42)


In [40]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).unsqueeze(1)

# Check the size of X_train_tensor and y_train_tensor
print(f"X_train_tensor shape: {X_train_tensor.shape}")
print(f"y_train_tensor shape: {y_train_tensor.shape}")



X_train_tensor shape: torch.Size([21004, 32])
y_train_tensor shape: torch.Size([21004, 1])


In [41]:
# Create DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)