In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

seed=42

In [2]:
# Load cleaned dataset
df = pd.read_csv("Bank Leads Cleaned 20240206v2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68797 entries, 0 to 68796
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Gender                               68797 non-null  object 
 1   DOB                                  68797 non-null  object 
 2   Lead_Creation_Date                   68797 non-null  object 
 3   City_Code                            68797 non-null  object 
 4   City_Category                        68797 non-null  object 
 5   Employer_Category1                   68797 non-null  object 
 6   Employer_Category2                   68797 non-null  float64
 7   Monthly_Income                       68797 non-null  float64
 8   Customer_Existing_Primary_Bank_Code  68797 non-null  object 
 9   Primary_Bank_Type                    68797 non-null  object 
 10  Contacted                            68797 non-null  object 
 11  Source                      

In [3]:
df.head()

Unnamed: 0,Gender,DOB,Lead_Creation_Date,City_Code,City_Category,Employer_Category1,Employer_Category2,Monthly_Income,Customer_Existing_Primary_Bank_Code,Primary_Bank_Type,...,Source,Source_Category,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved,Year
0,Female,1979-07-23,2016-07-15,C10001,A,A,4.0,2000.0,B001,P,...,S122,G,0.0,30000.0,4.0,19.21357,900.387994,0,0,1979
1,Male,1986-12-07,2016-07-04,C10003,A,C,1.0,3500.0,B002,P,...,S122,G,0.0,20000.0,2.0,13.25,953.0,10,0,1986
2,Male,1982-12-10,2016-07-19,C10125,C,C,4.0,2250.0,B003,G,...,S143,B,0.0,45000.0,4.0,19.21357,1350.581991,0,0,1982
3,Male,1989-01-30,2016-07-09,C10477,C,A,4.0,3500.0,B003,G,...,S143,B,0.0,92000.0,5.0,19.21357,2397.355046,7,0,1989
4,Male,1985-04-19,2016-07-20,C10002,A,A,4.0,10000.0,B001,P,...,S134,B,2500.0,50000.0,2.0,19.21357,2525.622064,10,0,1985


In [4]:
df["Approved"].value_counts()

Approved
0    67782
1     1015
Name: count, dtype: int64

In [5]:
# For each feature in the dataset, count the number of datatypes it contains
for col in df:
    data_types = df[col].dtypes
    unique_dtypes = len(np.unique(data_types))
    print(f"{col}: {unique_dtypes}, {data_types}")


Gender: 1, object
DOB: 1, object
Lead_Creation_Date: 1, object
City_Code: 1, object
City_Category: 1, object
Employer_Category1: 1, object
Employer_Category2: 1, float64
Monthly_Income: 1, float64
Customer_Existing_Primary_Bank_Code: 1, object
Primary_Bank_Type: 1, object
Contacted: 1, object
Source: 1, object
Source_Category: 1, object
Existing_EMI: 1, float64
Loan_Amount: 1, float64
Loan_Period: 1, float64
Interest_Rate: 1, float64
EMI: 1, float64
Var1: 1, int64
Approved: 1, int64
Year: 1, int64


In [6]:
#  Check how many unique values in features that aren't float or int
for col in df:
    if df[col].dtypes == object:
        print(f"{col}: {len(df[col].unique())}")

Gender: 2
DOB: 10711
Lead_Creation_Date: 92
City_Code: 678
City_Category: 3
Employer_Category1: 3
Customer_Existing_Primary_Bank_Code: 51
Primary_Bank_Type: 2
Contacted: 2
Source: 29
Source_Category: 7


### Feature Engineering

In [7]:
# Create debt to income ratio feature
def dti_optimized(data):
    # Vectorized calculation
    x = data['Existing_EMI'] / data['Monthly_Income']
    x[data['Existing_EMI'] == 0] = 0
    x[data['Monthly_Income'] == 0] = 1
    return x

df['Debt_Income'] = dti_optimized(df)

# Create Total Loan feature
def total_loan_optimized(data):
    # Ensure the calculation is vectorized
    return data['Loan_Amount'] * (1 + (data['Interest_Rate']/100)) ** data['Loan_Period']

df['Total_Loan'] = total_loan_optimized(df)

# Create Age feature
df['Age'] = 2024 - df['Year']

# Drop unusable features
df.drop(['Lead_Creation_Date', 'DOB', 'City_Code', 'Customer_Existing_Primary_Bank_Code', 'Source', 'Year', 'Var1'], axis=1, inplace=True)

# Convert dtype to str
df['Employer_Category2'] = df['Employer_Category2'].astype(str)


### Feature Encoding

In [8]:
cat_feats = df[['Gender', 'City_Category', 'Employer_Category1', 'Employer_Category2', 'Primary_Bank_Type', 'Contacted', 'Source_Category']]

df = pd.get_dummies(df, columns=cat_feats.columns, drop_first=True, dtype=int)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68797 entries, 0 to 68796
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Monthly_Income          68797 non-null  float64
 1   Existing_EMI            68797 non-null  float64
 2   Loan_Amount             68797 non-null  float64
 3   Loan_Period             68797 non-null  float64
 4   Interest_Rate           68797 non-null  float64
 5   EMI                     68797 non-null  float64
 6   Approved                68797 non-null  int64  
 7   Debt_Income             68797 non-null  float64
 8   Total_Loan              68797 non-null  float64
 9   Age                     68797 non-null  int64  
 10  Gender_Male             68797 non-null  int32  
 11  City_Category_B         68797 non-null  int32  
 12  City_Category_C         68797 non-null  int32  
 13  Employer_Category1_B    68797 non-null  int32  
 14  Employer_Category1_C    68797 non-null

### Format the data

In [10]:
X = df.drop('Approved', axis=1)
y = df['Approved']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

### Sampling

In [11]:
# Use SMOTE to oversample the minority class
smote = SMOTE(random_state=seed) 

X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [12]:
y_train_smote.value_counts()

Approved
0    54223
1    54223
Name: count, dtype: int64

### Random Forest Classifier

In [13]:
# Use RFE to reduce dimensionality 
rfe_rfc = RFE(RandomForestClassifier(random_state=seed), n_features_to_select=10)

rfe_rfc.fit(X_train_smote, y_train_smote)

rfe_rfc_table = pd.DataFrame({
    'Feature': X_train_smote.columns,
    'Ranking': rfe_rfc.ranking_,
    'SAelection': rfe_rfc.support_
})

rfe_rfc_table.sort_values(by='Ranking')

selected_features = list(rfe_rfc_table[rfe_rfc_table['Ranking'] == 1]['Feature'])

X_train_sampled_selected = X_train_smote[selected_features]

In [18]:
# Use Grid Search to find the best parameters

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [1, 2, 4],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'bootstrap': [True, False]
}

rfc_tune = RandomForestClassifier(random_state=seed)

grid_search = GridSearchCV(estimator=rfc_tune, 
                           param_grid=param_grid, 
                           cv=5, 
                           n_jobs=-1, 
                           verbose=2,
                           scoring='recall'
                           )

grid_search.fit(X_train_sampled_selected, y_train_smote)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
best_params = grid_search.best_params_

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 1, 'n_estimators': 100}
Best score: 0.9011306343927095


In [19]:
# Instantiate the model
rf = RandomForestClassifier(**best_params, random_state=seed)

# Fit the model
rf.fit(X_train_sampled_selected, y_train_smote)

# Predict the test set
y_pred = rf.predict(X_test[selected_features])

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6543604651162791


In [20]:
# Cross validate the model
cv = cross_val_score(rf, X, y, cv=5)
print(f"Cross Validation...................\n {np.mean(cv)}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\n\nConfusion Matrix.............\n {cm}")

# Classification report
cr = classification_report(y_test, y_pred)
print(f"\n\nClassification Report..................\n {cr}")

Cross Validation...................
 0.985246449681139


Confusion Matrix.............
 [[8844 4715]
 [  41  160]]


Classification Report..................
               precision    recall  f1-score   support

           0       1.00      0.65      0.79     13559
           1       0.03      0.80      0.06       201

    accuracy                           0.65     13760
   macro avg       0.51      0.72      0.43     13760
weighted avg       0.98      0.65      0.78     13760



### XGB Classifier

In [21]:
# Use RFE to reduce dimensionality 
rfe_xgbc = RFE(XGBClassifier(random_state=seed), n_features_to_select=10)

rfe_xgbc.fit(X_train_smote, y_train_smote)

rfe_xgbc_table = pd.DataFrame({
    'Feature': X_train_smote.columns,
    'Ranking': rfe_xgbc.ranking_,
    'Selection': rfe_xgbc.support_
})

rfe_xgbc_table.sort_values(by='Ranking')

selected_features = list(rfe_xgbc_table[rfe_xgbc_table['Ranking'] == 1]['Feature'])

X_train_sampled_selected = X_train_smote[selected_features]

In [30]:
# Use Grid Search to find the best parameters

# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.1, 0.01, 0.001],
#     'min_child_weight': [1, 5, 10],
#     'max_depth': [1, 2, 4],
#     'subsample': [0.3, 0.5, 0.7]
# }

param_grid = {
    'n_estimators': [10, 25, 50],
    'learning_rate': [0.1, 0.2, 0.3],
    'min_child_weight': [1, 5, 10],
    'max_depth': [1],
    'subsample': [0.1, 0.2, 0.3]
}

xgbc_tune = XGBClassifier(random_state=seed)

grid_search = GridSearchCV(estimator=xgbc_tune, 
                           param_grid=param_grid, 
                           cv=5, 
                           n_jobs=-1, 
                           verbose=2,
                           scoring='recall'
                           )

grid_search.fit(X_train_sampled_selected, y_train_smote)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
best_params = grid_search.best_params_

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 1, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.3}
Best score: 0.9231324289020076


In [31]:
# Instantiate the model
xgbc = XGBClassifier(**best_params, random_state=seed)

# Fit the model
xgbc.fit(X_train_sampled_selected, y_train_smote)

# Predict the test set
y_pred =xgbc.predict(X_test[selected_features])

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6698401162790698


In [32]:
# Cross validate the model
cv = cross_val_score(xgbc, X, y, cv=5)
print(f"Cross Validation...................\n {np.mean(cv)}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\n\nConfusion Matrix.............\n {cm}")

# Classification report
cr = classification_report(y_test, y_pred)
print(f"\n\nClassification Report..................\n {cr}")

Cross Validation...................
 0.985246449681139


Confusion Matrix.............
 [[9064 4495]
 [  48  153]]


Classification Report..................
               precision    recall  f1-score   support

           0       0.99      0.67      0.80     13559
           1       0.03      0.76      0.06       201

    accuracy                           0.67     13760
   macro avg       0.51      0.71      0.43     13760
weighted avg       0.98      0.67      0.79     13760

