# Scaling, sampling and modelling

In [1]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_columns', None)

In [2]:
# Load cleaned dataset
df = pd.read_csv("Bank Leads Cleaned 20240206v2.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68797 entries, 0 to 68796
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Gender                               68797 non-null  object 
 1   DOB                                  68797 non-null  object 
 2   Lead_Creation_Date                   68797 non-null  object 
 3   City_Code                            68797 non-null  object 
 4   City_Category                        68797 non-null  object 
 5   Employer_Category1                   68797 non-null  object 
 6   Employer_Category2                   68797 non-null  float64
 7   Monthly_Income                       68797 non-null  float64
 8   Customer_Existing_Primary_Bank_Code  68797 non-null  object 
 9   Primary_Bank_Type                    68797 non-null  object 
 10  Contacted                            68797 non-null  object 
 11  Source                      

In [3]:
df.describe()

Unnamed: 0,Employer_Category2,Monthly_Income,Existing_EMI,Loan_Amount,Loan_Period,Interest_Rate,EMI,Var1,Approved,Year
count,68797.0,68797.0,68797.0,68797.0,68797.0,68797.0,68797.0,68797.0,68797.0,68797.0
mean,3.735948,3599.859583,348.049336,35692.762766,3.933471,19.21563,1082.756741,3.989723,0.014754,1985.286844
std,0.786844,4810.412524,938.597201,24304.447313,0.91254,3.325489,670.95522,3.815124,0.120566,7.070241
min,1.0,0.0,0.0,5000.0,1.0,11.99,118.0,0.0,0.0,1930.0
25%,4.0,1640.0,0.0,29000.0,4.0,19.21357,900.387994,0.0,0.0,1982.0
50%,4.0,2500.0,0.0,30000.0,4.0,19.21357,900.387994,2.0,0.0,1987.0
75%,4.0,4000.0,350.0,36000.0,4.0,19.21357,1102.922701,7.0,0.0,1990.0
max,4.0,97500.0,42000.0,300000.0,6.0,37.0,13556.0,10.0,1.0,1999.0


## Feature engineering 

In [4]:
# Create debt to income ratio feature
def dti_optimized(data):
    # Vectorized calculation
    x = data['Existing_EMI'] / data['Monthly_Income']
    x[data['Existing_EMI'] == 0] = 0
    x[data['Monthly_Income'] == 0] = 1
    return x

df['Debt_Income'] = dti_optimized(df)

# Create Total Loan feature
def total_loan_optimized(data):
    return data['Loan_Amount'] * (1 + (data['Interest_Rate']/100)) ** data['Loan_Period']

df['Total_Loan'] = total_loan_optimized(df)

# Create Age feature 
df['Age'] = 2024 - df['Year']

# Drop unusable features 
df.drop(['Lead_Creation_Date', 'DOB', 'City_Code', 'Customer_Existing_Primary_Bank_Code', 'Source'], axis=1, inplace=True)


## Categorical encoding 

In [5]:
df['Employer_Category2'] = df['Employer_Category2'].astype(str)

In [6]:
cat_feats = df[['Gender', 'City_Category', 'Employer_Category1', 'Employer_Category2', 'Primary_Bank_Type', 'Contacted', 'Source_Category']]

df = pd.get_dummies(df, columns=cat_feats.columns, drop_first=True, dtype=int)

In [7]:
df.drop(['Year'], axis=1, inplace=True)

In [8]:
df.drop(['Var1'], axis=1, inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68797 entries, 0 to 68796
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Monthly_Income          68797 non-null  float64
 1   Existing_EMI            68797 non-null  float64
 2   Loan_Amount             68797 non-null  float64
 3   Loan_Period             68797 non-null  float64
 4   Interest_Rate           68797 non-null  float64
 5   EMI                     68797 non-null  float64
 6   Approved                68797 non-null  int64  
 7   Debt_Income             68797 non-null  float64
 8   Total_Loan              68797 non-null  float64
 9   Age                     68797 non-null  int64  
 10  Gender_Male             68797 non-null  int32  
 11  City_Category_B         68797 non-null  int32  
 12  City_Category_C         68797 non-null  int32  
 13  Employer_Category1_B    68797 non-null  int32  
 14  Employer_Category1_C    68797 non-null

## Split the data 

In [10]:
seed = 42

X = df.drop(['Approved'], axis=1)
y = df['Approved']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

## Numerical scaling

In [11]:
numeric_feats = X_train[['Monthly_Income', 'Existing_EMI', 'Loan_Amount', 'Loan_Period', 'Interest_Rate', 'EMI', 'Debt_Income', 'Total_Loan', 'Age']]
categorical_feats = X_train.drop(['Monthly_Income', 'Existing_EMI', 'Loan_Amount', 'Loan_Period', 'Interest_Rate', 'EMI', 'Debt_Income', 'Total_Loan', 'Age'], axis=1)

# StandardScaler
ss = StandardScaler()
scaled = ss.fit_transform(numeric_feats)
scaled_df = pd.DataFrame(scaled, index=numeric_feats.index, columns=numeric_feats.columns)
X_train_ss = pd.concat([scaled_df, categorical_feats], axis=1)

# RobustScaler 
rs = RobustScaler()
scaled = rs.fit_transform(numeric_feats)
scaled_df = pd.DataFrame(scaled, index=numeric_feats.index, columns=numeric_feats.columns)
X_train_rs = pd.concat([scaled_df, categorical_feats], axis=1)

## Feature selection 

In [12]:
# RFE - LR, SS
lr = LogisticRegression(max_iter=400)

rfe = RFE(estimator=lr, n_features_to_select=10, step=1)

rfe.fit(X_train_ss, y_train)

# print("Feature ranking: ", rfe.ranking_)
# print("Feature selection: ", rfe.support_)

ss_rfe_table = pd.DataFrame({
    'Feature': X_train_ss.columns,
    'Ranking': rfe.ranking_,
    'SAelection': rfe.support_
})

ss_rfe_table.sort_values(by='Ranking')

selected_features_ss = list(ss_rfe_table[ss_rfe_table['Ranking'] == 1]['Feature'])

X_train_ss_rfe = X_train_ss[selected_features_ss]

In [13]:
# RFE - LR, RS
lr = LogisticRegression(max_iter=400)

rfe = RFE(estimator=lr, n_features_to_select=10, step=1)

rfe.fit(X_train_rs, y_train)

# print("Feature ranking: ", rfe.ranking_)
# print("Feature selection: ", rfe.support_)

rs_rfe_table = pd.DataFrame({
    'Feature': X_train_ss.columns,
    'Ranking': rfe.ranking_,
    'SAelection': rfe.support_
})

rs_rfe_table.sort_values(by='Ranking')

selected_features_rs = list(rs_rfe_table[rs_rfe_table['Ranking'] == 1]['Feature'])

X_train_rs_rfe = X_train_rs[selected_features_rs]

In [14]:
# RFE - SVC, SS
svc = SVC(kernel='linear')

rfe = RFE(estimator=svc, n_features_to_select=10, step=1)

rfe.fit(X_train_ss, y_train)

svc_ss_rfe_table = pd.DataFrame({
    'Feature': X_train_ss.columns,
    'Ranking': rfe.ranking_,
    'SAelection': rfe.support_
})

svc_ss_rfe_table.sort_values(by='Ranking')

selected_features_svc_ss = list(svc_ss_rfe_table[svc_ss_rfe_table['Ranking'] == 1]['Feature'])

X_train_svc_ss_rfe = X_train_ss[selected_features_svc_ss]

## Sampling 

In [15]:
# SMOTE - LR, SS
smote = SMOTE(random_state=seed)

X_train_ss_rfe_smote, y_train_ss_smote = smote.fit_resample(X_train_ss_rfe, y_train)

print(f"Before SMOTE: {pd.Series(y_train).value_counts()}")
print(f"After SMOTE: {pd.Series(y_train_ss_smote).value_counts()}")

Before SMOTE: Approved
0    54223
1      814
Name: count, dtype: int64
After SMOTE: Approved
0    54223
1    54223
Name: count, dtype: int64


In [16]:
# SMOTE - LR, RS 
smote = SMOTE(random_state=seed)

X_train_rs_rfe_smote, y_train_rs_smote = smote.fit_resample(X_train_rs_rfe, y_train)

print(f"Before SMOTE: {pd.Series(y_train).value_counts()}")
print(f"After SMOTE: {pd.Series(y_train_rs_smote).value_counts()}")

Before SMOTE: Approved
0    54223
1      814
Name: count, dtype: int64
After SMOTE: Approved
0    54223
1    54223
Name: count, dtype: int64


In [17]:
# SMOTE - SVC, SS
smote = SMOTE(random_state=seed)

X_train_svc_ss_rfe_smote, y_train_svc_ss_smote = smote.fit_resample(X_train_svc_ss_rfe, y_train)

print(f"Before SMOTE: {pd.Series(y_train).value_counts()}")
print(f"After SMOTE: {pd.Series(y_train_svc_ss_smote).value_counts()}")

Before SMOTE: Approved
0    54223
1      814
Name: count, dtype: int64
After SMOTE: Approved
0    54223
1    54223
Name: count, dtype: int64


In [18]:
# Random oversampling

In [19]:
# Random undersampling

## Modelling 

In [20]:
# Logistic Regression | StandardScaler 
lr_ss = LogisticRegression(max_iter=400)

lr_ss.fit(X_train_ss_rfe_smote, y_train_ss_smote)

y_preds_lr_ss = lr_ss.predict(X_test[selected_features_ss])

conf_matrix = confusion_matrix(y_test, y_preds_lr_ss)
report = classification_report(y_test, y_preds_lr_ss)

print("Confusion matrix.................\n", conf_matrix)
print("\n\nClassification report...............\n", report)

print("\n\nCross Validation Score...............\n", cross_val_score(lr_ss, X, y, cv=5))

Confusion matrix.................
 [[5177 8382]
 [  23  178]]


Classification report...............
               precision    recall  f1-score   support

           0       1.00      0.38      0.55     13559
           1       0.02      0.89      0.04       201

    accuracy                           0.39     13760
   macro avg       0.51      0.63      0.30     13760
weighted avg       0.98      0.39      0.54     13760



Cross Validation Score...............
 [0.98524709 0.98510174 0.98510066 0.98510066 0.98510066]


In [21]:
# Logistic Regression | RobustScaler 
lr_rs = LogisticRegression(solver='liblinear', max_iter=400)

lr_rs.fit(X_train_rs_rfe_smote, y_train_rs_smote)

y_preds_lr_rs = lr_rs.predict(X_test[selected_features_rs])

conf_matrix = confusion_matrix(y_test, y_preds_lr_rs)
report = classification_report(y_test, y_preds_lr_rs)

print("Confusion matrix.................\n", conf_matrix)
print("\n\nClassification report...............\n", report)
print("\n\nCross Validation Score...............\n", cross_val_score(lr_rs, X, y, cv=5))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Confusion matrix.................
 [[    0 13559]
 [    0   201]]


Classification report...............
               precision    recall  f1-score   support

           0       0.00      0.00      0.00     13559
           1       0.01      1.00      0.03       201

    accuracy                           0.01     13760
   macro avg       0.01      0.50      0.01     13760
weighted avg       0.00      0.01      0.00     13760



Cross Validation Score...............
 [0.98524709 0.98517442 0.98510066 0.98510066 0.98510066]


In [22]:
# MLP

In [23]:
# SVC | StandardScaler
svc = SVC(random_state=seed)

svc.fit(X_train_svc_ss_rfe_smote, y_train_svc_ss_smote)

y_preds = svc.predict(X_test[selected_features_svc_ss])

conf_matrix = confusion_matrix(y_test, y_preds)
report = classification_report(y_test, y_preds)

print("Confusion matrix.................\n", conf_matrix)
print("\n\nClassification report...............\n", report)
print("\n\nCross Validation Score...............\n", cross_val_score(svc, X, y, cv=5))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Confusion matrix.................
 [[13559     0]
 [  201     0]]


Classification report...............
               precision    recall  f1-score   support

           0       0.99      1.00      0.99     13559
           1       0.00      0.00      0.00       201

    accuracy                           0.99     13760
   macro avg       0.49      0.50      0.50     13760
weighted avg       0.97      0.99      0.98     13760



Cross Validation Score...............
 [0.98524709 0.98524709 0.98524602 0.98524602 0.98524602]


# SVC | RobustScaler
svc = SVC(random_state=seed)

svc.fit(X_train_rs_rfe_smote, y_train_rs_smote)

conf_matrix = confusion_matrix(y_test, y_preds_rs)
report = classification_report(y_test, y_preds_rs)

print("Confusion matrix.................\n", conf_matrix)
print("\n\nClassification report...............\n", report)

## Testing for overfitting

In [24]:
# Test LogisticRegression, StandardScaler
train_preds = lr_ss.predict(X_train_ss_rfe_smote)
test_preds = lr_ss.predict(X_test[selected_features_ss])

print("LR, SS Training Accuracy: ", accuracy_score(y_train_ss_smote, train_preds))
print("LR, SS Test Accuracy: ", accuracy_score(y_test, test_preds))

# Test LogisticRegression, RobustScaler

# Test SVC, StandardScaler
svc_train_preds = svc.predict(X_train_svc_ss_rfe_smote)
svc_test_preds = svc.predict(X_test[selected_features_svc_ss])

print("\nSVC, SS Training Accuracy: ", accuracy_score(y_train_svc_ss_smote, train_preds))
print("SVC, SS Test Accuracy: ", accuracy_score(y_test, test_preds))

# Test SVC, RobustScaler 

LR, SS Training Accuracy:  0.7050236984305553
LR, SS Test Accuracy:  0.389171511627907

SVC, SS Training Accuracy:  0.7050236984305553
SVC, SS Test Accuracy:  0.389171511627907


## Fine tuning

In [33]:
# Grid Search to find the best hyperparameters 

lr_tune = LogisticRegression(solver='liblinear', max_iter=200)

param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2']  # Norm used in the penalization
}

grid_search = GridSearchCV(estimator=lr_tune, param_grid=param_grid, cv=5, verbose=1, scoring='recall')

grid_search.fit(X_train_ss_rfe_smote, y_train_ss_smote)

print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best parameters: {'C': 1, 'penalty': 'l2'}
Best score: 0.7288604236722172


In [34]:
# Logistic Regression | StandardScaler 
lr = LogisticRegression(solver='liblinear', penalty='l2', C=1)

lr.fit(X_train_ss_rfe_smote, y_train_ss_smote)

y_preds_ss = lr.predict(X_test[selected_features_ss])

conf_matrix = confusion_matrix(y_test, y_preds_ss)
report = classification_report(y_test, y_preds_ss)

print("Confusion matrix.................\n", conf_matrix)
print("\n\nClassification report...............\n", report)

Confusion matrix.................
 [[5182 8377]
 [  23  178]]


Classification report...............
               precision    recall  f1-score   support

           0       1.00      0.38      0.55     13559
           1       0.02      0.89      0.04       201

    accuracy                           0.39     13760
   macro avg       0.51      0.63      0.30     13760
weighted avg       0.98      0.39      0.54     13760



In [39]:
# Setting a new threshold to increase Recall 

lr = LogisticRegression(solver='liblinear', penalty='l2', C=1)

lr.fit(X_train_ss_rfe_smote, y_train_ss_smote)

y_preds_ss = lr.predict(X_test[selected_features_ss])

probs = lr.predict_proba(X_test[selected_features_ss])

new_threshold = 0.4

y_pred_new_threshold = (probs[:, 1] > new_threshold).astype(int)

conf_matrix = confusion_matrix(y_test, y_pred_new_threshold)
report = classification_report(y_test, y_pred_new_threshold)

print("Confusion matrix.................\n", conf_matrix)
print("\n\nClassification report...............\n", report)

Confusion matrix.................
 [[4671 8888]
 [  21  180]]


Classification report...............
               precision    recall  f1-score   support

           0       1.00      0.34      0.51     13559
           1       0.02      0.90      0.04       201

    accuracy                           0.35     13760
   macro avg       0.51      0.62      0.28     13760
weighted avg       0.98      0.35      0.50     13760



## Final Model Settings

In [40]:
# To make the threshold change persist in future predictions 

def predict_with_custom_threshold(model, X, threshold=0.4):
    probabilities = model.predict_proba(X)
    return (probabilities[:, 1] > threshold).astype(int)

# Usage
y_pred_custom = predict_with_custom_threshold(lr, X_test[selected_features_ss], 0.4)

conf_matrix = confusion_matrix(y_test, y_pred_custom)
report = classification_report(y_test, y_pred_custom)

print("Confusion matrix.................\n", conf_matrix)
print("\n\nClassification report...............\n", report)


Confusion matrix.................
 [[4671 8888]
 [  21  180]]


Classification report...............
               precision    recall  f1-score   support

           0       1.00      0.34      0.51     13559
           1       0.02      0.90      0.04       201

    accuracy                           0.35     13760
   macro avg       0.51      0.62      0.28     13760
weighted avg       0.98      0.35      0.50     13760

