# Lending Tree Credit Risk

### Dependencies and data

In [1]:
# Dependencies
from pathlib import Path
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier
from imblearn.metrics import classification_report_imbalanced

# print(mpl.style.available)
mpl.style.use('Solarize_Light2')
%matplotlib inline

In [2]:
# Inspect top rows in data
with open(Path('data/loans_1q19.csv')) as f:
    for i in range(5):
        print(f.readline()[:100])

Notes offered by Prospectus (https://www.lendingclub.com/info/prospectus.action)

"id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade"
"","","20000","20000","20000"," 60 months"," 17.19%","499.1","C","C5","Front desk supervisor","6 yea
"","","21225","21225","21225"," 60 months"," 14.74%","502.05","C","C2","ceo","10+ years","MORTGAGE",
"","","5000","5000","5000"," 36 months"," 17.97%","180.69","D","D1","","n/a","MORTGAGE","62000","Not


In [3]:
# Data
df = pd.read_csv(Path('data/loans_1q19.csv'), skiprows=1, low_memory=False)
print(df.shape)
df.head(3)

(115677, 144)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,20000.0,20000.0,20000.0,60 months,17.19%,499.1,C,C5,...,,,,N,,,,,,
1,,,21225.0,21225.0,21225.0,60 months,14.74%,502.05,C,C2,...,,,,N,,,,,,
2,,,5000.0,5000.0,5000.0,36 months,17.97%,180.69,D,D1,...,,,,N,,,,,,


### Drop unusable data

In [4]:
# Drop rows and columns with more than 10% its values missing
df.dropna(axis=1, thresh=df.shape[0]*0.9, inplace=True) # drop cols
df.dropna(axis=0, thresh=df.shape[1]*0.9, inplace=True) # drop rows
df.shape

(115674, 97)

In [5]:
# Drop constant columns
const_cols = df.nunique()[df.nunique() < 2].index # cols w/ 1 unique val
df.drop(const_cols, axis=1, inplace=True)
df.shape

(115674, 87)

In [6]:
# Drop newly issued loans
df = df[df['loan_status'] != 'Issued']
df.shape

(96839, 87)

### Convert all columns to numeric

In [7]:
# Inspect non-numeric columns
df_num = df.copy() # make a copy
obj_cols = df_num.dtypes[df_num.dtypes == object].index
df[obj_cols].head(3)

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,next_pymnt_d,last_credit_pull_d,application_type
80,60 months,13.08%,B,B5,< 1 year,MORTGAGE,Verified,Mar-2019,Fully Paid,debt_consolidation,Debt consolidation,240xx,VA,Jul-2005,66.7%,w,,Apr-2019,Individual
82,36 months,22.50%,D,D3,10+ years,RENT,Not Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,957xx,CA,Aug-2002,95.6%,w,May-2019,Apr-2019,Individual
93,36 months,17.19%,C,C5,9 years,RENT,Source Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,765xx,TX,Apr-2009,43.5%,w,May-2019,Apr-2019,Individual


In [8]:
""" String manipulation """

# Convert `term` to numeric
df_num['term'] = df['term'].str.replace(' months', '').astype(float)

# Convert `int_rate` to numeric
df_num['int_rate'] = df['int_rate'].str.replace('%', '').astype(float)

# Convert `emp_length` to numeric
df_num['emp_length'] = df['emp_length'].str.extract('(\d+)').astype(float)

# Convert `revol_util` to numeric
df_num['revol_util'] = df['revol_util'].str.replace('%', '').astype(float)

df_num[obj_cols].head(3)

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,next_pymnt_d,last_credit_pull_d,application_type
80,60.0,13.08,B,B5,1.0,MORTGAGE,Verified,Mar-2019,Fully Paid,debt_consolidation,Debt consolidation,240xx,VA,Jul-2005,66.7,w,,Apr-2019,Individual
82,36.0,22.5,D,D3,10.0,RENT,Not Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,957xx,CA,Aug-2002,95.6,w,May-2019,Apr-2019,Individual
93,36.0,17.19,C,C5,9.0,RENT,Source Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,765xx,TX,Apr-2009,43.5,w,May-2019,Apr-2019,Individual


In [9]:
""" Datetime manipulation """

# Create a new column for `issue_d` as numeric type
df_num['issue_month'] = pd.to_datetime(df['issue_d']).dt.month

# Create a new column for `earliest_cr_line` as numeric type
df_num['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line']) # convert to dt
youngest_cr = df_num['earliest_cr_line'].max() # latest date in data
df_num['oldest_cr_age'] = (youngest_cr - df_num['earliest_cr_line']).dt.days # oldest credit age

# Create a new column for 'last_credit_pull_d' as numeric type
df_num['last_credit_pull_month'] = pd.to_datetime(df['last_credit_pull_d']).dt.month
df_num['last_credit_pull_month'] = df_num['last_credit_pull_month'].replace(12, 0) # set Dec 2018 as month 0

df_num[obj_cols].head(3)

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,next_pymnt_d,last_credit_pull_d,application_type
80,60.0,13.08,B,B5,1.0,MORTGAGE,Verified,Mar-2019,Fully Paid,debt_consolidation,Debt consolidation,240xx,VA,2005-07-01,66.7,w,,Apr-2019,Individual
82,36.0,22.5,D,D3,10.0,RENT,Not Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,957xx,CA,2002-08-01,95.6,w,May-2019,Apr-2019,Individual
93,36.0,17.19,C,C5,9.0,RENT,Source Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,765xx,TX,2009-04-01,43.5,w,May-2019,Apr-2019,Individual


In [10]:
""" Numeric mapping """

# Convert `grade` to numeric
grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7} # num mapping
df_num['grade'] = df['grade'].map(grade_mapping).astype(float)

# Convert `home_ownership` to numeric
home_mapping = dict.fromkeys(['RENT', 'ANY', 'NONE'], 0) # num mapping
home_mapping.update(dict.fromkeys(['MORTGAGE', 'OWN'], 1)) # add 1 label
df_num['home_ownership'] = df['home_ownership'].map(home_mapping).astype(float)

# Convert `verification_state` to numeric
df_num['verification_status'] = df['verification_status'].str.replace('Source ', '') # combine verified labels
veri_mapping = {'Not Verified': 0, 'Verified': 1} # num mapping
df_num['verification_status'] = df['verification_status'].map(veri_mapping).astype(float)

# Create a new column for `initial_list_status` as numeric type
init_mapping = {'f': 0, 'w': 1} # num mapping
df_num['whole_loan'] = df['initial_list_status'].map(init_mapping).astype(float)

# Create a new column for `application_type` as numeric type
app_mapping = {'Individual': 0, 'Joint App': 1} # num mapping
df_num['joint_app'] = df['application_type'].map(app_mapping).astype(float)

# Create a new column for `loan_status` as numeric type
stat_mapping = dict.fromkeys(['Charged Off', 'In Grace Period', # num mapping
                              'Late (16-30 days)', 'Late (31-120 days)'], 1) # high risk
stat_mapping.update(dict.fromkeys(['Fully Paid', 'Current'], 0)) # low risk
df_num['high_risk'] = df['loan_status'].map(stat_mapping).astype(float)

df_num[obj_cols].head(3)

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,next_pymnt_d,last_credit_pull_d,application_type
80,60.0,13.08,2.0,B5,1.0,1.0,1.0,Mar-2019,Fully Paid,debt_consolidation,Debt consolidation,240xx,VA,2005-07-01,66.7,w,,Apr-2019,Individual
82,36.0,22.5,4.0,D3,10.0,0.0,0.0,Mar-2019,Current,debt_consolidation,Debt consolidation,957xx,CA,2002-08-01,95.6,w,May-2019,Apr-2019,Individual
93,36.0,17.19,3.0,C5,9.0,0.0,,Mar-2019,Current,debt_consolidation,Debt consolidation,765xx,TX,2009-04-01,43.5,w,May-2019,Apr-2019,Individual


In [11]:
""" One-hot encoding """

# Group labels into 3 categories
df_num['purpose'] = df['purpose'].replace(['debt_consolidation', 'credit_card', 'medical'], 'debt') \
                                 .replace(['home_improvement', 'car', 'house', 'vacation'], 'major_purchase') \
                                 .replace(['small_business', 'moving', 'renewable_energy', 'other'], '_other')

# One-hot encode `purpose` and drop the last label
df_num = pd.get_dummies(df_num, columns=['purpose'], drop_first=True)
df_num.head(3)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,total_bc_limit,total_il_high_credit_limit,issue_month,oldest_cr_age,last_credit_pull_month,whole_loan,joint_app,high_risk,purpose_debt,purpose_major_purchase
80,35000.0,35000.0,35000.0,60.0,13.08,797.8,2.0,B5,1.0,1.0,...,32700.0,130876.0,3,3867,4.0,1.0,0.0,0.0,1,0
82,20000.0,20000.0,20000.0,36.0,22.5,769.0,4.0,D3,10.0,0.0,...,47000.0,30797.0,3,4932,4.0,1.0,0.0,0.0,1,0
93,10500.0,10500.0,10500.0,36.0,17.19,375.35,3.0,C5,9.0,0.0,...,2000.0,61987.0,3,2497,4.0,1.0,0.0,0.0,1,0


### Additional cleaning

In [12]:
# Drop redundant cols
cols_to_drop = ['title', 'sub_grade', 'zip_code', 'issue_d', 'loan_status', 
                'earliest_cr_line', 'addr_state', 'verification_status', 'next_pymnt_d', 
                'last_credit_pull_d', 'initial_list_status', 'application_type']
df_num.drop(cols_to_drop, axis=1, inplace=True)
df_num.shape

(96839, 82)

In [13]:
# Drop rows with missing values
df_num.dropna(inplace=True)
df_num.shape

(84198, 82)

In [14]:
# Find highly correlated columns
cor_cols = []
for i in range(df_num.shape[1] - 1):
    for j in range(i + 1, df_num.shape[1]):
        col1 = df_num.iloc[:, i]
        col2 = df_num.iloc[:, j]
        cor = col1.corr(col2)
        if abs(cor) > 0.7:
            print(col1.name, col2.name, cor)
            if col2.name not in cor_cols:
                cor_cols.append(col2.name)
                
cor_cols

loan_amnt funded_amnt 1.0
loan_amnt funded_amnt_inv 0.999997083401125
loan_amnt installment 0.9361457315663073
loan_amnt out_prncp 0.9628116324949524
loan_amnt out_prncp_inv 0.9628298134887975
funded_amnt funded_amnt_inv 0.999997083401125
funded_amnt installment 0.9361457315663073
funded_amnt out_prncp 0.9628116324949524
funded_amnt out_prncp_inv 0.9628298134887975
funded_amnt_inv installment 0.936136708511857
funded_amnt_inv out_prncp 0.9628082995548731
funded_amnt_inv out_prncp_inv 0.9628319067457652
int_rate grade 0.9644165592939707
installment out_prncp 0.8917068406731731
installment out_prncp_inv 0.8917195745051264
open_acc total_acc 0.739408034833447
open_acc num_op_rev_tl 0.8542594490286857
open_acc num_rev_accts 0.7183869384158287
open_acc num_sats 0.9996842051086133
pub_rec pub_rec_bankruptcies 0.9997149746017331
revol_bal total_rev_hi_lim 0.7670091267372624
revol_util bc_util 0.89864728747148
revol_util percent_bc_gt_75 0.7620369638540233
total_acc num_il_tl 0.727778251003140

['funded_amnt',
 'funded_amnt_inv',
 'installment',
 'out_prncp',
 'out_prncp_inv',
 'grade',
 'total_acc',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_sats',
 'pub_rec_bankruptcies',
 'total_rev_hi_lim',
 'bc_util',
 'percent_bc_gt_75',
 'num_il_tl',
 'total_pymnt_inv',
 'total_rec_prncp',
 'last_pymnt_amnt',
 'avg_cur_bal',
 'tot_hi_cred_lim',
 'num_tl_op_past_12m',
 'open_il_24m',
 'total_bal_ex_mort',
 'total_il_high_credit_limit',
 'open_rv_24m',
 'acc_open_past_24mths',
 'bc_open_to_buy',
 'total_bc_limit',
 'oldest_cr_age',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_rev_tl_bal_gt_0',
 'num_bc_tl',
 'purpose_major_purchase']

In [15]:
# Drop highly correlated columns, keeping only 1 column per correlation
df_num.drop(cor_cols, axis=1, inplace=True)
df_num.shape

(84198, 48)

In [16]:
# Find integer columns
df_int = df_num % 1
int_cols = df_int.nunique()[df_int.nunique() < 2].index

# Convert above columns to integer type
for col in int_cols:
    df_num[col] = df_num[col].astype(int)
    
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84198 entries, 80 to 115674
Data columns (total 48 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   84198 non-null  int64  
 1   term                        84198 non-null  int64  
 2   int_rate                    84198 non-null  float64
 3   emp_length                  84198 non-null  int64  
 4   home_ownership              84198 non-null  int64  
 5   annual_inc                  84198 non-null  float64
 6   dti                         84198 non-null  float64
 7   delinq_2yrs                 84198 non-null  int64  
 8   inq_last_6mths              84198 non-null  int64  
 9   open_acc                    84198 non-null  int64  
 10  pub_rec                     84198 non-null  int64  
 11  revol_bal                   84198 non-null  int64  
 12  revol_util                  84198 non-null  float64
 13  total_pymnt                 8

In [17]:
# Reset index
df_num.reset_index(drop=True, inplace=True)
df_num.head(3)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,num_accts_ever_120_pd,num_actv_bc_tl,num_tl_90g_dpd_24m,pct_tl_nvr_dlq,issue_month,last_credit_pull_month,whole_loan,joint_app,high_risk,purpose_debt
0,35000,60,13.08,1,1,125890.0,30.48,0,0,14,...,0,5,0,100.0,3,4,1,0,0,1
1,20000,36,22.5,10,0,75000.0,24.37,0,0,8,...,0,6,0,100.0,3,4,1,0,0,1
2,10500,36,17.19,9,0,66000.0,27.24,0,0,8,...,0,1,0,85.7,3,4,1,0,0,1


### Data preprocessing

In [18]:
# Count target labels
df_num['high_risk'].value_counts()

0    83748
1      450
Name: high_risk, dtype: int64

In [19]:
# Feature/target split
X = df_num.drop('high_risk', axis=1).copy()
y = df_num['high_risk'].copy()

# Train/validation/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((50518, 47), (50518,), (16840, 47), (16840,), (16840, 47), (16840,))

In [20]:
# Count target labels in each set
print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())

0    50248
1      270
Name: high_risk, dtype: int64
0    16750
1       90
Name: high_risk, dtype: int64
0    16750
1       90
Name: high_risk, dtype: int64


In [21]:
# Scale data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_train_scaled.head(3)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,mths_since_recent_bc,num_accts_ever_120_pd,num_actv_bc_tl,num_tl_90g_dpd_24m,pct_tl_nvr_dlq,issue_month,last_credit_pull_month,whole_loan,joint_app,purpose_debt
0,-0.972387,-0.703006,0.732672,-1.251658,-1.268834,-0.154803,-0.055052,-0.309825,-0.607667,-0.013543,...,0.406336,-0.317836,0.477172,-0.137288,0.589169,-0.975984,0.147108,-2.695437,-0.391972,0.431996
1,0.370574,1.422463,-0.044268,-1.251658,-1.268834,0.539986,-0.769578,-0.309825,-0.607667,0.657547,...,-0.638417,-0.317836,0.072094,-0.137288,0.589169,0.371104,0.147108,-2.695437,2.5512,-2.314836
2,-0.300906,1.422463,0.07622,-0.708172,-1.268834,-0.306232,0.666918,-0.309825,-0.607667,0.657547,...,-0.730602,-0.317836,0.072094,-0.137288,0.589169,0.371104,0.147108,-2.695437,-0.391972,0.431996


### Baseline machine learning

In [22]:
# Logistic regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train)

# Evaluate model
lr_val_pred = lr.predict(X_val_scaled)
print(classification_report(y_val, lr_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.77      0.11      0.19        90

    accuracy                           1.00     16840
   macro avg       0.88      0.56      0.60     16840
weighted avg       0.99      1.00      0.99     16840

Accuracy: 0.9950712589073634


Unnamed: 0,Predicted 0,Predicted 1
0,16747,3
1,80,10


In [23]:
# Decision tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_scaled, y_train)

# Evaluate model
tree_val_pred = tree.predict(X_val_scaled)
print(classification_report(y_val, tree_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.38      0.39      0.39        90

    accuracy                           0.99     16840
   macro avg       0.69      0.69      0.69     16840
weighted avg       0.99      0.99      0.99     16840

Accuracy: 0.9934085510688836


Unnamed: 0,Predicted 0,Predicted 1
0,16694,56
1,55,35


### Undersampling

In [24]:
# Random undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train_scaled, y_train)
Counter(y_rus)

Counter({0: 270, 1: 270})

In [25]:
# Logistic regression
lr_rus = LogisticRegression(random_state=42)
lr_rus.fit(X_rus, y_rus)

# Evaluate model
lr_rus_val_pred = lr_rus.predict(X_val_scaled)
print(classification_report(y_val, lr_rus_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_rus_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_rus_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.75      0.85     16750
           1       0.01      0.62      0.03        90

    accuracy                           0.75     16840
   macro avg       0.51      0.68      0.44     16840
weighted avg       0.99      0.75      0.85     16840

Accuracy: 0.7466152019002376


Unnamed: 0,Predicted 0,Predicted 1
0,12517,4233
1,34,56


In [26]:
# Decision tree
tree_rus = DecisionTreeClassifier(random_state=42)
tree_rus.fit(X_rus, y_rus)

# Evaluate model
tree_rus_val_pred = tree_rus.predict(X_val_scaled)
print(classification_report(y_val, tree_rus_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_rus_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_rus_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.72      0.83     16750
           1       0.01      0.69      0.03        90

    accuracy                           0.72     16840
   macro avg       0.51      0.70      0.43     16840
weighted avg       0.99      0.72      0.83     16840

Accuracy: 0.7175771971496437


Unnamed: 0,Predicted 0,Predicted 1
0,12022,4728
1,28,62


In [27]:
# Cluster centroid undersampling
ccu = ClusterCentroids(random_state=42)
X_ccu, y_ccu = ccu.fit_resample(X_train_scaled, y_train)
Counter(y_ccu)

Counter({0: 270, 1: 270})

In [28]:
# Logistic regression
lr_ccu = LogisticRegression(random_state=42)
lr_ccu.fit(X_ccu, y_ccu)

# Evaluate model
lr_ccu_val_pred = lr_ccu.predict(X_val_scaled)
print(classification_report(y_val, lr_ccu_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_ccu_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_ccu_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.73      0.85     16750
           1       0.01      0.69      0.03        90

    accuracy                           0.73     16840
   macro avg       0.51      0.71      0.44     16840
weighted avg       0.99      0.73      0.84     16840

Accuracy: 0.7326009501187648


Unnamed: 0,Predicted 0,Predicted 1
0,12275,4475
1,28,62


In [29]:
# Decision tree
tree_ccu = DecisionTreeClassifier(random_state=42)
tree_ccu.fit(X_ccu, y_ccu)

# Evaluate model
tree_ccu_val_pred = tree_ccu.predict(X_val_scaled)
print(classification_report(y_val, tree_ccu_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_ccu_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_ccu_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      0.02      0.04     16750
           1       0.01      0.98      0.01        90

    accuracy                           0.03     16840
   macro avg       0.50      0.50      0.03     16840
weighted avg       0.99      0.03      0.04     16840

Accuracy: 0.02684085510688836


Unnamed: 0,Predicted 0,Predicted 1
0,364,16386
1,2,88


### Oversampling

In [30]:
# Random oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train_scaled, y_train)
Counter(y_ros)

Counter({0: 50248, 1: 50248})

In [31]:
# Logistic regression
lr_ros = LogisticRegression(random_state=42)
lr_ros.fit(X_ros, y_ros)

# Evaluate model
lr_ros_val_pred = lr_ros.predict(X_val_scaled)
print(classification_report(y_val, lr_ros_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_ros_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_ros_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.82      0.90     16750
           1       0.02      0.62      0.04        90

    accuracy                           0.82     16840
   macro avg       0.51      0.72      0.47     16840
weighted avg       0.99      0.82      0.90     16840

Accuracy: 0.8182897862232779


Unnamed: 0,Predicted 0,Predicted 1
0,13724,3026
1,34,56


In [32]:
# Decision tree
tree_ros = DecisionTreeClassifier(random_state=42)
tree_ros.fit(X_ros, y_ros)

# Evaluate model
tree_ros_val_pred = tree_ros.predict(X_val_scaled)
print(classification_report(y_val, tree_ros_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_ros_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_ros_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.36      0.32      0.34        90

    accuracy                           0.99     16840
   macro avg       0.68      0.66      0.67     16840
weighted avg       0.99      0.99      0.99     16840

Accuracy: 0.9933491686460808


Unnamed: 0,Predicted 0,Predicted 1
0,16699,51
1,61,29


In [33]:
# SMOTE
smo = SMOTE(random_state=42)
X_smo, y_smo = smo.fit_resample(X_train_scaled, y_train)
Counter(y_smo)

Counter({0: 50248, 1: 50248})

In [34]:
# Logistic regression
lr_smo = LogisticRegression(random_state=42)
lr_smo.fit(X_smo, y_smo)

# Evaluate model
lr_smo_val_pred = lr_smo.predict(X_val_scaled)
print(classification_report(y_val, lr_smo_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_smo_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_smo_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.83      0.91     16750
           1       0.02      0.67      0.04        90

    accuracy                           0.83     16840
   macro avg       0.51      0.75      0.47     16840
weighted avg       0.99      0.83      0.90     16840

Accuracy: 0.8291567695961995


Unnamed: 0,Predicted 0,Predicted 1
0,13903,2847
1,30,60


In [35]:
# Decision tree
tree_smo = DecisionTreeClassifier(random_state=42)
tree_smo.fit(X_smo, y_smo)

# Evaluate model
tree_smo_val_pred = tree_smo.predict(X_val_scaled)
print(classification_report(y_val, tree_smo_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_smo_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_smo_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     16750
           1       0.21      0.36      0.27        90

    accuracy                           0.99     16840
   macro avg       0.61      0.67      0.63     16840
weighted avg       0.99      0.99      0.99     16840

Accuracy: 0.9896080760095012


Unnamed: 0,Predicted 0,Predicted 1
0,16633,117
1,58,32


### Combination sampling

In [36]:
# SMOTEENN
sen = SMOTEENN(random_state=42)
X_sen, y_sen = sen.fit_resample(X_train_scaled, y_train)
Counter(y_sen)

Counter({0: 46836, 1: 50248})

In [37]:
# Logistic regression`
lr_sen = LogisticRegression(random_state=42)
lr_sen.fit(X_sen, y_sen)

# Evaluate model
lr_sen_val_pred = lr_sen.predict(X_val_scaled)
print(classification_report(y_val, lr_sen_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_sen_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_sen_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.82      0.90     16750
           1       0.02      0.69      0.04        90

    accuracy                           0.81     16840
   macro avg       0.51      0.75      0.47     16840
weighted avg       0.99      0.81      0.89     16840

Accuracy: 0.8146080760095011


Unnamed: 0,Predicted 0,Predicted 1
0,13656,3094
1,28,62


In [38]:
# Decision tree
tree_sen = DecisionTreeClassifier(random_state=42)
tree_sen.fit(X_sen, y_sen)

# Evaluate model
tree_sen_val_pred = tree_sen.predict(X_val_scaled)
print(classification_report(y_val, tree_sen_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_sen_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_sen_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     16750
           1       0.18      0.38      0.25        90

    accuracy                           0.99     16840
   macro avg       0.59      0.68      0.62     16840
weighted avg       0.99      0.99      0.99     16840

Accuracy: 0.9877672209026128


Unnamed: 0,Predicted 0,Predicted 1
0,16600,150
1,56,34


### Sampling comparison

In [39]:
def clf_report(y_true, y_pred):
    
    """
    Using the confusion matrix and classification report, create a custom classification report 
    with the following values: true positives, false negatives, false positives, true negatives, 
    and the macro average F1 score, as well as the precision, recall, and F1 score for both classes. 
    In this case, the positive class is labeled 0 (low risk) and the negative class is labeled 1 
    (high risk).
    
    Parameters
    ----------
    y_true : list-like
        True target labels
    y_pred : list-like
        Predicted target labels
    
    Returns
    -------
    Dict
        Custom classification report with the 11 listed values
    """
    
    # Confusion matrix and classification report
    confusion_mat = confusion_matrix(y_true, y_pred)
    clf_rep = classification_report(y_true, y_pred, output_dict=True)
    
    # Report values: true 0, false 1, false 0, true 1, F1 for 0, F1 for 1, accuracy
    report = confusion_mat.ravel().tolist()
    report.extend([clf_rep['0']['precision'], clf_rep['0']['recall'], clf_rep['0']['f1-score'], 
                   clf_rep['1']['precision'], clf_rep['1']['recall'], clf_rep['1']['f1-score'], 
                   clf_rep['macro avg']['f1-score']])
    
    # Add report keys
    keys = ['true_pos', 'false_neg', 'false_pos', 'true_neg', 
            'precision_pos', 'recall_pos', 'f1_pos', 
            'precision_neg', 'recall_neg', 'f1_neg', 'f1_avg']
    report = dict(zip(keys, report))
    return report
    

# Test function
clf_report(y_val, lr_val_pred)

{'true_pos': 16747,
 'false_neg': 3,
 'false_pos': 80,
 'true_neg': 10,
 'precision_pos': 0.9952457360194925,
 'recall_pos': 0.9998208955223881,
 'f1_pos': 0.9975280698096911,
 'precision_neg': 0.7692307692307693,
 'recall_neg': 0.1111111111111111,
 'f1_neg': 0.1941747572815534,
 'f1_avg': 0.5958514135456222}

In [40]:
# Logistic regression comparison
trials = ['base', 'rand_undersamp', 'centroid_undersamp', 'rand_oversamp', 'smote', 'smoteenn']
lr_preds = [lr_val_pred, lr_rus_val_pred, lr_ccu_val_pred, lr_ros_val_pred, lr_smo_val_pred, lr_sen_val_pred]
lr_reports = [clf_report(y_val, y_pred) for y_pred in lr_preds]
lr_reports_df = pd.DataFrame(lr_reports, index=trials)
lr_reports_df

Unnamed: 0,true_pos,false_neg,false_pos,true_neg,precision_pos,recall_pos,f1_pos,precision_neg,recall_neg,f1_neg,f1_avg
base,16747,3,80,10,0.995246,0.999821,0.997528,0.769231,0.111111,0.194175,0.595851
rand_undersamp,12517,4233,34,56,0.997291,0.747284,0.854374,0.013057,0.622222,0.025577,0.439975
centroid_undersamp,12275,4475,28,62,0.997724,0.732836,0.845007,0.013665,0.688889,0.026799,0.435903
rand_oversamp,13724,3026,34,56,0.997529,0.819343,0.899698,0.01817,0.622222,0.035309,0.467504
smote,13903,2847,30,60,0.997847,0.83003,0.906235,0.02064,0.666667,0.04004,0.473137
smoteenn,13656,3094,28,62,0.997954,0.815284,0.897417,0.019645,0.688889,0.038201,0.467809


In [41]:
# Decision tree comparison
tree_preds = [tree_val_pred, tree_rus_val_pred, tree_ccu_val_pred, 
              tree_ros_val_pred, tree_smo_val_pred, tree_sen_val_pred]
tree_reports = [clf_report(y_val, y_pred) for y_pred in tree_preds]
tree_reports_df = pd.DataFrame(tree_reports, index=trials)
tree_reports_df

Unnamed: 0,true_pos,false_neg,false_pos,true_neg,precision_pos,recall_pos,f1_pos,precision_neg,recall_neg,f1_neg,f1_avg
base,16694,56,55,35,0.996716,0.996657,0.996686,0.384615,0.388889,0.38674,0.691713
rand_undersamp,12022,4728,28,62,0.997676,0.717731,0.834861,0.012944,0.688889,0.02541,0.430135
centroid_undersamp,364,16386,2,88,0.994536,0.021731,0.042533,0.005342,0.977778,0.010625,0.026579
rand_oversamp,16699,51,61,29,0.99636,0.996955,0.996658,0.3625,0.322222,0.341176,0.668917
smote,16633,117,58,32,0.996525,0.993015,0.994767,0.214765,0.355556,0.267782,0.631275
smoteenn,16600,150,56,34,0.996638,0.991045,0.993833,0.184783,0.377778,0.248175,0.621004


### Features

Both models had the highest precision on high-risk loans and average F1 score on the base training set (without sampling), but missed over 60% of the high-risked loans. On the other hand, the decision tree model on the cluster-centroid undersampled training set was able to catch 88 out of 90 (97.78%) high-risk loans, but at the cost of an extremely high number of false negatives.

Neither of these cases is ideal. While the priority for credit risk detection is a high recall, we don't want to sacrifice too much precision for it. Since the centroid undersampled decision tree model showed preference for the negative class, we may be able to improve its low precision by tuning the model.

In [42]:
# Top 12 feature coefficients for baseline logistic regression
lr_feats = sorted(zip(np.abs(lr.coef_[0]), X_train_scaled.columns), reverse=True)
lr_feats = np.array(lr_feats[:12]) # top 12
lr_feats

array([['4.36222622486425', 'total_rec_int'],
       ['1.7627231190860662', 'loan_amnt'],
       ['1.215271833592626', 'issue_month'],
       ['1.1459571251375447', 'int_rate'],
       ['0.9028134800222859', 'total_pymnt'],
       ['0.3664224952062791', 'dti'],
       ['0.2847431168810585', 'open_acc'],
       ['0.21361262494723082', 'whole_loan'],
       ['0.20241965908902826', 'home_ownership'],
       ['0.19150365212103154', 'total_rec_late_fee'],
       ['0.18685945955305044', 'mo_sin_old_rev_tl_op'],
       ['0.17521197785860346', 'term']], dtype='<U32')

In [43]:
# Top 12 feature coefficients for baseline decision tree
tree_feats = sorted(zip(np.abs(tree.feature_importances_), X_train_scaled.columns), reverse=True)
tree_feats = np.array(tree_feats[:12]) # top 12
tree_feats

array([['0.3940448530258009', 'total_pymnt'],
       ['0.051021135723275324', 'issue_month'],
       ['0.04275239257995762', 'dti'],
       ['0.04275155898948167', 'loan_amnt'],
       ['0.04033817747296305', 'total_rec_int'],
       ['0.03985775471352417', 'mths_since_rcnt_il'],
       ['0.03714856892664318', 'max_bal_bc'],
       ['0.028128838474578095', 'revol_bal'],
       ['0.0273539718647938', 'mo_sin_old_rev_tl_op'],
       ['0.0261873693213452', 'revol_util'],
       ['0.025343898052278144', 'mo_sin_old_il_acct'],
       ['0.02054943633264685', 'total_bal_il']], dtype='<U32')

In [44]:
# Features in both baseline sets
base_feats = np.intersect1d(lr_feats[:, 1], tree_feats[:, 1])
print(len(base_feats))
base_feats

6


array(['dti', 'issue_month', 'loan_amnt', 'mo_sin_old_rev_tl_op',
       'total_pymnt', 'total_rec_int'], dtype='<U32')

In [45]:
# Top 12 feature coefficients for cluster-centroid undersampled decision tree
ccu_feats = sorted(zip(np.abs(tree_ccu.feature_importances_), X_train_scaled.columns), reverse=True)
ccu_feats = np.array(ccu_feats[:12]) # top 12
ccu_feats

array([['0.7029199090127748', 'tot_coll_amt'],
       ['0.16549342524266558', 'delinq_2yrs'],
       ['0.04913527232855835', 'total_pymnt'],
       ['0.014080558149185575', 'dti'],
       ['0.013847821650851999', 'delinq_amnt'],
       ['0.011111111111111113', 'mort_acc'],
       ['0.010856658184902447', 'term'],
       ['0.009382142139755785', 'revol_bal'],
       ['0.0073125765066633245', 'num_tl_90g_dpd_24m'],
       ['0.00725034181298961', 'annual_inc'],
       ['0.004938271604938272', 'num_actv_bc_tl'],
       ['0.0036719122556032216', 'total_rec_int']], dtype='<U32')

In [46]:
# Features in all 3 sets
feats = np.intersect1d(base_feats, ccu_feats[:, 1])
feats

array(['dti', 'total_pymnt', 'total_rec_int'], dtype='<U32')

### Machine learning on selected features

In [47]:
# Baseline logistic regression
lr_sel = LogisticRegression(random_state=42)
lr_sel.fit(X_train_scaled[lr_feats[:, 1]], y_train)

# Evaluate model
lr_sel_val_pred = lr_sel.predict(X_val_scaled[lr_feats[:, 1]])
print(classification_report(y_val, lr_sel_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_sel_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.67      0.09      0.16        90

    accuracy                           0.99     16840
   macro avg       0.83      0.54      0.58     16840
weighted avg       0.99      0.99      0.99     16840



Unnamed: 0,Predicted 0,Predicted 1
0,16746,4
1,82,8


In [48]:
# Baseline decision tree
tree_sel = DecisionTreeClassifier(random_state=42)
tree_sel.fit(X_train_scaled[tree_feats[:, 1]], y_train)

# Evaluate model
tree_sel_val_pred = tree_sel.predict(X_val_scaled[tree_feats[:, 1]])
print(classification_report(y_val, tree_sel_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_sel_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.52      0.50      0.51        90

    accuracy                           0.99     16840
   macro avg       0.76      0.75      0.75     16840
weighted avg       0.99      0.99      0.99     16840



Unnamed: 0,Predicted 0,Predicted 1
0,16709,41
1,45,45


In [49]:
# Cluster centroid undersampling with selected features
X_ccu_sel = pd.DataFrame(X_ccu, columns=X_train_scaled.columns)[ccu_feats[:, 1]]
X_ccu_sel.head(3)

Unnamed: 0,tot_coll_amt,delinq_2yrs,total_pymnt,dti,delinq_amnt,mort_acc,term,revol_bal,num_tl_90g_dpd_24m,annual_inc,num_actv_bc_tl,total_rec_int
0,-0.092056,-0.167721,-0.220181,0.469734,-0.008585,0.024241,-0.38039,1.105764,-0.115606,0.087608,1.743042,-0.298885
1,-0.042557,-0.218356,-0.241683,-0.230097,-0.008585,-0.706678,-0.613427,-0.406936,-0.100071,-0.227753,-0.301943,-0.65002
2,0.029739,-0.070041,0.087908,-0.01763,-0.008585,0.457796,1.026189,-0.060181,-0.082409,0.140037,-0.03318,1.463715


In [50]:
# Decision tree with cluster-centroid undersampling
ccu_sel = DecisionTreeClassifier(random_state=42)
ccu_sel.fit(X_ccu_sel, y_ccu)

# Evaluate model
ccu_sel_val_pred = ccu_sel.predict(X_val_scaled[ccu_feats[:, 1]])
print(classification_report(y_val, ccu_sel_val_pred))
pd.DataFrame(confusion_matrix(y_val, ccu_sel_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      0.02      0.03     16750
           1       0.01      0.98      0.01        90

    accuracy                           0.02     16840
   macro avg       0.50      0.50      0.02     16840
weighted avg       0.99      0.02      0.03     16840



Unnamed: 0,Predicted 0,Predicted 1
0,266,16484
1,2,88


### Decision tree tuning

In [51]:
# Parameters to tune
tree_params = {
    'max_depth': [10, 20, 50, 100, None], 
    'min_samples_split': [2, 8, 16, 32, 64], 
    'min_samples_leaf': [1, 4, 8, 16, 32]
}

# Grid search for baseline decision tree
tree_search = GridSearchCV(DecisionTreeClassifier(random_state=42), tree_params, 
                           cv=5, scoring='f1_macro', n_jobs=-1, verbose=2)
tree_search.fit(X_train_scaled[tree_feats[:, 1]], y_train)
print(tree_search.best_score_)
tree_search.best_params_

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:  3.3min finished


0.8087906001938865


{'max_depth': 50, 'min_samples_leaf': 4, 'min_samples_split': 32}

In [52]:
# Evaluate best model
tree_search_val_pred = tree_search.best_estimator_.predict(X_val_scaled[tree_feats[:, 1]])
print(classification_report(y_val, tree_search_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_search_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.77      0.49      0.60        90

    accuracy                           1.00     16840
   macro avg       0.88      0.74      0.80     16840
weighted avg       1.00      1.00      1.00     16840



Unnamed: 0,Predicted 0,Predicted 1
0,16737,13
1,46,44


In [53]:
# Parameters to tune
ccu_params = {
    'max_depth': [10, 20, 50, 100, None], 
    'min_samples_split': [2, 8, 16, 32, 64], 
    'min_samples_leaf': [1, 4, 8, 16, 32]
}

# Grid search for decision tree with cluster-centroid undersampling
ccu_search = GridSearchCV(DecisionTreeClassifier(random_state=42), ccu_params, 
                           cv=5, scoring='f1_macro', n_jobs=-1, verbose=2)
ccu_search.fit(X_ccu_sel, y_ccu)
print(ccu_search.best_score_)
ccu_search.best_params_

Fitting 5 folds for each of 125 candidates, totalling 625 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:    0.9s


0.9647952675311166


[Parallel(n_jobs=-1)]: Done 625 out of 625 | elapsed:    4.0s finished


{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}

In [54]:
# Evaluate best model
ccu_search_val_pred = ccu_search.best_estimator_.predict(X_val_scaled[ccu_feats[:, 1]])
print(classification_report(y_val, ccu_search_val_pred))
pd.DataFrame(confusion_matrix(y_val, ccu_search_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      0.02      0.03     16750
           1       0.01      0.98      0.01        90

    accuracy                           0.02     16840
   macro avg       0.50      0.50      0.02     16840
weighted avg       0.99      0.02      0.03     16840



Unnamed: 0,Predicted 0,Predicted 1
0,266,16484
1,2,88


### Ensemble classifiers

In [57]:
# Parameters to tune
brf_params = {
    'max_depth': [10, 20, None], 
    'min_samples_split': [2, 8, 32], 
    'min_samples_leaf': [1, 4, 16], 
    'max_features': ['auto', 0.5, None],
    'max_samples': [0.5, 0.75, None]
}

# Grid search for balanced random forest
brf_search = GridSearchCV(BalancedRandomForestClassifier(random_state=42), brf_params, 
                          cv=5, scoring='f1_macro', n_jobs=-1, verbose=2)
brf_search.fit(X_train_scaled[tree_feats[:, 1]], y_train)
print(brf_search.best_score_)
brf_search.best_params_

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed: 18.6min finished


0.5407847276289501


{'max_depth': 10,
 'max_features': 'auto',
 'max_samples': 0.5,
 'min_samples_leaf': 16,
 'min_samples_split': 2}

In [58]:
# Evaluate best model
brf_search_val_pred = brf_search.best_estimator_.predict(X_val_scaled[tree_feats[:, 1]])
print(classification_report(y_val, brf_search_val_pred))
pd.DataFrame(confusion_matrix(y_val, brf_search_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.96      0.98     16750
           1       0.06      0.48      0.11        90

    accuracy                           0.96     16840
   macro avg       0.53      0.72      0.55     16840
weighted avg       0.99      0.96      0.97     16840



Unnamed: 0,Predicted 0,Predicted 1
0,16117,633
1,47,43


In [65]:
# Ensemble AdaBoost
eab = EasyEnsembleClassifier(n_estimators=100, random_state=42, n_jobs=-1, verbose=2)
eab.fit(X_train_scaled, y_train)

# Evaluate model
eab_val_pred = eab.predict(X_val_scaled)
print(classification_report(y_val, eab_val_pred))
pd.DataFrame(confusion_matrix(y_val, eab_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   15.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


              precision    recall  f1-score   support

           0       1.00      0.88      0.94     16750
           1       0.03      0.74      0.06        90

    accuracy                           0.88     16840
   macro avg       0.52      0.81      0.50     16840
weighted avg       0.99      0.88      0.93     16840



[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.5s finished


Unnamed: 0,Predicted 0,Predicted 1
0,14796,1954
1,23,67


### Best model

In [66]:
# Best model
dt = tree_search.best_estimator_

# Train, validation, test sets
train = X_train_scaled[tree_feats[:, 1]]
val = X_val_scaled[tree_feats[:, 1]]
train = X_train_scaled[tree_feats[:, 1]]


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     50248
           1       0.88      0.58      0.70       270

    accuracy                           1.00     50518
   macro avg       0.94      0.79      0.85     50518
weighted avg       1.00      1.00      1.00     50518



Unnamed: 0,Predicted 0,Predicted 1
0,50227,21
1,114,156


In [66]:
# Evaluate on training set
train_pred = dt.predict(X_train_scaled[tree_feats[:, 1]])
print(classification_report(y_train, train_pred))
pd.DataFrame(confusion_matrix(y_train, train_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     50248
           1       0.88      0.58      0.70       270

    accuracy                           1.00     50518
   macro avg       0.94      0.79      0.85     50518
weighted avg       1.00      1.00      1.00     50518



Unnamed: 0,Predicted 0,Predicted 1
0,50227,21
1,114,156


In [67]:
# Evaluate on validation set
val_pred = dt.predict(X_val_scaled[tree_feats[:, 1]])
print(classification_report(y_val, val_pred))
pd.DataFrame(confusion_matrix(y_val, val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.77      0.49      0.60        90

    accuracy                           1.00     16840
   macro avg       0.88      0.74      0.80     16840
weighted avg       1.00      1.00      1.00     16840



Unnamed: 0,Predicted 0,Predicted 1
0,16737,13
1,46,44


In [68]:
# Evaluate on test set
test_pred = dt.predict(X_test_scaled[tree_feats[:, 1]])
print(classification_report(y_test, test_pred))
pd.DataFrame(confusion_matrix(y_test, test_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.72      0.51      0.60        90

    accuracy                           1.00     16840
   macro avg       0.86      0.76      0.80     16840
weighted avg       1.00      1.00      1.00     16840



Unnamed: 0,Predicted 0,Predicted 1
0,16732,18
1,44,46
