# Lending Tree Credit Risk

### Dependencies and data

In [1]:
# Dependencies
from pathlib import Path
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier

# print(mpl.style.available)
mpl.style.use('Solarize_Light2')
%matplotlib inline

In [2]:
# Inspect top rows in data
with open(Path('data/loans_1q19.csv')) as f:
    for i in range(5):
        print(f.readline()[:100])

Notes offered by Prospectus (https://www.lendingclub.com/info/prospectus.action)

"id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade"
"","","20000","20000","20000"," 60 months"," 17.19%","499.1","C","C5","Front desk supervisor","6 yea
"","","21225","21225","21225"," 60 months"," 14.74%","502.05","C","C2","ceo","10+ years","MORTGAGE",
"","","5000","5000","5000"," 36 months"," 17.97%","180.69","D","D1","","n/a","MORTGAGE","62000","Not


In [3]:
# Data
df = pd.read_csv(Path('data/loans_1q19.csv'), skiprows=1, low_memory=False)
print(df.shape)
df.head(3)

(115677, 144)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,20000.0,20000.0,20000.0,60 months,17.19%,499.1,C,C5,...,,,,N,,,,,,
1,,,21225.0,21225.0,21225.0,60 months,14.74%,502.05,C,C2,...,,,,N,,,,,,
2,,,5000.0,5000.0,5000.0,36 months,17.97%,180.69,D,D1,...,,,,N,,,,,,


### Drop unusable data

In [4]:
# Drop rows and columns with more than 10% its values missing
df.dropna(axis=1, thresh=df.shape[0]*0.9, inplace=True) # drop cols
df.dropna(axis=0, thresh=df.shape[1]*0.9, inplace=True) # drop rows
df.shape

(115674, 97)

In [5]:
# Drop constant columns
const_cols = df.nunique()[df.nunique() < 2].index # cols w/ 1 unique val
df.drop(const_cols, axis=1, inplace=True)
df.shape

(115674, 87)

In [6]:
# Drop newly issued loans
df = df[df['loan_status'] != 'Issued']
df.shape

(96839, 87)

### Convert all columns to numeric

In [7]:
# Inspect non-numeric columns
df_num = df.copy() # make a copy
obj_cols = df_num.dtypes[df_num.dtypes == object].index
df[obj_cols].head(3)

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,next_pymnt_d,last_credit_pull_d,application_type
80,60 months,13.08%,B,B5,< 1 year,MORTGAGE,Verified,Mar-2019,Fully Paid,debt_consolidation,Debt consolidation,240xx,VA,Jul-2005,66.7%,w,,Apr-2019,Individual
82,36 months,22.50%,D,D3,10+ years,RENT,Not Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,957xx,CA,Aug-2002,95.6%,w,May-2019,Apr-2019,Individual
93,36 months,17.19%,C,C5,9 years,RENT,Source Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,765xx,TX,Apr-2009,43.5%,w,May-2019,Apr-2019,Individual


In [8]:
""" String manipulation """

# Convert `term` to numeric
df_num['term'] = df['term'].str.replace(' months', '').astype(float)

# Convert `int_rate` to numeric
df_num['int_rate'] = df['int_rate'].str.replace('%', '').astype(float)

# Convert `emp_length` to numeric
df_num['emp_length'] = df['emp_length'].str.extract('(\d+)').astype(float)

# Convert `revol_util` to numeric
df_num['revol_util'] = df['revol_util'].str.replace('%', '').astype(float)

df_num[obj_cols].head(3)

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,next_pymnt_d,last_credit_pull_d,application_type
80,60.0,13.08,B,B5,1.0,MORTGAGE,Verified,Mar-2019,Fully Paid,debt_consolidation,Debt consolidation,240xx,VA,Jul-2005,66.7,w,,Apr-2019,Individual
82,36.0,22.5,D,D3,10.0,RENT,Not Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,957xx,CA,Aug-2002,95.6,w,May-2019,Apr-2019,Individual
93,36.0,17.19,C,C5,9.0,RENT,Source Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,765xx,TX,Apr-2009,43.5,w,May-2019,Apr-2019,Individual


In [9]:
""" Datetime manipulation """

# Create a new column for `issue_d` as numeric type
df_num['issue_month'] = pd.to_datetime(df['issue_d']).dt.month

# Create a new column for `earliest_cr_line` as numeric type
df_num['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line']) # convert to dt
youngest_cr = df_num['earliest_cr_line'].max() # latest date in data
df_num['oldest_cr_age'] = (youngest_cr - df_num['earliest_cr_line']).dt.days # oldest credit age

# Create a new column for 'last_credit_pull_d' as numeric type
df_num['last_credit_pull_month'] = pd.to_datetime(df['last_credit_pull_d']).dt.month
df_num['last_credit_pull_month'] = df_num['last_credit_pull_month'].replace(12, 0) # set Dec 2018 as month 0

df_num[obj_cols].head(3)

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,next_pymnt_d,last_credit_pull_d,application_type
80,60.0,13.08,B,B5,1.0,MORTGAGE,Verified,Mar-2019,Fully Paid,debt_consolidation,Debt consolidation,240xx,VA,2005-07-01,66.7,w,,Apr-2019,Individual
82,36.0,22.5,D,D3,10.0,RENT,Not Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,957xx,CA,2002-08-01,95.6,w,May-2019,Apr-2019,Individual
93,36.0,17.19,C,C5,9.0,RENT,Source Verified,Mar-2019,Current,debt_consolidation,Debt consolidation,765xx,TX,2009-04-01,43.5,w,May-2019,Apr-2019,Individual


In [10]:
""" Numeric mapping """

# Convert `grade` to numeric
grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7} # num mapping
df_num['grade'] = df['grade'].map(grade_mapping).astype(float)

# Convert `home_ownership` to numeric
home_mapping = dict.fromkeys(['RENT', 'ANY', 'NONE'], 0) # num mapping
home_mapping.update(dict.fromkeys(['MORTGAGE', 'OWN'], 1)) # add 1 label
df_num['home_ownership'] = df['home_ownership'].map(home_mapping).astype(float)

# Convert `verification_state` to numeric
df_num['verification_status'] = df['verification_status'].str.replace('Source ', '') # combine verified labels
veri_mapping = {'Not Verified': 0, 'Verified': 1} # num mapping
df_num['verification_status'] = df['verification_status'].map(veri_mapping).astype(float)

# Create a new column for `initial_list_status` as numeric type
init_mapping = {'f': 0, 'w': 1} # num mapping
df_num['whole_loan'] = df['initial_list_status'].map(init_mapping).astype(float)

# Create a new column for `application_type` as numeric type
app_mapping = {'Individual': 0, 'Joint App': 1} # num mapping
df_num['joint_app'] = df['application_type'].map(app_mapping).astype(float)

# Create a new column for `loan_status` as numeric type
stat_mapping = dict.fromkeys(['Charged Off', 'In Grace Period', # num mapping
                              'Late (16-30 days)', 'Late (31-120 days)'], 1) # high risk
stat_mapping.update(dict.fromkeys(['Fully Paid', 'Current'], 0)) # low risk
df_num['high_risk'] = df['loan_status'].map(stat_mapping).astype(float)

df_num[obj_cols].head(3)

Unnamed: 0,term,int_rate,grade,sub_grade,emp_length,home_ownership,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,earliest_cr_line,revol_util,initial_list_status,next_pymnt_d,last_credit_pull_d,application_type
80,60.0,13.08,2.0,B5,1.0,1.0,1.0,Mar-2019,Fully Paid,debt_consolidation,Debt consolidation,240xx,VA,2005-07-01,66.7,w,,Apr-2019,Individual
82,36.0,22.5,4.0,D3,10.0,0.0,0.0,Mar-2019,Current,debt_consolidation,Debt consolidation,957xx,CA,2002-08-01,95.6,w,May-2019,Apr-2019,Individual
93,36.0,17.19,3.0,C5,9.0,0.0,,Mar-2019,Current,debt_consolidation,Debt consolidation,765xx,TX,2009-04-01,43.5,w,May-2019,Apr-2019,Individual


In [11]:
""" One-hot encoding """

# Group labels into 3 categories
df_num['purpose'] = df['purpose'].replace(['debt_consolidation', 'credit_card', 'medical'], 'debt') \
                                 .replace(['home_improvement', 'car', 'house', 'vacation'], 'major_purchase') \
                                 .replace(['small_business', 'moving', 'renewable_energy', 'other'], '_other')

# One-hot encode `purpose` and drop the last label
df_num = pd.get_dummies(df_num, columns=['purpose'], drop_first=True)
df_num.head(3)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,total_bc_limit,total_il_high_credit_limit,issue_month,oldest_cr_age,last_credit_pull_month,whole_loan,joint_app,high_risk,purpose_debt,purpose_major_purchase
80,35000.0,35000.0,35000.0,60.0,13.08,797.8,2.0,B5,1.0,1.0,...,32700.0,130876.0,3,3867,4.0,1.0,0.0,0.0,1,0
82,20000.0,20000.0,20000.0,36.0,22.5,769.0,4.0,D3,10.0,0.0,...,47000.0,30797.0,3,4932,4.0,1.0,0.0,0.0,1,0
93,10500.0,10500.0,10500.0,36.0,17.19,375.35,3.0,C5,9.0,0.0,...,2000.0,61987.0,3,2497,4.0,1.0,0.0,0.0,1,0


### Additional cleaning

In [12]:
# Drop redundant cols
cols_to_drop = ['title', 'sub_grade', 'zip_code', 'issue_d', 'loan_status', 
                'earliest_cr_line', 'addr_state', 'verification_status', 'next_pymnt_d', 
                'last_credit_pull_d', 'initial_list_status', 'application_type']
df_num.drop(cols_to_drop, axis=1, inplace=True)
df_num.shape

(96839, 82)

In [13]:
# Drop rows with missing values
df_num.dropna(inplace=True)
df_num.shape

(84198, 82)

In [14]:
# Find highly correlated columns
cor_cols = []
for i in range(df_num.shape[1] - 1):
    for j in range(i + 1, df_num.shape[1]):
        col1 = df_num.iloc[:, i]
        col2 = df_num.iloc[:, j]
        cor = col1.corr(col2)
        if abs(cor) > 0.7:
            print(col1.name, col2.name, cor)
            if col2.name not in cor_cols:
                cor_cols.append(col2.name)
                
cor_cols

loan_amnt funded_amnt 1.0
loan_amnt funded_amnt_inv 0.999997083401125
loan_amnt installment 0.9361457315663073
loan_amnt out_prncp 0.9628116324949524
loan_amnt out_prncp_inv 0.9628298134887975
funded_amnt funded_amnt_inv 0.999997083401125
funded_amnt installment 0.9361457315663073
funded_amnt out_prncp 0.9628116324949524
funded_amnt out_prncp_inv 0.9628298134887975
funded_amnt_inv installment 0.936136708511857
funded_amnt_inv out_prncp 0.9628082995548731
funded_amnt_inv out_prncp_inv 0.9628319067457652
int_rate grade 0.9644165592939707
installment out_prncp 0.8917068406731731
installment out_prncp_inv 0.8917195745051264
open_acc total_acc 0.739408034833447
open_acc num_op_rev_tl 0.8542594490286857
open_acc num_rev_accts 0.7183869384158287
open_acc num_sats 0.9996842051086133
pub_rec pub_rec_bankruptcies 0.9997149746017331
revol_bal total_rev_hi_lim 0.7670091267372624
revol_util bc_util 0.89864728747148
revol_util percent_bc_gt_75 0.7620369638540233
total_acc num_il_tl 0.727778251003140

['funded_amnt',
 'funded_amnt_inv',
 'installment',
 'out_prncp',
 'out_prncp_inv',
 'grade',
 'total_acc',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_sats',
 'pub_rec_bankruptcies',
 'total_rev_hi_lim',
 'bc_util',
 'percent_bc_gt_75',
 'num_il_tl',
 'total_pymnt_inv',
 'total_rec_prncp',
 'last_pymnt_amnt',
 'avg_cur_bal',
 'tot_hi_cred_lim',
 'num_tl_op_past_12m',
 'open_il_24m',
 'total_bal_ex_mort',
 'total_il_high_credit_limit',
 'open_rv_24m',
 'acc_open_past_24mths',
 'bc_open_to_buy',
 'total_bc_limit',
 'oldest_cr_age',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_rev_tl_bal_gt_0',
 'num_bc_tl',
 'purpose_major_purchase']

In [15]:
# Drop highly correlated columns, keeping only 1 column per correlation
df_num.drop(cor_cols, axis=1, inplace=True)
df_num.shape

(84198, 48)

In [16]:
# Find integer columns
df_int = df_num % 1
int_cols = df_int.nunique()[df_int.nunique() < 2].index

# Convert above columns to integer type
for col in int_cols:
    df_num[col] = df_num[col].astype(int)
    
df_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84198 entries, 80 to 115674
Data columns (total 48 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   84198 non-null  int64  
 1   term                        84198 non-null  int64  
 2   int_rate                    84198 non-null  float64
 3   emp_length                  84198 non-null  int64  
 4   home_ownership              84198 non-null  int64  
 5   annual_inc                  84198 non-null  float64
 6   dti                         84198 non-null  float64
 7   delinq_2yrs                 84198 non-null  int64  
 8   inq_last_6mths              84198 non-null  int64  
 9   open_acc                    84198 non-null  int64  
 10  pub_rec                     84198 non-null  int64  
 11  revol_bal                   84198 non-null  int64  
 12  revol_util                  84198 non-null  float64
 13  total_pymnt                 8

In [17]:
# Reset index
df_num.reset_index(drop=True, inplace=True)
df_num.head(3)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,num_accts_ever_120_pd,num_actv_bc_tl,num_tl_90g_dpd_24m,pct_tl_nvr_dlq,issue_month,last_credit_pull_month,whole_loan,joint_app,high_risk,purpose_debt
0,35000,60,13.08,1,1,125890.0,30.48,0,0,14,...,0,5,0,100.0,3,4,1,0,0,1
1,20000,36,22.5,10,0,75000.0,24.37,0,0,8,...,0,6,0,100.0,3,4,1,0,0,1
2,10500,36,17.19,9,0,66000.0,27.24,0,0,8,...,0,1,0,85.7,3,4,1,0,0,1


### Data preprocessing

In [18]:
# Count target labels
df_num['high_risk'].value_counts()

0    83748
1      450
Name: high_risk, dtype: int64

In [19]:
# Feature/target split
X = df_num.drop('high_risk', axis=1).copy()
y = df_num['high_risk'].copy()

# Train/validation/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((50518, 47), (50518,), (16840, 47), (16840,), (16840, 47), (16840,))

In [20]:
# Count target labels in each set
print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())

0    50248
1      270
Name: high_risk, dtype: int64
0    16750
1       90
Name: high_risk, dtype: int64
0    16750
1       90
Name: high_risk, dtype: int64


In [21]:
# Scale data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_train_scaled.head(3)

Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,mths_since_recent_bc,num_accts_ever_120_pd,num_actv_bc_tl,num_tl_90g_dpd_24m,pct_tl_nvr_dlq,issue_month,last_credit_pull_month,whole_loan,joint_app,purpose_debt
0,-0.972387,-0.703006,0.732672,-1.251658,-1.268834,-0.154803,-0.055052,-0.309825,-0.607667,-0.013543,...,0.406336,-0.317836,0.477172,-0.137288,0.589169,-0.975984,0.147108,-2.695437,-0.391972,0.431996
1,0.370574,1.422463,-0.044268,-1.251658,-1.268834,0.539986,-0.769578,-0.309825,-0.607667,0.657547,...,-0.638417,-0.317836,0.072094,-0.137288,0.589169,0.371104,0.147108,-2.695437,2.5512,-2.314836
2,-0.300906,1.422463,0.07622,-0.708172,-1.268834,-0.306232,0.666918,-0.309825,-0.607667,0.657547,...,-0.730602,-0.317836,0.072094,-0.137288,0.589169,0.371104,0.147108,-2.695437,-0.391972,0.431996


### Baseline machine learning

In [22]:
# Logistic regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train)

# Evaluate model
lr_val_pred = lr.predict(X_val)
print(classification_report(y_val, lr_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      0.21      0.35     16750
           1       0.01      0.74      0.01        90

    accuracy                           0.22     16840
   macro avg       0.50      0.48      0.18     16840
weighted avg       0.99      0.22      0.35     16840

Accuracy: 0.21561757719714963


Unnamed: 0,Predicted 0,Predicted 1
0,3564,13186
1,23,67


In [23]:
# Decision tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train_scaled, y_train)

# Evaluate model
tree_val_pred = tree.predict(X_val)
print(classification_report(y_val, tree_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     16750
           1       0.15      0.04      0.07        90

    accuracy                           0.99     16840
   macro avg       0.57      0.52      0.53     16840
weighted avg       0.99      0.99      0.99     16840

Accuracy: 0.9935273159144893


Unnamed: 0,Predicted 0,Predicted 1
0,16727,23
1,86,4


### Undersampling

In [25]:
# Random undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train_scaled, y_train)
Counter(y_rus)

Counter({0: 270, 1: 270})

In [26]:
# Logistic regression
lr_rus = LogisticRegression(random_state=42)
lr_rus.fit(X_rus, y_rus)

# Evaluate model
lr_rus_val_pred = lr_rus.predict(X_val)
print(classification_report(y_val, lr_rus_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_rus_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_rus_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.41      0.58     16750
           1       0.01      0.63      0.01        90

    accuracy                           0.42     16840
   macro avg       0.50      0.52      0.30     16840
weighted avg       0.99      0.42      0.58     16840

Accuracy: 0.41502375296912114


Unnamed: 0,Predicted 0,Predicted 1
0,6932,9818
1,33,57


In [27]:
# Decision tree
tree_rus = DecisionTreeClassifier(random_state=42)
tree_rus.fit(X_rus, y_rus)

# Evaluate model
tree_rus_val_pred = tree_rus.predict(X_val)
print(classification_report(y_val, tree_rus_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_rus_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_rus_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16750
           1       0.41      0.14      0.21        90

    accuracy                           0.99     16840
   macro avg       0.70      0.57      0.61     16840
weighted avg       0.99      0.99      0.99     16840

Accuracy: 0.9942992874109263


Unnamed: 0,Predicted 0,Predicted 1
0,16731,19
1,77,13


In [30]:
# Cluster centroid undersampling
ccu = ClusterCentroids(random_state=42)
X_ccu, y_ccu = ccu.fit_resample(X_train_scaled, y_train)
Counter(y_ccu)

Counter({0: 270, 1: 270})

In [32]:
# Logistic regression
lr_ccu = LogisticRegression(random_state=42)
lr_ccu.fit(X_ccu, y_ccu)

# Evaluate model
lr_ccu_val_pred = lr_ccu.predict(X_val)
print(classification_report(y_val, lr_ccu_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_ccu_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_ccu_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      0.43      0.60     16750
           1       0.01      0.54      0.01        90

    accuracy                           0.43     16840
   macro avg       0.50      0.49      0.31     16840
weighted avg       0.99      0.43      0.60     16840

Accuracy: 0.4318883610451306


Unnamed: 0,Predicted 0,Predicted 1
0,7224,9526
1,41,49


In [33]:
# Decision tree
tree_ccu = DecisionTreeClassifier(random_state=42)
tree_ccu.fit(X_ccu, y_ccu)

# Evaluate model
tree_ccu_val_pred = tree_ccu.predict(X_val)
print(classification_report(y_val, tree_ccu_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_ccu_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_ccu_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     16750
           1       0.00      0.00      0.00        90

    accuracy                           0.99     16840
   macro avg       0.50      0.50      0.50     16840
weighted avg       0.99      0.99      0.99     16840

Accuracy: 0.9946555819477435


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Predicted 0,Predicted 1
0,16750,0
1,90,0


### Oversampling

In [34]:
# Random oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train_scaled, y_train)
Counter(y_ros)

Counter({0: 50248, 1: 50248})

In [35]:
# Logistic regression
lr_ros = LogisticRegression(random_state=42)
lr_ros.fit(X_ros, y_ros)

# Evaluate model
lr_ros_val_pred = lr_ros.predict(X_val)
print(classification_report(y_val, lr_ros_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_ros_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_ros_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      0.15      0.26     16750
           1       0.01      0.82      0.01        90

    accuracy                           0.15     16840
   macro avg       0.50      0.49      0.14     16840
weighted avg       0.99      0.15      0.26     16840

Accuracy: 0.15397862232779097


Unnamed: 0,Predicted 0,Predicted 1
0,2519,14231
1,16,74


In [36]:
# Decision tree
tree_ros = DecisionTreeClassifier(random_state=42)
tree_ros.fit(X_ros, y_ros)

# Evaluate model
tree_ros_val_pred = tree_ros.predict(X_val)
print(classification_report(y_val, tree_ros_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_ros_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_ros_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      1.00      1.00     16750
           1       0.00      0.00      0.00        90

    accuracy                           0.99     16840
   macro avg       0.50      0.50      0.50     16840
weighted avg       0.99      0.99      0.99     16840

Accuracy: 0.9946555819477435


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Predicted 0,Predicted 1
0,16750,0
1,90,0


In [37]:
# SMOTE
smo = SMOTE(random_state=42)
X_smo, y_smo = smo.fit_resample(X_train_scaled, y_train)
Counter(y_smo)

Counter({0: 50248, 1: 50248})

In [38]:
# Logistic regression
lr_smo = LogisticRegression(random_state=42)
lr_smo.fit(X_smo, y_smo)

# Evaluate model
lr_smo_val_pred = lr_smo.predict(X_val)
print(classification_report(y_val, lr_smo_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_smo_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_smo_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      0.19      0.32     16750
           1       0.01      0.80      0.01        90

    accuracy                           0.19     16840
   macro avg       0.50      0.49      0.16     16840
weighted avg       0.99      0.19      0.32     16840

Accuracy: 0.19299287410926366


Unnamed: 0,Predicted 0,Predicted 1
0,3178,13572
1,18,72


In [39]:
# Decision tree
tree_smo = DecisionTreeClassifier(random_state=42)
tree_smo.fit(X_smo, y_smo)

# Evaluate model
tree_smo_val_pred = tree_smo.predict(X_val)
print(classification_report(y_val, tree_smo_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_smo_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_smo_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.55      0.71     16750
           1       0.01      0.59      0.01        90

    accuracy                           0.55     16840
   macro avg       0.50      0.57      0.36     16840
weighted avg       0.99      0.55      0.71     16840

Accuracy: 0.5504750593824228


Unnamed: 0,Predicted 0,Predicted 1
0,9217,7533
1,37,53


### Combination sampling

In [40]:
# SMOTEENN
sen = SMOTEENN(random_state=42)
X_sen, y_sen = sen.fit_resample(X_train_scaled, y_train)
Counter(y_sen)

Counter({0: 46836, 1: 50248})

In [41]:
# Logistic regression
lr_sen = LogisticRegression(random_state=42)
lr_sen.fit(X_sen, y_sen)

# Evaluate model
lr_sen_val_pred = lr_sen.predict(X_val)
print(classification_report(y_val, lr_sen_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_sen_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_sen_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       0.99      0.21      0.35     16750
           1       0.01      0.77      0.01        90

    accuracy                           0.21     16840
   macro avg       0.50      0.49      0.18     16840
weighted avg       0.99      0.21      0.34     16840

Accuracy: 0.21282660332541567


Unnamed: 0,Predicted 0,Predicted 1
0,3515,13235
1,21,69


In [42]:
# Decision tree
tree_sen = DecisionTreeClassifier(random_state=42)
tree_sen.fit(X_sen, y_sen)

# Evaluate model
tree_sen_val_pred = tree_sen.predict(X_val)
print(classification_report(y_val, tree_sen_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_sen_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_sen_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

              precision    recall  f1-score   support

           0       1.00      0.55      0.71     16750
           1       0.01      0.59      0.01        90

    accuracy                           0.55     16840
   macro avg       0.50      0.57      0.36     16840
weighted avg       0.99      0.55      0.71     16840

Accuracy: 0.5504750593824228


Unnamed: 0,Predicted 0,Predicted 1
0,9217,7533
1,37,53


### Sampling comparison

In [49]:
def clf_report(y_true, y_pred):
    
    """
    Using the confusion matrix and classification report, create a custom classification report 
    with the following values: true positives, false negatives, false positives, true negatives, 
    F1 score for positive class, F1 score for negative class, accuracy. In this case, the positive 
    class is the 0 label (low risk) and the negative class is the 1 label (high risk).
    
    Parameters
    ----------
    y_true : list-like
        True target labels
    y_pred : list-like
        Predicted target labels
    
    Returns
    -------
    Dict
        Custom classification report with the true positives, false negatives, false positives, 
        true negatives, F1 score for positive class, F1 score for negative class, accuracy
    """
    
    # Confusion matrix and classification report
    confusion_mat = confusion_matrix(y_true, y_pred)
    clf_rep = classification_report(y_true, y_pred, output_dict=True)
    
    # Report values: true 0, false 1, false 0, true 1, F1 for 0, F1 for 1, accuracy
    report = confusion_mat.ravel().tolist()
    report.extend([clf_rep['0']['f1-score'], clf_rep['1']['f1-score'], clf_rep['accuracy']])
    
    # Add report keys
    keys = ['true_pos', 'false_neg', 'false_pos', 'true_neg', 'f1_pos', 'f1_neg', 'accuracy']
    report = dict(zip(keys, report))
    return report
    

# Test function
clf_report(y_val, lr_val_pred)

{'true_pos': 3564,
 'false_neg': 13186,
 'false_pos': 23,
 'true_neg': 67,
 'f1_pos': 0.35049417318188525,
 'f1_neg': 0.010042719028704189,
 'accuracy': 0.21561757719714963}

In [52]:
# Logistic regression comparison
trials = ['base', 'rand_undersamp', 'centroid_undersamp', 'rand_oversamp', 'smote', 'smoteenn']
lr_preds = [lr_val_pred, lr_rus_val_pred, lr_ccu_val_pred, lr_ros_val_pred, lr_smo_val_pred, lr_sen_val_pred]
lr_reports = [clf_report(y_val, y_pred) for y_pred in lr_preds]
lr_reports_df = pd.DataFrame(lr_reports, index=trials)
lr_reports_df

Unnamed: 0,true_pos,false_neg,false_pos,true_neg,f1_pos,f1_neg,accuracy
base,3564,13186,23,67,0.350494,0.010043,0.215618
rand_undersamp,6932,9818,33,57,0.584609,0.01144,0.415024
centroid_undersamp,7224,9526,41,49,0.601624,0.01014,0.431888
rand_oversamp,2519,14231,16,74,0.261239,0.010281,0.153979
smote,3178,13572,18,72,0.31866,0.010485,0.192993
smoteenn,3515,13235,21,69,0.346544,0.010303,0.212827


In [53]:
# Decision tree comparison
tree_preds = [tree_val_pred, tree_rus_val_pred, tree_ccu_val_pred, tree_ros_val_pred, tree_smo_val_pred, tree_sen_val_pred]
tree_reports = [clf_report(y_val, y_pred) for y_pred in tree_preds]
tree_reports_df = pd.DataFrame(tree_reports, index=trials)
tree_reports_df

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,true_pos,false_neg,false_pos,true_neg,f1_pos,f1_neg,accuracy
base,16727,23,86,4,0.996752,0.068376,0.993527
rand_undersamp,16731,19,77,13,0.997139,0.213115,0.994299
centroid_undersamp,16750,0,90,0,0.997321,0.0,0.994656
rand_oversamp,16750,0,90,0,0.997321,0.0,0.994656
smote,9217,7533,37,53,0.708891,0.013809,0.550475
smoteenn,9217,7533,37,53,0.708891,0.013809,0.550475
