# Classification Algorithms and Resampling Methods Part 2

In [1]:
import numpy as np
import pandas as pd
import pickle
import datetime

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings(action = 'ignore')
import gc

In [2]:
with open('lending_club_modeling_pick.pkl', 'rb') as pickle_file:
    df = pickle.load(pickle_file)

In [3]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
#pd.reset_option('display.float_format')
pd.set_option('display.max_columns', None)

In [4]:
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,annual_inc,verification_status,issue_d,pymnt_plan,purpose,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,application_type,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag,TARGET
0,3600.0,3600.0,3600.0,36,13.99,123.03,C,C4,10,MORTGAGE,55000.0,Not Verified,16770.0,n,debt_consolidation,PA,5.91,0.0,12265.0,675.0,679.0,1.0,30.0,72.31284,7.0,0.0,2765.0,29.7,13.0,w,0.0,0.0,4421.72392,4421.72,3600.0,821.72,0.0,0.0,0.0,17897.0,122.67,18017.0,17956.0,564.0,560.0,0.0,30.0,Individual,0.0,722.0,144904.0,2.0,2.0,0.0,1.0,21.0,4981.0,36.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,N,Cash,N,0
1,24700.0,24700.0,24700.0,36,11.99,820.28,C,C1,10,MORTGAGE,65000.0,Not Verified,16770.0,n,small_business,SD,16.06,1.0,10926.0,715.0,719.0,4.0,6.0,72.31284,22.0,0.0,21470.0,19.2,38.0,w,0.0,0.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,16953.0,926.35,18017.0,17956.0,699.0,695.0,0.0,44.16422,Individual,0.0,0.0,204396.0,1.0,1.0,0.0,1.0,19.0,18005.0,73.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,37.0,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,N,Cash,N,0
2,20000.0,20000.0,20000.0,60,10.78,432.66,B,B4,10,MORTGAGE,63000.0,Not Verified,16770.0,n,home_improvement,IL,10.78,0.0,11170.0,695.0,699.0,0.0,34.54092,72.31284,6.0,0.0,7869.0,56.2,18.0,w,0.0,0.0,22705.92429,22705.92,20000.0,2705.92,0.0,0.0,0.0,17318.0,15813.3,18017.0,17956.0,704.0,700.0,0.0,44.16422,Joint App,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,73.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,55.9,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,37.0,10.0,0.0,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,0.0,0.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,N,Cash,N,0
3,35000.0,35000.0,35000.0,60,14.85,829.9,C,C5,10,MORTGAGE,110000.0,Source Verified,16770.0,n,debt_consolidation,NJ,17.06,0.0,14123.0,785.0,789.0,0.0,34.54092,72.31284,13.0,0.0,7802.0,11.6,17.0,w,15897.65,15897.65,31464.01,31464.01,19102.35,12361.66,0.0,0.0,0.0,17928.0,829.9,17987.0,17956.0,679.0,675.0,0.0,44.16422,Individual,0.0,0.0,301500.0,1.0,1.0,0.0,1.0,23.0,12609.0,70.0,1.0,1.0,6987.0,45.0,67300.0,0.0,1.0,0.0,2.0,23192.0,54962.0,12.1,0.0,0.0,36.0,87.0,2.0,2.0,1.0,2.0,37.0,0.0,0.0,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,N,Cash,N,0
4,10400.0,10400.0,10400.0,60,22.45,289.91,F,F1,3,MORTGAGE,104433.0,Source Verified,16770.0,n,major_purchase,PA,25.37,1.0,10378.0,695.0,699.0,3.0,12.0,72.31284,12.0,0.0,21929.0,64.5,35.0,w,0.0,0.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,16983.0,10128.96,18017.0,17591.0,704.0,700.0,0.0,44.16422,Individual,0.0,0.0,331730.0,1.0,3.0,0.0,3.0,14.0,73839.0,84.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,N,Cash,N,0


# Conducting Train Test Split

In [4]:
def dummy_encoding(df):
    
    df = pd.get_dummies(df)
    
    return df

In [5]:
def preprocess_inputs(df):
    df = df.copy()
    
    #Binary Encode
    df = dummy_encoding(df)
    
    #split into x and y arrays
    y = df['TARGET']
    X = df.drop('TARGET', axis = 1)
    
    #Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 1, stratify = y)
    
    #Scale X
    sc = StandardScaler()
    sc.fit(X_train)
    X_train = sc.transform(X_train)
    X_test = sc.transform(X_test)    
    
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = preprocess_inputs(df)

In [6]:
X_train.shape, y_train.shape

((1805776, 222), (1805776,))

In [8]:
del df
gc.collect()

60

# Creating A Baseline For Accuracy (Logistic Regression & Random Forest)
### Oversampling with SMOTE

In [7]:
sm = SMOTE(random_state=12, sampling_strategy = 'minority')
X_train_r, y_train_r = sm.fit_resample(X_train, y_train)

In [10]:
X_train_r.shape, y_train_r.shape

((3611488, 222), (3611488,))

### Logistic Regression

In [11]:
clf = LogisticRegression(C = 0.0001, random_state = 21)

clf.fit(X_train_r, y_train_r) 

LogisticRegression(C=0.0001, random_state=21)

In [22]:
y_predict_test = clf.predict(X_test)
print("\n")
print("[Test] Accuracy score (y_predict_test, y_test):",accuracy_score(y_predict_test, y_test))

y_predict_training = clf.predict(X_train_r)
print("\n")
print("[Training] Accuracy score: (y_train_r, y_predict_training)",accuracy_score(y_train_r, y_predict_training))



[Test] Accuracy score (y_predict_test, y_test): 0.9847644785079024


[Training] Accuracy score: (y_train_r, y_predict_training) 0.9925676064824249


In [23]:
print("[Training Classification Report]")
print(classification_report(y_train_r, y_predict_training))

print("[Test Classification Report]")
print(classification_report(y_test, y_predict_test))

[Training Classification Report]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99   1805744
           1       0.99      1.00      0.99   1805744

    accuracy                           0.99   3611488
   macro avg       0.99      0.99      0.99   3611488
weighted avg       0.99      0.99      0.99   3611488

[Test Classification Report]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    451437
           1       0.00      0.38      0.00         8

    accuracy                           0.98    451445
   macro avg       0.50      0.68      0.50    451445
weighted avg       1.00      0.98      0.99    451445



Recall score seems to have increased significantly but precision is still off.

### Random Forest

In [None]:
clf_rf = RandomForestClassifier(n_estimators=40, random_state=21)
clf_rf.fit(X_train_r, y_train_r)

# Hyperparameter Tuning

In [None]:
import lightgbm as lgb

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [None]:
model = lgb.train(#parameters)