In [3]:
# You will:

# Oversample the data using the RandomOverSampler and SMOTE algorithms.
# Undersample the data using the cluster centroids algorithm.
# Use a combination approach with the SMOTEENN algorithm.
# For each of the above, you’ll:

# Train a logistic regression classifier (from Scikit-learn) using the resampled data.
# Calculate the balanced accuracy score using balanced_accuracy_score from sklearn.metrics.
# Generate a confusion_matrix.
# Print the classification report (classification_report_imbalanced from imblearn.metrics).

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
from path import Path
import numpy as np
import pandas as pd

In [5]:
# Read in the data
data = Path('./Resources/LoanStats_2019Q1_missingColumnsRemoved.csv')
a_df = pd.read_csv(data)
a_df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term_months,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,debt_settlement_flag
0,4800,4800,4800,36 months,0.17,171.59,C,C5,Office clerk,6 years,...,,,,,,,,,N,N
1,3600,3600,3600,36 months,0.16,125.81,C,C3,Service Technician,10+ years,...,,,,,,,,,N,N
2,4800,4800,4800,36 months,0.15,165.79,C,C2,Manager,6 years,...,,,,,,,,,N,N
3,4800,4800,4800,36 months,0.07,148.26,A,A2,Account Manager,10+ years,...,,,,,,,,,N,N
4,10000,10000,10000,36 months,0.08,311.34,A,A3,Vice President - Portfolio Manage,8 years,...,,,,,,,,,N,N


In [6]:
# select statuses of interest
keep_statuses = ['Current', 'Fully Paid', 'Late (16-30 days)', 'Late (31-120 days)', 'Charged Off']
a_df['bad_status'] = a_df["loan_status"].isin(['Late (16-30 days)', 'Late (31-120 days)', 'Charged Off']).astype('int')
a_df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115675 entries, 0 to 115674
Data columns (total 121 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   loan_amnt                            115675 non-null  int64  
 1   funded_amnt                          115675 non-null  int64  
 2   funded_amnt_inv                      115675 non-null  int64  
 3   term_months                          115675 non-null  object 
 4   int_rate                             115675 non-null  float64
 5   installment                          115675 non-null  float64
 6   grade                                115675 non-null  object 
 7   sub_grade                            115675 non-null  object 
 8   emp_title                            96157 non-null   object 
 9   emp_length                           104574 non-null  object 
 10  home_ownership                       115675 non-null  object 
 11  annual_inc  

In [7]:
df = a_df[(a_df['loan_status'].isin(keep_statuses)) & (a_df['bad_status'].notnull())]
df

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term_months,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,debt_settlement_flag,bad_status
0,4800,4800,4800,36 months,0.17,171.59,C,C5,Office clerk,6 years,...,,,,,,,,N,N,1
1,3600,3600,3600,36 months,0.16,125.81,C,C3,Service Technician,10+ years,...,,,,,,,,N,N,1
2,4800,4800,4800,36 months,0.15,165.79,C,C2,Manager,6 years,...,,,,,,,,N,N,1
3,4800,4800,4800,36 months,0.07,148.26,A,A2,Account Manager,10+ years,...,,,,,,,,N,N,1
4,10000,10000,10000,36 months,0.08,311.34,A,A3,Vice President - Portfolio Manage,8 years,...,,,,,,,,N,N,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115670,1200,1200,1200,36 months,0.16,42.28,C,C4,Shift Supervisor,10+ years,...,6.0,99.6,0.0,3.0,0.0,1.0,4.0,N,N,1
115671,4000,4000,4000,36 months,0.21,150.48,D,D4,,< 1 year,...,,,,,,,,N,N,1
115672,10000,10000,10000,60 months,0.20,264.50,D,D3,,< 1 year,...,,,,,,,,N,N,1
115673,20000,20000,20000,36 months,0.06,612.62,A,A1,,< 1 year,...,,,,,,,,N,N,1


In [8]:
df.groupby('bad_status').describe().head()

Unnamed: 0_level_0,loan_amnt,loan_amnt,loan_amnt,loan_amnt,loan_amnt,loan_amnt,loan_amnt,loan_amnt,funded_amnt,funded_amnt,...,sec_app_collections_12_mths_ex_med,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,sec_app_mths_since_last_major_derog,sec_app_mths_since_last_major_derog,sec_app_mths_since_last_major_derog,sec_app_mths_since_last_major_derog,sec_app_mths_since_last_major_derog,sec_app_mths_since_last_major_derog,sec_app_mths_since_last_major_derog
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
bad_status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,96273.0,16647.037331,10364.969864,1000.0,9000.0,15000.0,24000.0,40000.0,96273.0,16647.037331,...,0.0,16.0,3980.0,39.40804,23.672195,0.0,20.0,39.0,59.0,132.0
1,334.0,15428.817365,10062.373534,1000.0,8000.0,12000.0,20000.0,40000.0,334.0,15428.817365,...,0.0,1.0,12.0,49.166667,31.737083,4.0,25.5,49.5,66.5,117.0


In [9]:
# column lists by dtype
type_float64 = df.select_dtypes(include='float64').columns.values.tolist()
type_int64 = df.select_dtypes(include='int64').columns.values.tolist()
type_object = df.select_dtypes(include='object').columns.values.tolist()


In [10]:
# number of unique values in lists by dtype
unique_values_float = df.select_dtypes(include='float64').nunique()


unique_values_float.array

sum_df_float = pd.DataFrame(unique_values_float)
sum_df_float.reset_index(level=0, inplace=True)
sum_df_float.rename(columns={ 'index':'colname', 0: 'value_count'}, inplace=True)

sum_df_float = sum_df_float[(sum_df_float['value_count'] > 10)]
sum_df_float


Unnamed: 0,colname,value_count
0,int_rate,25
1,installment,10570
2,annual_inc,8375
3,dti,6192
4,mths_since_last_delinq,128
5,mths_since_last_record,106
6,revol_util,115
7,out_prncp,24537
8,out_prncp_inv,26123
9,total_pymnt,36886


In [11]:
unique_values_int = df.select_dtypes(include='int64').nunique()

sum_df_int = pd.DataFrame(unique_values_int)
sum_df_int.reset_index(level=0, inplace=True)
sum_df_int.rename(columns={ 'index':'colname', 0: 'value_count'}, inplace=True)

sum_df_int = sum_df_int[(sum_df_int['value_count'] > 1) & (sum_df_int['value_count'] <= 50)]
sum_df_int

Unnamed: 0,colname,value_count
3,delinq_2yrs,19
4,inq_last_6mths,6
6,pub_rec,5
11,collections_12_mths_ex_med,7
16,open_acc_6m,14
17,open_act_il,39
18,open_il_12m,7
19,open_il_24m,17
21,open_rv_12m,18
22,open_rv_24m,34


In [12]:
unique_values_object = df.select_dtypes(include='object').nunique()

sum_df_object = pd.DataFrame(unique_values_object)
sum_df_object.reset_index(level=0, inplace=True)
sum_df_object.rename(columns={ 'index':'colname', 0: 'value_count'}, inplace=True)
sum_df_object = sum_df_object[(sum_df_object['value_count'] > 1) & (sum_df_object['value_count'] <= 50)]
sum_df_object

Unnamed: 0,colname,value_count
0,term_months,2
1,grade,7
2,sub_grade,33
4,emp_length,11
5,home_ownership,5
6,verification_status,3
7,issue_d,3
8,loan_status,5
10,purpose,12
11,title,12


In [13]:
b_df = a_df[(a_df['loan_status'].isin(keep_statuses)) & (a_df['bad_status'].notnull())]
b_df

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term_months,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,debt_settlement_flag,bad_status
0,4800,4800,4800,36 months,0.17,171.59,C,C5,Office clerk,6 years,...,,,,,,,,N,N,1
1,3600,3600,3600,36 months,0.16,125.81,C,C3,Service Technician,10+ years,...,,,,,,,,N,N,1
2,4800,4800,4800,36 months,0.15,165.79,C,C2,Manager,6 years,...,,,,,,,,N,N,1
3,4800,4800,4800,36 months,0.07,148.26,A,A2,Account Manager,10+ years,...,,,,,,,,N,N,1
4,10000,10000,10000,36 months,0.08,311.34,A,A3,Vice President - Portfolio Manage,8 years,...,,,,,,,,N,N,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115670,1200,1200,1200,36 months,0.16,42.28,C,C4,Shift Supervisor,10+ years,...,6.0,99.6,0.0,3.0,0.0,1.0,4.0,N,N,1
115671,4000,4000,4000,36 months,0.21,150.48,D,D4,,< 1 year,...,,,,,,,,N,N,1
115672,10000,10000,10000,60 months,0.20,264.50,D,D3,,< 1 year,...,,,,,,,,N,N,1
115673,20000,20000,20000,36 months,0.06,612.62,A,A1,,< 1 year,...,,,,,,,,N,N,1


In [14]:
df = b_df.filter(['bad_status', 'term_months', 'purpose', 'grade', 'application_type','home_ownership'])

dummy_set=pd.get_dummies(df)

dummy_set.head()

Unnamed: 0,bad_status,term_months_ 36 months,term_months_ 60 months,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,...,grade_E,grade_F,grade_G,application_type_Individual,application_type_Joint App,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OWN,home_ownership_RENT
0,1,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,1,1,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,1,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


In [15]:
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

# X = df.YearsExperience.values.reshape(-1, 1)
int_rate_res = a_df.int_rate.values.reshape(-1, 1)

int_rate_rescaled = data_scaler.fit_transform(int_rate_res)
int_rate_res_df = pd.DataFrame(int_rate_rescaled)
int_rate_res_df.rename(columns={ 'index':'index', 0: 'int_rate_rescaled'}, inplace=True)
int_rate_res


array([[0.17],
       [0.16],
       [0.15],
       ...,
       [0.2 ],
       [0.06],
       [0.2 ]])

In [16]:
total_pymnt_res = a_df.total_pymnt.values.reshape(-1, 1)

total_pymnt_rescaled = data_scaler.fit_transform(total_pymnt_res)
total_pymnt_res_df = pd.DataFrame(total_pymnt_rescaled)
total_pymnt_res_df.rename(columns={ 'index':'index', 0: 'total_pymnt_rescaled'}, inplace=True)
total_pymnt_res_df


Unnamed: 0,total_pymnt_rescaled
0,-0.409754
1,-0.409754
2,-0.409754
3,-0.409754
4,-0.409754
...,...
115670,-0.389538
115671,-0.348252
115672,-0.312848
115673,-0.178247


In [17]:
df = pd.concat([dummy_set, total_pymnt_res_df, int_rate_res_df], axis=1)
df = df[(df["bad_status"].notnull())]

# df = df[df['bad_status'].notna()]
# df = df[df['total_pymnt_rescaled'].notna()]
df.head(15)

Unnamed: 0,bad_status,term_months_ 36 months,term_months_ 60 months,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,...,grade_G,application_type_Individual,application_type_Joint App,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OWN,home_ownership_RENT,total_pymnt_rescaled,int_rate_rescaled
0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.409754,0.882794
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.409754,0.677806
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.409754,0.472818
3,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.409754,-1.167086
4,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.409754,-0.962098
5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.409754,-0.962098
6,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.409754,-0.962098
7,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.409754,-0.962098
8,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.409754,-1.372074
9,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.409754,-0.142146


In [None]:
# drop columns with none or one value only - done above
# scale columns for >10 unique values
# get dummies for 2-10 unique values and objects

In [18]:
# Segment the features from the target
y = df["bad_status"]
X = df.drop(columns="bad_status")


In [19]:
# Normal train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)


Counter({0.0: 72225, 1.0: 230})

In [20]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)

In [21]:
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0.0: 72225, 1.0: 72225})

In [22]:
X_resampled.describe()

Unnamed: 0,term_months_ 36 months,term_months_ 60 months,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,...,grade_G,application_type_Individual,application_type_Joint App,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OWN,home_ownership_RENT,total_pymnt_rescaled,int_rate_rescaled
count,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,...,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0,144450.0
mean,0.709332,0.290668,0.006577,0.227034,0.553403,0.077362,0.011983,0.02749,0.016795,0.004874,...,8.3e-05,0.875943,0.124057,0.010751,0.477854,7e-06,0.148183,0.363205,-0.098488,0.195574
std,0.454072,0.454072,0.08083,0.418916,0.497142,0.267167,0.108811,0.163508,0.128502,0.069642,...,0.009114,0.329647,0.329647,0.103129,0.499511,0.002631,0.355282,0.480925,0.84636,1.104081
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.409754,-1.372074
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.409754,-0.75711
50%,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.25491,0.062842
75%,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.080955,0.882794
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,15.427938,3.752626


In [23]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [24]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[17705,  6343],
       [   20,    84]], dtype=int64)

In [25]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.7719640846512104

In [26]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.74      0.81      0.85      0.77      0.59     24048
        1.0       0.01      0.81      0.74      0.03      0.77      0.60       104

avg / total       0.99      0.74      0.81      0.84      0.77      0.59     24152



In [27]:
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0.0: 72225, 1.0: 72225})

In [28]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [29]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7726038307999386

In [30]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[17967,  6081],
       [   21,    83]], dtype=int64)

In [31]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.75      0.80      0.85      0.77      0.59     24048
        1.0       0.01      0.80      0.75      0.03      0.77      0.60       104

avg / total       0.99      0.75      0.80      0.85      0.77      0.59     24152



In [37]:
# Implement undersampling using ClusterCentroids algorithm
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [38]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [39]:
# confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[12217, 11831],
       [   16,    88]], dtype=int64)

In [40]:
# Balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.6770897307948206

In [41]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       1.00      0.51      0.85      0.67      0.66      0.42     24048
        1.0       0.01      0.85      0.51      0.01      0.66      0.44       104

avg / total       0.99      0.51      0.84      0.67      0.66      0.42     24152



In [None]:
# Implement SMOTEEN
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [None]:
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

# Balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))