In [51]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import scorecardpy as sc
import pprint

from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from category_encoders import TargetEncoder

In [52]:
df_main = pd.read_csv('filtered_data_iter2.csv')
df = df_main.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114755 entries, 0 to 114754
Data columns (total 20 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                114755 non-null  int64  
 1   STATUS                    114755 non-null  int64  
 2   NAME_CONTRACT_TYPE        114755 non-null  object 
 3   FLAG_OWN_CAR              114755 non-null  object 
 4   FLAG_OWN_REALTY           114755 non-null  object 
 5   AMT_INCOME_TOTAL          114755 non-null  float64
 6   AMT_CREDIT                114755 non-null  float64
 7   AMT_ANNUITY               114755 non-null  float64
 8   NAME_INCOME_TYPE          114755 non-null  object 
 9   NAME_EDUCATION_TYPE       114755 non-null  object 
 10  NAME_FAMILY_STATUS        114755 non-null  object 
 11  NAME_HOUSING_TYPE         114755 non-null  object 
 12  DAYS_EMPLOYED             114755 non-null  int64  
 13  OCCUPATION_TYPE           114755 non-null  o

In [53]:
#replace value 0 in status with 2
# df["STATUS"].replace(0,2,inplace=True)
# df["STATUS"].replace(1,0,inplace=True)
# df["STATUS"].replace(2,1,inplace=True)
df["STATUS"].value_counts()


0    104221
1     10534
Name: STATUS, dtype: int64

In [54]:
df.dropna(inplace=True)
df.isnull().sum()

Unnamed: 0                  0
STATUS                      0
NAME_CONTRACT_TYPE          0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
AMT_INCOME_TOTAL            0
AMT_CREDIT                  0
AMT_ANNUITY                 0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_EMPLOYED               0
OCCUPATION_TYPE             0
CNT_FAM_MEMBERS             0
EXT_SOURCE_2                0
DEF_60_CNT_SOCIAL_CIRCLE    0
age                         0
total_enquiries_cb          0
credit_income_ratio         0
dtype: int64

In [55]:
data_dict = {}
for col in df.columns:
    data_dict[col] = [str(df[col].dtypes)]
    

In [56]:
# define the target variable
target = df['STATUS']

# create a list of object columns
object_list = [col for col in df.columns if df[col].dtype == 'object']

# create a dictionary to store the original categorical variables and their encoded values
cat_dict = {}

df_label_encoded = df.copy()
# label encode all object columns and store the original categorical variables and their encoded values
for col in object_list:
    le = LabelEncoder()
    df_label_encoded[col] = le.fit_transform(df[col])
    cat_dict[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# label encode the target variable and add it to the dictionary
le = LabelEncoder()
target = le.fit_transform(target)
cat_dict['STATUS'] = dict(zip(le.classes_, le.transform(le.classes_)))

# print the dictionary
for (k,v) in cat_dict.items():
    print(k, v)

NAME_CONTRACT_TYPE {'Cash loans': 0, 'Revolving loans': 1}
FLAG_OWN_CAR {'N': 0, 'Y': 1}
FLAG_OWN_REALTY {'N': 0, 'Y': 1}
NAME_INCOME_TYPE {'Businessman': 0, 'Commercial associate': 1, 'Pensioner': 2, 'State servant': 3, 'Student': 4, 'Working': 5}
NAME_EDUCATION_TYPE {'Academic degree': 0, 'Higher education': 1, 'Incomplete higher': 2, 'Lower secondary': 3, 'Secondary / secondary special': 4}
NAME_FAMILY_STATUS {'Civil marriage': 0, 'Married': 1, 'Separated': 2, 'Single / not married': 3, 'Widow': 4}
NAME_HOUSING_TYPE {'Co-op apartment': 0, 'House / apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Rented apartment': 4, 'With parents': 5}
OCCUPATION_TYPE {'Accountants': 0, 'Cleaning staff': 1, 'Cooking staff': 2, 'Core staff': 3, 'Drivers': 4, 'HR staff': 5, 'High skill tech staff': 6, 'IT staff': 7, 'Laborers': 8, 'Low-skill Laborers': 9, 'Managers': 10, 'Medicine staff': 11, 'Not Specified': 12, 'Private service staff': 13, 'Realty agents': 14, 'Sales staff': 15, 'Sec

After label encoding, certain categories such as OCCUPATION_TYPE have an arbitrary and misleading order. The order of the encoded categories may not reflect the actual relationship between the categories. In the label encoded order above, **Accountants** have a low encoding label and would be likely binned with **Cleaning** and **Cooking staff** despite a much lower default rate as seen in the data below. This could potentially lead to inaccurate binning and scorecard creation, which could produce irrational scores.

In [57]:
df.groupby(df['OCCUPATION_TYPE'])['STATUS'].mean().sort_values(ascending=False)

OCCUPATION_TYPE
Low-skill Laborers       0.180932
Cooking staff            0.125254
Cleaning staff           0.124682
Security staff           0.121001
Waiters/barmen staff     0.114334
Laborers                 0.113637
Drivers                  0.112117
Sales staff              0.103980
Realty agents            0.099502
Not Specified            0.084891
Secretaries              0.081413
Medicine staff           0.079542
Managers                 0.067038
High skill tech staff    0.066873
HR staff                 0.064516
Core staff               0.063942
Private service staff    0.063846
Accountants              0.051250
IT staff                 0.048387
Name: STATUS, dtype: float64

We will group occupation types into 5 categories based on the corresponding default rates for each type

In [58]:
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].map({'Low-skill Labourers': 0, 'Cooking staff': 1, 'Cleaning staff': 2, 'Security staff': 3, 'Waiters/barmen staff': 4})
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].map({'Labourers': 5, 'Drivers': 6, 'Sales staff': 7, 'Realty agents': 8, 'Secretaries': 9})
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].map({'Not Specified': 10})
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].map({'Medicine staff': 11, 'Managers': 12, 'High skill tech staff': 13, 'HR staff ': 14})
df['OCCUPATION_TYPE'] = df['OCCUPATION_TYPE'].map({'Core staff': 15, 'Private service staff': 16, 'Accountants': 17, 'IT staff': 18})
cat_dict['OCCUPATION_TYPE'] = {'Low-skill Labourers': 0, 'Cooking staff': 1, 'Cleaning staff': 2, 'Security staff': 3, 'Waiters/barmen staff': 4, 'Labourers': 5, 'Drivers': 6, 'Sales staff': 7, 'Realty agents': 8, 'Secretaries': 9, 'Not Specified': 10, 'Medicine staff': 11, 'Managers': 12, 'High skill tech staff': 13, 'HR staff ': 14, 'Core staff': 15, 'Private service staff': 16, 'Accountants': 17, 'IT staff': 18}

To mitigate this issue, we will perform manual label encoding for categories that can be easily ordered without biases

In [59]:
# Education labels are in order of increasing education level
df['NAME_EDUCATION_TYPE'] = df['NAME_EDUCATION_TYPE'].map({'Lower secondary': 0, 'Secondary/ secondary special': 1, 'Incomplete higher': 2, 'Higher education': 3, 'Academic degree': 4})
cat_dict['NAME_EDUCATION_TYPE'] = {'Lower secondary': 0, 'Secondary/ secondary special': 1, 'Incomplete higher': 2, 'Higher education': 3, 'Academic degree': 4}

# Income labels are in order of increasing income level
df['NAME_INCOME_TYPE'] = df['NAME_INCOME_TYPE'].map({'Student': 0, 'Pensioner': 1, 'Working': 2, 'State servant': 3, 'Commercial associate': 4, 'Businessman': 5})
cat_dict['NAME_INCOME_TYPE'] = {'Student': 0, 'Pensioner': 1, 'Working': 2, 'State servant': 3, 'Commercial associate': 4, 'Businessman': 5}

# Housing labels are in order of increasing loan burden on the individual
df['NAME_HOUSING_TYPE'] = df['NAME_HOUSING_TYPE'].map({'With parents': 0, 'Rented apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Co-op apartment': 4, 'House / apartment': 5})
cat_dict['NAME_HOUSING_TYPE'] = {'With parents': 0, 'Rented apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Co-op apartment': 4, 'House / apartment': 5}

For the remaining variables, perform label encoding

In [60]:
manual_encodings = ['OCCUPATION_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE']

# define the target variable
target = df['STATUS']

# create a list of object columns
object_list = [col for col in df.columns if df[col].dtype == 'object']

# create a dictionary to store the original categorical variables and their encoded values
df_label_encoded = df.copy()
# label encode all object columns and store the original categorical variables and their encoded values
for col in set(object_list) - set(manual_encodings):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    cat_dict[col] = dict(zip(le.classes_, le.transform(le.classes_)))

# label encode the target variable and add it to the dictionary
le = LabelEncoder()
target = le.fit_transform(target)
cat_dict['STATUS'] = dict(zip(le.classes_, le.transform(le.classes_)))

In [61]:
# print the dictionary
for (k,v) in cat_dict.items():
    print(k, v)

NAME_CONTRACT_TYPE {'Cash loans': 0, 'Revolving loans': 1}
FLAG_OWN_CAR {'N': 0, 'Y': 1}
FLAG_OWN_REALTY {'N': 0, 'Y': 1}
NAME_INCOME_TYPE {'Student': 0, 'Pensioner': 1, 'Working': 2, 'State servant': 3, 'Commercial associate': 4, 'Businessman': 5}
NAME_EDUCATION_TYPE {'Lower secondary': 0, 'Secondary/ secondary special': 1, 'Incomplete higher': 2, 'Higher education': 3, 'Academic degree': 4}
NAME_FAMILY_STATUS {'Civil marriage': 0, 'Married': 1, 'Separated': 2, 'Single / not married': 3, 'Widow': 4}
NAME_HOUSING_TYPE {'With parents': 0, 'Rented apartment': 1, 'Municipal apartment': 2, 'Office apartment': 3, 'Co-op apartment': 4, 'House / apartment': 5}
OCCUPATION_TYPE {'Low-skill Labourers': 0, 'Cooking staff': 1, 'Cleaning staff': 2, 'Security staff': 3, 'Waiters/barmen staff': 4, 'Labourers': 5, 'Drivers': 6, 'Sales staff': 7, 'Realty agents': 8, 'Secretaries': 9, 'Not Specified': 10, 'Medicine staff': 11, 'Managers': 12, 'High skill tech staff': 13, 'HR staff ': 14, 'Core staff': 1

In [62]:
df.head()

Unnamed: 0.1,Unnamed: 0,STATUS,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_EMPLOYED,OCCUPATION_TYPE,CNT_FAM_MEMBERS,EXT_SOURCE_2,DEF_60_CNT_SOCIAL_CIRCLE,age,total_enquiries_cb,credit_income_ratio
0,0,0,0,1,1,171000.0,1560726.0,41301.0,4,3.0,Married,5,-3130,,3.0,0.724,0.0,37.0,4.0,9.127053
1,2,0,0,0,1,112500.0,652500.0,21177.0,2,3.0,Married,5,-679,,3.0,0.651862,0.0,27.0,1.0,5.8
2,3,0,0,0,1,67500.0,80865.0,5881.5,2,,Married,5,-2717,,2.0,0.715042,0.0,36.0,1.0,1.198
3,4,0,0,1,0,225000.0,918468.0,28966.5,2,,Married,5,-3028,,3.0,0.566907,0.0,38.0,1.0,4.08208
4,6,0,0,0,0,108000.0,509602.5,26149.5,2,,Married,5,-1317,,2.0,0.236378,0.0,35.0,1.0,4.718542


In [63]:
del df['Unnamed: 0']

In [64]:
# automatically calculate bin ranges
# bins = sc.woebin(df, y='STATUS',positive="bad|0")
bins = sc.woebin(df, y='STATUS')
# make it easy to read the bins
for variables , bindetails in bins.items():
    if (variables == 'OCCUPATION_TYPE'):
        print(bindetails)
    print(variables , " : ")
    display(bindetails)
    print("--"*50)

[INFO] creating woe binning ...


MergeError: Can only pass argument "on" OR "left_index" and "right_index", not a combination of both.

In [None]:
#drop credit_income_ratio
df.drop("credit_income_ratio",axis=1,inplace=True)
df.drop("total_enquiries_cb",axis=1,inplace=True)

In [None]:
df.drop("FLAG_OWN_REALTY",axis=1,inplace=True)
df.drop("NAME_HOUSING_TYPE",axis=1,inplace=True)

In [None]:
print(df.shape)
df.head()

(114564, 15)


Unnamed: 0,STATUS,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,DAYS_EMPLOYED,OCCUPATION_TYPE,CNT_FAM_MEMBERS,EXT_SOURCE_2,DEF_60_CNT_SOCIAL_CIRCLE,age
0,0,0,1,171000.0,1560726.0,41301.0,1,1,1,-3130,0,3.0,0.724,0.0,37.0
1,0,0,0,112500.0,652500.0,21177.0,5,1,1,-679,3,3.0,0.651862,0.0,27.0
2,0,0,0,67500.0,80865.0,5881.5,5,4,1,-2717,8,2.0,0.715042,0.0,36.0
3,0,0,1,225000.0,918468.0,28966.5,5,4,1,-3028,4,3.0,0.566907,0.0,38.0
4,0,0,0,108000.0,509602.5,26149.5,5,4,1,-1317,4,2.0,0.236378,0.0,35.0


In [None]:
breaks_adj = {
       'OCCUPATION_TYPE' : [1, 4, 15],
       'NAME_INCOME_TYPE' : [0,1,5],
        'NAME_EDUCATION_TYPE' : [0,2,3],
        'AMT_INCOME_TOTAL' : [200000,250000, 300000,400000],
        'DAYS_EMPLOYED' : [-5000,-3500, -2200, -1700, -1000],
        'AMT_ANNUITY' : [4000, 7000, 16000],
        'EXT_SOURCE_2' : [0.1,0.15,0.20,0.44,0.6,0.80],
        'AMT_CREDIT' : [91000,150000,200000,300000],
        'CNT_FAM_MEMBERS' : [1,2,3],
    'NAME_FAMILY_STATUS' : [0,3],
        'age' : [27, 30,35, 40],
        'DEF_60_CNT_SOCIAL_CIRCLE' : [0,1,2],
        'FLAG_OWN_CAR': [0,1]
        
    }
bins_final = sc.woebin(df, y='STATUS',breaks_list=breaks_adj)
# bins_final = sc.woebin(df, y='STATUS',breaks_list=breaks_adj, positive="bad|0")

[INFO] creating woe binning ...


In [None]:
# sample code
train, test = sc.split_df(df, 'STATUS', ratio=0.8).values()
print(train.shape)
print(test.shape)

(91651, 15)
(22913, 15)


In [None]:
print('Proportion of positive cases in train set:', train['STATUS'].mean())
print('Proportion of positive cases in test set:', test['STATUS'].mean())

Proportion of positive cases in train set: 0.09175022640233058
Proportion of positive cases in test set: 0.09173831449395539


In [None]:
# train_woe = sc.woebin_ply(train, bins_final, positive="bad|0")
# test_woe = sc.woebin_ply(test, bins_final, positive="bad|0")
train_woe = sc.woebin_ply(train, bins_final)
test_woe = sc.woebin_ply(test, bins_final)

[INFO] converting into woe values ...
[INFO] converting into woe values ...


In [None]:
y_train = train_woe.loc[:,'STATUS']
X_train = train_woe.loc[:,train_woe.columns != 'STATUS']
y_test = test_woe.loc[:,'STATUS']
X_test = test_woe.loc[:,test_woe.columns != 'STATUS']

In [None]:
print("Class distribution in original train set:", Counter(y_train))
rus = RandomUnderSampler(random_state=7)
X_train_rus_resampled, y_train_rus_resampled = rus.fit_resample(X_train, y_train)
print("Class distribution in resampled train set:", Counter(y_train_rus_resampled))

Class distribution in original train set: Counter({0: 83242, 1: 8409})
Class distribution in resampled train set: Counter({0: 8409, 1: 8409})


In [None]:
print("Class distribution in original train set:", Counter(y_train))
ros = RandomOverSampler(random_state=7)
X_train_ros_resampled, y_train_ros_resampled = ros.fit_resample(X_train, y_train)
print("Class distribution in resampled train set:", Counter(y_train_ros_resampled))

Class distribution in original train set: Counter({0: 83242, 1: 8409})
Class distribution in resampled train set: Counter({0: 83242, 1: 83242})


In [None]:
print("Class distribution in original train set:", Counter(y_train))
smote = SMOTE(random_state=7)
X_train_smote_resampled, y_train_smote_resampled = smote.fit_resample(X_train, y_train)
print("Class distribution in resampled train set:", Counter(y_train_smote_resampled))

Class distribution in original train set: Counter({0: 83242, 1: 8409})
Class distribution in resampled train set: Counter({0: 83242, 1: 83242})


In [None]:
#create a logistic regression model object
lr = LogisticRegression(class_weight='balanced', random_state=7)
lr.fit(X_train, y_train)
print(lr.coef_)
print(lr.intercept_)

[[0.70762312 0.40179317 0.10333775 0.6074612  0.86220769 0.88966486
  0.50440388 0.86035346 0.61778868 0.06325317 0.74167581 1.17754017
  0.77701069 0.86615051]]
[-0.00390821]


1 is good
0 is default

In [None]:
accuracy = lr.score(X_test, y_test)
accuracy

0.6485837734037446

In [None]:
# make predictions on the test set
y_pred = lr.predict(X_test)

In [None]:
# assume y_true and y_pred are the true and predicted labels, respectively
cm = confusion_matrix(y_test, y_pred, labels=[0,1])

# create a dataframe from the confusion matrix
df_cm = pd.DataFrame(cm, index=['Good', 'Default'], columns=['Predict Good', 'Predict Default'])


# print the dataframe
print(df_cm)

         Predict Good  Predict Default
Good            13560             7251
Default           801             1301


True positive (TP)
The model predicts a good customer, and the client did not default <br>
False positive (FP)
The model predicts a good customer, but the client defaulted<br>
True negative (TN) 
The model predicts a default, and the client defaulted<br>
False negative (FN)
The model predicts a default, but the client did not default

In [None]:
#import accuracy score from sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
print(tp,fn,fp,tn)

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) #Given that we predict positive, how often is it really positive
recall = tp / (tp + fn) #How good at predicting positive
f1 = 2 * (precision * recall) / (precision + recall)
specificity = tn / (tn + fp) # How good at predicting negative?

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity) 
print("F1:", f1)

13560 7251 801 1301
Accuracy: 0.6485837734037446
Precision: 0.9442239398370587
Recall: 0.6515784921435779
Specificity: 0.6189343482397717
F1: 0.7710678949164108


# RUS

In [None]:
lr_rus = LogisticRegression(C=1, solver='liblinear', random_state=7)
lr_rus.fit(X_train_rus_resampled, y_train_rus_resampled)
print(lr_rus.coef_)
print(lr_rus.intercept_)

[[ 0.70589589  0.71169941 -0.01191753  0.59143766  0.91442382  0.89155576
   0.44595473  0.85331905  0.65696645 -0.29524461  0.7783618   0.92488134
   0.79405883  0.77128414]]
[-0.00352012]


In [None]:
y_rus_pred = lr_rus.predict(X_test)
cm = confusion_matrix(y_test, y_rus_pred, labels=[0,1])
df_cm = pd.DataFrame(cm, index=['Good', 'Default'], columns=['Predict Good', 'Predict Default'])
print(df_cm)

         Predict Good  Predict Default
Good            13560             7251
Default           796             1306


In [None]:
#import accuracy score from sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
tp, fn, fp, tn = confusion_matrix(y_test, y_rus_pred, labels=[0,1]).ravel()
print(tp,fn,fp,tn)

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) #Given that we predict positive, how often is it really positive
recall = tp / (tp + fn) #How good at predicting positive
f1 = 2 * (precision * recall) / (precision + recall)
specificity = tn / (tn + fp) # How good at predicting negative?

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity) 
print("F1:", f1)

13560 7251 796 1306
Accuracy: 0.6488019901366037
Precision: 0.9445528002229033
Recall: 0.6515784921435779
Specificity: 0.621313035204567
F1: 0.7711775243836552


# ROS

In [None]:
lr_ros = LogisticRegression(C=1, solver='liblinear', random_state=7)
lr_ros.fit(X_train_ros_resampled, y_train_ros_resampled)
print(lr_ros.coef_)
print(lr_ros.intercept_)

[[0.70648937 0.48547155 0.08827639 0.6064858  0.82844671 0.89045982
  0.5315791  0.89046923 0.63633696 0.12027072 0.7327792  1.11875963
  0.76256482 0.90786405]]
[-0.00543027]


In [None]:
lr_ros_pred = lr_ros.predict(X_test)
cm = confusion_matrix(y_test, lr_ros_pred, labels=[0,1])
df_cm = pd.DataFrame(cm, index=['Good', 'Default'], columns=['Predict Good', 'Predict Default'])
print(df_cm)

         Predict Good  Predict Default
Good            13547             7264
Default           804             1298


In [None]:
#import accuracy score from sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
tp, fn, fp, tn = confusion_matrix(y_test, lr_ros_pred, labels=[0,1]).ravel()
print(tp,fn,fp,tn)

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) #Given that we predict positive, how often is it really positive
recall = tp / (tp + fn) #How good at predicting positive
f1 = 2 * (precision * recall) / (precision + recall)
specificity = tn / (tn + fp) # How good at predicting negative?

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity) 
print("F1:", f1)

13547 7264 804 1298
Accuracy: 0.6478854798585956
Precision: 0.9439760295449794
Recall: 0.6509538224977176
Specificity: 0.6175071360608944
F1: 0.7705477504123771


# Smote

In [None]:
lr_smote = LogisticRegression(C=1, solver='liblinear', random_state=7)
lr_smote.fit(X_train_smote_resampled, y_train_smote_resampled)
print(lr_smote.coef_)
print(lr_smote.intercept_)

[[ 0.72667343  0.70091772  0.39426325  0.6422244   0.99157793  0.88773873
   0.64308847  0.95449769  0.78345818 -1.05379915  0.61250193  1.63995605
   0.83115168  0.63239972]]
[-0.01549624]


In [None]:
y_smote_pred = lr_smote.predict(X_test)
cm = confusion_matrix(y_test, y_smote_pred, labels=[0,1])
df_cm = pd.DataFrame(cm, index=['Good', 'Default'], columns=['Predict Good', 'Predict Default'])
print(df_cm)

         Predict Good  Predict Default
Good            13509             7302
Default           797             1305


In [None]:
#import accuracy score from sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
tp, fn, fp, tn = confusion_matrix(y_test, y_smote_pred, labels=[0,1]).ravel()
print(tp,fn,fp,tn)

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) #Given that we predict positive, how often is it really positive
recall = tp / (tp + fn) #How good at predicting positive
f1 = 2 * (precision * recall) / (precision + recall)
specificity = tn / (tn + fp) # How good at predicting negative?

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity) 
print("F1:", f1)

13509 7302 797 1305
Accuracy: 0.6465325361148693
Precision: 0.9442891094645604
Recall: 0.6491278650713564
Specificity: 0.620837297811608
F1: 0.7693709599339351


In [None]:
#import accuracy score from sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
def calculate(y_test,y_pred):
    tp, fn, fp, tn = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    return tn/(tn+fp)

# Original dataset tuning

In [None]:
numbers = []
for i in range(1, 200):
    numbers.append(i / 100)

best_c = 1
spec = 0.6189343482397717

In [None]:
for i in numbers:
    lr = LogisticRegression(random_state=7,C= i, solver= 'liblinear', class_weight='balanced')
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    res = calculate(y_test,y_pred)
    if res > spec:
        best_c=i
        spec=res

In [None]:
print(best_c)

1


# RUS Tuning

In [None]:
numbers = []
for i in range(1, 200):
    numbers.append(i / 100)

best_c = 1
spec = 0.621313035204567

In [None]:
for i in numbers:
    lr = LogisticRegression(random_state=7,C= i, solver= 'liblinear')
    lr.fit(X_train_rus_resampled, y_train_rus_resampled)
    y_pred = lr.predict(X_test)
    res = calculate(y_test,y_pred)
    if res > spec:
        best_c=i
        spec=res

In [None]:
print(best_c)

1.14


In [None]:
lr_rus = LogisticRegression(C=1, solver='liblinear', random_state=7)
lr_rus.fit(X_train_rus_resampled, y_train_rus_resampled)
print(lr_rus.coef_)
print(lr_rus.intercept_)


In [None]:
y_rus_pred = lr_rus.predict(X_test)
cm = confusion_matrix(y_test, y_rus_pred, labels=[0,1])
df_cm = pd.DataFrame(cm, index=['Good', 'Default'], columns=['Predict Good', 'Predict Default'])
print(df_cm)

In [None]:
#import accuracy score from sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
tp, fn, fp, tn = confusion_matrix(y_test, y_rus_pred, labels=[0,1]).ravel()
print(tp,fn,fp,tn)

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) #Given that we predict positive, how often is it really positive
recall = tp / (tp + fn) #How good at predicting positive
f1 = 2 * (precision * recall) / (precision + recall)
specificity = tn / (tn + fp) # How good at predicting negative?

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity) 
print("F1:", f1)

# ROS Tuning

In [None]:
numbers = []
for i in range(1, 200):
    numbers.append(i / 100)

best_c = 1
spec = 0.6175071360608944

In [None]:
for i in numbers:
    lr = LogisticRegression(random_state=7,C= i, solver= 'liblinear')
    lr.fit(X_train_ros_resampled, y_train_ros_resampled)
    y_pred = lr.predict(X_test)
    res = calculate(y_test,y_pred)
    if res > spec:
        best_c=i
        spec=res

In [None]:
print(best_c)

0.21


In [None]:
lr = LogisticRegression(random_state=7,C= 0.21, solver= 'liblinear')
lr.fit(X_train_ros_resampled, y_train_ros_resampled)

In [None]:
lr_ros_pred = lr_ros.predict(X_test)
cm = confusion_matrix(y_test, lr_ros_pred, labels=[0,1])
df_cm = pd.DataFrame(cm, index=['Good', 'Default'], columns=['Predict Good', 'Predict Default'])
print(df_cm)

         Predict Good  Predict Default
Good            13547             7264
Default           804             1298


In [None]:
#import accuracy score from sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
tp, fn, fp, tn = confusion_matrix(y_test, lr_ros_pred, labels=[0,1]).ravel()
print(tp,fn,fp,tn)

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) #Given that we predict positive, how often is it really positive
recall = tp / (tp + fn) #How good at predicting positive
f1 = 2 * (precision * recall) / (precision + recall)
specificity = tn / (tn + fp) # How good at predicting negative?

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity) 
print("F1:", f1)

13547 7264 804 1298
Accuracy: 0.6478854798585956
Precision: 0.9439760295449794
Recall: 0.6509538224977176
Specificity: 0.6175071360608944
F1: 0.7705477504123771


# SMOTE Tuning

In [None]:
numbers = []
for i in range(1, 200):
    numbers.append(i / 100)

best_c = 1
spec = 0.620837297811608

In [None]:
for i in numbers:
    lr_smote = LogisticRegression(C=1, solver='liblinear', random_state=7)
    lr_smote.fit(X_train_smote_resampled, y_train_smote_resampled)
    y_pred = lr_smote.predict(X_test)
    res = calculate(y_test,y_pred)
    if res > spec:
        best_c=i
        spec=res

In [None]:
print(best_c)

1


In [None]:
#create a logistic regression model object
lr_smote = LogisticRegression(C=1, solver='liblinear', random_state=7)
lr_smote.fit(X_train_smote_resampled, y_train_smote_resampled)
print(lr.coef_)
print(lr.intercept_)

[[0.70566543 0.49478577 0.08798268 0.60388034 0.80621461 0.89021128
  0.52970181 0.88939432 0.63200623 0.13085229 0.73092283 1.09493738
  0.7591641  0.90077856]]
[-0.00541043]


In [None]:
y_smote_pred = lr_smote.predict(X_test)
cm = confusion_matrix(y_test, y_smote_pred, labels=[0,1])
df_cm = pd.DataFrame(cm, index=['Good', 'Default'], columns=['Predict Good', 'Predict Default'])
print(df_cm)

         Predict Good  Predict Default
Good            13509             7302
Default           797             1305


In [None]:
#import accuracy score from sklearn
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
tp, fn, fp, tn = confusion_matrix(y_test, y_smote_pred, labels=[0,1]).ravel()
print(tp,fn,fp,tn)

accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) #Given that we predict positive, how often is it really positive
recall = tp / (tp + fn) #How good at predicting positive
f1 = 2 * (precision * recall) / (precision + recall)
specificity = tn / (tn + fp) # How good at predicting negative?

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("Specificity:", specificity) 
print("F1:", f1)

13509 7302 797 1305
Accuracy: 0.6465325361148693
Precision: 0.9442891094645604
Recall: 0.6491278650713564
Specificity: 0.620837297811608
F1: 0.7693709599339351


BEST SMOTE
<br> 
Accuracy: 0.6465325361148693 <br> 
Precision: 0.9442891094645604 <br> 
Recall: 0.6491278650713564 <br> 
Specificity: 0.620837297811608 <br> 
F1: 0.7693709599339351 <br> 

<br>
    
BEST ROS
<br> 
Accuracy: 0.6478854798585956 <br> 
Precision: 0.9439760295449794 <br> 
Recall: 0.6509538224977176<br> 
Specificity: 0.6175071360608944<br> 
F1: 0.7705477504123771<br> 

<br> 
    
BEST RUS
<br> 
Accuracy: 0.6465325361148693<br> 
Precision: 0.9442891094645604<br>
Recall: 0.6491278650713564<br> 
Specificity: 0.620837297811608<br> 
F1: 0.7693709599339351

ORIGINAL DATASET 
<br>
Accuracy: 0.6485837734037446<br>
Precision: 0.9442239398370587<br>
Recall: 0.6515784921435779<br>
Specificity: 0.6189343482397717<br>
F1: 0.7710678949164108<br>

Since all methods had not much difference, we will still use the original dataset that is not balanced.