In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve,roc_auc_score
from statsmodels.discrete import discrete_model as sm
from statsmodels.tools.tools import add_constant
import matplotlib.pyplot as plt
%matplotlib inline

In [24]:
# Binary dataset
data_default = pd.read_excel("Default.xlsx")

# Lets keep it to 2 variables only

# convert student column to categorical

data_default["default"] = data_default["default"].replace("No",0)
data_default["default"] = data_default["default"].replace("Yes",1)

## Peek into the Data

In [65]:
data_default.head()

Unnamed: 0,default,student,balance,income
1,0,No,729.526495,44361.625074
2,0,Yes,817.180407,12106.1347
3,0,No,1073.549164,31767.138947
4,0,No,529.250605,35704.493935
5,0,No,785.655883,38463.495879


In [25]:
# Quite an imbalanced Dataset
data_default["default"].value_counts()

0    9667
1     333
Name: default, dtype: int64

In [26]:
# split dataset into x and y

y_default = data_default['default']
x_default = data_default.drop(["default","student"],axis =1)

from sklearn.preprocessing import StandardScaler
st = StandardScaler()
x_default = st.fit_transform(x_default)

# split into training and test
from sklearn.model_selection import train_test_split
train_x_def,test_x_def,train_y_def,test_y_def = train_test_split(x_default,y_default,test_size = 0.2,random_state=42)

In [27]:
# Initiate model, fit and predict
log_model = LogisticRegression(C=0.0001)
log_model.fit(train_x_def,train_y_def) # log_model(y_train,x_Train).fit()
prediction_sk = log_model.predict(train_x_def) # predicted classes
prediction_probs = log_model.predict_proba(train_x_def) # probability values

In [28]:
print((classification_report(train_y_def, prediction_sk)))

             precision    recall  f1-score   support

          0       0.97      1.00      0.98      7736
          1       0.00      0.00      0.00       264

avg / total       0.94      0.97      0.95      8000



  'precision', 'predicted', average, warn_for)


## Probability Threshold Adjustment

In [58]:
threshold_45 = []
for i in prediction_probs[:,1]:
    if i>= 0.45:
        threshold_45.append(1)
    else:
        threshold_30.append(0)

In [59]:
print(classification_report(train_y_def,threshold_45))

             precision    recall  f1-score   support

          0       0.97      1.00      0.99      7736
          1       0.81      0.19      0.31       264

avg / total       0.97      0.97      0.96      8000



## Upsampling 

In [31]:
from sklearn.utils import resample 

# Separate majority and minority classes
df_majority = data_default[data_default.default==0]
df_minority = data_default[data_default.default==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=9667,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [32]:
df_upsampled.default.value_counts()

1    9667
0    9667
Name: default, dtype: int64

In [33]:
# Re-split into test and train

y_default_up = df_upsampled['default']
x_default_up = df_upsampled.drop(["default","student"],axis =1)

train_x_up,test_x_up,train_y_up,test_y_up = train_test_split(x_default_up,y_default_up,test_size = 0.2,random_state=42)

In [34]:
# Re-fit data to same model

log_model.fit(train_x_up,train_y_up) # log_model(y_train,x_Train).fit()
prediction_up = log_model.predict(train_x_up) # predicted classes
prediction_probs_up = log_model.predict_proba(train_x_up) # probability values

In [35]:
print((classification_report(train_y_up, prediction_up)))

             precision    recall  f1-score   support

          0       0.83      0.67      0.74      7726
          1       0.72      0.86      0.78      7741

avg / total       0.77      0.76      0.76     15467



Overall Accuracy has fallen but much better performance on minority class. 

## SMOTE

In [60]:
from imblearn.over_sampling import SMOTE

In [61]:
sm = SMOTE(ratio='auto', kind='regular')
X_resampled , y_resampled = sm.fit_sample(train_x_def, train_y_def)

In [69]:
len(X_resampled)

15472

In [62]:
#fit model with new data
    
log_model.fit(X_resampled,y_resampled)
prediction_smote = log_model.predict(X_resampled) # predicted classes
prediction_smote_probs = log_model.predict_proba(X_resampled) # probability values


In [64]:
print(classification_report(y_resampled, prediction_smote))

             precision    recall  f1-score   support

          0       0.98      0.62      0.76      7736
          1       0.72      0.99      0.84      7736

avg / total       0.85      0.81      0.80     15472



Much Better!!