<h3>Model - Logistic Regression</h3>

In [None]:
import numpy as np
import pandas as pd

import cleaning

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, auc

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
df = cleaning.clean_raw()

In [7]:
df.head()

Unnamed: 0_level_0,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,pay%_2,pay%_3,pay%_4,pay%_5,arrears_1,arrears_2,arrears_3,arrears_4,arrears_5,arrears_6
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20000,2,2,1,24,2,2,-1,-1,-2,...,1.0,,,,2,2,-1,-1,-2,-2
2,120000,2,2,2,26,-1,2,0,0,0,...,0.372856,0.305623,0.289436,0.0,-1,2,0,0,0,2
3,90000,2,2,2,34,0,0,0,0,0,...,0.110628,0.069779,0.066899,0.064313,0,0,0,0,0,0
4,50000,2,2,1,37,0,0,0,0,0,...,0.040961,0.042382,0.037985,0.03618,0,0,0,0,0,0
5,50000,1,2,1,57,-1,0,-1,0,0,...,1.023608,0.477555,0.470072,0.036015,-1,0,-1,0,0,0


<h4>Transform Categorical Data</h4>

In [19]:
df.drop(columns=['pay_0','pay_2', 'pay_3', 'pay_4','pay_5','pay_6'], inplace=True)
df.drop(columns=['pay%_1','pay%_2', 'pay%_3', 'pay%_4','pay%_5'], inplace=True)

arr = []
for i in range(1,7):
    arr.append(f'arrears_{i}')

x_feats = ['sex', 'education', 'marriage'] + arr

In [20]:
x_feats

['sex',
 'education',
 'marriage',
 'arrears_1',
 'arrears_2',
 'arrears_3',
 'arrears_4',
 'arrears_5',
 'arrears_6']

In [30]:
X = pd.get_dummies(df, columns=x_feats, drop_first=True)
y = df['default']

In [33]:
# normalise the data

for col in X.columns:
    # Subtract the minimum and divide by the range forcing a scale of 0 to 1 for each feature
    X[col] = (X[col] - min(X[col]))/ (max(X[col]) - min(X[col])) 


In [34]:
X.head()

Unnamed: 0_level_0,limit_bal,age,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,...,arrears_4_2,arrears_4_3,arrears_5_-1,arrears_5_0,arrears_5_2,arrears_5_3,arrears_6_-1,arrears_6_0,arrears_6_2,arrears_6_3
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.010101,0.051724,0.149982,0.069164,0.086723,0.160138,0.080648,0.260979,0.0,0.000409,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.111111,0.086207,0.148892,0.067858,0.087817,0.16322,0.084074,0.263485,0.0,0.000594,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.080808,0.224138,0.172392,0.079532,0.093789,0.173637,0.09547,0.272928,0.001738,0.000891,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.040404,0.275862,0.1881,0.111995,0.113407,0.186809,0.109363,0.283685,0.00229,0.001199,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5,0.040404,0.62069,0.154144,0.071601,0.10602,0.179863,0.099633,0.275681,0.00229,0.021779,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [63]:
# data split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [64]:
print(X.shape)
print(X_test.shape)

(29932, 49)
(8980, 49)


In [65]:
# model fitting

logreg = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
# logreg = LogisticRegression()
model_log = logreg.fit(X_train, y_train)
model_log

LogisticRegression(C=1000000000000.0, class_weight=None, dual=False,
                   fit_intercept=False, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
# predict
y_hat_test = logreg.predict(X_test)
y_hat_train = logreg.predict(X_train)

probas = logreg.predict_proba(X_test)

In [67]:
# We could subtract the two columns. If values or equal, difference will be zero. Then count number of zeros 
residuals = np.abs(y_test - y_hat_test)
print(pd.Series(residuals).value_counts())
print(pd.Series(residuals).value_counts(normalize=True))

0    8980
Name: default, dtype: int64
0    1.0
Name: default, dtype: float64


In [None]:
# Calculate accuracy 
acc = accuracy_score(y_test, preds)
print('Accuracy is :{0}'.format(acc))

# Check the AUC for predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,probas[:,1])
roc_auc = auc(false_positive_rate,true_positive_rate)
print('\nAUC is :{0}'.format(round(roc_auc, 2)))

# Create and print a confusion matrix 
print('\nConfusion Matrix')
print('----------------')
pd.crosstab(y_test, preds, rownames=['True'], colnames=['Predicted'], margins=True)