In [74]:
# import packages
#import warnings
#warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, auc, average_precision_score

In [76]:
# Create a working directory
import os
print(os.getcwd())

# Set the working directory
os.chdir('/Users/surajgurung/Library/CloudStorage/OneDrive-UniversityofFlorida/AI&FINtech/CreditApplication')

# list the name of variables located their
print(os.listdir())

/Users/surajgurung/Library/CloudStorage/OneDrive-UniversityofFlorida/AI&FINtech/CreditApplication
['.DS_Store', 'anaconda_projects', 'LendingClub_Decision_Trees_Training_Set.xlsx', 'Logistic_Regression.ipynb', '.ipynb_checkpoints', 'Decision_trees.ipynb', '.git', 'LendingClub_Decision_Trees_Test_Set.xlsx', 'LendingClub_Decision_Trees_Validation_Set.xlsx']


In [78]:
# Load the dataset
train = pd.read_excel('/Users/surajgurung/Library/CloudStorage/OneDrive-UniversityofFlorida/AI&FINtech/CreditApplication/LendingClub_Decision_Trees_Training_Set.xlsx')
validation=pd.read_excel('/Users/surajgurung/Library/CloudStorage/OneDrive-UniversityofFlorida/AI&FINtech/CreditApplication/LendingClub_Decision_Trees_Validation_Set.xlsx')
test = pd.read_excel('/Users/surajgurung/Library/CloudStorage/OneDrive-UniversityofFlorida/AI&FINtech/CreditApplication/LendingClub_Decision_Trees_Test_Set.xlsx')

In [80]:
# Note: 1: Good and 0= default
print(train.head())
print("--------------")
print(validation.head())
print("--------------")
print(test.head())
print("--------------")

#To check the name of the variables 
test.columns

   home_ownership    income    dti  fico  loan_status
0               1   44304.0  18.47   690            0
1               0   50000.0  29.62   735            1
2               0   64400.0  16.68   675            1
3               0   38500.0  33.73   660            0
4               1  118000.0  26.66   665            1
--------------
   homw_ownership    income    dti  fico  loan_status
0               0   25000.0  27.60   660            0
1               0   50000.0  21.51   715            1
2               1  100000.0   8.14   770            1
3               0   75000.0   1.76   685            0
4               1   78000.0  16.11   680            1
--------------
   home_ownership    income    dti  fico  loan_status
0               1   52400.0  24.64   665            1
1               1  150000.0  17.04   785            1
2               1  100000.0  20.92   710            1
3               0   97000.0  13.11   705            1
4               1  100000.0  24.08   685            

Index(['home_ownership', 'income', 'dti', 'fico', 'loan_status'], dtype='object')

The data has already been split into training set, validation set and test set.

In [82]:
# remove target column to create feature only dataset
X_train = train.drop('loan_status', axis=1)
X_val=validation.drop('loan_status', axis=1)
X_test = test.drop('loan_status', axis=1)

# Scale data using the mean and standard deviation of the training set. 
# This is not necessary for the simple logistic regression we will do here 
# but should be done if L1 or L2 regrularization is carried out
X_test=(X_test-X_train.mean())/X_train.std()
X_val=(X_val-X_train.mean())/X_train.std()
X_train=(X_train-X_train.mean())/X_train.std()

# store target column as y-variables 
y_train = train['loan_status']
y_val=validation['loan_status']
y_test = test['loan_status']

#print first five instances for each data set

print(X_train.head())
print("--------------------------------")
print(X_val.head())
print("--------------------------------")
print(X_test.head())

   home_ownership    income       dti      fico
0        0.809651 -0.556232  0.053102 -0.163701
1       -1.234923 -0.451393  1.307386  1.262539
2       -1.234923 -0.186349 -0.148259 -0.639114
3       -1.234923 -0.663060  1.769728 -1.114527
4        0.809651  0.800204  0.974410 -0.956056
--------------------------------
        dti      fico  home_ownership  homw_ownership    income
0  1.080153 -1.114527             NaN             NaN -0.911538
1  0.395077  0.628655             NaN             NaN -0.451393
2 -1.108940  2.371837             NaN             NaN  0.468899
3 -1.826638 -0.322172             NaN             NaN  0.008753
4 -0.212379 -0.480643             NaN             NaN  0.063971
--------------------------------
   home_ownership    income       dti      fico
0        0.809651 -0.407219  0.747177 -0.956056
1        0.809651  1.389190 -0.107762  2.847250
2        0.809651  0.468899  0.328707  0.470184
3       -1.234923  0.413681 -0.549855  0.311713
4        0.809651  0.4

In [84]:
print(X_train.shape, y_train.shape, X_val.shape,y_val.shape, X_test.shape, y_test.shape)
X_train.columns

(7000, 4) (7000,) (3000, 5) (3000,) (2290, 4) (2290,)


Index(['home_ownership', 'income', 'dti', 'fico'], dtype='object')

In [86]:
freq = y_train.value_counts()           # count frequency of different classes in training set
freq/sum(freq)*100                      # get percentage of above

loan_status
1    79.171429
0    20.828571
Name: count, dtype: float64

This shows that 79.17% do not default on their loan, whereas 20.82% actually default on their loan amount.

In [90]:
#Create an ionstance of logisticregression named lgstc_reg 

lgstc_reg =  LogisticRegression(penalty=None,solver="newton-cg")     # Penalty none mean no regularization

# Fit logististic regression to training set

lgstc_reg.fit(X_train, y_train)                                        # fit training data on logistic regression 


print(lgstc_reg.intercept_, lgstc_reg.coef_)     # get the coefficients of each features

[1.41622043] [[ 0.14529381  0.03361951 -0.32404237  0.363174  ]]


These results show that we have a bias of 1.416 and coefficients are 0.145, 0.034, -0.32, and 0.363, respectively, for the 4 features.
Now we will test the model with the validation set.