### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### The Data
##### Let's start by reading in the cr_loan_clean.csv file into a pandas dataframe.

In [2]:
cr_loan_clean = pd.read_csv('/Users/batch/Desktop/python/study/cr_loan_clean.csv')

#### One-hot encode the non-numeric columns
##### Create two data sets for numeric and non-numeric data

In [3]:
cred_num = cr_loan_clean.select_dtypes(exclude=['object'])
cred_str = cr_loan_clean.select_dtypes(include=['object'])

##### One-hot encode the non-numeric columns

In [4]:
cred_str_onehot = pd.get_dummies(cred_str)

##### Union the one-hot encoded columns to the numeric ones

In [5]:
cr_loan_modeling = pd.concat([cred_num,cred_str_onehot], axis=1)

In [6]:
print(cr_loan_modeling.columns)

Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_cred_hist_length', 'person_home_ownership_MORTGAGE',
       'person_home_ownership_OTHER', 'person_home_ownership_OWN',
       'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION',
       'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT',
       'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE',
       'loan_grade_A', 'loan_grade_B', 'loan_grade_C', 'loan_grade_D',
       'loan_grade_E', 'loan_grade_F', 'loan_grade_G',
       'cb_person_default_on_file_N', 'cb_person_default_on_file_Y'],
      dtype='object')


### Create the training and test sets

In [7]:
X = cr_loan_modeling.drop('loan_status', axis = 1)
y = cr_loan_modeling[['loan_status']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =.4, random_state = 123)

### Train the logistic regression model on the training data

In [None]:
logistic = LogisticRegression(solver='lbfgs',max_iter=20000).fit(X_train, np.ravel(y_train))

### Print the coefficients of the model

In [15]:
print(logistic.coef_)

[[-2.76608810e-03 -1.32821072e-06 -1.31054624e-02 -7.98602709e-05
   7.15810051e-02  1.14013214e+01 -6.84688443e-03 -2.40858686e-01
   2.40267435e-01 -1.68697005e+00  5.92593955e-01  3.22120128e-01
  -5.36806400e-01  2.67222247e-01  7.91367311e-02 -3.20560505e-01
  -9.06079655e-01 -2.04103760e+00 -1.92299735e+00 -1.77600441e+00
   2.42143942e-01  3.81973020e-01  2.06095474e+00  1.96000043e+00
  -5.83036776e-01 -5.11930811e-01]]


### Print the accuracy score the model

In [18]:
print(logistic.score(X_test, y_test))

0.8659198913781398


### Create predictions of probability for loan status using test data

In [None]:
prediction = logistic.predict_proba(X_test)
print(prediction)

### Create a dataframe for the probabilities of default

In [31]:
prediction_df = pd.DataFrame(prediction[:,1], columns = ['prob_default'])
print(prediction_df)

       prob_default
0          0.630738
1          0.818450
2          0.091794
3          0.067427
4          0.053204
...             ...
11779      0.012176
11780      0.088304
11781      0.467677
11782      0.217006
11783      0.054784

[11784 rows x 1 columns]


### Threshold
##### Set the threshold for defaults to 0.5

In [27]:
prediction_df["loan_status"] = prediction_df["prob_default"].apply(lambda x: 1 if x > 0.5 else 0)
print(prediction_df)

       prob_default  loan_status
0          0.630738            1
1          0.818450            1
2          0.091794            0
3          0.067427            0
4          0.053204            0
...             ...          ...
11779      0.012176            0
11780      0.088304            0
11781      0.467677            0
11782      0.217006            0
11783      0.054784            0

[11784 rows x 2 columns]


##### Print the confusion matrix

In [28]:
print(confusion_matrix(y_test,prediction_df['loan_status']))

[[8776  422]
 [1158 1428]]


##### Set the threshold for defaults to 0.4 and print the confusion matrix

In [24]:
prediction_df["loan_status"] = prediction_df["prob_default"].apply(lambda x: 1 if x > 0.4 else 0)
print(confusion_matrix(y_test,prediction_df["loan_status"]))

[[8524  674]
 [ 918 1668]]


#### Therefore, we choose the 0.4 threshold as the recall(Default) of 0.4 threshold > that of 0.5 threshold.

### Print the row counts for each loan status

In [29]:
print(prediction_df["loan_status"].value_counts())

loan_status
0    9934
1    1850
Name: count, dtype: int64


### Print the classification report

In [30]:
target_names = ['Non-Default', 'Default']
print(classification_report(y_test, prediction_df['loan_status'], target_names=target_names))

              precision    recall  f1-score   support

 Non-Default       0.88      0.95      0.92      9198
     Default       0.77      0.55      0.64      2586

    accuracy                           0.87     11784
   macro avg       0.83      0.75      0.78     11784
weighted avg       0.86      0.87      0.86     11784

