## 4. Model Evaluation and Implementation

In [10]:
# Import pandas as pd
import pandas as pd
import matplotlib as mtlb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn as skl 
from sklearn import metrics
import xgboost as xgb

In [7]:
# Read in the csv file
clean_loan_data = pd.read_csv('clean_loan_data.csv')

1. Comparing model reports

You've used logistic regression models and gradient boosted trees. It's time to compare these two to see which model will be used to make the final predictions.

One of the easiest first steps for comparing different models' ability to predict the probability of default is to look at their metrics from the classification_report(). With this, you can see many different scoring metrics side-by-side for each model. Because the data and models are normally unbalanced with few defaults, focus on the metrics for defaults for now.# Create the X and y data sets

In [8]:
# Create two data sets for numeric and non-numeric data
cred_num = clean_loan_data.select_dtypes(exclude=['object'])
cred_str = clean_loan_data.select_dtypes(include=['object'])

# One-hot encode the non-numeric columns
cred_str_onehot = pd.get_dummies(cred_str)

# Union the one-hot encoded columns to the numeric ones
cr_loan_prep = pd.concat([cred_num, cred_str_onehot], axis=1)

In [13]:
X = cr_loan_prep.drop(['loan_status'], axis=1)
y = cr_loan_prep[['loan_status']]

# Use test_train_split to create the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=123)

###GBT
clf_gbt = xgb.XGBClassifier().fit(X_train, np.ravel(y_train))

# Predict with a model
gbt_preds = clf_gbt.predict_proba(X_test)

# Create dataframes of first five predictions, and first five true labels
preds_df_gbt = pd.DataFrame(gbt_preds[:,1], columns = ['prob_default'])
true_df = y_test

###LR

# Train the logistic regression model on the training data
clf_logistic = LogisticRegression(solver='lbfgs').fit(X_train, np.ravel(y_train))

# Create predictions of probability for loan status using test data
preds = clf_logistic.predict_proba(X_test)

# Set the threshold for defaults to 0.4

# Create a dataframe for the probabilities of default
preds_df_lr = pd.DataFrame(preds[:,1], columns = ['prob_default'])
preds_df_lr['loan_status'] = preds_df_lr['prob_default'].apply(lambda x: 1 if x > 0.4 else 0)


In [16]:
preds_df_gbt['loan_status']=preds_df_gbt['prob_default'].apply(lambda x: 1 if x > 0.4 else 0)

In [17]:
# Print the logistic regression classification report
target_names = ['Non-Default', 'Default']
print(metrics.classification_report(y_test, preds_df_lr['loan_status'], target_names=target_names))

# Print the gradient boosted tree classification report
print(metrics.classification_report(y_test, preds_df_gbt['loan_status'], target_names=target_names))

# Print the default F-1 scores for the logistic regression
print(metrics.precision_recall_fscore_support(y_test,preds_df_lr['loan_status'], average = 'macro')[2])

# Print the default F-1 scores for the gradient boosted tree
print(metrics.precision_recall_fscore_support(y_test,preds_df_gbt['loan_status'], average = 'macro')[2])

              precision    recall  f1-score   support

 Non-Default       0.86      0.92      0.89      9198
     Default       0.62      0.46      0.53      2586

    accuracy                           0.82     11784
   macro avg       0.74      0.69      0.71     11784
weighted avg       0.81      0.82      0.81     11784

              precision    recall  f1-score   support

 Non-Default       0.94      0.98      0.96      9198
     Default       0.91      0.76      0.83      2586

    accuracy                           0.93     11784
   macro avg       0.92      0.87      0.89     11784
weighted avg       0.93      0.93      0.93     11784

0.7108943782814463
0.8936226604864824
