In [29]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

import warnings
warnings.filterwarnings('ignore')

---

In [30]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
loans_df = pd.read_csv(Path('Resources/lending_data.csv'))

# Review the DataFrame
display(loans_df.head())
display(loans_df.tail())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
77531,19100.0,11.261,86600,0.65358,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1
77535,15600.0,9.742,72300,0.585062,9,2,42300,1


In [31]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = loans_df['loan_status']

# Separate the X variable, the features
X = loans_df.drop(columns=['loan_status'])

In [32]:
# Review the y variable Series
display(y.head())
display(y.tail())

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, dtype: int64

In [33]:
# Review the X variable DataFrame
display(X.head())
display(X.tail())

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
77531,19100.0,11.261,86600,0.65358,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300
77535,15600.0,9.742,72300,0.585062,9,2,42300


In [34]:
# Check the balance of our target values
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [35]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)

---

In [36]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_regression_model.fit(train_X, train_y)

LogisticRegression(random_state=1)

In [37]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(test_X)

In [38]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(test_y, testing_predictions)

0.9520479254722232

In [39]:
# Generate a confusion matrix for the model
confusion_matrix(test_y, testing_predictions)

array([[18663,   102],
       [   56,   563]])

In [40]:
# Print the classification report for the model
print(classification_report_imbalanced(test_y, testing_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.91      1.00      0.95      0.91     18765
          1       0.85      0.91      0.99      0.88      0.95      0.90       619

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The model appears to predict both of them with really well. It predicts the healthy loan almost perfectly, and predicts the high risk loan a little less accuratley but still very high. Both their precision and recall scores are high as well as their F-1 score. The healthy loan has perfect on 2/3 and 0.99 on the recall. While the high risk loan has a 0.85 precision, 0.91 recall, and 0.88 F-1 score. However, due to the imbalance we cannot be sure that this is actually true, and that the results are not skewed due to the low value counts of the high risk loans. 

In [41]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
random_oversampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_oversampler.fit_resample(train_X, train_y)

In [42]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

1    56271
0    56271
Name: loan_status, dtype: int64

In [43]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
new_logistic_regression_model = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
new_logistic_regression_model.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
oversampled_predictions = new_logistic_regression_model.predict(test_X)

In [44]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(test_y, oversampled_predictions)

0.9936781215845847

In [45]:
# Generate a confusion matrix for the model
confusion_matrix(test_y, oversampled_predictions)

array([[18649,   116],
       [    4,   615]])

In [46]:
# Print the classification report for the model
print(classification_report_imbalanced(test_y, oversampled_predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.84      0.99      0.99      0.91      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** Overall, the logistic regression model fit with oversampled data predicts the healthy and high risk loans better than the original non-oversampled data. Even though the original model had high scores for accuracy, precision, recall, and F-1, it appears the new oversampled model had higher scores in all categories. So the oversampled model predicts better than the one fit with original data.