In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Loading data
file_path = Path("./Resources/loans_data_encoded.csv")
df_loans = pd.read_csv(file_path)
df_loans.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


# Preprocessing the data

In [6]:
# X in independent and y is bad => 0 is good and 1 is bad
X=df_loans.copy()
X=X.drop("bad",axis=1)
X.head()

Unnamed: 0,amount,term,age,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,6,0,1,0,0,0,1
1,1000,30,50,7,1,0,0,0,1,0
2,1000,30,33,8,1,0,0,0,1,0
3,1000,15,27,9,0,0,0,1,0,1
4,1000,30,28,10,0,0,0,1,1,0


In [7]:
y=df_loans["bad"].values
y[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [10]:
# Split the dataset into training and test dataset

X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=78)

In [11]:
# Determine the shape of our training and testing sets.  Default split 75% vs. 25%
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 10)
(125, 10)
(375,)
(125,)


In [12]:
# if we manuall would like to split  80% vs. 20%, use train size
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)


In [13]:
# Determine the shape of our training and testing sets.
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(400, 10)
(100, 10)
(400,)
(100,)


 StandardScaler as before and fit the instance, scaler, with the training data and then scale the features with the transform() method:

In [25]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
# fit function calc mean and standard deviation and 
# we apply that properties to test data
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [26]:
import numpy as np
print(np.mean(X_train_scaled[:,0]))
print (np.mean(X_test_scaled[:,0]))
print(np.std(X_train_scaled[:,0]))
print(np.std(X_test_scaled[:,0]))

3.931669804539221e-16
0.08040483006321758
1.0
0.8450480061575104


# Fit Decision Tree Model and Make the predictions

In [27]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model. - runing through train dataset
model = model.fit(X_train_scaled, y_train)

In [30]:
# make the prediction using the test data
predictions = model.predict(X_test_scaled)
predictions

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0], dtype=int64)

# Evaluate the Model

In [32]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[52, 32],
       [23, 18]], dtype=int64)

In [33]:
# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,52,32
Actual 1,23,18


Out of 84 good loan applications (Actual 0), 50 were predicted to be good (Predicted 0), which we call true positives.
Out of 84 good loan applications (Actual 0), 34 were predicted to be bad (Predicted 1), which are considered false negatives.
Out of 41 bad loan applications (Actual 1), 22 were predicted to be good (Predicted 0) and are considered false positives.
Out of 41 bad loan applications (Actual 1), 19 were predicted to be bad (Predicted 1) and are considered true negatives.

There are a total of 41 (22 + 19) bad loans. Nineteen out of 41 were classified as bad loans. Therefore, the recall is 0.46 or 46%.

In [35]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.56

True Positives (TP) + True Negatives (TN)) / Total = (50 + 19)/125 = 0.552

In [37]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,52,32
Actual 1,23,18


Accuracy Score : 0.56
Classification Report
              precision    recall  f1-score   support

           0       0.69      0.62      0.65        84
           1       0.36      0.44      0.40        41

    accuracy                           0.56       125
   macro avg       0.53      0.53      0.52       125
weighted avg       0.58      0.56      0.57       125



Accuracy = TP+TN/Total observation => (52+18)/125 => 0.552
Precision for good loan= TP/total TP + FP= 52/(52+23) => .69

Precision for bad loan = TN/(FN+TN) => 18/(18+32) =>0.358 means large number of false positive; of 50 loan application, we predicted to be bad application 32 which were actually good loan applications.

sensitivity
Recall: Actual 0:  TP/(TP+FN)=> 52/(52+32)=>0.595 for good loans
Recall: Actual 1: TN/(TN+FP) => 18/(18+23) => 0.463 for bad loans low score results in large number of False Negatives. 

F1 Score between 0 and 1; Zero is worst and 1 is best

Support: 84 actual Occurances => 52+32=>84 for good loans and 41 actual occurance for bad loans