In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# load the data
df_loans = pd.read_csv('Resources/loans_data_encoded.csv')
df_loans.head()

Unnamed: 0,amount,term,age,bad,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,0,6,0,1,0,0,0,1
1,1000,30,50,0,7,1,0,0,0,1,0
2,1000,30,33,0,8,1,0,0,0,1,0
3,1000,15,27,0,9,0,0,0,1,0,1
4,1000,30,28,0,10,0,0,0,1,1,0


In [5]:
# define the features set(the X)
X = df_loans.copy()
X = X.drop("bad", axis = 1)
X.head()

Unnamed: 0,amount,term,age,month_num,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,45,6,0,1,0,0,0,1
1,1000,30,50,7,1,0,0,0,1,0
2,1000,30,33,8,1,0,0,0,1,0
3,1000,15,27,9,0,0,0,1,0,1
4,1000,30,28,10,0,0,0,1,1,0


In [7]:
# define the target set(the y)
y = df_loans['bad'].values
y[:5]

array([0, 0, 0, 0, 0])

In [8]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [9]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 10)
(125, 10)
(375,)
(125,)


In [10]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)


In [11]:
# Determine the shape of our training and testing sets.
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(400, 10)
(100, 10)
(400,)
(100,)


In [13]:
# create a standardscaler instance
scaler = StandardScaler()
# fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fit the Decision Tree Model

In [14]:
# create the decision tree classifier instance
model = tree.DecisionTreeClassifier()
# fitting the model
model = model.fit(X_train_scaled, y_train)

# make prediction using the testing data

In [16]:
# make prediction using the testing data
predictions = model.predict(X_test_scaled)
predictions

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0])

# evaluate the model


In [18]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test,predictions)

# create a dataframe from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=['Actual 0','Actual 1'], columns = ['Predicted 0', 'Predicted 1'])
cm_df

# the results:
# out of the (51+33=84) good loan applications, 51 were predicted to be good which is the true positive
# out of 84 good loan applications, 33 were predicted to be bad which is the false negative
# out of the (22+19=41) bad loan applications, 22 were predicted to be good which is the false positive
# out of the 41 bad loan applications, 19 were predicted bad which is the true negative

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,22,19


In [19]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.70      0.61      0.65        84
           1       0.37      0.46      0.41        41

    accuracy                           0.56       125
   macro avg       0.53      0.54      0.53       125
weighted avg       0.59      0.56      0.57       125



In [22]:
# calculating the accuracy score
acc_score = accuracy_score(y_test,predictions)
acc_score
#(True Positives (TP) + True Negatives (TN)) / Total = (51 + 19)/125 = 0.56


0.56

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,22,19


Accuracy Score : 0.56
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.61      0.65        84
           1       0.37      0.46      0.41        41

    accuracy                           0.56       125
   macro avg       0.53      0.54      0.53       125
weighted avg       0.59      0.56      0.57       125



# Summary

### this model may not be the best one for the preventing fraudulent loan applications because the model's accuracy is 0.56 which is considered low. The precision and recall are not good enough to state that the model will be good at classifiying fraudulent loan applications. Modeling is an iterative process: you may need more data, more cleaning, another model parameter or a different model. Its also important to have a goal that's been agreed upon, so that you know when the model is good enough