In [2]:
# initial imports
import pandas as pd
from path import Path

In [3]:
# Load data
file_path = Path("../Resources/loans_data.csv")
loans_df = pd.read_csv(file_path)
loans_df.head()
#loans_df.sample(n=20)

Unnamed: 0,amount,term,month,age,education,gender,bad
0,1000,30,June,45,High School or Below,male,0
1,1000,30,July,50,Bachelor,female,0
2,1000,30,August,33,Bachelor,female,0
3,1000,15,September,27,college,male,0
4,1000,30,October,28,college,female,0


In [4]:
# Months dictionary
months_num = {
    "January": 1,
    "February": 2,
    "March": 3,
    "April": 4,
    "May": 5,
    "June": 6,
    "July": 7,
    "August": 8,
    "September": 9,
    "October": 10,
    "November": 11,
    "December": 12,
}

In [5]:
#transform the text columns to numerical
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#transform the gender column
loans_df["gender"] = le.fit_transform(loans_df["gender"])
#transform the education column
loans_df["education"] = le.fit_transform(loans_df["education"])
#transform the month column using data dictionary: months_num
loans_df["month_num"] = loans_df["month"].apply(lambda x: months_num[x])
loans_df = loans_df.drop(["month"], axis=1)


In [6]:
loans_df.sample(n=10)

Unnamed: 0,amount,term,age,education,gender,bad,month_num
466,1000,30,26,1,1,1,10
403,1000,15,29,0,1,1,5
338,1000,15,27,3,0,1,3
188,800,15,28,3,1,0,4
166,1000,30,28,3,1,0,9
118,1000,30,35,0,1,0,5
456,1000,30,25,3,1,1,2
468,1000,30,38,1,1,1,6
308,1000,30,33,3,1,1,7
22,1000,30,25,1,1,0,10


# OR THIS OPTION

In [7]:
alt_df = loans_df = pd.read_csv(file_path)
# using pandas to encode gender and education
alt_df = pd.get_dummies(alt_df, columns= ["education", "gender"])
alt_df["month"] = alt_df["month"].apply(lambda x: months_num[x])
#alt_df.drop(["month"])
alt_df.head()

Unnamed: 0,amount,term,month,age,bad,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,6,45,0,0,1,0,0,0,1
1,1000,30,7,50,0,1,0,0,0,1,0
2,1000,30,8,33,0,1,0,0,0,1,0
3,1000,15,9,27,0,0,0,0,1,0,1
4,1000,30,10,28,0,0,0,0,1,1,0


# scale the data

In [8]:
# we'll first import the StandardScaler module and create an instance of it as data_scaler
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [9]:
# scale the data
scaled_df = data_scaler.fit_transform(alt_df)
scaled_df

array([[ 0.49337687,  0.89789115, -0.16890147, ..., -0.88640526,
        -0.42665337,  0.42665337],
       [ 0.49337687,  0.89789115,  0.12951102, ..., -0.88640526,
         2.34382305, -2.34382305],
       [ 0.49337687,  0.89789115,  0.42792352, ..., -0.88640526,
         2.34382305, -2.34382305],
       ...,
       [-1.24386563, -0.97897162, -0.16890147, ...,  1.12815215,
        -0.42665337,  0.42665337],
       [ 0.49337687,  0.89789115, -1.06413896, ...,  1.12815215,
         2.34382305, -2.34382305],
       [ 0.49337687,  0.89789115, -1.06413896, ..., -0.88640526,
        -0.42665337,  0.42665337]])

In [56]:
scaled_df.shape

(500, 11)

In [14]:
#verify that the columns have been standardized/scaled
import numpy as np
print(np.mean(scaled_df[:,0]))
print(np.std(scaled_df[:,0]))

-3.552713678800501e-16
0.9999999999999999


In [46]:
# check that all of the columns have been scaled
int=0
while int < 11:
    print(f'column {int}')
    print(f'mean of col {int}: {np.mean(scaled_df[:,int])}')
    print(f'standard deviation of col {int}: {np.std(scaled_df[:,int])}')
    int+=1

column 0
mean of col 0: -3.552713678800501e-16
standard deviation of col 0: 0.9999999999999999
column 1
mean of col 1: -1.7763568394002506e-16
standard deviation of col 1: 1.0
column 2
mean of col 2: 6.838973831690964e-17
standard deviation of col 2: 1.0
column 3
mean of col 3: 6.039613253960852e-17
standard deviation of col 3: 1.0
column 4
mean of col 4: -5.684341886080802e-17
standard deviation of col 4: 0.9999999999999999
column 5
mean of col 5: -4.263256414560601e-17
standard deviation of col 5: 1.0
column 6
mean of col 6: 1.2168044349891717e-16
standard deviation of col 6: 0.9999999999999999
column 7
mean of col 7: -1.0658141036401503e-17
standard deviation of col 7: 1.0000000000000004
column 8
mean of col 8: 2.842170943040401e-17
standard deviation of col 8: 1.0
column 9
mean of col 9: -2.842170943040401e-17
standard deviation of col 9: 1.0
column 10
mean of col 10: 3.907985046680551e-17
standard deviation of col 10: 1.0


# fitting data to model

In [47]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [48]:
# define the features set (all the columns except the one we want to predict (bad))
#create a copy of the unscaled df
X = alt_df.copy()
#drop the target column
X = X.drop("bad", axis=1)
# show table
X.head()

Unnamed: 0,amount,term,month,age,education_Bachelor,education_High School or Below,education_Master or Above,education_college,gender_female,gender_male
0,1000,30,6,45,0,1,0,0,0,1
1,1000,30,7,50,1,0,0,0,1,0
2,1000,30,8,33,1,0,0,0,1,0
3,1000,15,9,27,0,0,0,1,0,1
4,1000,30,10,28,0,0,0,1,1,0


In [49]:
# define the target set
y = alt_df["bad"].values
# show the first 5 rows in y
y[:5]

array([0, 0, 0, 0, 0])

In [50]:
# split the data into test and train sets default is 75% train, 25% test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =78)

In [51]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(375, 10)
(125, 10)
(375,)
(125,)


In [52]:
# Splitting into Train and Test sets into an 80/20 split.
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)


In [60]:
# To scale our data, we'll use the StandardScaler as before and fit the instance, scaler, with the training data and then scale the features with the transform() method:
# create a standardscalar instance
scaler = StandardScaler()
# fit the standard scaler with the TRAINING data
X_scaler = scaler.fit(X_train)

# scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [53]:
# Determine the shape of our training and testing sets.
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(400, 10)
(100, 10)
(400,)
(100,)


In [63]:
# check that all of the columns have been scaled
int=0
while int < 10:
    print(f'column {int}')
    print(f'mean of col {int}: {np.mean(X_train_scaled[:,int])}')
    print(f'standard deviation of col {int}: {np.std(X_train_scaled[:,int])}')
    print(f'mean of col {int}: {np.mean(X_test_scaled[:,int])}')
    print(f'standard deviation of col {int}: {np.std(X_test_scaled[:,int])}')
    int+=1

column 0
mean of col 0: 3.931669804539221e-16
standard deviation of col 0: 1.0
mean of col 0: 0.08040483006321758
standard deviation of col 0: 0.8450480061575104
column 1
mean of col 1: -4.618527782440651e-17
standard deviation of col 1: 1.0
mean of col 1: 0.12297037557744318
standard deviation of col 1: 0.9564007345405279
column 2
mean of col 2: -1.0894988614988202e-16
standard deviation of col 2: 1.0
mean of col 2: -0.24508261791538438
standard deviation of col 2: 0.9902565532613558
column 3
mean of col 3: -1.1131836193574902e-16
standard deviation of col 3: 0.9999999999999999
mean of col 3: -0.046669098854475274
standard deviation of col 3: 1.0135597524939095
column 4
mean of col 4: 1.4210854715202004e-17
standard deviation of col 4: 1.0
mean of col 4: 0.10470260847804559
standard deviation of col 4: 1.1072483741768515
column 5
mean of col 5: 0.0
standard deviation of col 5: 1.0
mean of col 5: 0.1689329399014945
standard deviation of col 5: 1.0209540692403776
column 6
mean of col 6:

# fitting decision tree model

In [64]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [65]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [66]:
# look at preds
predictions


array([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0])

In [67]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,21,20


In [69]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
print(acc_score)

0.568


In [70]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,51,33
Actual 1,21,20


Accuracy Score : 0.568
Classification Report
              precision    recall  f1-score   support

           0       0.71      0.61      0.65        84
           1       0.38      0.49      0.43        41

    accuracy                           0.57       125
   macro avg       0.54      0.55      0.54       125
weighted avg       0.60      0.57      0.58       125



## Precision: 
#### Precision is the measure of how reliable a positive classification is. From our results, the precision for the good loan applications can be determined by the ratio TP/(TP + FP), which is 50/(50 + 22) = 0.69. The precision for the bad loan applications can be determined as follows: 19/(19 + 34) = 0.358. A low precision is indicative of a large number of false positivesâ€”of the 53 loan applications we predicted to be bad applications, 34 were actually good loan applications.
## Recall: 
#### Recall is the ability of the classifier to find all the positive samples. It can be determined by the ratio: TP/(TP + FN), or 50/(50 + 34) = 0.595 for the good loans and 19/(19 + 22) = 0.463 for the bad loans. A low recall is indicative of a large number of false negatives.
## F1 score: 
#### F1 score is a weighted average of the true positive rate (recall) and precision, where the best score is 1.0 and the worst is 0.0.
## Support: 
#### Support is the number of actual occurrences of the class in the specified dataset. For our results, there are 84 actual occurrences for the good loans and 41 actual occurrences for bad loans.