# Let us now use decision trees to classify if a loan applicant is a good or bad credit risk using the Credit Risk dataset. 

In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Reading the data from the datasets
credit_data = pd.read_csv(r"C:\Users\udayk\Downloads\creditriskdataset1603523640815\datasets\credit_risk.csv")
credit_data.head()

Unnamed: 0,over_draft,credit_usage,credit_history,purpose,current_balance,Average_Credit_Balance,employment,location,personal_status,other_parties,...,property_magnitude,cc_age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


In [3]:
credit_data.shape

(1000, 21)

In [4]:
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   over_draft              1000 non-null   object
 1   credit_usage            1000 non-null   int64 
 2   credit_history          1000 non-null   object
 3   purpose                 1000 non-null   object
 4   current_balance         1000 non-null   int64 
 5   Average_Credit_Balance  1000 non-null   object
 6   employment              1000 non-null   object
 7   location                1000 non-null   int64 
 8   personal_status         1000 non-null   object
 9   other_parties           1000 non-null   object
 10  residence_since         1000 non-null   int64 
 11  property_magnitude      1000 non-null   object
 12  cc_age                  1000 non-null   int64 
 13  other_payment_plans     1000 non-null   object
 14  housing                 1000 non-null   object
 15  exist

In [5]:
credit_data.describe

<bound method NDFrame.describe of       over_draft  credit_usage                  credit_history  \
0             <0             6  critical/other existing credit   
1       0<=X<200            48                   existing paid   
2    no checking            12  critical/other existing credit   
3             <0            42                   existing paid   
4             <0            24              delayed previously   
..           ...           ...                             ...   
995  no checking            12                   existing paid   
996           <0            30                   existing paid   
997  no checking            12                   existing paid   
998           <0            45                   existing paid   
999     0<=X<200            45  critical/other existing credit   

                 purpose  current_balance Average_Credit_Balance  employment  \
0               radio/tv             1169       no known savings         >=7   
1            

In [6]:
credit_data.isnull().sum()

over_draft                0
credit_usage              0
credit_history            0
purpose                   0
current_balance           0
Average_Credit_Balance    0
employment                0
location                  0
personal_status           0
other_parties             0
residence_since           0
property_magnitude        0
cc_age                    0
other_payment_plans       0
housing                   0
existing_credits          0
job                       0
num_dependents            0
own_telephone             0
foreign_worker            0
class                     0
dtype: int64

# setting the predictors and target

after loading the data, the next step is to select relevant features, perform necessary feature engineering then split the data into train and test sets.

consider all the attributes except for the target column, "class" as potential predictors for building the tree.

In [7]:
#setting the predictor attributes
X = credit_data.columns.drop("class")
# selecting the target
y = credit_data['class']

# Encoding the categorical values

In [8]:
# Encoding all the predictor variables to convert the categorical values to numerical values.
credit_data_encoded = pd.get_dummies(credit_data[X])

print("The total number of predictors after encoding =", len(credit_data_encoded.columns))

# printing the list of columns after encoding to understand the encoding process
credit_data_encoded.columns

The total number of predictors after encoding = 61


Index(['credit_usage', 'current_balance', 'location', 'residence_since',
       'cc_age', 'existing_credits', 'num_dependents', 'over_draft_0<=X<200',
       'over_draft_<0', 'over_draft_>=200', 'over_draft_no checking',
       'credit_history_all paid',
       'credit_history_critical/other existing credit',
       'credit_history_delayed previously', 'credit_history_existing paid',
       'credit_history_no credits/all paid', 'purpose_business',
       'purpose_domestic appliance', 'purpose_education',
       'purpose_furniture/equipment', 'purpose_new car', 'purpose_other',
       'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining',
       'purpose_used car', 'Average_Credit_Balance_100<=X<500',
       'Average_Credit_Balance_500<=X<1000', 'Average_Credit_Balance_<100',
       'Average_Credit_Balance_>=1000',
       'Average_Credit_Balance_no known savings', 'employment_1<=X<4',
       'employment_4<=X<7', 'employment_<1', 'employment_>=7',
       'employment_unemployed', 'p

After encoding, there are 61 predictor attributes. Each of the non-numerical columns has been replaced by new columns as the number of its values. The newly generated columns take either 0 or 1 as the value.

In [9]:
credit_data_encoded

Unnamed: 0,credit_usage,current_balance,location,residence_since,cc_age,existing_credits,num_dependents,over_draft_0<=X<200,over_draft_<0,over_draft_>=200,...,housing_own,housing_rent,job_high qualif/self emp/mgmt,job_skilled,job_unemp/unskilled non res,job_unskilled resident,own_telephone_none,own_telephone_yes,foreign_worker_no,foreign_worker_yes
0,6,1169,4,4,67,2,1,False,True,False,...,True,False,False,True,False,False,False,True,False,True
1,48,5951,2,2,22,1,1,True,False,False,...,True,False,False,True,False,False,True,False,False,True
2,12,2096,2,3,49,1,2,False,False,False,...,True,False,False,False,False,True,True,False,False,True
3,42,7882,2,4,45,1,2,False,True,False,...,False,False,False,True,False,False,True,False,False,True
4,24,4870,3,4,53,2,2,False,True,False,...,False,False,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,12,1736,3,4,31,1,1,False,False,False,...,True,False,False,False,False,True,True,False,False,True
996,30,3857,4,4,40,1,1,False,True,False,...,True,False,True,False,False,False,False,True,False,True
997,12,804,4,4,38,1,1,False,False,False,...,True,False,False,True,False,False,True,False,False,True
998,45,1845,4,4,23,1,1,False,True,False,...,False,False,False,True,False,False,False,True,False,True


In [10]:
for col in credit_data.columns:
    print(f"Column Name: {col}")
    counts = credit_data[col].value_counts()
    for index, value in counts.items():
        print(f"{index} : {value}")


Column Name: over_draft
no checking : 394
<0 : 274
0<=X<200 : 269
>=200 : 63
Column Name: credit_usage
24 : 184
12 : 179
18 : 113
36 : 83
6 : 75
15 : 64
9 : 49
48 : 48
30 : 40
21 : 30
10 : 28
60 : 13
27 : 13
42 : 11
11 : 9
20 : 8
8 : 7
4 : 6
45 : 5
7 : 5
39 : 5
14 : 4
13 : 4
33 : 3
28 : 3
54 : 2
16 : 2
22 : 2
47 : 1
5 : 1
26 : 1
72 : 1
40 : 1
Column Name: credit_history
existing paid : 530
critical/other existing credit : 293
delayed previously : 88
all paid : 49
no credits/all paid : 40
Column Name: purpose
radio/tv : 280
new car : 234
furniture/equipment : 181
used car : 103
business : 97
education : 50
repairs : 22
domestic appliance : 12
other : 12
retraining : 9
Column Name: current_balance
1478 : 3
1262 : 3
1258 : 3
1275 : 3
1393 : 3
1442 : 2
3590 : 2
2578 : 2
701 : 2
1924 : 2
2039 : 2
4526 : 2
3349 : 2
1597 : 2
1449 : 2
1410 : 2
1169 : 2
1919 : 2
1126 : 2
932 : 2
2028 : 2
3617 : 2
1409 : 2
2384 : 2
609 : 2
3832 : 2
1082 : 2
5954 : 2
1553 : 2
709 : 2
3959 : 2
4272 : 2
1237 : 2
67

# Splitting the required function

In [11]:
# Splitting the required function
from sklearn.model_selection import train_test_split
#splitting data into train and test datasets
X_train,X_test,y_train,y_test = train_test_split(credit_data_encoded,y,test_size = 0.15,random_state = 100)
#printing the shape of the resulting datasets
print("Shape of X_train and y_train are:", X_train.shape, "and", y_train.shape, " respectively")
print("Shape of X_test and y_test are:", X_test.shape, "and", y_test.shape, " respectively")

Shape of X_train and y_train are: (850, 61) and (850,)  respectively
Shape of X_test and y_test are: (150, 61) and (150,)  respectively


# Building the model using Scikit-Learn

In [12]:
# Creating an object of the DecisionTreeClassifier model
model = DecisionTreeClassifier(random_state = 1)
# Training model on the training data
model.fit(X_train,y_train)


In [13]:
# Predicting target values using the model built on training data
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

In [14]:
# Getting the accuracy on train data
train_accuracy = model.score(X_train,y_train)
print("Accuracy of the model on train data = ",train_accuracy)
# Getting the accuracy on test data
test_accuracy = model.score(X_test,y_test)
print("Accuracy of the model on test data = ",test_accuracy)


Accuracy of the model on train data =  1.0
Accuracy of the model on test data =  0.6533333333333333


You can observe that the training accuracy is 100% and the test accuracy is approximately 67%.

This could mean that the model is overfit to the training data and is not a good approximation of the input to output mapping.

In order to avoid this problem, certain hyperparameters of the decision tree algorithmcan be tuned.

Hyperparameters are model properties which guide the training process i.e. they cannot be learnt from the training data.

For example, the below code demonstrates creating two models with different values for the following hyper-parameters:

min_samples_split: The minimum number of instances that should be at a node before the algorithm does a further split on it.

min_impurity_decrease: The minimum percentage reduction in the impurity of a set (as measured by gini index) required to make a split.

In [15]:
# Model 1:
# Min number of samples required in a set to split = 10
# Min reduction in impurity required for split to be included in the tree = 0.005
model1 = DecisionTreeClassifier(min_samples_split=10,min_impurity_decrease=0.005)
# Fitting the model to the training data
model1.fit(X_train,y_train)
# Measuring the accuracy of the model
print("train_accuracy = ", model1.score(X_train,y_train))
print("test_accuracy = ", model1.score(X_test,y_test))


train_accuracy =  0.7635294117647059
test_accuracy =  0.7266666666666667


In [16]:
# Model 2:
# Min number of samples required in a set to split = 20
# Min reduction in impurity required for split to be included in the tree = 0.1
model2 = DecisionTreeClassifier(min_samples_split=20,min_impurity_decrease=0.1)
# Fitting the model to the training data
model2.fit(X_train,y_train)
# Measuring the accuracy of the model
print("Model2 train accuracy = ", model2.score(X_train,y_train))
print("Model2 test accuracy = ", model2.score(X_test,y_test))


Model2 train accuracy =  0.6976470588235294
Model2 test accuracy =  0.7133333333333334


You can observe that with different values of hyperparameters, we can get different accuracy and reduce overfitting.