# Python Implementation of Decision Trees

You will be using the heart disease dataset provided to predict whether a person has heart disease or not using Decision Trees.

In the following sections, we'll:
1. clean and prepare the data,
2. build a decision tree with default hyperparameters,
3. understand all the hyperparameters that we can tune, and finally
4. choose the optimal hyperparameters using grid search cross-validation.

In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np

# Load the data

In [2]:
# Read the data into a dataframe

df = pd.read_csv('Data\heart_disease_case_study+(1).csv',sep=';')


In [3]:
# View the data
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [4]:
# View shape of data
df.shape

(70000, 13)

In [5]:
# View the columns
df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [6]:
# View the statistics of data
df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [7]:
# View the count of patients have heart diseaese
df.cardio.value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

> Distribution is almost Same

In [8]:
# Assign the output column
y = df.cardio

In [9]:
# Assign the input features
x = df.drop('cardio',axis = 1)

In [10]:
# View the features
x.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0


## Train a simple Decision tree model

In [11]:
# load sklearn libraries required to construct a Decision Tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

In [12]:
# Split the data into training and testing data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [13]:
# View the shape of training data
x_train.shape

(56000, 12)

In [14]:
# View the shape of test data
x_test.shape

(14000, 12)

# Decision Tree

In [30]:
# Create a Decision Tree
dt_basic = DecisionTreeClassifier(max_depth= 10 )

In [31]:
# View the attributes of tree created
dt_basic

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [32]:
# Fit the training data
dt_basic.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [33]:
# Predict based on test data
y_preds = dt_basic.predict(x_test)

In [34]:
# View the predictions
y_preds

array([0, 0, 0, ..., 1, 0, 1], dtype=int64)

# Accuracy

In [35]:
# Calculate Accuracy
accuracy_value = metrics.accuracy_score(y_test,y_preds)

In [36]:
# View the accuracy
accuracy_value

0.7265714285714285

# Confusion Matrix 

In [37]:
# Create and print confusion matrix
confusion_matrix(y_test,y_preds)

array([[5613, 1456],
       [2372, 4559]], dtype=int64)

In [38]:
# Print the confusion matrix
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.70      0.79      0.75      7069
           1       0.76      0.66      0.70      6931

    accuracy                           0.73     14000
   macro avg       0.73      0.73      0.73     14000
weighted avg       0.73      0.73      0.73     14000



In [39]:
# Calculate the number of nodes in the tree
dt_basic.tree_.node_count

1149

## Visualize the Tree

In [40]:
# Importing required packages for visualization
from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz

In [41]:
import os
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz\bin'

In [42]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [43]:
#Select input columns as features
features = list(x.columns[0:])

In [44]:
#View the features
features

['id',
 'age',
 'gender',
 'height',
 'weight',
 'ap_hi',
 'ap_lo',
 'cholesterol',
 'gluc',
 'smoke',
 'alco',
 'active']

In [45]:
#Visualise the tree using the following steps:
dot_data = StringIO()  
export_graphviz(dt_basic, out_file=dot_data,
                feature_names=features, filled=True,rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

InvocationException: GraphViz's executables not found

## Hyperparamter Tuning Optimization 

### Max_depth

In [None]:
#Choose and mention the hyperparameter

In [None]:
#You can define number of folds for cross-validation as follows:

In [None]:
#Build a Decision Tree

In [None]:
#Import the required libraries
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [None]:
#Use GridSearchCV to buid a tree

In [None]:
#Fit the training data

In [None]:
#View the score

In [None]:
# Plot accuracy vs param_max_depth

### Min_samples_leaf

In [None]:
# Choose and mention the hyperparameter

In [None]:
# Build a Decision Tree

In [None]:
# Build the treeecd c

In [None]:
# Fit the training data

In [None]:
# Plot accuracy vs param_min_samples_leaf

# Multiple parameters

In [None]:
# Create a Parameter grid

In [None]:
# Create a Decision Tree

In [None]:
# Create a Grid with parameters

In [None]:
# Visualise the tree

## Create a less complex tree easy to visualize

In [None]:
# Create a Tree

In [None]:
# Visualise the Tree

In [None]:
# View the accuracy score