# Predicting Heart Disease using Decision Tree Classification

In [1]:
# Study of heart disease patients using decision trees
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import graphviz.backend as be
from dtreeviz.trees import *

In [2]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
df.shape

(319795, 18)

In [4]:
df = df.loc[:,['HeartDisease','BMI','PhysicalHealth','MentalHealth','SleepTime','Asthma']]
df.head()

Unnamed: 0,HeartDisease,BMI,PhysicalHealth,MentalHealth,SleepTime,Asthma
0,No,16.6,3.0,30.0,5.0,Yes
1,No,20.34,0.0,0.0,7.0,No
2,No,26.58,20.0,30.0,8.0,Yes
3,No,24.21,0.0,0.0,6.0,No
4,No,23.71,28.0,0.0,8.0,No


In [5]:
df.shape

(319795, 6)

In [6]:
df = df.replace({'Yes': 1, 'No': 0}) 
df.head()

Unnamed: 0,HeartDisease,BMI,PhysicalHealth,MentalHealth,SleepTime,Asthma
0,0,16.6,3.0,30.0,5.0,1
1,0,20.34,0.0,0.0,7.0,0
2,0,26.58,20.0,30.0,8.0,1
3,0,24.21,0.0,0.0,6.0,0
4,0,23.71,28.0,0.0,8.0,0


In [7]:
# Decision Tree Classifier: The Syntax
# Import the class containing the classification method
from sklearn.tree import DecisionTreeClassifier  # DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 

In [8]:
# Create an instance of the class
DTC = DecisionTreeClassifier() # Tree parameters

In [9]:
x = df.iloc[:,[1,2,3,4,5]]
x.shape

(319795, 5)

In [10]:
x.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Asthma
0,16.6,3.0,30.0,5.0,1
1,20.34,0.0,0.0,7.0,0
2,26.58,20.0,30.0,8.0,1
3,24.21,0.0,0.0,6.0,0
4,23.71,28.0,0.0,8.0,0


In [11]:
y = df.iloc[:,[0]]
y.shape

(319795, 1)

In [12]:
y.head()

Unnamed: 0,HeartDisease
0,0
1,0
2,0
3,0
4,0


In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=4)

In [14]:
x_train.head()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,Asthma
295259,23.67,0.0,0.0,7.0,0
188531,23.78,0.0,0.0,10.0,0
117459,33.93,15.0,0.0,10.0,0
214360,31.75,0.0,3.0,4.0,0
187873,24.37,0.0,0.0,8.0,0


In [15]:
y_train.head()

Unnamed: 0,HeartDisease
295259,0
188531,0
117459,0
214360,0
187873,0


In [16]:
model = DTC.fit(x_train, y_train)

In [17]:
y_predict = model.predict(x_test)

In [18]:
y_predict = np.array(y_predict)
y_test = np.array(y_test)
for i in range(100):
    print(y_predict[i],' ', y_test[i])

0   [0]
0   [0]
0   [1]
0   [0]
0   [1]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [1]
0   [0]
0   [0]
0   [0]
1   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [1]
0   [0]
0   [0]
1   [0]
0   [0]
0   [0]
0   [1]
0   [0]
0   [0]
0   [0]
0   [0]
0   [1]
0   [0]
0   [0]
0   [0]
0   [0]
0   [1]
0   [0]
0   [1]
0   [0]
0   [0]
0   [0]
1   [1]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [1]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
1   [0]
0   [1]
0   [0]
0   [0]
1   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]
0   [0]


In [19]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94    116925
           1       0.16      0.09      0.12     10993

    accuracy                           0.88    127918
   macro avg       0.54      0.52      0.53    127918
weighted avg       0.85      0.88      0.87    127918



In [20]:
confusion_matrix(y_test, y_predict)

array([[111754,   5171],
       [ 10002,    991]], dtype=int64)

In [21]:
print('Accuracy is: ', accuracy_score(y_test, y_predict)*100)

Accuracy is:  88.13849497334229
