Heart disease Case Study

In [3]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os

In [4]:
# Reading the csv file and putting it into 'df' object.
os.chdir('/Users/theovl/Downloads')
df = pd.read_csv("heart_disease_case_study+(1).csv", delimiter=';')

In [5]:
# Let's understand the data, how it look like.
df.head(10)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


In [16]:
df.isnull().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [6]:
for i in df[['cholesterol', 'gluc','smoke','alco', 'active']]:
    print(df[i].value_counts())

1    52385
2     9549
3     8066
Name: cholesterol, dtype: int64
1    59479
3     5331
2     5190
Name: gluc, dtype: int64
0    63831
1     6169
Name: smoke, dtype: int64
0    66236
1     3764
Name: alco, dtype: int64
1    56261
0    13739
Name: active, dtype: int64


# NOTE
This dataset contains missing rows with a value='?'. Remove the missing values by dropping those rows.

In [8]:
pd.get_dummies(data = df, columns = ['gender','cholesterol','gluc'], drop_first= True)

Unnamed: 0,id,age,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,gender_2,cholesterol_2,cholesterol_3,gluc_2,gluc_3
0,0,18393,168,62.0,110,80,0,0,1,0,1,0,0,0,0
1,1,20228,156,85.0,140,90,0,0,1,1,0,0,1,0,0
2,2,18857,165,64.0,130,70,0,0,0,1,0,0,1,0,0
3,3,17623,169,82.0,150,100,0,0,1,1,1,0,0,0,0
4,4,17474,156,56.0,100,60,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,168,76.0,120,80,1,0,1,0,1,0,0,0,0
69996,99995,22601,158,126.0,140,90,0,0,1,1,0,1,0,1,0
69997,99996,19066,183,105.0,180,90,0,1,0,1,1,0,1,0,0
69998,99998,22431,163,72.0,135,80,0,0,0,1,0,0,0,1,0


# Data Preparation
There are a number of preprocessing steps we need to do before building the model.

Firstly, note that we have both categorical and numeric features as predictors. In previous models such as linear and logistic regression, we had created dummy variables for categorical variables, since those models (being mathematical equations) can process only numeric variables.

All that is not required in decision trees, since they can process categorical variables easily. However, we still need to encode the categorical variables into a standard format so that sklearn can understand them and build the tree. We'll do that using the LabelEncoder() class, which comes with sklearn.preprocessing.

Now all the categorical variables are suitably encoded. Let's build the model.

# Model Building and Evaluation
Let's first build a decision tree with default hyperparameters. Then we'll use cross-validation to tune them.

In [10]:
# Importing train-test-split 
from sklearn.model_selection import train_test_split

In [11]:
# Putting feature variable to X
X = df.drop('cardio', axis = 1)
# Putting response variable to y
y = df['cardio']

In [12]:
# Splitting the data into train and test (70/30 ratio)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [13]:
# Importing decision tree classifier from sklearn library
from sklearn.tree import DecisionTreeClassifier

In [15]:
# Build a Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train,y_train)

DecisionTreeClassifier()

In [17]:
predictions = model.predict(X_test)

In [22]:
from sklearn.metrics import classification_report, confusion_matrix ,recall_score, precision_score, accuracy_score

print(classification_report(y_test,predictions),'\n\n')
print(confusion_matrix(y_test,predictions))

# Let's check the evaluation metrics of our default model
# Importing classification report and confusion matrix from sklearn metrics
# Making predictions
# Printing classification report

              precision    recall  f1-score   support

           0       0.63      0.64      0.63     10543
           1       0.63      0.62      0.63     10457

    accuracy                           0.63     21000
   macro avg       0.63      0.63      0.63     21000
weighted avg       0.63      0.63      0.63     21000
 


[[6706 3837]
 [3944 6513]]


Question 1: Find the accuracy of the model. [Mark the correct answer in graded questions segment]

In [23]:
accuracy_score(y_test,predictions)

0.6294761904761905

# Plotting the Decision Tree

To visualise decision trees in python, you need to install certain external libraries. You can read about the process in detail here: http://scikit-learn.org/stable/modules/tree.html

We need the ```graphviz``` library to plot a tree.

In [25]:
# Importing required packages for visualization
from IPython.display import Image  
from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz

ImportError: cannot import name 'StringIO' from 'sklearn.externals' (/Users/theovl/opt/anaconda3/lib/python3.9/site-packages/sklearn/externals/__init__.py)

In [None]:
# Putting features

In [None]:
# Import path for Graphviz
import os
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38\bin'

In [None]:
# Plot a Decision Tree
dot_data = StringIO()  
export_graphviz(dt_default, out_file=dot_data,
                feature_names=features, filled=True,rounded=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

# OPTIMAL HYPERPARAMETERS

In [None]:
#import libraries required
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

Question 2: Which are the most optimal criteria for splitting? [Mark the correct answer in graded questions segment]

In [None]:
# Create the parameter grid 

In [None]:
# Instantiate the grid search model
# Fit the grid search to the data

In [None]:
# printing the optimal accuracy score and hyperparameters

# Running the model with best parameters obtained from grid search.

In [None]:
# model with optimal hyperparameters

Question 3: What is the change in accuracy after using hyperparameters? [Mark the correct answer in graded questions segment]

In [None]:
# accuracy score

In [None]:
# plotting the tree