In this notebook we will be building a machine learning model using population data to predict heart disease risk.

To do:

* Use Pandas to import the csv data
* binarise the true/false data
* One-hot encoding to convert categorical variables into binary form
* Random forest model
* save the model

In [2]:
# Import dependencies
import pandas as pd

In [3]:
df = pd.read_csv("../resources/heart_2020_cleaned.csv", encoding = 'utf-8')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


## Binarise the yes/no data columns to be 0=No, 1=Yes

In [4]:
# First list all the column value counts
for column in df:
    print(df[column].value_counts())

No     292422
Yes     27373
Name: HeartDisease, dtype: int64
26.63    3762
27.46    2767
27.44    2723
24.41    2696
27.12    2525
         ... 
48.03       1
38.19       1
63.33       1
59.85       1
44.68       1
Name: BMI, Length: 3604, dtype: int64
No     187887
Yes    131908
Name: Smoking, dtype: int64
No     298018
Yes     21777
Name: AlcoholDrinking, dtype: int64
No     307726
Yes     12069
Name: Stroke, dtype: int64
0.0     226589
30.0     19509
2.0      14880
1.0      10489
3.0       8617
5.0       7606
10.0      5453
15.0      5012
7.0       4629
4.0       4468
20.0      3216
14.0      2893
6.0       1270
25.0      1164
8.0        924
21.0       626
12.0       605
28.0       446
29.0       204
9.0        180
18.0       167
16.0       135
27.0       124
17.0       110
13.0        91
22.0        89
11.0        85
24.0        67
26.0        66
23.0        46
19.0        35
Name: PhysicalHealth, dtype: int64
0.0     205401
30.0     17373
2.0      16495
5.0      14149
10.0     105

In [9]:
# Put all columns with yes/no values into a list
yesno = ["HeartDisease","Smoking","AlcoholDrinking","Stroke","DiffWalking","PhysicalActivity", "Asthma","KidneyDisease","SkinCancer"]

# Create a zipped dictionary to use in a column loop
to_replace = dict(zip(["Yes", "No"],[1,0]))
# print(to_replace)

# Loop through the yesno list as dataframe column names and replace the values
for columnname in yesno:
    df.replace({columnname:to_replace}, inplace = True)
    
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,Female,55-59,White,Yes,1,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,Female,80 or older,White,No,1,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,Male,65-69,White,Yes,1,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,Female,75-79,White,No,0,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,Female,40-44,White,No,1,Very good,8.0,0,0,0


## Use One Hot Encoding to encode the rest of the categorical variables

In [10]:
# Put categorical column names into list
categorical = ["Sex", "AgeCategory","Race","Diabetic","GenHealth"]

# Using the list above, create a new df with the encoded columns
encoded_df = pd.get_dummies(df, columns = categorical, prefix=categorical, drop_first=True)

encoded_df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,PhysicalActivity,SleepTime,...,Race_Hispanic,Race_Other,Race_White,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good
0,0,16.6,1,0,0,3.0,30.0,0,1,5.0,...,0,0,1,0,1,0,0,0,0,1
1,0,20.34,0,0,1,0.0,0.0,0,1,7.0,...,0,0,1,0,0,0,0,0,0,1
2,0,26.58,1,0,0,20.0,30.0,0,1,8.0,...,0,0,1,0,1,0,1,0,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,6.0,...,0,0,1,0,0,0,0,1,0,0
4,0,23.71,0,0,0,28.0,0.0,1,1,8.0,...,0,0,1,0,0,0,0,0,0,1


## Create Random Forest model using the target variable as "HeartDisease"

In [13]:
#Target variable
target = encoded_df["HeartDisease"]
target_names = ["Heart Disease", "No Heart Disease"]

target.head()

0    0
1    0
2    0
3    0
4    0
Name: HeartDisease, dtype: int64

In [14]:
# Data used to create prediction
data = encoded_df.drop("HeartDisease", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,PhysicalActivity,SleepTime,Asthma,...,Race_Hispanic,Race_Other,Race_White,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy),GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very good
0,16.6,1,0,0,3.0,30.0,0,1,5.0,1,...,0,0,1,0,1,0,0,0,0,1
1,20.34,0,0,1,0.0,0.0,0,1,7.0,0,...,0,0,1,0,0,0,0,0,0,1
2,26.58,1,0,0,20.0,30.0,0,1,8.0,1,...,0,0,1,0,1,0,1,0,0,0
3,24.21,0,0,0,0.0,0.0,0,0,6.0,0,...,0,0,1,0,0,0,0,1,0,0
4,23.71,0,0,0,28.0,0.0,1,1,8.0,0,...,0,0,1,0,0,0,0,0,0,1


In [15]:
# Split the data using train_test_split
from sklearn.model_selection import train_test_split

X = data
y = target.values.reshape(-1,1)

print(X.shape, y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

(319795, 37) (319795, 1)


In [16]:
# Create, fit, and score a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=200)

# .ravel() will flatten the numpy array
rf = rf.fit(X_train, y_train.ravel())
rf.score(X_test, y_test)

0.9037261254049457