# Heart Disease Prediction Model

## Read training data set

In [1]:
import pandas as pd
import numpy
 
df = pd.read_csv('../data/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Pre-Processing
* Remove missing values and replace
* Rename columns (no special characters and spaces) to simply further processing

In [2]:
#remove any missing values from the data.
#We have replaced the missing values with the mode value in that column. There are many other ways to replace missing values but for this type of dataset, it seemed most optimal. 
col_names = df.columns
 
for c in col_names:
    df = df.replace("?", numpy.NaN)
df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))

#To rename columns use this
# df.replace(['Column1_oldname', 'Column2_oldname'],
#            ['Column1_newname','Column2_newname'], inplace=True)

* Convert categorical data values into numerical values to fit the data into the prediction model

In [3]:
#The machine learning algorithm cannot process categorical data values. It can only process numerical values. 
#To fit the data into the prediction model, we need to convert categorical values to numerical ones. 
#Before that, we will evaluate if any transformation on categorical columns is necessary.

from sklearn import preprocessing
category_col = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

labelEncoder = preprocessing.LabelEncoder()
 
mapping_dict = {}

for col in category_col:
    df[col] = labelEncoder.fit_transform(df[col])
    le_name_mapping = dict(zip(labelEncoder.classes_,
                               labelEncoder.transform(labelEncoder.classes_)))
    mapping_dict[col] = le_name_mapping

print(mapping_dict)

{'Sex': {'F': 0, 'M': 1}, 'ChestPainType': {'ASY': 0, 'ATA': 1, 'NAP': 2, 'TA': 3}, 'RestingECG': {'LVH': 0, 'Normal': 1, 'ST': 2}, 'ExerciseAngina': {'N': 0, 'Y': 1}, 'ST_Slope': {'Down': 0, 'Flat': 1, 'Up': 2}}


# Fitting the Model

In [4]:
#After pre-processing the data, the data is ready to be fed to the machine learning algorithm
# We then slice the data separating the labels with the attributes
# Now, we split the dataset into two halves, one for training and one for testing. This is achieved using train_test_split() function of sklearn.
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#features
X = df.values[:, 0:11]

#outcome
y = df.values[:, 11]

X_train, X_test, y_train, y_test = train_test_split(
           X, y, test_size = 0.4, random_state = 90)

scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfclf = RandomForestClassifier(random_state=43, n_estimators=100).fit(X_train_scaled, y_train)

print(f'Training Score: {rfclf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {rfclf.score(X_test_scaled, y_test)}')


Training Score: 1.0
Testing Score: 0.8804347826086957


In [5]:
# Save random forest classifier model
import pickle

# Saving model to current directory
# Pickle serializes objects so they can be saved to a file, and loaded in a program again later on.
pickle.dump(rfclf, open('rfcl.pkl','wb'))


In [6]:

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

X_dt_train, X_dt_test, y_dt_train, y_dt_test = train_test_split(X, y, test_size = 0.4, random_state = 90)

dtclf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 150, max_depth = 5, min_samples_leaf = 5)

dtclf_gini.fit(X_dt_train, y_dt_train)
y_pred_gini = dtclf_gini.predict(X_dt_test)

print("Decision Tree using Gini Index")
print("Training Score: ", dtclf_gini.score(X_dt_train,y_dt_train))
print("Accuracy is ",accuracy_score(y_dt_test, y_pred_gini)*100 )

Decision Tree using Gini Index
Training Score:  0.8945454545454545
Accuracy is  83.15217391304348
