In [None]:
#-------------------------------------------------------------------
# Author Wessel Olaf van Dam 09-30-2019
#
# Machine Learning script on practice dataset "cleveland_Heart_Disease.csv"
#-------------------------------------------------------------------

import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import scipy as sc
import numpy as np
import sklearn

# We use the Cleveland Heart Disease dataset that has already been preprocessed, 13 features and 1 target variable
data = pd.read_csv('cleveland_Heart_Disease.csv')
data.head()

data = data.dropna()

# following are the 13 features used to classify heart disease or no -------

# age: age in years 
# sex: sex (1 = male; 0 = female) 
# cp: chest pain type (1=typical angina, 2=atypical angina, 3=non-anginal pain, 4=asymptomatic) 
# trestbps: resting blood pressure (in mm Hg on admission to the hospital) 
# chol: serum cholestoral in mg/dl 
# fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false) 
#restecg: resting electrocardiographic results (0=normal,1=having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV),
#2= showing probable or definite left ventricular hypertrophy by Estes' criteria 
# thalach: maximum heart rate achieved 
# exang: exercise induced angina (1 = yes; 0 = no) 
# oldpeak = ST depression induced by exercise relative to rest 
# slope: the slope of the peak exercise ST segment (1=upsloping, 2=flat,3=downsloping) 
# ca: number of major vessels (0-3) colored by flourosopy 
# thal: 3=normal; 6=fixed defect; 7=reversable defect 

#The Variable to be classified

#num: diagnosis of heart disease (angiographic disease status) 0: < 50% diameter narrowing, 1: > 50% diameter narrowing 
#(in any major vessel: attributes 59 through 68 are vessels)

#here we recode the num variable to 0 for healthy and > 0 for heart disease
data.loc[data.num != 0, 'num'] = 'heart disease'
data.loc[data.num == 0, 'num'] = 'healthy'

# shape
print(data.shape)
print(data.describe())
print(data.groupby('num').size())


In [None]:
#importing relevant libraries
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#We will split the loaded dataset into two, 80% of which we will use to train our models 
#and 20% that we will hold back as a validation dataset. X_train and Y_train contain the
#training data, and X_validation and Y_validation will be used later to test the performance
#of our models

# Split-out validation dataset
array = data.values
X = array[:,0:13]
Y = array[:,13]

validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

print (X_train.shape) # we have 297 observations altogether, now 80% is X training = 237
print (X_validation.shape) # 20% of 297 = 60 X validation points

In [None]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'

# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []

# Here we print the accuracy of each of the 6 models to see
# which one is most accurate

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    
# We can see that the Gaussian Naive Bayes is the best algorithm with an accuracy of 84.4%   

# plotting the results
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
#The NB algorithm is very simple and was an accurate model based on our tests. 
#Now we want to get an idea of the accuracy of the model on our validation set.

#We can run the NB model directly on the validation set and summarize the results as a final accuracy score, 
#a confusion matrix and a classification report.

# Make predictions on validation dataset
nb = GaussianNB()
nb.fit(X_train, Y_train)
predictions = nb.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

# We can see that the accuracy is 0.833 or 83.3%. The confusion matrix provides 
# an indication of the 10 errors made. 28/32 healthy subjects are correctly classified
# 22/28 heart disease patients are correctly classified.

# Finally, the classification report provides a breakdown of each class by precision, recall, f1-score and 
# support showing excellent results (granted the validation dataset was small).
