In [2]:
# import libraries used
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_auc_score


In [3]:
# import data as pandas table
# https://www.kaggle.com/fedesoriano/heart-failure-prediction
data_table = pd.read_csv("data/heart.csv")
data_table.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
# encode non-numerical data types

#data_table.Sex.unique() # column 2
#data_table.ChestPainType.unique() # column 3
#data_table.RestingECG.unique() # column 7
#data_table.ExerciseAngina.unique() # column 9
#data_table.ST_Slope.unique() # column 11

data_table2 = pd.get_dummies(data_table, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], drop_first=True)

data_table2.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


In [5]:
# Separate features from labels
X = data_table2.drop("HeartDisease",axis='columns')
y = data_table2.HeartDisease

X.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,1,0,1,0,1,0,0,0,1


In [6]:
# Complete baseline Random Forest Classification

# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
print("Train array shape: ")
print(X_train.shape)
print("Test array shape: ")
print(X_test.shape)

# fit random forest classifier on data
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
baseline_acc_test = model_rf.score(X_test, y_test)

print("No feature reduction test accuracy: " + baseline_acc_test.astype('str'))

Train array shape: 
(734, 15)
Test array shape: 
(184, 15)
No feature reduction test accuracy: 0.8695652173913043


In [7]:
# Complete Random Forest Classification with LDA

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)


lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
X_lda_train = lda.transform(X_train)
X_train2 = np.hstack((X_train, X_lda_train))

X_lda_test = lda.transform(X_test)
X_test2 = np.hstack((X_test, X_lda_test))

# fit random forest classifier on data
model_rf = RandomForestClassifier()
model_rf.fit(X_train2, y_train)
lda_acc_test = model_rf.score(X_test2, y_test)
print("LDA test accuracy: " + lda_acc_test.astype('str'))

lda.fit(X_train2, y_train)
roc_score = roc_auc_score(y_train, lda.predict_proba(X_train2)[:,1])
print("LDA ROC Score accuracy: " + roc_score.astype('str'))


LDA test accuracy: 0.8668478260869565
LDA ROC Score accuracy: 0.920972928257882


In [8]:
# Complete Random Forest Classification with PCA

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)


pca = PCA(n_components=12) # highest 12
pca.fit(X_train, y_train)

X_pca_train = pca.transform(X_train)
X_train2 = np.hstack((X_train, X_pca_train))

X_pca_test = pca.transform(X_test)
X_test2 = np.hstack((X_test, X_pca_test))

# fit random forest classifier on data
model_rf = RandomForestClassifier()
model_rf.fit(X_train2, y_train)
pca_acc_test = model_rf.score(X_test2, y_test)
print("PCA test accuracy: " + pca_acc_test.astype('str'))

pca.fit(X_train2, y_train)
roc_score = roc_auc_score(y_train, np.exp(pca.score_samples(X_train2)))
print("PCA ROC Score accuracy: " + roc_score.astype('str'))

PCA test accuracy: 0.8858695652173914
PCA ROC Score accuracy: 0.40102807328996704


In [9]:
# Complete Random Forest Classification with PCA + LDA

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

pca = PCA(n_components=12) # highest 12
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
pca.fit(X_train)

X_lda_train = lda.transform(X_train)
X_pca_train = pca.transform(X_train)
X_train2 = np.hstack((X_pca_train, X_lda_train))

X_lda_test = lda.transform(X_test)
X_pca_test = pca.transform(X_test)
X_test2 = np.hstack((X_pca_test, X_lda_test))

# fit random forest classifier on data
model_rf = RandomForestClassifier()
model_rf.fit(X_train2, y_train)
pcalda_acc_test = model_rf.score(X_test2, y_test)
print("PCA + LDA test accuracy: " + pcalda_acc_test.astype('str'))

lda.fit(X_train2, y_train)
roc_score = roc_auc_score(y_train, lda.predict_proba(X_train2)[:,1])
print("PCA + LDA ROC Score accuracy: " + roc_score.astype('str'))

PCA + LDA test accuracy: 0.8940217391304348
PCA + LDA ROC Score accuracy: 0.9254193171405426
