In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("heart.csv").dropna()

# df[["Sex","ChestPainType","RestingECG","ExerciseAngina","ST_Slope"]] = \
#     encoded_data[:,0],encoded_data[:,1],encoded_data[:,2],encoded_data[:,3],encoded_data[:,4]


# preprocessing data

# remove outliers using Z score
df.describe()
# df = df[(df["Cholesterol"] <= df["Cholesterol"].mean() + 3*df["Cholesterol"].std())]

from scipy.stats import zscore
zscores = np.abs(zscore(df[["Cholesterol","FastingBS","MaxHR","Oldpeak"]]))
threshold = 3
df_filt = df[(zscores < threshold)]
df_filt

# convert text columns to numbers using one hot encoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse = False)
df_filt["Sex"] = ohe.fit_transform(df[["Sex"]])
df_filt["ChestPainType"] = ohe.fit_transform(df[["ChestPainType"]])
df_filt["RestingECG"] = ohe.fit_transform(df[["RestingECG"]])
df_filt["ExerciseAngina"] = ohe.fit_transform(df[["ExerciseAngina"]])
df_filt["ST_Slope"] = ohe.fit_transform(df[["ST_Slope"]])
df_filt["Age"] = df["Age"]
df_filt["RestingBP"] = df["RestingBP"]
df_filt["HeartDisease"] = df["HeartDisease"]
df_filt

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0.0,0.0,140,289.0,0,0.0,172.0,1.0,0.0,0.0,0
1,49,1.0,0.0,160,180.0,0,0.0,156.0,1.0,1.0,0.0,1
2,37,0.0,0.0,130,283.0,0,0.0,98.0,1.0,0.0,0.0,0
3,48,1.0,1.0,138,214.0,0,0.0,108.0,0.0,1.5,0.0,1
4,54,0.0,0.0,150,195.0,0,0.0,122.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,0.0,0.0,110,264.0,0,0.0,132.0,1.0,1.2,0.0,1
914,68,0.0,1.0,144,193.0,1,0.0,141.0,1.0,3.4,0.0,1
915,57,0.0,1.0,130,131.0,0,0.0,115.0,0.0,1.2,0.0,1
916,57,1.0,0.0,130,236.0,0,1.0,174.0,1.0,0.0,0.0,1


In [2]:
# bulid a classification model
df_filt = df_filt.dropna()
x = df_filt.drop(columns = ["HeartDisease"])
y = df_filt["HeartDisease"]

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)
lr_score = lr.score(x_test,y_test)
print(f"The score of Logistic Regression is {round(lr_score,4)}")

from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha = 50,max_iter = 100,tol = 0.1)
lasso_reg.fit(x_train,y_train)
lasso_score = lasso_reg.score(x_test,y_test)
print(f"The score of Lasso Regression is {round(lasso_score,4)}")

# CART - Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
dt_score = dt.score(x_test,y_test)
print(f"The score of Decision Tree Classifier is {round(dt_score,4)}")

# K Nearest Neighbors Classification(KNN)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train,y_train)
knn_score = knn.score(x_test,y_test)
print(f"The score of K Nearest Neighbors Classification is {round(knn_score,4)}")

# Support Vector Machine (SVM)
from sklearn.svm import SVC
svc = SVC(gamma = "auto")
svc.fit(x_train,y_train)
svc_score = svc.score(x_test,y_test)
print(f"The score of Support Vector Machine is {round(svc_score,4)}")

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 5)
rf.fit(x_train,y_train)
rf_score = rf.score(x_test,y_test)
print(f"The score of Random Forest Classifier is {round(rf_score,4)}")

The score of Logistic Regression is 0.8736
The score of Lasso Regression is -0.0067
The score of Decision Tree Classifier is 0.7033
The score of K Nearest Neighbors Classification is 0.6593
The score of Support Vector Machine is 0.5989
The score of Random Forest Classifier is 0.8077


In [9]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
# using for loop to replace the previous operations

models = []
models.append(("LR",LogisticRegression()))
models.append(("Lasso",Lasso()))
models.append(("CART",DecisionTreeClassifier()))
models.append(("KNN",KNeighborsClassifier()))
models.append(("SVM",SVC()))
models.append(("FRC",RandomForestClassifier()))

num_folds = 4
seed = 2023

names = []
kfold_results = []
score_results = []
train_results = []
test_results = []

for name,model in models:
    names.append(name)
    kfold = KFold(n_splits = num_folds,random_state = seed,shuffle = True)
    cv_results = \
        (-1)*cross_val_score(model,x_train,y_train,
                             cv = kfold,scoring = "neg_mean_squared_error")
    kfold_results.append(cv_results)
    res = model.fit(x_train,y_train)
    score = res.score(x_test,y_test)
    score_results.append(score)
    
    train_result = mean_squared_error(res.predict(x_train),y_train)
    train_results.append(train_result)
    
    test_result = mean_squared_error(res.predict(x_test),y_test)
    test_results.append(test_result)
    
    message =  "%s:  %f  %f  (%f)  %f  %f" % (name, score, cv_results.mean(), cv_results.std(), 
                                                                        train_result, test_result)
    print(message)
    print()

LR:  0.802198  0.168265  (0.005857)  0.166897  0.197802

Lasso:  -0.001860  0.247309  (0.002206)  0.247032  0.249376

CART:  0.692308  0.285479  (0.015997)  0.000000  0.307692

KNN:  0.851648  0.208275  (0.014793)  0.150345  0.148352

SVM:  0.840659  0.188984  (0.015520)  0.136552  0.159341

FRC:  0.807692  0.195837  (0.016467)  0.000000  0.192308



In [11]:
# use PCA to reduce dimension
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

from sklearn.decomposition import PCA
pca = PCA(0.8)
x_pca = pca.fit_transform(x_scaled)

x_pca_train,x_pca_test,y_train,y_test = train_test_split(x_pca,y,test_size = 0.2)

num_folds = 4
seed = 2023

names = []
kfold_results = []
train_results = []
test_results = []

for name,model in models:
    names.append(name)
    kfold = KFold(n_splits = num_folds,random_state = seed,shuffle = True)
    cv_results = \
        (-1)*cross_val_score(model,x_pca_train,y_train,
                             cv = kfold,scoring = "neg_mean_squared_error")
    kfold_results.append(cv_results)
    res = model.fit(x_pca_train,y_train)
    score = res.score(x_pca_test,y_test)
    score_results.append(score)
    
    train_result = mean_squared_error(res.predict(x_pca_train),y_train)
    train_results.append(train_result)
    
    test_result = mean_squared_error(res.predict(x_pca_test),y_test)
    test_results.append(test_result)
    
    message =  "%s:  %f  %f  (%f)  %f  %f" % (name, score, cv_results.mean(), cv_results.std(), 
                                                                        train_result, test_result)
    print(message)
    print()

LR:  0.818681  0.184832  (0.012721)  0.176552  0.181319

Lasso:  -0.000243  0.248652  (0.003426)  0.247325  0.248128

CART:  0.736264  0.248338  (0.046000)  0.000000  0.263736

KNN:  0.807692  0.190388  (0.021886)  0.147586  0.192308

SVM:  0.857143  0.184855  (0.017068)  0.155862  0.142857

FRC:  0.796703  0.173836  (0.021502)  0.000000  0.203297

