In [1]:
# import usual libraries for machine learing and data science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os

file_list = os.listdir("Dataset")
file_list = [file.replace(".csv", "") for file in file_list]

# put file names in file_list that have world splitTrain to file_list_train
file_list_train = [file for file in file_list if "splitTrain" in file]
file_list_test = [file for file in file_list if "splitTest" in file]

data_train = {}
for file in file_list_train:
    data_train[file.replace("splitTrain_", "").replace("splitTrain", "")] = pd.read_csv("Dataset/" + file + ".csv")

data_test = {}
for file in file_list_test:
    data_test[file.replace("splitTest_", "").replace("splitTest", "")] = pd.read_csv("Dataset/" + file + ".csv")

# AdaBoost

In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score

def get_ada_score(data_train, data_test):
  performance_score = {}
  for key in data_train:
    try:
      X_train = data_train[key].drop(["HeartDisease"], axis=1)
      y_train = data_train[key]["HeartDisease"]
      X_test = data_test[key].drop(["HeartDisease"], axis=1)
      y_test = data_test[key]["HeartDisease"]

      abc = AdaBoostClassifier(n_estimators=500, learning_rate=0.01, random_state=0)
      model = abc.fit(X_train, y_train)
      y_pred = model.predict(X_test)

      performance_score[key] = {'f1_score' : f1_score(y_test, y_pred)}
      performance_score[key]['accuracy'] = accuracy_score(y_test, y_pred)
      performance_score[key]['confusion_matrix'] = confusion_matrix(y_test, y_pred)
      performance_score[key]['roc_auc_score'] = roc_auc_score(y_test, y_pred)
      performance_score[key]['precision_score'] = precision_score(y_test, y_pred)
      performance_score[key]['recall_score'] = recall_score(y_test, y_pred)
    except:
      continue
  return performance_score

from sklearn.model_selection import KFold, cross_val_score

# function to cross validate the model using KFold
def cross_validate_ada(data_train, model):
    performanceData = {}
    for key in data_train:
      try:
        X_train = data_train[key].drop(["HeartDisease"], axis=1)
        y_train = data_train[key]["HeartDisease"]

        kf = KFold(n_splits=10, shuffle=True)
        scores = cross_val_score(model(n_estimators=500, learning_rate=0.01, random_state=0), X_train, y_train, cv=kf, scoring="f1_macro")
        performanceData[key] = scores.mean()
      except:
        continue
            
    return performanceData



In [4]:
ada_score = get_ada_score(data_train, data_test)

In [5]:
df_ada_score = pd.DataFrame.from_dict(ada_score, orient='index')
df_ada_score.sort_values(by=["f1_score", "accuracy"], ascending=[False, False], inplace=True)
df_ada_score

Unnamed: 0,f1_score,accuracy,confusion_matrix,roc_auc_score,precision_score,recall_score
df_modifiedOutlier_encoded_minmaxScaled,0.875,0.882353,"[[104, 12], [14, 91]]",0.881609,0.883495,0.866667
df_modifiedOutlier_encoded_,0.869565,0.877828,"[[104, 12], [15, 90]]",0.876847,0.882353,0.857143
df_encoded_minmaxScaled,0.859756,0.833333,"[[89, 24], [22, 141]]",0.826321,0.854545,0.865031
df_encoded_,0.856269,0.82971,"[[89, 24], [23, 140]]",0.823253,0.853659,0.858896
df_DF_modifiedOutlier_encoded_minmaxScaled,0.854369,0.864253,"[[103, 13], [17, 88]]",0.863013,0.871287,0.838095
df_DF_modifiedOutlier_encoded_,0.852941,0.864253,"[[104, 12], [18, 87]]",0.862562,0.878788,0.828571
df_deleteOutlier_encoded_,0.84264,0.853081,"[[97, 13], [18, 83]]",0.8518,0.864583,0.821782
df_deleteOutlier_encoded_minmaxScaled,0.836735,0.848341,"[[97, 13], [19, 82]]",0.84685,0.863158,0.811881
df_DF_deleteOutlier_encoded_minmaxScaled,0.824742,0.838863,"[[97, 13], [21, 80]]",0.836949,0.860215,0.792079
df_DF_encoded_minmaxScaled,0.824615,0.793478,"[[85, 28], [29, 134]]",0.787149,0.82716,0.822086


In [6]:
cross_val_ada = cross_validate_ada(data_train, AdaBoostClassifier)

In [7]:
df_cross_val_ada = pd.DataFrame.from_dict(cross_val_ada, orient='index')
df_cross_val_ada.rename(columns={0: "f1_score"}, inplace=True)
df_cross_val_ada.sort_values(by=["f1_score"], ascending=[False], inplace=True)
df_cross_val_ada

Unnamed: 0,f1_score
df_encoded_,0.858415
df_encoded_minmaxScaled,0.856511
df_modifiedOutlier_encoded_,0.85027
df_modifiedOutlier_encoded_minmaxScaled,0.845809
df_deleteOutlier_encoded_minmaxScaled,0.844818
df_DF_deleteOutlier_encoded_minmaxScaled,0.842832
df_deleteOutlier_encoded_,0.842785
df_DF_deleteOutlier_encoded_,0.840946
df_DF_encoded_minmaxScaled,0.840294
df_DF_encoded_stdScaled_rounded,0.837666


# XG Boost

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score

def get_xg_score(data_train, data_test):
  performance_score = {}
  for key in data_train:
    try:
      X_train = data_train[key].drop(["HeartDisease"], axis=1)
      y_train = data_train[key]["HeartDisease"]
      X_test = data_test[key].drop(["HeartDisease"], axis=1)
      y_test = data_test[key]["HeartDisease"]

      abc = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, random_state=0)
      model = abc.fit(X_train, y_train)
      y_pred = model.predict(X_test)

      performance_score[key] = {'f1_score' : f1_score(y_test, y_pred)}
      performance_score[key]['accuracy'] = accuracy_score(y_test, y_pred)
      performance_score[key]['confusion_matrix'] = confusion_matrix(y_test, y_pred)
      performance_score[key]['roc_auc_score'] = roc_auc_score(y_test, y_pred)
      performance_score[key]['precision_score'] = precision_score(y_test, y_pred)
      performance_score[key]['recall_score'] = recall_score(y_test, y_pred)
    except:
      continue
  return performance_score

from sklearn.model_selection import KFold, cross_val_score

# function to cross validate the model using KFold
def cross_validate_xg(data_train):
    performanceData = {}
    for key in data_train:
      try:
        X_train = data_train[key].drop(["HeartDisease"], axis=1)
        y_train = data_train[key]["HeartDisease"]

        kf = KFold(n_splits=10, shuffle=True)
        scores = cross_val_score(GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, random_state=0), X_train, y_train, cv=kf, scoring="f1_macro")
        performanceData[key] = scores.mean()
      except:
        continue
            
    return performanceData



In [9]:
xg_score = get_xg_score(data_train, data_test)

In [10]:
df_xg_score = pd.DataFrame.from_dict(xg_score, orient='index')
df_xg_score.sort_values(by=["f1_score", "accuracy"], ascending=[False, False], inplace=True)
df_xg_score

Unnamed: 0,f1_score,accuracy,confusion_matrix,roc_auc_score,precision_score,recall_score
df_encoded_,0.89697,0.876812,"[[94, 19], [15, 148]]",0.869917,0.886228,0.907975
df_DF_encoded_,0.888218,0.865942,"[[92, 21], [16, 147]]",0.858,0.875,0.90184
df_modifiedOutlier_encoded_,0.883721,0.886878,"[[101, 15], [10, 95]]",0.887726,0.863636,0.904762
df_DF_modifiedOutlier_encoded_minmaxScaled,0.874419,0.877828,"[[100, 16], [11, 94]]",0.878654,0.854545,0.895238
df_DF_modifiedOutlier_encoded_,0.867925,0.873303,"[[101, 15], [13, 92]]",0.87344,0.859813,0.87619
df_deleteOutlier_encoded_,0.865672,0.872038,"[[97, 13], [14, 87]]",0.871602,0.87,0.861386
df_modifiedOutlier_encoded_minmaxScaled,0.861111,0.864253,"[[98, 18], [12, 93]]",0.865271,0.837838,0.885714
df_deleteOutlier_encoded_minmaxScaled,0.86,0.867299,"[[97, 13], [15, 86]]",0.866652,0.868687,0.851485
df_DF_deleteOutlier_encoded_,0.843137,0.848341,"[[93, 17], [15, 86]]",0.84847,0.834951,0.851485
df_DF_deleteOutlier_encoded_minmaxScaled,0.834951,0.838863,"[[91, 19], [15, 86]]",0.839379,0.819048,0.851485


In [11]:
cross_val_xg = cross_validate_xg(data_train)

In [12]:
df_cross_val_xg = pd.DataFrame.from_dict(cross_val_xg, orient='index')
df_cross_val_xg.rename(columns={0: "f1_score"}, inplace=True)
df_cross_val_xg.sort_values(by=["f1_score"], ascending=[False], inplace=True)
df_cross_val_xg

Unnamed: 0,f1_score
df_deleteOutlier_encoded_minmaxScaled,0.872085
df_DF_encoded_,0.867753
df_encoded_,0.866689
df_modifiedOutlier_encoded_minmaxScaled,0.864999
df_DF_encoded_minmaxScaled,0.864155
df_DF_encoded_stdScaled_rounded,0.863531
df_encoded_minmaxScaled,0.862779
df_modifiedOutlier_encoded_,0.860422
df_deleteOutlier_encoded_,0.858834
df_DF_modifiedOutlier_encoded_,0.849616
