In [1]:
# import usual libraries for machine learing and data science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import os

file_list = os.listdir("Dataset")
file_list = [file.replace(".csv", "") for file in file_list]

# put file names in file_list that have world splitTrain to file_list_train
file_list_train = [file for file in file_list if "splitTrain" in file]
file_list_test = [file for file in file_list if "splitTest" in file]

data_train = {}
for file in file_list_train:
    data_train[file.replace("splitTrain_", "").replace("splitTrain", "")] = pd.read_csv("Dataset/" + file + ".csv")

data_test = {}
for file in file_list_test:
    data_test[file.replace("splitTest_", "").replace("splitTest", "")] = pd.read_csv("Dataset/" + file + ".csv")

# AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score

def get_ada_score(data_train, data_test):
  performance_score = {}
  for key in data_train:
    try:
      X_train = data_train[key].drop(["HeartDisease"], axis=1)
      y_train = data_train[key]["HeartDisease"]
      X_test = data_test[key].drop(["HeartDisease"], axis=1)
      y_test = data_test[key]["HeartDisease"]

      abc = AdaBoostClassifier(n_estimators=500, learning_rate=0.01, random_state=0)
      model = abc.fit(X_train, y_train)
      y_pred = model.predict(X_test)

      performance_score[key] = {'f1_score' : f1_score(y_test, y_pred)}
      performance_score[key]['accuracy'] = accuracy_score(y_test, y_pred)
      performance_score[key]['confusion_matrix'] = confusion_matrix(y_test, y_pred)
      performance_score[key]['roc_auc_score'] = roc_auc_score(y_test, y_pred)
      performance_score[key]['precision_score'] = precision_score(y_test, y_pred)
      performance_score[key]['recall_score'] = recall_score(y_test, y_pred)
    except:
      continue
  return performance_score

from sklearn.model_selection import KFold, cross_val_score

# function to cross validate the model using KFold
def cross_validate_ada(data_train, model):
    performanceData = {}
    for key in data_train:
      try:
        X_train = data_train[key].drop(["HeartDisease"], axis=1)
        y_train = data_train[key]["HeartDisease"]

        kf = KFold(n_splits=10, shuffle=True)
        scores = cross_val_score(model(n_estimators=500, learning_rate=0.01, random_state=0), X_train, y_train, cv=kf, scoring="f1_macro")
        performanceData[key] = scores.mean()
      except:
        continue
            
    return performanceData



In [4]:
ada_score = get_ada_score(data_train, data_test)

In [12]:
df_ada_score = pd.DataFrame.from_dict(ada_score, orient='index')
df_ada_score.sort_values(by=["f1_score", "accuracy"], ascending=[False, False], inplace=True)
df_ada_score

Unnamed: 0,f1_score,accuracy,confusion_matrix,roc_auc_score,precision_score,recall_score
df_encoded_,0.873874,0.847826,"[[59, 18], [10, 97]]",0.836388,0.843478,0.906542
df_DF_encoded_,0.847926,0.820652,"[[59, 18], [15, 92]]",0.813023,0.836364,0.859813
df_DF_modifiedOutlier_encoded_minmaxScaled,0.817518,0.829932,"[[66, 9], [16, 56]]",0.828889,0.861538,0.777778
df_deleteOutlier_encoded_minmaxScaled,0.81203,0.822695,"[[62, 12], [13, 54]]",0.821904,0.818182,0.80597
df_modifiedOutlier_encoded_,0.808824,0.823129,"[[66, 9], [17, 55]]",0.821944,0.859375,0.763889
df_deleteOutlier_encoded_,0.808824,0.815603,"[[60, 14], [12, 55]]",0.815853,0.797101,0.820896
df_DF_deleteOutlier_encoded_minmaxScaled,0.80292,0.808511,"[[59, 15], [12, 55]]",0.809096,0.785714,0.820896
df_modifiedOutlier_encoded_minmaxScaled,0.80292,0.816327,"[[65, 10], [17, 55]]",0.815278,0.846154,0.763889
df_DF_deleteOutlier_encoded_,0.8,0.808511,"[[60, 14], [13, 54]]",0.80839,0.794118,0.80597
df_DF_modifiedOutlier_encoded_,0.785185,0.802721,"[[65, 10], [19, 53]]",0.801389,0.84127,0.736111


In [14]:
cross_val_ada = cross_validate_ada(data_train, AdaBoostClassifier)

In [15]:
df_cross_val_ada = pd.DataFrame.from_dict(cross_val_ada, orient='index')
df_cross_val_ada.rename(columns={0: "f1_score"}, inplace=True)
df_cross_val_ada.sort_values(by=["f1_score"], ascending=[False], inplace=True)
df_cross_val_ada

Unnamed: 0,f1_score
df_encoded_minmaxScaled,0.867755
df_encoded_,0.863994
df_DF_encoded_stdScaled_rounded,0.851468
df_DF_deleteOutlier_encoded_minmaxScaled,0.847696
df_modifiedOutlier_encoded_,0.847517
df_deleteOutlier_encoded_,0.847195
df_DF_encoded_,0.847111
df_DF_deleteOutlier_encoded_,0.844089
df_deleteOutlier_encoded_minmaxScaled,0.841844
df_modifiedOutlier_encoded_minmaxScaled,0.841703


# XG Boost

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score

def get_xg_score(data_train, data_test):
  performance_score = {}
  for key in data_train:
    try:
      X_train = data_train[key].drop(["HeartDisease"], axis=1)
      y_train = data_train[key]["HeartDisease"]
      X_test = data_test[key].drop(["HeartDisease"], axis=1)
      y_test = data_test[key]["HeartDisease"]

      abc = GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, random_state=0)
      model = abc.fit(X_train, y_train)
      y_pred = model.predict(X_test)

      performance_score[key] = {'f1_score' : f1_score(y_test, y_pred)}
      performance_score[key]['accuracy'] = accuracy_score(y_test, y_pred)
      performance_score[key]['confusion_matrix'] = confusion_matrix(y_test, y_pred)
      performance_score[key]['roc_auc_score'] = roc_auc_score(y_test, y_pred)
      performance_score[key]['precision_score'] = precision_score(y_test, y_pred)
      performance_score[key]['recall_score'] = recall_score(y_test, y_pred)
    except:
      continue
  return performance_score

from sklearn.model_selection import KFold, cross_val_score

# function to cross validate the model using KFold
def cross_validate_xg(data_train):
    performanceData = {}
    for key in data_train:
      try:
        X_train = data_train[key].drop(["HeartDisease"], axis=1)
        y_train = data_train[key]["HeartDisease"]

        kf = KFold(n_splits=10, shuffle=True)
        scores = cross_val_score(GradientBoostingClassifier(n_estimators=500, learning_rate=0.01, random_state=0), X_train, y_train, cv=kf, scoring="f1_macro")
        performanceData[key] = scores.mean()
      except:
        continue
            
    return performanceData



In [17]:
xg_score = get_xg_score(data_train, data_test)

In [18]:
df_xg_score = pd.DataFrame.from_dict(xg_score, orient='index')
df_xg_score.sort_values(by=["f1_score", "accuracy"], ascending=[False, False], inplace=True)
df_xg_score

Unnamed: 0,f1_score,accuracy,confusion_matrix,roc_auc_score,precision_score,recall_score
df_encoded_,0.87156,0.847826,"[[61, 16], [12, 95]]",0.840029,0.855856,0.88785
df_DF_encoded_,0.866359,0.842391,"[[61, 16], [13, 94]]",0.835356,0.854545,0.878505
df_deleteOutlier_encoded_,0.857143,0.865248,"[[65, 9], [10, 57]]",0.864562,0.863636,0.850746
df_DF_deleteOutlier_encoded_,0.846715,0.851064,"[[62, 12], [9, 58]]",0.851755,0.828571,0.865672
df_deleteOutlier_encoded_minmaxScaled,0.844444,0.851064,"[[63, 11], [10, 57]]",0.851049,0.838235,0.850746
df_modifiedOutlier_encoded_,0.841379,0.843537,"[[63, 12], [11, 61]]",0.843611,0.835616,0.847222
df_DF_deleteOutlier_encoded_minmaxScaled,0.82963,0.836879,"[[62, 12], [11, 56]]",0.836829,0.823529,0.835821
df_DF_modifiedOutlier_encoded_minmaxScaled,0.825175,0.829932,"[[63, 12], [13, 59]]",0.829722,0.830986,0.819444
df_DF_modifiedOutlier_encoded_,0.817518,0.829932,"[[66, 9], [16, 56]]",0.828889,0.861538,0.777778
df_modifiedOutlier_encoded_minmaxScaled,0.816901,0.823129,"[[63, 12], [14, 58]]",0.822778,0.828571,0.805556


In [19]:
cross_val_xg = cross_validate_xg(data_train)

In [20]:
df_cross_val_xg = pd.DataFrame.from_dict(cross_val_xg, orient='index')
df_cross_val_xg.rename(columns={0: "f1_score"}, inplace=True)
df_cross_val_xg.sort_values(by=["f1_score"], ascending=[False], inplace=True)
df_cross_val_xg

Unnamed: 0,f1_score
df_modifiedOutlier_encoded_minmaxScaled,0.883787
df_modifiedOutlier_encoded_,0.872604
df_encoded_minmaxScaled,0.871642
df_DF_encoded_minmaxScaled,0.863388
df_encoded_,0.86264
df_deleteOutlier_encoded_minmaxScaled,0.861876
df_deleteOutlier_encoded_,0.856082
df_DF_encoded_,0.856079
df_DF_modifiedOutlier_encoded_,0.853513
df_DF_encoded_stdScaled_rounded,0.845656
