In [57]:
# import usual libraries for machine learing and data science
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# import naive bayes classifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB

# Prepare the data

In [66]:
import os

file_list = os.listdir("Dataset")
file_list = [file.replace(".csv", "") for file in file_list]

# put file names in file_list that have world splitTrain to file_list_train
file_list_train = [file for file in file_list if "splitTrain" in file]
file_list_test = [file for file in file_list if "splitTest" in file]

data_train = {}
for file in file_list_train:
    if "stdScaled" in file:
        continue
    data_train[file.replace("splitTrain_", "")] = pd.read_csv("Dataset/" + file + ".csv")

data_test = {}
for file in file_list_test:
    if "stdScaled" in file:
        continue
    data_test[file.replace("splitTest_", "")] = pd.read_csv("Dataset/" + file + ".csv")


# Fungsi

In [79]:
# import f1 score metric from sklearn
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score
from sklearn.model_selection import KFold, cross_val_score

# function to get dictionary of f1 score prediction for each data train and data test using KNeighborsClassifier
def get_score(data_train, data_test, modelNB):
    performanceData = {}
    for key in data_train:
        try:
          X_train = data_train[key].drop(["HeartDisease"], axis=1)
          y_train = data_train[key]["HeartDisease"]
          X_test = data_test[key].drop(["HeartDisease"], axis=1)
          y_test = data_test[key]["HeartDisease"]

          model = modelNB()
          model.fit(X_train, y_train)
          y_pred = model.predict(X_test)
        except:
          continue
        
        try:
          performanceData[key] = {'f1_score' : f1_score(y_test, y_pred)}
          performanceData[key]['accuracy'] = accuracy_score(y_test, y_pred)
          performanceData[key]['confusion_matrix'] = confusion_matrix(y_test, y_pred)
          performanceData[key]['roc_auc_score'] = roc_auc_score(y_test, y_pred)
          performanceData[key]['precision_score'] = precision_score(y_test, y_pred)
          performanceData[key]['recall_score'] = recall_score(y_test, y_pred)
        except:
          print('error masukkan data')
        
    return performanceData

# function to cross validate the model using KFold
def cross_validate(data_train, modelNB):
    performanceData = {}
    for key in data_train:
        # try:
          X_train = data_train[key].drop(["HeartDisease"], axis=1)
          y_train = data_train[key]["HeartDisease"]

          kf = KFold(n_splits=10, shuffle=True)
          scores = cross_val_score(modelNB(), X_train, y_train, cv=kf, scoring="f1_macro")
          performanceData[key] = scores.mean()

        # except:
        #   continue
            
    return performanceData

# Comparing already made models

### Gaussian Naive Bayes

In [68]:
list_f1score_gnb = get_score(data_train, data_test, GaussianNB)

In [69]:
# create dataframe from list_f1score_gnb, sort by f1 score and then display it
df_f1score_gnb = pd.DataFrame.from_dict(list_f1score_gnb, orient="index")
df_f1score_gnb

Unnamed: 0,f1_score,accuracy,confusion_matrix,roc_auc_score,precision_score,recall_score
df_encoded_minmaxScaled,0.847458,0.804348,"[[48, 29], [7, 100]]",0.778978,0.775194,0.934579
df_modifiedOutlier_encoded_minmaxScaled,0.819444,0.823129,"[[62, 13], [13, 59]]",0.823056,0.819444,0.819444
df_deleteOutlier_encoded_minmaxScaled,0.844444,0.851064,"[[63, 11], [10, 57]]",0.851049,0.838235,0.850746
df_DF_encoded_minmaxScaled,0.843882,0.798913,"[[47, 30], [7, 100]]",0.772485,0.769231,0.934579
df_DF_modifiedOutlier_encoded_minmaxScaled,0.819444,0.823129,"[[62, 13], [13, 59]]",0.823056,0.819444,0.819444
df_DF_deleteOutlier_encoded_minmaxScaled,0.827068,0.836879,"[[63, 11], [12, 55]]",0.836123,0.833333,0.820896


In [70]:
cross_validate_gnb = cross_validate(data_train, GaussianNB)

In [71]:
df_cross_validate_gnb = pd.DataFrame.from_dict(cross_validate_gnb, orient="index")
df_cross_validate_gnb

Unnamed: 0,0
df_encoded_minmaxScaled,0.836955
df_modifiedOutlier_encoded_minmaxScaled,0.838177
df_deleteOutlier_encoded_minmaxScaled,0.823587
df_DF_encoded_minmaxScaled,0.834178
df_DF_modifiedOutlier_encoded_minmaxScaled,0.824047
df_DF_deleteOutlier_encoded_minmaxScaled,0.817903


### Multinomial Naive Bayes

In [72]:
list_f1score_mnb = get_score(data_train, data_test, MultinomialNB)

In [73]:
# create dataframe from list_f1score_mnb, sort by f1 score and then display it
df_f1score_mnb = pd.DataFrame.from_dict(list_f1score_mnb, orient="index")
df_f1score_mnb

Unnamed: 0,f1_score,accuracy,confusion_matrix,roc_auc_score,precision_score,recall_score
df_encoded_minmaxScaled,0.854626,0.820652,"[[54, 23], [10, 97]]",0.80392,0.808333,0.906542
df_modifiedOutlier_encoded_minmaxScaled,0.77027,0.768707,"[[56, 19], [15, 57]]",0.769167,0.75,0.791667
df_deleteOutlier_encoded_minmaxScaled,0.834532,0.836879,"[[60, 14], [9, 58]]",0.838241,0.805556,0.865672
df_DF_encoded_minmaxScaled,0.834081,0.798913,"[[54, 23], [14, 93]]",0.785229,0.801724,0.869159
df_DF_modifiedOutlier_encoded_minmaxScaled,0.756757,0.755102,"[[55, 20], [16, 56]]",0.755556,0.736842,0.777778
df_DF_deleteOutlier_encoded_minmaxScaled,0.80597,0.815603,"[[61, 13], [13, 54]]",0.815147,0.80597,0.80597


In [74]:
cross_validate_mnb = cross_validate(data_train, MultinomialNB)
df_cross_validate_mnb = pd.DataFrame.from_dict(cross_validate_mnb, orient="index")
df_cross_validate_mnb

Unnamed: 0,0
df_encoded_minmaxScaled,0.799647
df_modifiedOutlier_encoded_minmaxScaled,0.804317
df_deleteOutlier_encoded_minmaxScaled,0.7909
df_DF_encoded_minmaxScaled,0.771773
df_DF_modifiedOutlier_encoded_minmaxScaled,0.780711
df_DF_deleteOutlier_encoded_minmaxScaled,0.781516


# Model from scratch

In [45]:
from collections import Counter

class GaussianNB_Classifier:
    def get_prior(self, data):
        """
        data : list data
        """
        n_data = len(data)
        prior = Counter(data)
        for key in prior.keys():
            prior[key] = prior[key] / n_data
        return prior

    def get_mean_and_std(self, data):
        list_columns = data.columns[:-1]
        class_column_name = data.columns[-1]
        list_class = set(data[class_column_name])

        mean = {}
        std = {}

        for column in list_columns:
            for a_class in list_class:
                mean[(column, a_class)] = np.mean(
                    data.loc[data[class_column_name] == a_class][column])
                std[(column, a_class)] = np.std(
                    data.loc[data[class_column_name] == a_class][column])

        return mean, std

    def get_gaussian_likelihood(self, data, mean, std):
        res = (1/np.sqrt(2*np.pi*(std**2)))
        res *= np.exp((-1*((data-mean)**2))/(2*(std**2)))

        return res

    def training_gaussianNB(self, X, y):
        X = X.join(y)
        prior = self.get_prior(y)
        mean, std = self.get_mean_and_std(X)

        list_class = set(y)
        list_columns = X.columns[:-1]

        model = {}
        model['prior'] = prior
        model['mean'] = mean
        model['std'] = std
        model['class'] = list_class
        model['columns'] = list_columns

        return model

    def fit(self, X, y):
        self.model = self.training_gaussianNB(X, y)

    def get_single_prediction(self, data):
        prior = self.model['prior']
        mean = self.model['mean']
        std = self.model['std']
        list_class = self.model['class']
        list_columns = self.model['columns']
        
        posterior = dict.fromkeys(list_class, 1)

        for a_class in list_class:
            for column in list_columns:    
                posterior[a_class] *= self.get_gaussian_likelihood(
                    data[column], mean[(column, a_class)], std[(column, a_class)])
            posterior[a_class] *= prior[a_class]

        kelas_uji = max(posterior, key=posterior.get)
        return kelas_uji

    def predict(self, X_test):
        y_pred = []
        for index, row in X_test.iterrows():
            y_pred.append(self.get_single_prediction(row))
        return np.array(y_pred)


In [76]:
list_gnb_scratch = get_score(data_train, data_test, GaussianNB_Classifier)
df_gnb_scratch = pd.DataFrame.from_dict(list_f1score_gnb_scratch, orient="index")
df_gnb_scratch

Unnamed: 0,f1_score,accuracy,confusion_matrix,roc_auc_score,precision_score,recall_score
df_encoded_minmaxScaled,0.847458,0.804348,"[[48, 29], [7, 100]]",0.778978,0.775194,0.934579
df_modifiedOutlier_encoded_minmaxScaled,0.819444,0.823129,"[[62, 13], [13, 59]]",0.823056,0.819444,0.819444
df_deleteOutlier_encoded_minmaxScaled,0.844444,0.851064,"[[63, 11], [10, 57]]",0.851049,0.838235,0.850746
df_DF_encoded_minmaxScaled,0.843882,0.798913,"[[47, 30], [7, 100]]",0.772485,0.769231,0.934579
df_DF_modifiedOutlier_encoded_minmaxScaled,0.819444,0.823129,"[[62, 13], [13, 59]]",0.823056,0.819444,0.819444
df_DF_deleteOutlier_encoded_minmaxScaled,0.827068,0.836879,"[[63, 11], [12, 55]]",0.836123,0.833333,0.820896


In [82]:

X = data_train['df_encoded_minmaxScaled'].drop(["HeartDisease"], axis=1)
y = data_train['df_encoded_minmaxScaled']["HeartDisease"]
kf = KFold(n_splits=5, random_state=0, shuffle=True)
model = GaussianNB_Classifier()
scoresAccuracy = []
scoresRecall = []
scoresPrecision = []
scoresF1 = []
for train_index, test_index in kf.split(X, y):
    x_train, x_test = X.iloc[list(train_index)], X.iloc[list(test_index)]
    Y_train, Y_test = y.iloc[list(train_index)], y.iloc[list(test_index)]
    x_train = (x_train-np.min(x_train)) / \
        (np.max(x_train)-np.min(x_train)).values
    x_test = (x_test-np.min(x_test))/(np.max(x_test)-np.min(x_test)).values
    model.fit(x_train, Y_train)
    y_pred_test = model.predict(x_test)
    test_data_accuracy = accuracy_score(Y_test, y_pred_test)
    test_data_recall = recall_score(Y_test, y_pred_test)
    test_data_precision = precision_score(Y_test, y_pred_test)
    test_data_f1_score = f1_score(Y_test, y_pred_test)
    scoresAccuracy.append(test_data_accuracy)
    scoresRecall.append(test_data_recall)
    scoresPrecision.append(test_data_precision)
    scoresF1.append(test_data_f1_score)

print('Accuracy Scores Test Data: %.2f (%.2f)' %
      (np.mean(scoresAccuracy)*100, np.std(scoresAccuracy)*100))
print('Recall Scores Test Data: %.2f (%.2f)' %
      (np.mean(scoresRecall)*100, np.std(scoresRecall)*100))
print('Precision Scores Test Data: %.2f (%.2f)' %
      (np.mean(scoresPrecision)*100, np.std(scoresPrecision)*100))
print('F1 Scores Test Data: %.2f (%.2f)' %
      (np.mean(scoresF1)*100, np.std(scoresF1)*100))


  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


Accuracy Scores Test Data: 80.38 (2.89)
Recall Scores Test Data: 90.12 (4.74)
Precision Scores Test Data: 77.90 (5.60)
F1 Scores Test Data: 83.30 (2.75)
