In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

## PREPROCESSING

In [20]:
# Category features
cont_cols  = ['age', 'chol', 'oldpeak', 'thalch', 'trestbps']
cat_cols = ['ca', 'cp', 'restecg', 'slope', 'thal', 'sex', 'fbs', 'exang']

#read training data, normalize continous features using (X - mean) / std
df = pd.read_csv('heart_disease_uci.csv')

# Create a new dataframe from original and change all target variables greater than 0 to 1, instead of 1,2,3,4,5
df_fix = df
df_fix['num'] = np.where(df_fix['num'] > 0, 1, 0)

# Seperate features (X) and target (y)
X = df_fix.drop(columns=['id', 'num', 'dataset'])
y = df_fix['num']

# Replace missing data in categorical columns with unknown
for cols in cat_cols:
    X[cols] = X[cols].fillna('Unknown')

# One-hot encoding for categorical columns
X = pd.get_dummies(X, columns=cat_cols)

# Convert continuous column values to float
X[cont_cols] = X.loc[:, cont_cols].astype('float64')

# find the mean and standard deviation of each continuous column
train_means = X[cont_cols].mean()
train_std = X[cont_cols].std()

# replace any missing value in a continuous column with the mean of that column
X.loc[:, cont_cols] = X[cont_cols].fillna(train_means)

# Normalize data
X.loc[:, cont_cols] = (X.loc[:, cont_cols] - train_means) / train_std


## NAIVE BAYES

In [23]:
# shuffling data before splitting for randomized train, valid, test sets
shuffled = df_fix.sample(frac=1, random_state=42).reset_index(drop=True)

# separate data into training and testing sets
train_split = int(0.6 * len(X))
X_train = X[:train_split]
X_test = X[train_split:]

y_train = y[:train_split]
y_test = y[train_split:]

In [14]:
# NB algorithm

#calculate catogorical feature probability
def calcCatProb(x, p):
    """
    x: feature value (0 or 1 since one-hot encoded)
    p: probability of feature=1 given class
    """
    return p if x == 1 else (1 - p)
    
#calculate continous feature probability using Maximum Likelihood estimator for Gaussian Distribution
def calcGaussProb(x, mean, var):
    return (1 / np.sqrt(2 * np.pi * var)) * np.exp(- (x - mean) ** 2 / (2 * var))

In [27]:
#naive bayes classifier
def naiveBayes(X_train, y_train):
    model = {} #store all learned parameters
    classes = np.unique(y_train)

    # where y_train is equal to expected result, assign X_train row to X_c, after looping through all the classes
    # returns model as the fully trained dictionary
    for c in classes:
        X_c = X_train[y_train == c]
        model[c] = {
            "prior": len(X_c) / len(X_train),
            "mean": X_c[cont_cols].mean(),
            "var": X_c[cont_cols].var() + 1e-6,
            "cat_probs": {}
        }

        # categorical probs (Bernoulli for one-hot encoded columns)
        for col in X_train.drop(columns=cont_cols).columns:
            model[c]["cat_probs"][col] = X_c[col].mean()

    return model

def predictSingle(row, model):
    posteriors = {}
    for c, params in model.items():
        # Start with log prior
        log_prob = np.log(params["prior"])
        
        # Continuous features (Gaussian)
        for col in cont_cols:
            mean = params["mean"][col]
            var = params["var"][col]
            prob = calcGaussProb(row[col], mean, var)
            log_prob += np.log(prob + 1e-9)

        # Categorical features (Bernoulli)
        for col, p in params["cat_probs"].items():
            prob = calcCatProb(row[col], p)
            log_prob += np.log(prob + 1e-9)

        posteriors[c] = log_prob

    return max(posteriors, key=posteriors.get)

def naiveBayesPredict(X_test, model):
    return np.array([predictSingle(row, model) for _, row in X_test.iterrows()])


In [28]:
# Print results of Naive Bayes classifier

def printConfusionMatrix(tp, fp, tn, fn):
    print("\n%15sActual" % "")
    print("%6s %7s %7s" % ("", "1", "0"))
    print("P%6s +--------+--------+" % "")
    print("r%6s | %-6s | %-6s |" % ("1", 'TP='+str(tp), 'FP='+str(fp)))
    print("e%6s +--------+--------+" % "")
    print("d%6s | %-6s | %-6s |" % ("0", 'FN='+str(fn), 'TN='+str(tn)))
    print(".%6s +--------+--------+\n" % "")

def getConfusionMatrix(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return tp, fp, tn, fn

def getAccuracy(tp, fp, tn, fn):
    return (tp + tn) / (tp + tn + fp + fn) 

def getPrecision(tp, fp, tn, fn):
    return tp / (tp + fp + 1e-9)

def getRecall(tp, fp, tn, fn):
    return tp / (tp + fn + 1e-9) 

def getFScore(tp, fp, tn, fn):
    precision = getPrecision(tp, fp, tn, fn)
    recall = getRecall(tp, fp, tn, fn)
    return 2 * precision * recall / (precision + recall + 1e-9) 

model = naiveBayes(X_train, y_train)
y_pred = naiveBayesPredict(X_test, model)

tp, fp, tn, fn = getConfusionMatrix(y_test.values, y_pred)
printConfusionMatrix(tp, fp, tn, fn)

print('Accuracy:  %8.5f' % getAccuracy(tp, fp, tn, fn))
print('Precison:  %8.5f' % getPrecision(tp, fp, tn, fn))
print('Recall:    %8.5f' % getRecall(tp, fp, tn, fn))
print('F-Score: %8.5f' % getFScore(tp, fp, tn, fn))


               Actual
             1       0
P       +--------+--------+
r     1 | TP=209 | FP=16  |
e       +--------+--------+
d     0 | FN=100 | TN=43  |
.       +--------+--------+

Accuracy:   0.68478
Precison:   0.92889
Recall:     0.67638
F-Score:  0.78277
