# Naive Bayes implementation

It is tested with the following example data sets:

- [arrhythmia](./data/arrhythmia.csv)
- [banknote](./data/banknote.csv)
- [forestfires](./data/forestfires.csv)
- [lung-cancer](./data/lung-cancer.csv)
- [phishing-websites](./data/phishing-websites.csv)
- [pima-indians-diabetes](./data/pima-indians-diabetes.csv)

The main source for the code is the following tutorial: [Naive Bayes Classifier From Scratch in Python](http://machinelearningmastery.com/naive-bayes-classifier-scratch-python/)

In [None]:
from argparse import ArgumentParser
from math import exp
from math import pi as PI
from math import sqrt

from numpy import mean, std
from sklearn.naive_bayes import GaussianNB

from utility import display, load_dataset, split_dataset

## Calculate the mean, stdev and count for each column in a dataset

In [None]:
def summarize(dataset):
    summaries = [(mean(attribute), std(attribute))
                 for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

## Split the dataset by class values, returns a dictionary

In [None]:
def separate_by_class(dataset, target):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        result = target[i]
        if result not in separated:
            separated[result] = []
        separated[result].append(vector)
    return separated

## Split dataset by class then calculate statistics for each row

In [None]:
def summarize_by_class(dataset, target):
    separated = separate_by_class(dataset, target)
    summaries = {}
    for classValue, instances in separated.items():
        summaries[classValue] = summarize(instances)
    return summaries

## Calculate the Gaussian probability distribution function for x

In [None]:
def calculate_probability(x, mean, stdev):
    if mean == 0 or stdev == 0:
        return 0
    exponent = exp(-(pow(x - mean, 2) / (2 * pow(stdev, 2))))
    return (1 / (sqrt(2 * PI) * stdev)) * exponent

## Calculate the probabilities of predicting each class for a given row

In [None]:
def calculate_class_probabilities(summaries, input_vector):
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summaries)):
            mean, stdev = class_summaries[i]
            x = input_vector[i]
            probability = calculate_probability(x, mean, stdev)
            # ignore zero probability
            if probability != 0:
                probabilities[class_value] *= probability
    return probabilities

## Predict the class for a given row

In [None]:
def predict(summaries, input_vector):
    probabilities = calculate_class_probabilities(summaries, input_vector)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

## Calculate predictions

In [None]:
def get_predictions(summaries, test_set):
    predictions = []
    for i in range(len(test_set)):
        result = predict(summaries, test_set[i])
        predictions.append(result)
    return predictions

## Test the algorithm

### Load and split data

In [None]:
dataset, target = load_dataset("data/banknote.csv")
train_x, train_y, test_x, actual = split_dataset(dataset, target, 0.8)
print(f"Training set size: {len(train_x)}, Testing set size: {len(test_x)}")

### Using self-implementation

In [None]:
# prepare model
summaries = summarize_by_class(train_x, train_y)
# test model
predictions = get_predictions(summaries, test_x)
display(actual, predictions)

### Using scikit-learn

In [None]:
gnb = GaussianNB()
y_pred = gnb.fit(train_x, train_y).predict(test_x)
display(actual, y_pred)