# Gaussian Naive Bayes Classifier from scratch in numpy

We will start by importing the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import math
import warnings
warnings.filterwarnings('ignore')

## Overview of the algorithm :-

Naive bayes is a simple algorithm in which we find the distribution of y given x i.e. p(y|x) by bayes rule :-

p(y|x) = (p(x|y) * p(y))/(p(x))

Also the algorithm has a strong assumption that the x's are conditionally independent given y.

p(y) is the prior of the classes which can be extracted from the dataset.

Also, we will use gaussian distribution as the distribution of attributes in our dataset.

We will use Pima Indians diabetes dataset for classification task. Dataset can be found here: https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv

In [2]:
df = pd.read_csv('pima-indians-diabetes.data.csv',delimiter=',',header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
def data_split(data,ratio=0.8):
    data_train = data.iloc[:int(ratio*data.shape[0]),:]
    data_test  = data.iloc[int(ratio*data.shape[0]):,:]
    return data_train,data_test

In [4]:
def label_split(data,ratio=0.8):
    data_train = data[:int(ratio*data.shape[0])]
    data_test  = data[int(ratio*data.shape[0]):]
    return data_train,data_test

Functions to calculate the mean and standard deviation of features wrt. classes and as a whole is required for our model as we will be using normal distribution.
Naive Bayes makes use of these summary statistics to predict the final probability of the classes given our example.

In [5]:
def split_data_by_class(data):
    data_class_0 = []
    data_class_1 = []
    random_row = []
    for i in range(len(data)):
        random_row = list(data[i])
        if(random_row[-1] == 0):
            data_class_0.append(random_row)
        else:
            data_class_1.append(random_row)
    data_class_0 = np.array(data_class_0)
    data_class_1 = np.array(data_class_1)
    return data_class_0 , data_class_1

In [6]:
def get_mean(array_nos):
    return sum(array_nos)/float(len(array_nos))

In [7]:
def std_dev(array_nos):
    mean_val = get_mean(array_nos)
    val = [pow(x-mean_val,2)/float(len(array_nos)-1) for x in array_nos]
    return math.sqrt(sum(val))

In [8]:
def summary_features(dataset):
    summary = [[get_mean(feature),std_dev(feature)] for feature in zip(*dataset)]
    del summary[-1]
    return np.array(summary)

In [9]:
def gaussian_prob(x,mean,dev):
    return (1.0/(pow(2*math.pi,0.5)*dev)) * (math.exp(-math.pow(x-mean,2))/(2*math.pow(dev,2)))

In [10]:
def classwise_probability(data_class_0,data_class_1,in_vector):
    summarize_0 = summary_features(data_class_0)
    summarize_1 = summary_features(data_class_1)
    probabilities_0 = 1
    probabilities_1 = 1
    for i in range(len(summarize_0)):
        mean , stdev = summarize_0[i]
        probabilities_0 *= gaussian_prob(in_vector[i],mean,stdev) 
    for i in range(len(summarize_1)):
        mean , stdev = summarize_1[i]
        probabilities_1 *= gaussian_prob(in_vector[i],mean,stdev)
    return probabilities_0,probabilities_1    

In [11]:
def predict(data_class_0,data_class_1,in_vector):
    probab_0 , probab_1 = classwise_probability(data_class_0,data_class_1,in_vector)
    if(probab_0 >= probab_1):
        return 0
    else:
        return 1

In [12]:
def get_predictions(data_class_0,data_class_1,test_data):
    predictions = []
    for i in range(len(test_data)):
        label = predict(data_class_0,data_class_1,test_data[i])
        predictions.append(label)
    predictions = np.array(predictions)
    return predictions

In [13]:
def evaluation(data_class_0,data_class_1,test_data,test_label):
    accuracy = 0
    predictions = get_predictions(data_class_0,data_class_1,test_data)
    for i in range(len(test_data)):
        if(predictions[i] == test_label[i]):
            accuracy += 1
    return accuracy/len(test_data)        

In [14]:
data_train , data_test = data_split(df,0.6)

In [15]:
test_label = data_test.iloc[:,8:] 

In [16]:
data_test = data_test.iloc[:,:8]

In [17]:
data_train = data_train.as_matrix()
data_test = data_test.as_matrix()
test_label = test_label.as_matrix()

In [18]:
data_train_0 , data_train_1 = split_data_by_class(data_train)

In [19]:
evaluation(data_train_0,data_train_1,data_test,test_label)

0.6948051948051948

So finally we have our accuracy as 69.48% which is higher than the baseline i.e. 65% for this dataset with just basic Gaussian Naive Bayes classifier.