# Gaussian Naive Bayes Classifier

In [1]:
import sys, os
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
sys.path.insert(0, os.getcwd() + '/../data_tools')
from sklearn_pandas_converter import sklearn_to_df

## Load Data

In [2]:
datask = datasets.load_iris()
data = sklearn_to_df(datask)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Process Data

In [3]:
# Split data into features and classes
x = data.drop(['target'], axis = 1)
y = data['target']
# Split data into train and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.85, test_size = 0.15, random_state = 0)

## Building the Model

In [4]:
class gaussian_naive_bayes:
    
    def __init__(self):
        self.num_classes = 0
        self.params = []

    #  Calculates p(CLASS|FEATURE) for an array of all features at a given class
    def p_x_given_y(self, x, mean, var):
        p = 1/(np.sqrt(2*np.pi*var)) * np.exp((-(x-mean)**2)/(2*var))
        return p

    # Calculates the numerator of bayes therom. P(CLASS)*P(FEATURE1|CLASS)*P(FEATURE2|CLASS)*...*P(FEATURE(N)|CLASS)
    # Denominator is equivelent for all classes so it can be ignored
    # Ignoring the denominator means the resultant is not a probability value, but corresponds directly to one
    # This value is referred to as psuedo-posterior from here out
    def calc_posteriors(self, features_test): #, class_num):
        probs = []
        for i in range(self.num_classes):
            sub_probs = self.params[self.params[:, 0] == i]                                    
            probs_by_feature = self.p_x_given_y(features_test, sub_probs[:, 3], sub_probs[:, 4])
            p = self.params[i, 1]
            for x in probs_by_feature:
                p *= x
            probs.append(p)
        return probs

    def fit(self, features, target, verbose=False):
        # Converts input to numpy array if it is given in a different form
        features = np.array(features)
        target = np.array(target)

        # Class probability calculations. Results stores in self.num_classes
        unique_classes, counts = np.unique(target, return_counts=True)
        class_counts = dict(zip(unique_classes, counts))
        self.num_classes = len(class_counts)
        class_probs = []
        for i in range(len(class_counts)):
            class_prob = class_counts[i]/len(target)
            class_probs.append(class_prob)

        # Calculate parameters for each (feature, class) combination, under the assumtion of independence
        # and normal distributions. Parameters are mean and varience of feature,class pair
        for j in unique_classes:
            num_feats = np.array(range(len(features[0])))
            feats = features[target == j]
            mean = np.mean(feats, axis = 0)
            varience = np.var(feats, axis = 0)
            class_prob = class_probs[j]
            # create an array of format: class number, class prob, feature number, mean, variance
            class_array = np.column_stack((np.array([j for i in range(len(features[0]))]), 
                                           np.array([class_prob for i in range(len(features[0]))]),
                                           num_feats, mean, varience))
            if self.params == []:
                self.params = class_array
            else:
                self.params = np.concatenate((self.params, class_array), axis = 0)
        
        # Prints calculations if verbose output desired
        if verbose == True:
            print('Class Number, Class Probability, Feature Number, Mean, Varience\n',
                  self.params, '\n')
        return 'FIT COMPLETE'

    # Makes predictions based on test instances. Calculates psuedo-posterior values to determine the class of 
    # each test value. Returns prediction list and psuedo-posterior list
    def predict(self, features_test):
        features_test = np.array(features_test)
        predictions = []
        probs = []
        # Loop allows for as many test instances to be passed in and still make predictions for each
        for i in range(len(features_test)):
            # pass through each class, store the posterior probability of the greatest class
            probs_by_class = []
            probs_by_class = self.calc_posteriors(features_test[i])

            # Store the predicted classes in one list and their respective weights in another
            predictions.append(np.argmax(probs_by_class))
            probs.append(probs_by_class)

        return predictions, probs

## Training the Model

In [5]:
clf  = gaussian_naive_bayes()

In [6]:
clf.fit(x_train, y_train, verbose=True)

Class Number, Class Probability, Feature Number, Mean, Varience
 [[0.         0.33858268 0.         4.99767442 0.12999459]
 [0.         0.33858268 1.         3.40465116 0.13904813]
 [0.         0.33858268 2.         1.4627907  0.02838291]
 [0.         0.33858268 3.         0.24651163 0.01132504]
 [1.         0.30708661 0.         5.87179487 0.27279421]
 [1.         0.30708661 1.         2.75384615 0.10453649]
 [1.         0.30708661 2.         4.19487179 0.23946088]
 [1.         0.30708661 3.         1.31538462 0.04130178]
 [2.         0.35433071 0.         6.62888889 0.38472099]
 [2.         0.35433071 1.         2.98444444 0.10620247]
 [2.         0.35433071 2.         5.54888889 0.30072099]
 [2.         0.35433071 3.         2.02666667 0.06417778]] 





'FIT COMPLETE'

## Making Predictions

In [7]:
predictions, weights = clf.predict(x_test)

## Training and Predicting with Sklearn's Gaussian Naive Bayes Model

In [8]:
model = GaussianNB()
model.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [9]:
sk_predictions = model.predict(x_test)

## Comparing Sklearn, Numpy-based Model, and Actual Values

In [10]:
comparison = pd.DataFrame({'Scratch': predictions, 'Sklearn': sk_predictions, 'Actual': y_test})

In [11]:
comparison[comparison['Scratch'] != comparison['Actual']]

Unnamed: 0,Scratch,Sklearn,Actual
134,1,1,2


In [12]:
comparison[comparison['Sklearn'] != comparison['Actual']]

Unnamed: 0,Scratch,Sklearn,Actual
134,1,1,2


In [13]:
comparison[comparison['Scratch'] != comparison['Sklearn']]

Unnamed: 0,Scratch,Sklearn,Actual


Numpy based naive bayes classifier preforms as well as the sklearn model on the iris dataset, with both missing one test case. 