# Gaussian Naive Bayes Classifier

In [513]:
import sys, os
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import numpy as np
import pandas as pd
sys.path.insert(0, os.getcwd() + '/../data_tools')
from sklearn_pandas_converter import sklearn_to_df

## Load Data

In [514]:
datask = datasets.load_iris()
data = sklearn_to_df(datask)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Process Data

In [515]:
# Split data into features and classes
x = data.drop(['target'], axis = 1)
y = data['target']
# Split data into train and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.85, test_size = 0.15, random_state = 0)
x_train = np.array(x_train)
y_train = np.array(y_train)

## Building the Model

In [520]:
class gaussian_naive_bayes:
    
    def __init__(self):
        self.num_classes = 0
        self.class_probs = []
        self.feat_conditional_probs = []

    #  Calculates p(x | y)
    def p_x_given_y(self, x, mean, var):
        p = 1/(np.sqrt(2*np.pi*var)) * np.exp((-(x-mean)**2)/(2*var))
        return p

    # Calculates the numerator of bayes therom. P(CLASS)*P(FEATURE1|CLASS)*P(FEATURE2|CLASS)*...*P(FEATURE(N)|CLASS)
    # Denominator is equivelent for all classes and can be difficult to calculate, so it can be ignored
    def calc_posterior(self, feature_probs, class_num):
        p = self.class_probs[class_num]
        for x in feature_probs:
            p *= x
        return p

    def fit(self, features, target, verbose=False):
        # Converts input to numpy array if it is given in a different form
        features = np.array(features)
        target = np.array(target)

        # Class probability calculations. Results stores in self.num_classes
        unique_classes, counts = np.unique(target, return_counts=True)
        class_counts = dict(zip(unique_classes, counts))
        total_count = len(target)
        self.num_classes = len(class_counts)
        for i in range(len(class_counts)):
            class_prob = class_counts[i]/total_count
            self.class_probs.append(class_prob)

        # calculate likelihoods of each variable in each class, under the assumtion of independence and normal distributions
        self.feat_conditional_probs = pd.DataFrame(
            {'class': [], 'feature': [], 'mean': [], 'var': []})

        for j in unique_classes:
            num_feats = np.array(range(len(features[0])))
            feats = features[target == j]
            mean = np.mean(feats, axis = 0)
            varience = np.var(feats, axis = 0)
            temp_df = pd.DataFrame({'class': [j for i in range(len(features[0]))],
                                    'feature': num_feats, 'mean': mean, 'var': varience})
            self.feat_conditional_probs = pd.concat(
                [self.feat_conditional_probs, temp_df], ignore_index=True)

        # Prints calculations if verbose output desired
        if verbose == True:
            print('Mean, Varience for each feature given each class',
                  self.feat_conditional_probs, '\n')
            print('Class probabilities:', self.class_probs, '\n')
        return 'FIT COMPLETE'

    def predict(self, features_test):
        features_test = np.array(features_test)
        predictions = []
        probs = []
        # first loop allows for as many test instances to be passed in and still make predictions for each
        for i in range(len(features_test)):

            # pass through each class, store the posterior probability of the greatest class
            probs_by_class = []
            for j in range(self.num_classes):
                probs_by_feature = []
                # run through each feature, store average probabilities
                for k in range(len(features_test[0])):
                    sub_df = self.feat_conditional_probs[(self.feat_conditional_probs['class'] == j) &
                                                  (self.feat_conditional_probs['feature'] == k)]
                    probs_by_feature.append(self.p_x_given_y(features_test[i, k],
                                                             float(sub_df['mean']), float(sub_df['var'])))
                # print(likes_by_feature)
                class_prob = self.calc_posterior(probs_by_feature, j)
                probs_by_class.append(class_prob)

            # Store the predicted classes in one list and their respective probabilities in another
            class_val = probs_by_class.index(max(probs_by_class))
            predictions.append(class_val)
            probs.append(max(probs_by_class))

        return predictions, probs

## Training the Model

In [521]:
clf  = gaussian_naive_bayes()

In [522]:
clf.fit(x_train, y_train)

'FIT COMPLETE'

## Making Predictions

In [523]:
predictions, weights = clf.predict(np.array(x_test))

## Training and Predicting with Sklearn's Gaussian Naive Bayes Model

In [524]:
model = GaussianNB()
model.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [525]:
sk_predictions = model.predict(x_test)

In [526]:
comparison = pd.DataFrame({'Scratch': predictions, 'Sklearn': sk_predictions, 'Actual': y_test})

In [527]:
comparison[comparison['Scratch'] != comparison['Actual']]


Unnamed: 0,Scratch,Sklearn,Actual
134,1,1,2


In [528]:
comparison[comparison['Sklearn'] != comparison['Actual']]

Unnamed: 0,Scratch,Sklearn,Actual
134,1,1,2


In [529]:
comparison[comparison['Scratch'] != comparison['Sklearn']]

Unnamed: 0,Scratch,Sklearn,Actual


Numpy and pandas based naive bayes classifier preforms as well as the sklearn model on the iris dataset, with both missing one test case. 