#### Here I am implementing Gaussian Naive Bayes classifier from scratch and then test it on famous Iris dataset

In [68]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random
warnings.filterwarnings('ignore')


In [82]:
# implement train test split
def train_test_split_in(df, test_size):
    if isinstance(test_size, float):
        test_size = round(test_size*len(df))
    indices = df.index.tolist() # random.sample takes list, set, dictionary
    test_indices = random.sample(population=indices, k=test_size)
    test = df.loc[test_indices]
    train = df.drop(test_indices)
    return train, test

In [71]:
# Calculating prior probablities
def calc_prior_prob(df):
    prior_probabilities = df.groupby(by = 'species').apply(lambda x: len(x)/len(df))
    return np.log(prior_probabilities).values

In [72]:
# Finding mean and variance
def calc_stats(df):
    m=df.groupby(by='species').apply(lambda x:x.mean(axis=0))
    v=df.groupby(by='species').apply(lambda x:x.var(axis=0))
    return (m.values,v.values)

In [73]:
# calculating probability density function
def calc_probability_density(mean, variance, x):
    probability_density = (1 / np.sqrt(2*np.pi*variance) ) * np.exp( (-(x - mean)**2)  / ( 2*variance ) )
    return probability_density

In [96]:
# calculate posterior probability
def calc_posterior_probabilities(df_row, mean, variance, n_unique_labels, n_cols):
    posterior_probabilities = []
    # calculate probabilities wrt each label to find max
    for i in range(n_unique_labels):
        posterior = 0

    # for each feature
        for j in range(n_cols):
            posterior += np.log(calc_probability_density(mean[i][j], variance[i][j], df_row[j]))
        posterior_probabilities.append(posterior)
  
    return posterior_probabilities

In [97]:
# implement fit method to fit to dataset
def fit(df):
    n_cols = len(df.columns)-1
    unique_labels = df['species'].unique()
    n_unique_labels = len(unique_labels)

    mean, variance = calc_stats(df)
    prior_probabilities = calc_prior_prob(df)

    return {
      'n_cols': n_cols,
      'unique_labels': unique_labels,
      'n_unique_labels': n_unique_labels,
      'mean': mean,
      'variance': variance,
      'prior_probabilities': prior_probabilities
  }

In [98]:
# implement predict function
def predict(test_df, gnb):
    predictions = []
    for i in range(len(test_df)):
        prior = gnb['prior_probabilities']
        posterior = calc_posterior_probabilities(test_df.iloc[i, :-1], gnb['mean'], gnb['variance'], gnb['n_unique_labels'], gnb['n_cols'])  # returns log
        probabilities = prior + posterior
        # one with max prob will be the output 
        mx_idx = np.argmax(probabilities)
        predictions.append(gnb['unique_labels'][mx_idx]) 
    return predictions

In [99]:
# Defining the accuracy_score function as its implemented in sklearn
def accuracy_score(y_true, y_pred):
	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

In [100]:
# loading dataset
df=sns.load_dataset('iris')

In [101]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [102]:
# train test split
train_df, test_df = train_test_split_in(df,0.5)
# fit model
nba=fit(train_df)
# make predictions
predictions = predict(test_df, nba)
# accuracy
accuracy = len(test_df.loc[predictions == test_df['species']])/len(test_df) * 100
accuracy

94.66666666666667