In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import random
warnings.filterwarnings('ignore')

# Gaussian Naive Bayes

## Train test split

In [None]:
def train_test_split(df, test_size):

  if isinstance(test_size, float):
    test_size = round(test_size*len(df))
  
  indices = df.index.tolist() # random.sample takes list, set, dictionary
  test_indices = random.sample(population=indices, k=test_size)

  test = df.loc[test_indices]
  train = df.drop(test_indices)

  return train, test

## Calculate Prior Probabilities

In [None]:
def calculate_prior_probabilities(df):

  prior_probabilities = df.groupby(by = 'target').apply(lambda x: len(x)/len(df))

  return np.log(prior_probabilities).values

In [None]:
# calculate_prior_probabilities(df)

# [Prior_probability(setosa), Prior_probability(versicolor), Prior_probability(virginica)]

## Find mean, variance

In [None]:
def return_statistics(df):

  mean = df.groupby(by='target').apply(lambda x: x.mean(axis=0))
  variance = df.groupby(by='target').apply(lambda x: x.var(axis=0))

  return (mean.values, variance.values)

In [None]:
# mean, variance = return_statistics(df)
# print(mean)
# print(variance)

#             s_l  s_w  p_l  p_w
# setosa
# versicolor
# virginica

## Find Gaussian Probability density

In [None]:
# P(x=12 | 'setosa')

def calculate_probability_density(mean, variance, x):

  probability_density = (1 / np.sqrt(2*np.pi*variance) ) * np.exp( (-(x - mean)**2)  / ( 2*variance ) )

  return probability_density

## Posterior Probabilities

In [None]:
def calculate_posterior_probabilities(df_row, mean, variance, n_unique_labels, n_cols):
  
  posterior_probabilities = []
  
  # calculate probabilities wrt each label to find max
  for i in range(n_unique_labels):
    posterior = 0

    # for each feature
    for j in range(n_cols):
      posterior += np.log(calculate_probability_density(mean[i][j], variance[i][j], df_row[j]))
    posterior_probabilities.append(posterior)
  
  return posterior_probabilities

In [None]:
# calculate_posterior_probabilities()

# [posterior_probability['setosa'], posterior_probability['versicolor'], posterior_probability['virginica']]

## Fit model on training dataset

In [None]:
def NBA_fit(df):

  n_cols = len(df.columns)-1
  unique_labels = df['target'].unique()
  n_unique_labels = len(unique_labels)

  mean, variance = return_statistics(df)
  prior_probabilities = calculate_prior_probabilities(df) # returns log

  return {
      'n_cols': n_cols,
      'unique_labels': unique_labels,
      'n_unique_labels': n_unique_labels,
      'mean': mean,
      'variance': variance,
      'prior_probabilities': prior_probabilities
  }
  

In [None]:
# nba = NBA_fit(df)

# Returns a dictonary containing statistical and other important info

## Predict

In [None]:
def predict(test_df, nba):

  predictions = []
  for i in range(len(test_df)):

    prior = nba['prior_probabilities']
    posterior = calculate_posterior_probabilities(test_df.iloc[i, :-1], nba['mean'], nba['variance'], nba['n_unique_labels'], nba['n_cols'])  # returns log
    probabilities = prior + posterior
    # one with max prob will be the output 
    mx_idx = np.argmax(probabilities)

    predictions.append(nba['unique_labels'][mx_idx])  # add log values

  return predictions

In [None]:
# predictions = predict(test_df, nba)

# returns label

## Load Dataset

In [None]:
df = sns.load_dataset('iris')
df_copy = df.copy()
df.rename(columns={'species': 'target'}, inplace = True)

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


No null values present

In [None]:
# train test split
train_df, test_df = train_test_split(df, 0.2)

# fit model
nba = NBA_fit(train_df)

# make predictions
predictions = predict(test_df, nba)

# accuracy
accuracy = len(test_df.loc[predictions == test_df['target']])/len(test_df) * 100
accuracy

96.66666666666667