## Implementation of Naive Bayes Classifier algorithm

In [1]:
import pandas as pd
import numpy as np

class NB:
    def __init__(self):
        self.prior_prob = dict()
        self.likelihood0 = dict()
        self.likelihood1 = dict()
        self.post_prob = dict()
        
    def find_prior_prob(self, Y):
        self.num_examples = Y.shape[0]
        self.classes, counts = np.unique(Y, return_counts=True) 
        
        for i in range(len(self.classes)):
            self.prior_prob[self.classes[i]] = counts[i] / self.num_examples
        
        print('\nPrior probabilities of target:')
        for k, v in self.prior_prob.items():
            print(f'\tP(Y = {k}) = {v}')
        
    def find_likelihoods(self, X, Y, m):
        self.num_features = X.shape[1]
        
        X_class0 = X[Y == self.classes[0]]
        X_class1 = X[Y == self.classes[1]]

        k = -1
        
        print(f'\n{m} estimate Probabilities of attribute given target (Likelihoods):')
        
        for i in range(self.num_features):
            feat_values = np.unique(X.iloc[:, i])
            p = 1 / len(feat_values)
            k+=1
            
            for j in feat_values:  
                try:
                    count0 = X_class0[self.columns[k]].value_counts()[j]
                except:
                    count0 = 0
                
                try:
                    count1 = X_class1[self.columns[k]].value_counts()[j]
                except:
                    count1 = 0
                
                self.likelihood0[j] = (count0+m*p)/(X_class0.shape[0]+m)       
                self.likelihood1[j] = (count1+m*p)/(X_class1.shape[0]+m)
        
                print(f'\tP({self.columns[i]} = {j} | {self.columns[-1]} = {self.classes[0]}) = {round(self.likelihood0[j], 4)}')
                print(f'\tP({self.columns[i]} = {j} | {self.columns[-1]} = {self.classes[1]}) = {round(self.likelihood1[j], 4)}')
            
    def target_function(self, query):
        product0, product1 = 1, 1 
        
        for i in query:
            product0 *= self.likelihood0[i]
            product1 *= self.likelihood1[i]
        
        temp0 = self.prior_prob[self.classes[0]] * product0
        temp1 = self.prior_prob[self.classes[1]] * product1        
        
        print('Posterior probabilities of target given query:')
        print(f'\tP(Y = {self.classes[0]} | X) = {round(temp0, 4)}')
        print(f'\tP(Y = {self.classes[1]} | X) = {round(temp1, 4)}')
        
        if temp0 > temp1 :
            return self.classes[0]
        else:
            return self.classes[1]
    
    def predict(self, queries):
        Y_pred = []
        for i in range(queries.shape[0]):
            query = list(queries.iloc[i])
            print(f'\nquery X: {query}')
            target = self.target_function(query)
            print(f'predicted target Y: {target}')
            Y_pred.append(target)
        return Y_pred
    
    def accuracy(self, Y, Y_pred):
        count = 0
        for i in range(len(Y)):
            if (Y[i] == Y_pred[i]):
                count += 1
        accuracy = (count/len(Y))*100
        return accuracy
    
    def fit(self, df, m=1):
        self.columns = df.keys()
        Y = df.iloc[:,-1]
        X = df.iloc[:, 0:-1]
        self.find_prior_prob(Y)
        print(f'\nGiven equivalent sample size m = {m}')
        self.find_likelihoods(X, Y, m)
        Y_pred = self.predict(X)
        X[self.columns[-1]] = Y_pred
        print(f'\nAccuracy: {self.accuracy(Y, Y_pred)}%')
        return X

In [2]:
dataset = {'Taste':['Salty','Spicy','Spicy','Spicy','Spicy','Sweet','Salty','Sweet','Spicy','Salty'],

'Temperature':['Hot','Hot','Hot','Cold','Hot','Cold','Cold','Hot','Cold','Hot'],

'Texture':['Soft','Soft','Hard','Hard','Hard','Soft','Soft','Soft','Soft','Hard'],

'Eat':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}

df = pd.DataFrame(dataset)
print('Actual dataset:')
display(df)

nb = NB()
m = 2

df1 = nb.fit(df, m)
print('\nPredicted dataset:')
display(df1)

Actual dataset:


Unnamed: 0,Taste,Temperature,Texture,Eat
0,Salty,Hot,Soft,No
1,Spicy,Hot,Soft,No
2,Spicy,Hot,Hard,Yes
3,Spicy,Cold,Hard,No
4,Spicy,Hot,Hard,Yes
5,Sweet,Cold,Soft,Yes
6,Salty,Cold,Soft,No
7,Sweet,Hot,Soft,Yes
8,Spicy,Cold,Soft,Yes
9,Salty,Hot,Hard,Yes



Prior probabilities of target:
	P(Y = No) = 0.4
	P(Y = Yes) = 0.6

Given equivalent sample size m = 2

2 estimate Probabilities of attribute given target (Likelihoods):
	P(Taste = Salty | Eat = No) = 0.4444
	P(Taste = Salty | Eat = Yes) = 0.2083
	P(Taste = Spicy | Eat = No) = 0.4444
	P(Taste = Spicy | Eat = Yes) = 0.4583
	P(Taste = Sweet | Eat = No) = 0.1111
	P(Taste = Sweet | Eat = Yes) = 0.3333
	P(Temperature = Cold | Eat = No) = 0.5
	P(Temperature = Cold | Eat = Yes) = 0.375
	P(Temperature = Hot | Eat = No) = 0.5
	P(Temperature = Hot | Eat = Yes) = 0.625
	P(Texture = Hard | Eat = No) = 0.3333
	P(Texture = Hard | Eat = Yes) = 0.5
	P(Texture = Soft | Eat = No) = 0.6667
	P(Texture = Soft | Eat = Yes) = 0.5

query X: ['Salty', 'Hot', 'Soft']
Posterior probabilities of target given query:
	P(Y = No | X) = 0.0593
	P(Y = Yes | X) = 0.0391
predicted target Y: No

query X: ['Spicy', 'Hot', 'Soft']
Posterior probabilities of target given query:
	P(Y = No | X) = 0.0593
	P(Y = Yes | X) = 0.085

Unnamed: 0,Taste,Temperature,Texture,Eat
0,Salty,Hot,Soft,No
1,Spicy,Hot,Soft,Yes
2,Spicy,Hot,Hard,Yes
3,Spicy,Cold,Hard,Yes
4,Spicy,Hot,Hard,Yes
5,Sweet,Cold,Soft,Yes
6,Salty,Cold,Soft,No
7,Sweet,Hot,Soft,Yes
8,Spicy,Cold,Soft,No
9,Salty,Hot,Hard,Yes
