In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [20]:
class NaiveBayesClassifier():
    '''
      P(y|X) = P(X|y) * P(y) / P(X)
      P(n|X) = P(X|n) * P(n) / P(X)
      
    '''
    def calc_prior(self,features,target):
        
        self.prior = (features.groupby(target).apply(lambda x: len(x)) / self.rows).to_numpy()
        print(self.prior)
        return self.prior
    
    
    def calc_statistics(self, features, target):
        '''
        features : pandas data frame
        target : lable  string
        calculate mean, variance for each column and convert to numpy array
        ''' 
        self.mean = features.groupby(target).apply(np.mean).to_numpy()
        self.var = features.groupby(target).apply(np.var).to_numpy()
              
        return self.mean, self.var
    
    def calc_posterior(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for i in range(self.count):
            prior = np.log(self.prior[i]) ## use the log to make it more numerically stable
            conditional = np.sum(np.log(self.gaussian_density(i, x))) # use the log to make it more numerically stable
            posterior = prior + conditional
            posteriors.append(posterior)
        # return class with highest posterior probability
        return self.classes[np.argmax(posteriors)]
    
    
    def fit(self, features, target):
        self.classes = np.unique(target)
        self.count = len(self.classes)
        self.feature_nums = features.shape[1]
        self.rows = features.shape[0]
        
        self.calc_statistics(features, target)
        self.calc_prior(features, target)
    
    def gaussian_density(self, class_idx, x):     
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        numerator = np.exp((-1/2)*((x-mean)**2) / (var))
        denominator = np.sqrt(2 * np.pi) * var
        prob = numerator / denominator
        return prob
    
    
    
    def predict(self, features):
        preds = [self.calc_posterior(f) for f in features.to_numpy()]
        return preds
    
    def accuracy(self, y_test, y_pred):
        accuracy = np.sum(y_test == y_pred) / len(y_test)
        return accuracy

In [21]:
df = pd.read_csv("iris.csv")
df.head()


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [30]:
df = df.sample(frac=1,random_state=1).reset_index(drop=True)
df.shape

(75, 5)

In [23]:
df.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.8,4.0,1.2,0.2,Setosa
1,5.1,2.5,3.0,1.1,Versicolor
2,6.6,3.0,4.4,1.4,Versicolor
3,5.4,3.9,1.3,0.4,Setosa
4,7.9,3.8,6.4,2.0,Virginica


In [24]:
# set features and target
X, y = df.iloc[:, :-1], df.iloc[:, -1]

print(X.shape)
print(y.shape)
print(y[:10])
print(X[:10])

(150, 4)
(150,)
0        Setosa
1    Versicolor
2    Versicolor
3        Setosa
4     Virginica
5    Versicolor
6     Virginica
7        Setosa
8        Setosa
9     Virginica
Name: variety, dtype: object
   sepal.length  sepal.width  petal.length  petal.width
0           5.8          4.0           1.2          0.2
1           5.1          2.5           3.0          1.1
2           6.6          3.0           4.4          1.4
3           5.4          3.9           1.3          0.4
4           7.9          3.8           6.4          2.0
5           6.3          3.3           4.7          1.6
6           6.9          3.1           5.1          2.3
7           5.1          3.8           1.9          0.4
8           4.7          3.2           1.6          0.2
9           6.9          3.2           5.7          2.3


In [25]:
x_train,y_train =  X[:100],y[:100]
x_test,y_test   = X[100:],y[100:]

print("X train : ",x_train.shape)
print("x test  : ",x_test.shape)

X train :  (100, 4)
x test  :  (50, 4)


In [26]:
model = NaiveBayesClassifier()
model.fit(x_train,y_train)

[0.31 0.32 0.37]


In [27]:
y_pred = model.predict(x_test)


In [28]:
acc = model.accuracy(y_test, y_pred)
print("Accuracy : ",acc)

Accuracy :  0.92
