In [1]:
import numpy as np
import pandas as pd

In [2]:
class NaiveBayesClassifier():
  
  # calculating prior probability
  def calc_prior_probability(self, features, target):
    self.prior = (features.groupby(target).apply(lambda x : len(x)) / self.rows).to_numpy()
    return self.prior
  
  # calculating statistics
  def calc_statistics(self, features, target):
    self.mean = features.groupby(target).apply(np.mean).to_numpy()
    self.var = features.groupby(target).apply(np.var).to_numpy()
    return self.mean, self.var
  
  # naive bayes
  def gaussian_density(self, class_index, x):
    mean = self.mean[class_index]
    var = self.var[class_index]
    numerator = np.exp((-0.5) * ((x - mean) ** 2) / (2 * var))
    denominator = np.sqrt(2 * np.pi * var)
    probability = numerator / denominator
    return probability
  
  # calculating posterior probability
  def calc_posterior_probability(self, x):
    posteriors = []
    ## posterior probability for each class
    for i in range(self.count):
      prior = np.log(self.prior[i])
      conditional = np.sum(np.log(self.gaussian_density(i, x)))
      posterior = prior + conditional
      posteriors.append(posterior)
    return self.classes[np.argmax(posteriors)]	# classes with highest posterior probability
  
  def fit(self, features, target):
    self.classes = np.unique(target)
    self.count = len(self.classes)
    self.features_numbers = features.shape[1]
    self.rows = features.shape[0]
    
    self.calc_statistics(features, target)
    self.calc_prior_probability(features, target)
  
  def predict(self, features):
    predictions = [self.calc_posterior_probability(x) for x in features.to_numpy()]
    return predictions
  
  def accuracy(self, y_test, y_pred):
    accuracy = np.sum(y_pred == y_test) / len(y_test)
    return accuracy

In [3]:
# loading the dataset
data = pd.read_csv("Iris.csv")

# shuffling the dataset
data = data.sample(frac=1, random_state=1).reset_index(drop=True)
data.drop("Id", axis="columns", inplace=True)

print(data.shape)

# setting the features and target
X, y = data.iloc[:, :-1], data.iloc[:, -1]

# splitting the dataset
X_train, y_train, X_test, y_test = X[:100], y[:100], X[100:], y[100:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(150, 5)
(100, 4) (100,)
(50, 4) (50,)


In [4]:
data

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.8,4.0,1.2,0.2,Iris-setosa
1,5.1,2.5,3.0,1.1,Iris-versicolor
2,6.6,3.0,4.4,1.4,Iris-versicolor
3,5.4,3.9,1.3,0.4,Iris-setosa
4,7.9,3.8,6.4,2.0,Iris-virginica
...,...,...,...,...,...
145,6.3,2.8,5.1,1.5,Iris-virginica
146,6.4,3.1,5.5,1.8,Iris-virginica
147,6.3,2.5,4.9,1.5,Iris-versicolor
148,6.7,3.1,5.6,2.4,Iris-virginica


In [5]:
## Training the model
x = NaiveBayesClassifier()
x.fit(X_train, y_train)

In [6]:
x.classes, x.features_numbers, x.rows, x.count

(array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object),
 4,
 100,
 3)

In [7]:
print(x.calc_prior_probability(X_train, y_train))
x.prior

[0.31 0.32 0.37]


array([0.31, 0.32, 0.37])

In [8]:
x.calc_statistics(X_train, y_train)

(array([[5.08387097, 3.50322581, 1.46129032, 0.24193548],
        [5.9125    , 2.790625  , 4.275     , 1.33125   ],
        [6.71891892, 2.98918919, 5.63243243, 2.05675676]]),
 array([[0.11361082, 0.10934443, 0.02430801, 0.00953174],
        [0.21296875, 0.08272461, 0.185625  , 0.03214844],
        [0.3566691 , 0.11339664, 0.32867787, 0.0592111 ]]))

In [9]:
x.mean, x.var

(array([[5.08387097, 3.50322581, 1.46129032, 0.24193548],
        [5.9125    , 2.790625  , 4.275     , 1.33125   ],
        [6.71891892, 2.98918919, 5.63243243, 2.05675676]]),
 array([[0.11361082, 0.10934443, 0.02430801, 0.00953174],
        [0.21296875, 0.08272461, 0.185625  , 0.03214844],
        [0.3566691 , 0.11339664, 0.32867787, 0.0592111 ]]))

In [10]:
X_train

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.8,4.0,1.2,0.2
1,5.1,2.5,3.0,1.1
2,6.6,3.0,4.4,1.4
3,5.4,3.9,1.3,0.4
4,7.9,3.8,6.4,2.0
...,...,...,...,...
95,6.7,3.0,5.2,2.3
96,6.3,2.3,4.4,1.3
97,6.2,3.4,5.4,2.3
98,7.2,3.6,6.1,2.5


In [11]:
predictions = x.predict(X_test)

In [13]:
y_test.value_counts(normalize=True)

Iris-setosa        0.38
Iris-versicolor    0.36
Iris-virginica     0.26
Name: Species, dtype: float64

In [12]:
print("Accuracy: " + x.accuracy(y_test, predictions))

0.92