# Naive Bayes Classifier on Iris Dataset

## Import Modules

In [1]:
import numpy as np
import pandas as pd

import random
import math

## Load and Prepare Data

In [2]:
df=pd.read_csv("Datasets/Iris.csv")
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
df=df.drop("Id",axis=1)
df=df.rename(columns={"Species":"label"})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   label          150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


## Train-test Data Split

In [4]:
def train_test_split(df,test_size):
    indices = df.index.tolist()
    if isinstance(test_size,float):
        test_size=round(test_size*len(df))
    
    test_indices=random.sample(population=indices,k=test_size)
    test_df=df.loc[test_indices]
    train_df=df.drop(test_indices)
    
    return train_df,test_df

In [5]:
random.seed(0)
train_df,test_df=train_test_split(df,test_size=20)

In [6]:
test_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
98,5.1,2.5,3.0,1.1,Iris-versicolor
107,7.3,2.9,6.3,1.8,Iris-virginica
10,5.4,3.7,1.5,0.2,Iris-setosa
66,5.6,3.0,4.5,1.5,Iris-versicolor
130,7.4,2.8,6.1,1.9,Iris-virginica


In [7]:
train_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Converting to numpy array

In [8]:
data = train_df.values
data[:5]

array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
       [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
       [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
       [5.0, 3.6, 1.4, 0.2, 'Iris-setosa']], dtype=object)

In [9]:
X_train = data[:,:-1]
X_train

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3.0, 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5.0, 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5.0, 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3.0, 1.4, 0.1],
       [4.3, 3.0, 1.1, 0.1],
       [5.8, 4.0, 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1.0, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [5.0, 3.0, 1.6, 0.2],
       [5.0, 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.5, 3.5, 1.3, 0.2],
       [4.4, 3

In [10]:
Y_train = data[:,-1]
Y_train

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolo

In [11]:
data1 = test_df.values
X_test = data1[:,:-1]
X_test

array([[5.1, 2.5, 3.0, 1.1],
       [7.3, 2.9, 6.3, 1.8],
       [5.4, 3.7, 1.5, 0.2],
       [5.6, 3.0, 4.5, 1.5],
       [7.4, 2.8, 6.1, 1.9],
       [6.7, 3.3, 5.7, 2.1],
       [6.3, 2.9, 5.6, 1.8],
       [6.7, 3.0, 5.0, 1.7],
       [7.7, 2.8, 6.7, 2.0],
       [6.1, 3.0, 4.6, 1.4],
       [5.9, 3.0, 5.1, 1.8],
       [5.7, 2.8, 4.5, 1.3],
       [7.2, 3.0, 5.8, 1.6],
       [5.0, 3.2, 1.2, 0.2],
       [6.3, 2.5, 4.9, 1.5],
       [4.8, 3.4, 1.9, 0.2],
       [5.6, 2.9, 3.6, 1.3],
       [6.3, 3.4, 5.6, 2.4],
       [4.9, 3.1, 1.5, 0.1],
       [5.7, 2.6, 3.5, 1.0]], dtype=object)

In [12]:
Y_test = data1[:,-1]
Y_test

array(['Iris-versicolor', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
       'Iris-virginica', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor'], dtype=object)

## Naive Bayes Algorithm

In [13]:
class NaiveBayes:
    # to fit the training data
    def fit(self,X,y):
        n_samples , n_features = X.shape
        self._classes = np.unique(y)     # number of unique labels
        n_classes = len(self._classes)
        
        # init mean,variance ,priors
        self._mean = np.zeros((n_classes , n_features) , dtype = np.float64)
        self._var = np.zeros((n_classes , n_features) , dtype = np.float64)
        self._priors = np.zeros(n_classes , dtype = np.float64)
        
        for idx , c in enumerate(self._classes):
            X_c = X[c==y]
            self._mean[idx,:] = X_c.mean(axis = 0)
            self._var[idx,:] = X_c.var(axis = 0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)
    
    # to predict the label for our tesy data
    def predict(self,X):
        y_pred = [self._predict(x) for x in X]
        return y_pred
    
    # helper function to predict for a single example only
    def _predict(self,x):
        posteriors = []
        
        for idx , c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            class_conditional = np.sum(np.log(self._pdf(idx , x)))
            posterior = prior + class_conditional
            posteriors.append(posterior)
            
        return self._classes[np.argmax(posteriors)]
            
    def _pdf(self , class_idx , x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        #mean+=1;
        #var+=1;
        ans=(-(x-mean)**2/(2*var))
        numerator = [math.exp(x) for x in ans]
        denominator = np.sqrt(2*np.pi*var)
        return numerator / denominator

## Implementation

In [14]:
nb = NaiveBayes()
nb.fit(X_train,Y_train)
Y_pred = nb.predict(X_test)
Y_pred

['Iris-versicolor',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-setosa',
 'Iris-versicolor',
 'Iris-virginica',
 'Iris-setosa',
 'Iris-versicolor']

## Calculating Accuracy

In [15]:
val = Y_pred==Y_test
accuracy = val.mean()
accuracy

0.95

### We have achieved 95% accuracy