# Naive Bayes Algorithm on Iris classification dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

In [2]:
data = pd.read_csv('./Dataset/Iris.csv') # Loading the dataset
print('Shape of the dataset:' , data.shape)

Shape of the dataset: (150, 6)


In [3]:
data.head() # Peeking the dataset

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
""" Changing the dataset classes from 3 to 2 for binary classification"""
for i in range(data.shape[0]):
    if data.loc[i, 'Species'] != 'Iris-virginica':
        data.loc[i , 'Species'] = 0
    else:
        data.loc[i , 'Species'] = 1

We will shuffle our data as the classes of the data set are not randomly ordered and we need randomly ordered dataset to make our model work on unknown data

In [5]:
data = data.sample(frac=1) # Shuffling our data
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
83,84,6.0,2.7,5.1,1.6,0
98,99,5.1,2.5,3.0,1.1,0
126,127,6.2,2.8,4.8,1.8,1
26,27,5.0,3.4,1.6,0.4,0
45,46,4.8,3.0,1.4,0.3,0


The id column is not signifiacant for the prediction purposes so we will drop it. The classes to be predicted are in string format so we will change them into numerical data

In [6]:
data.drop(columns=['Id'], inplace=True)
#data = pd.get_dummies(data, dummy_na=False, columns=['Species'])
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
83,6.0,2.7,5.1,1.6,0
98,5.1,2.5,3.0,1.1,0
126,6.2,2.8,4.8,1.8,1
26,5.0,3.4,1.6,0.4,0
45,4.8,3.0,1.4,0.3,0


In [7]:
X = data.drop(columns=['Species'])
Y = data.get(['Species'])

In [8]:
# Splitting the dataset into test and train
split = 0.1
train_X = np.asarray(X[:-int(split*data.shape[0])])
train_Y = np.asarray(Y[:-int(split*data.shape[0])])

test_X = np.asarray(X[-int(split*data.shape[0]):])
test_Y = np.asarray(Y[-int(split*data.shape[0]):])

print('Shape of training data:\nX: {}\tY: {}'.format(train_X.shape, train_Y.shape))
print('Shape of testing data:\nX: {}\tY: {}'.format(test_X.shape, test_Y.shape))

Shape of training data:
X: (135, 4)	Y: (135, 1)
Shape of testing data:
X: (15, 4)	Y: (15, 1)


In [9]:
def mean(X):
    """ Function for calculating mean of X """
    return  (1/X.shape[0])*(np.sum(X))

In [10]:
def std(X, mean):
    """ Function for calculating standard deviation of X """
    return np.sqrt((1/X.shape[0])*np.sum((X-mean)**2))

In [11]:
def pdf(X, mean, std):
    """ Function for calculation probablity density """
    return (1/(np.sqrt(2*np.pi)*std)) * (np.exp(-(X-mean)**2 / (2*(std)**2)))

In [12]:
def features_by_class(X, Y):
    """ Separating the dataset according to classes for carrying out calculation on class respective features """
    class_separated_features = {'0': [], '1': []}
    for i in range(X.shape[0]):
        if Y[i] == 0:
            class_separated_features['0'].append(X[i])
        else:
            class_separated_features['1'].append(X[i])
    return class_separated_features

In [13]:
separated_features = features_by_class(train_X, train_Y)
print('Shape of class 0 features: {}\nShape of class 1 features: {}'
      .format(np.shape(separated_features['0']), np.shape(separated_features['1'])))

Shape of class 0 features: (88, 4)
Shape of class 1 features: (47, 4)


In [14]:
# Changing the features into numpy arrays for easier computiation
separated_features_class0 = np.asarray(separated_features['0'])
separated_features_class1 = np.asarray(separated_features['1'])

In [15]:
def compute_summary(features):
    """ Calculates summary (i.e. mean and standard deviation per feature respective to class) """
    computed_summary = []
    summary = []
    for i in range(features.shape[1]):
        summary = [mean(features[:, i]), std(features[:, i], mean(features[:, i]))]
        computed_summary.append(summary)

    return computed_summary

In [16]:
summary_class0 = compute_summary(separated_features_class0)
summary_class1 = compute_summary(separated_features_class1)

In [17]:
print('Summary of class 0: {}\n\nSummary of class 1: {}'.format(summary_class0, summary_class1))

Summary of class 0: [[5.4363636363636365, 0.6261683294925607], [3.0829545454545455, 0.43932323751970304], [2.8261363636363637, 1.4491764368224187], [0.7704545454545456, 0.5705179247660312]]

Summary of class 1: [[6.574468085106383, 0.6465845076841731], [2.974468085106383, 0.31855263442064513], [5.536170212765957, 0.559416691614937], [2.019148936170213, 0.27260625324335264]]


In [18]:
summary = [summary_class0, summary_class1] # COncatinating the features back to one single list
np.shape(summary)

(2, 4, 2)

In [19]:
def calculate_probablities(X, summary):
    """ Calculates the probablities for each class by drawing probablity from Gaussian Probablity Distribution """
    probablities = {}
    for class_value in range(np.shape(summary)[0]):
        probablities[class_value] = 0
        for feature in summary[class_value]:
            mean, stddev = feature
            probablities[class_value] += pdf(X, mean, stddev)
        probablities[class_value] = np.sum(probablities[class_value] ,axis=1) /4
    return probablities

In [20]:
pred = calculate_probablities(test_X, summary) # Making predictions on test data

In [21]:
# Calculating accuracy
pred_Y = []
for i in range(len(pred[0])):
    if pred[0][i] > pred[1][i]:
        pred_Y.append(0)
    else:
        pred_Y.append(1)
acc = np.sum(np.equal(pred_Y, test_Y[:, 0])) / test_Y.shape[0]
print("Accuracy on test data: {}%".format(acc*100))

Accuracy on test data: 93.33333333333333%
