In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

In [2]:
data = pd.read_csv('./Dataset/Iris.csv') # Loading the dataset
print('Shape of the dataset:' , data.shape)

Shape of the dataset: (150, 6)


In [3]:
data.head() # Peeking the dataset

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
""" Changing the dataset classes from 3 to 2 for binary classification"""
for i in range(data.shape[0]):
    if data.loc[i, 'Species'] != 'Iris-virginica':
        data.loc[i , 'Species'] = 0
    else:
        data.loc[i , 'Species'] = 1

We will shuffle our data as the classes of the data set are not randomly ordered and we need randomly ordered dataset to make our model work on unknown data

In [5]:
data = data.sample(frac=1) # Shuffling our data
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
141,142,6.9,3.1,5.1,2.3,1
7,8,5.0,3.4,1.5,0.2,0
149,150,5.9,3.0,5.1,1.8,1
142,143,5.8,2.7,5.1,1.9,1
87,88,6.3,2.3,4.4,1.3,0


The id column is not signifiacant for the prediction purposes so we will drop it. The classes to be predicted are in string format so we will change them into numerical data

In [6]:
data.drop(columns=['Id'], inplace=True)
#data = pd.get_dummies(data, dummy_na=False, columns=['Species'])
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
141,6.9,3.1,5.1,2.3,1
7,5.0,3.4,1.5,0.2,0
149,5.9,3.0,5.1,1.8,1
142,5.8,2.7,5.1,1.9,1
87,6.3,2.3,4.4,1.3,0


In [7]:
X = data.drop(columns=['Species'])
Y = data.get(['Species'])

In [8]:
split = 0.1
train_X = np.asarray(X[:-int(split*data.shape[0])])
train_Y = np.asarray(Y[:-int(split*data.shape[0])])

test_X = np.asarray(X[-int(split*data.shape[0]):])
test_Y = np.asarray(Y[-int(split*data.shape[0]):])

print('Shape of training data:\nX: {}\tY: {}'.format(train_X.shape, train_Y.shape))
print('Shape of testing data:\nX: {}\tY: {}'.format(test_X.shape, test_Y.shape))

Shape of training data:
X: (135, 4)	Y: (135, 1)
Shape of testing data:
X: (15, 4)	Y: (15, 1)


In [9]:
def mean(X):
    """ Function for calculating mean of X """
    return  (1/X.shape[0])*(np.sum(X))

In [10]:
def std(X, mean):
    """ Function for calculating standard deviation of X """
    return np.sqrt((1/X.shape[0])*np.sum((X-mean)**2))

In [68]:
def pdf(X, mean, std):
    """ Function for calculation probablity density """
    return (1/(np.sqrt(2*np.pi)*std)) * (np.exp(-(X-mean)**2 / (2*(std)**2)))

In [69]:
def features_by_class(X, Y):
    class_seprated_features = {'0': [], '1': []}
    for i in range(X.shape[0]):
        if Y[i] == 0:
            class_seprated_features['0'].append(X[i])
        else:
            class_seprated_features['1'].append(X[i])
    return class_seprated_features

In [70]:
seprated_features = features_by_class(train_X, train_Y)
print('Shape of class 0 features: {}\nShape of class 1 features: {}'
      .format(np.shape(seprated_features['0']), np.shape(seprated_features['1'])))

Shape of class 0 features: (92, 4)
Shape of class 1 features: (43, 4)


In [71]:
seprated_features_class0 = np.asarray(seprated_features['0'])
seprated_features_class1 = np.asarray(seprated_features['1'])

In [77]:
def compute_summary(features):
    computed_summary = []
    for j in range(len(features)):
        summary = []
        for i in range(len(features[j])):
            for k in range(len(features[j][i][k])):
                mean, std = mean(features[j][i][k]), std(features[j][:, i], mean(features[j][:, i]))
                
        computed_summary.append(summary)
        
    return computed_summary

In [96]:
len(seprated_features['0'][0])

4

In [78]:
summ = compute_summary(seprated_features)

AttributeError: 'dict' object has no attribute 'shape'

In [73]:
summary_class0 = compute_summary(seprated_features_class0)
summary_class1 = compute_summary(seprated_features_class1)

In [74]:
print('Summary of class 0: {}\n\nSummary of class 1: {}'.format(summary_class0, summary_class1))

Summary of class 0: [[5.448913043478261, 0.620920523727787], [3.1043478260869573, 0.48743183324490774], [2.7902173913043478, 1.427300567246703], [0.7576086956521739, 0.5624545770359892]]

Summary of class 1: [[6.604651162790696, 0.643736725172533], [2.9767441860465116, 0.3325987940673194], [5.567441860465116, 0.5651593437510317], [2.023255813953488, 0.2709476988081902]]


In [63]:
def calculate_probablities(X, summary):
    

0.06434552909700528

0.06248965759370005