In [18]:
import numpy as np
import pandas as pd
import sklearn

#load data

def load_iris_data():
    return pd.read_csv("dataset/iris.csv")

In [19]:
iris = load_iris_data()

# Data Preprocessing

In [20]:
#separate test set and train set

import hashlib
def test_set_check(identifier,test_ratio,hash):
    return bytearray(hash(np.int64(identifier)).digest())[-1]  < 51
def split_train_test_by_id(data,test_ratio,id_column,hash):
    ids=data[id_column]
    in_test_set=ids.apply(lambda id_:test_set_check(id_,test_ratio,hash))
    return in_test_set

In [21]:
iris=iris.reset_index()
in_test_set=split_train_test_by_id(iris,0.66,"index",hashlib.md5)
test_set=iris[in_test_set]
train_set=iris[~in_test_set]

#deleting index from test set and train set

del train_set['index']
del test_set['index']

In [22]:
#converting data frame to matrix

train_set_array = train_set.as_matrix()


In [23]:
test_set_copy = test_set

#creating two copies of testset...
#one with predictions and other without predictions
#then converting both of thenm to matrix

test_set_copy = test_set_copy.as_matrix()
del test_set['species']

test_set_array = test_set.as_matrix()


In [24]:
#functions for separating data classwise

def separatedataset(data):
    separated = {}
    for i in range(len(data)):
        row = data[i]
        if row[-1] not in separated:
            separated[row[-1]] =[]
        separated[row[-1]].append(row)
    return separated


# Naive Bayes classification

In [25]:
import math
#mean of values [1,2,3,4] = (1+2+3+4)/4

def mean(numbers):
    return sum(numbers)/float(len(numbers))

#standard deviation of values [1,2,3,4] = ((1^2+2^2+3^2+4^2)/4)

def stddev(numbers):
    avg = mean(numbers)
    sum = 0
    for i in range(len(numbers)):
        power2 = pow(numbers[i]-avg,2)
        sum+=power2
    variance = sum/float(len(numbers))
    return math.sqrt(variance)

In [26]:
#passing data with multiple attributes
#function will separate mean and standard deviations of all data for all attributes and appending it in a variable and returning

def meandevdata(data):
    meandev = []

    #zipd funcion is zipping array such that it can be iterated through all attributes
    
    zipd = zip(*data)
    del zipd[-1]
    for attribute in zipd:
        meandev_prev = ((mean(attribute),stddev(attribute)))
        meandev.append((meandev_prev))
    return meandev

In [27]:
#fuction calculating mean and standard deviations of all attributes for each class

def meandevclass(data):
    separated = separatedataset(data)
    meandevclassdata = {}
    for classno, instances in separated.iteritems():
        meandevclassdata[classno] = meandevdata(instances)
    return meandevclassdata

In [28]:
import time

#start time
start_time=time.clock()

meandevbyclass = meandevclass(train_set_array)

#end time
end_time=time.clock()

In [29]:
#testing instance is passed to calculate probability of that attribute being part of that class

def P_attributegivenclass(x,mean,dev):
    expo = math.exp(-(math.pow(x-mean,2)/(2*math.pow(dev,2))))
    return (1 / (math.sqrt(2*math.pi) * dev)) * expo


In [30]:
#claculate probabilities of a vector belonging to each class and returning all probabilities  

def P_combiningprobforaclass(meandevclassdata,inputvector):
    probabilities = {}
    for classValue, classmeandev in meandevclassdata.iteritems():
        probabilities[classValue] = 1
        for i in range(len(classmeandev)):
            mean, dev = classmeandev[i]
            x = inputvector[i]
            probabilities[classValue] *= P_attributegivenclass(x, mean, dev)
    return probabilities



In [31]:
#After getting probabilies from P_combiningprobforaclass function return the most probable class for atest instance

def prediction(meandevclassdata, inputVector):
    probabilities = P_combiningprobforaclass(meandevclassdata, inputVector)
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.iteritems():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel

In [32]:
#getting predictions for all test instance for the most probable class

def getPredictions(meandevclassdata, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = prediction(meandevclassdata, testSet[i])
        predictions.append(result)
    return predictions
Predictions = []
Predictions = getPredictions(meandevbyclass,test_set_array)

In [33]:
#getting accuracy if predicted value is same as actual values

def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

In [34]:
#accuracy

print("Accuracy",getAccuracy(test_set_copy,Predictions))
print("Training Time",end_time-start_time)


('Accuracy', 88.46153846153845)
('Training Time', 0.0013170000000000126)
