### Naive-Bayes from scratch

In [1]:
import csv
import math
import random

In [39]:
# Function to load the dataset
def loadcsv():
    lines=csv.reader(open(r'diabetes.csv'))
    dataset=list(lines)
    for i in range (1,len(dataset)):
        dataset[i]=[float(x) for x in dataset[i]]
    return dataset[1:]

In [27]:
# Splitting the dataset into training and test
def splitDataset(dataset,splitRatio):
    trainSize=int(len(dataset)*splitRatio)
    trainSet=[]
    copy=list(dataset)
    while len(trainSet)<trainSize:
        index=random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]

In [11]:
# Function to seperate data-values by class
def seperateByClass(dataset):
    seperated={}
    for i in range (len(dataset)):
        vector=dataset[i]
        if(vector[-1] not in seperated):
            seperated[vector[-1]]=[]
        seperated[vector[-1]].append(vector)
    return seperated

In [41]:
# Function to compute mean
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [13]:
# Function to calculate Standard-deviation
def std(numbers):
    avg=mean(numbers)
    variance=sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [14]:
def summarize(dataset):
    summaries=[(mean(attributes),std(attributes)) for attributes in zip(*dataset)]
    del(summaries[-1])
    return summaries

In [29]:
def summarizeByClass(dataset):
    seperated=seperateByClass(dataset)
    summaries={}
    for classValue, instances in seperated.items():
        summaries[classValue]=summarize(instances)
    return summaries

In [16]:
def calculateProb(x, mean, stdev):
    exponent=math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))
    return (1/(math.sqrt(2*math.pi)*stdev))*exponent

In [43]:
def calculateClassProb(summaries, inputVector):
    probabilities={}
    for classValues, classSummaries in summaries.items():
        probabilities[classValues]=1
        for i in range(len(classSummaries)):
            mean, stdev=classSummaries[i]
            x=inputVector[i]
            probabilities[classValues]*=calculateProb(x, mean, stdev)
        return probabilities

In [46]:
def predict(summaries, inputVector):
    probabilities=calculateClassProb(summaries, inputVector)
    bestLabel, bestProb = None, -1
    for classValues, probability in probabilities.items():
        if bestLabel is None or probability>bestProb:
            bestProb=probability
            bestLabel=classValues
    return bestLabel

In [19]:
def getPredictions(summaries, testSet):
    predictions=[]
    for i in range (len(testSet)):
        result=predict(summaries, testSet[i])
        predictions.append(result)
    return predictions

In [48]:
def getAccuracy(testSet, predictions):
    correct=0
    for x in range (len(testSet)):
        if testSet[x][-1]==predictions[x]:
            correct+=1
    return (correct/float(len(testSet)))*100.0

In [61]:
def main():
    dataset=loadcsv()
    trainingSet, testSet=splitDataset(dataset, 0.67)
    print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset), len(trainingSet), len(testSet)))
    # Prepare model
    summaries=summarizeByClass(trainingSet)
    # Test Model
    predictions=getPredictions(summaries, testSet)
    accuracy=getAccuracy(testSet, predictions)
    print('Accuracy: {}%'.format(accuracy))
    
main()

Split 768 rows into train = 514 and test = 254 rows
Accuracy: 68.89763779527559%
