# Importing modules

In [11]:
import numpy as np
import pandas as pd
import math

# Loading Data

In [12]:
data = pd.read_csv("diabetes.csv")

In [13]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
len(data.index)

768

In [15]:
data = data.sample(frac = 1)
data = np.array(data)

# Train and Test Dataset

In [16]:
x = int(len(data) * 0.67)
train = data[ : x]
test = data[x: ]
print(len(train),len(test))

514 254


# Creating Naive Bayes Model

In [17]:
def separated_by_class(dataset):
    separated = {}
    for x in range(len(dataset)):
        vector = dataset[x]
        if(vector[-1] not in separated):
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return sum(numbers)/float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/ float(len(numbers) - 1)
    return math.sqrt(variance)

def summerize(dataset):
    summeries = [(mean(attributes), stdev(attributes)) for attributes in zip(*dataset)]
    del summeries[-1]
    return summeries

def summerize_by_class(dataset):
    separated = separated_by_class(dataset)
    summeries = {}
    for class_value, instances in separated.items():
        summeries[class_value] = summerize(instances)
    return summeries

def calculate_probability(x, mean, stdev):
    exponent = math.exp(math.pow(x-mean, 2)/(2*math.pow(stdev,2)))
    return (1/(math.sqrt(2*math.pi)*stdev))*exponent

def calculate_class_probability(summeries, input_vector):
    probabilities = {}
    for class_value, class_summeries in summeries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summeries)):
            mean, stdev = class_summeries[i]
            x = input_vector[i]
            probabilities[class_value] *= calculate_probability(x, mean, stdev)
        return probabilities


# Applying Model and Prediction

In [18]:
def predict(summeries, input_vector):
    probabilities = calculate_class_probability(summeries, input_vector)
    best_label, best_prob = None, 1
    for class_value, probability in probabilities.items():
        if ((best_label is None) or (probability > best_prob)):
            best_label = class_value
            best_prob = probability
    return best_label

def get_prediction(summeries, test):
    predictions = []
    for i in range(len(test)):
        result = predict(summeries, test[i])
        predictions.append(result)
    return predictions

def get_accuracy(test, prediction):
    correct = 0
    for x in range(len(test)):
        if(test[x][-1] == prediction[x]):
            correct += 1
    return (correct/float(len(test)))*100

In [19]:
if (__name__ == "__main__"):
    summeries = summerize_by_class(train)
    predictions = get_prediction(summeries, test)
    accuracy = get_accuracy(test, predictions)
    print("Accuracy: {0}%".format(accuracy))

Accuracy: 64.96062992125984%


# Conclusion
The accuracy of our Naive Bayes model in analysis of PIMA Indian Diabetes dataset is 64.96%. Which is just quite good!!!