In [1]:
import math 
import random 
import csv 
  
def encode_class(mydata): 
    classes = [] 
    for i in range(len(mydata)): 
        if mydata[i][-1] not in classes: 
            classes.append(mydata[i][-1]) 
    for i in range(len(classes)): 
        for j in range(len(mydata)): 
            if mydata[j][-1] == classes[i]: 
                mydata[j][-1] = i 
    return mydata             
              

def splitting(mydata, ratio): 
    train_num = int(len(mydata) * ratio) 
    train = [] 
    # initally testset will have all the dataset  
    test = list(mydata) 
    while len(train) < train_num: 
        # index generated randomly from range 0  
        # to length of testset 
        index = random.randrange(len(test)) 
        # from testset, pop data rows and put it in train 
        train.append(test.pop(index)) 
    return train, test 
  

def groupUnderClass(mydata): 
      dict = {} 
      for i in range(len(mydata)): 
          if (mydata[i][-1] not in dict): 
              dict[mydata[i][-1]] = [] 
          dict[mydata[i][-1]].append(mydata[i]) 
      return dict
  

def mean(numbers): 
    return sum(numbers) / float(len(numbers)) 
  
# Calculating Standard Deviation 
def std_dev(numbers): 
    avg = mean(numbers) 
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1) 
    return math.sqrt(variance) 
  
def MeanAndStdDev(mydata): 
    info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)] 
    # eg: list = [ [a, b, c], [m, n, o], [x, y, z]] 
    # here mean of 1st attribute =(a + m+x), mean of 2nd attribute = (b + n+y)/3 
    # delete summaries of last class 
    del info[-1] 
    return info 
  

def MeanAndStdDevForClass(mydata): 
    info = {} 
    dict = groupUnderClass(mydata) 
    for classValue, instances in dict.items(): 
        info[classValue] = MeanAndStdDev(instances) 
    return info 
  

def calculateGaussianProbability(x, mean, stdev): 
    expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2)))) 
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo 
  

def calculateClassProbabilities(info, test): 
    probabilities = {} 
    for classValue, classSummaries in info.items(): 
        probabilities[classValue] = 1
        for i in range(len(classSummaries)): 
            mean, std_dev = classSummaries[i] 
            x = test[i] 
            probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev) 
    return probabilities 
  
def predict(info, test): 
    probabilities = calculateClassProbabilities(info, test) 
    bestLabel, bestProb = None, -1
    for classValue, probability in probabilities.items(): 
        if bestLabel is None or probability > bestProb: 
            bestProb = probability 
            bestLabel = classValue 
    return bestLabel 
  
def getPredictions(info, test): 
    predictions = [] 
    for i in range(len(test)): 
        result = predict(info, test[i]) 
        predictions.append(result) 
    return predictions 

def accuracy_rate(test, predictions): 
    correct = 0
    for i in range(len(test)): 
        if test[i][-1] == predictions[i]: 
            correct += 1
    return (correct / float(len(test))) * 100.0


In [2]:
# driver code 
import math 
import random 
import csv  
# add the data path in your system 
filename ='pima-indians-diabetes.csv'
  
# load the file and store it in mydata list 
mydata = csv.reader(open(filename, "rt"))
mydata = list(mydata)

In [3]:
mydata

[['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1'],
 ['1', '85', '66', '29', '0', '26.6', '0.351', '31', '0'],
 ['8', '183', '64', '0', '0', '23.3', '0.672', '32', '1'],
 ['1', '89', '66', '23', '94', '28.1', '0.167', '21', '0'],
 ['0', '137', '40', '35', '168', '43.1', '2.288', '33', '1'],
 ['5', '116', '74', '0', '0', '25.6', '0.201', '30', '0'],
 ['3', '78', '50', '32', '88', '31.0', '0.248', '26', '1'],
 ['10', '115', '0', '0', '0', '35.3', '0.134', '29', '0'],
 ['2', '197', '70', '45', '543', '30.5', '0.158', '53', '1'],
 ['8', '125', '96', '0', '0', '0.0', '0.232', '54', '1'],
 ['4', '110', '92', '0', '0', '37.6', '0.191', '30', '0'],
 ['10', '168', '74', '0', '0', '38.0', '0.537', '34', '1'],
 ['10', '139', '80', '0', '0', '27.1', '1.441', '57', '0'],
 ['1', '189', '60', '23', '846', '30.1', '0.398', '59', '1'],
 ['5', '166', '72', '19', '175', '25.8', '0.587', '51', '1'],
 ['7', '100', '0', '0', '0', '30.0', '0.484', '32', '1'],
 ['0', '118', '84', '47', '230', '45.8', 

In [4]:
classes = [] 
for i in range(len(mydata)): 
    if mydata[i][-1] not in classes: 
        classes.append(mydata[i][-1])

In [5]:
for i in range(len(classes)): 
    for j in range(len(mydata)): 
        if mydata[j][-1] == classes[i]: 
            mydata[j][-1] = i
#mydata = encode_class(mydata)


In [6]:
for i in range(len(mydata)): 
    mydata[i] = [float(x) for x in mydata[i]]   


In [7]:
mydata

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 0.0],
 [1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 1.0],
 [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 0.0],
 [1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0, 1.0],
 [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 0.0],
 [5.0, 116.0, 74.0, 0.0, 0.0, 25.6, 0.201, 30.0, 1.0],
 [3.0, 78.0, 50.0, 32.0, 88.0, 31.0, 0.248, 26.0, 0.0],
 [10.0, 115.0, 0.0, 0.0, 0.0, 35.3, 0.134, 29.0, 1.0],
 [2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0, 0.0],
 [8.0, 125.0, 96.0, 0.0, 0.0, 0.0, 0.232, 54.0, 0.0],
 [4.0, 110.0, 92.0, 0.0, 0.0, 37.6, 0.191, 30.0, 1.0],
 [10.0, 168.0, 74.0, 0.0, 0.0, 38.0, 0.537, 34.0, 0.0],
 [10.0, 139.0, 80.0, 0.0, 0.0, 27.1, 1.441, 57.0, 1.0],
 [1.0, 189.0, 60.0, 23.0, 846.0, 30.1, 0.398, 59.0, 0.0],
 [5.0, 166.0, 72.0, 19.0, 175.0, 25.8, 0.587, 51.0, 0.0],
 [7.0, 100.0, 0.0, 0.0, 0.0, 30.0, 0.484, 32.0, 0.0],
 [0.0, 118.0, 84.0, 47.0, 230.0, 45.8, 0.551, 31.0, 0.0],
 [7.0, 107.0, 74.0, 0.0, 0.0, 29.6, 0.254, 31.0

In [8]:
# split ratio = 0.7  
# 70% of data is training data and 30% is test data used for testing 
ratio = 0.7
#train_data, test_data = splitting(mydata, ratio)
train_num = int(len(mydata) * ratio) 
train_data = [] 
# initally testset will have all the dataset  
test_data = list(mydata) 
while len(train_data) < train_num: 
    # index generated randomly from range 0  
    # to length of testset 
    index = random.randrange(len(test_data)) 
    # from testset, pop data rows and put it in train 
    train_data.append(test_data.pop(index)) 
#return train, test 

print('Total number of examples are: ', len(mydata)) 
print('Out of these, training examples are: ', len(train_data)) 
print("Test examples are: ", len(test_data))  


Total number of examples are:  768
Out of these, training examples are:  537
Test examples are:  231


In [9]:
# prepare model 
info = MeanAndStdDevForClass(train_data) 


In [10]:
info

{0.0: [(4.909090909090909, 3.729751458650296),
  (143.02139037433156, 31.735797300753774),
  (70.77005347593582, 22.10059021002683),
  (21.572192513368982, 17.460827955998433),
  (96.65775401069519, 133.23074204190218),
  (35.29144385026738, 7.570388802021054),
  (0.5507219251336901, 0.3921017090190104),
  (37.18716577540107, 10.940810745710122)],
 1.0: [(3.2714285714285714, 2.96648629413297),
  (109.90571428571428, 26.530832394403127),
  (68.62857142857143, 18.411385717208645),
  (20.034285714285716, 15.02277054591306),
  (66.52571428571429, 95.05497231049867),
  (30.436571428571426, 7.489678071152092),
  (0.4255057142857142, 0.28353763710978086),
  (30.985714285714284, 11.39382637985562)]}

In [11]:
# test model 
predictions = getPredictions(info, test_data) 
predictions

[1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0

In [12]:
accuracy = accuracy_rate(test_data, predictions) 
print("Accuracy of your model is: ", accuracy)

Accuracy of your model is:  73.16017316017316
