In [1]:
import pandas as pd
import numpy as np
import sklearn.naive_bayes as nb

## Part 1

In [2]:
data = pd.read_csv('pima-indians-diabetes.csv', 
                   header=0, names=['Preganancies', "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Class"])

In [3]:
data.head()

Unnamed: 0,Preganancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


## Data Split

In [4]:
# splitting data into train and test set
def data_split(x, ratio=0.8):
    n, m = data.shape
    idx = np.random.rand(n)
    train = x[idx <= ratio]
    test = x[idx > ratio]
    return train, test

In [5]:
train, test = data_split(data)
print(train.shape)
print(test.shape)

(623, 9)
(144, 9)


## Naive Bayes

In [8]:
# naive function
# train: data for training
# test: data for testing
# target: target variable
def naive(train, test, target):
    n = train.shape[0]
    p_train = train[train[target] == 1]
    n_train= train[train[target] == 0]
    p_train = p_train.drop([target], axis=1)
    n_train = n_train.drop([target], axis=1)
    
    p_mu = np.mean(p_train, axis=0)
    n_mu = np.mean(n_train, axis=0)
    p_var = np.var(p_train, axis=0)
    n_var = np.var(n_train, axis=0)

    p_prob = -(test - p_mu)**2/(2*p_var)
    n_prob = -(test - n_mu)**2/(2*n_var)
    p_result = np.sum(p_prob, axis=1) + np.log(p_train.shape[0]/n)
    n_result = np.sum(n_prob, axis=1) + np.log(n_train.shape[0]/n)
    return p_result, n_result

## Train

In [9]:
# 10 cross validation with all variables
accuracies = []
for i in np.arange(10):
    train, test = data_split(data)
    actual = test['Class']
    p_result, n_result = naive(train, test, 'Class')
    pred = np.array([int(tf) for tf in (p_result > n_result)])
    accuracy= np.count_nonzero(pred==actual)/ test.shape[0]
    accuracies.append(accuracy)
np.mean(accuracies)

0.7431529756959931

In [10]:
# excluding 0 for variables 2,3,5,7
# use copy function as df uses reference
data2 = data.copy()
data2[data2.iloc[:,[2,3,5,7]]==0]=None

accuracies = []
for i in np.arange(10):
    train, test = data_split(data2)
    actual = test['Class']
    p_result, n_result = naive(train, test, 'Class')
    pred = np.array([int(tf) for tf in (p_result > n_result)])
    accuracy= np.count_nonzero(pred==actual)/ test.shape[0]
    accuracies.append(accuracy)
np.mean(accuracies)

0.7139604828365359

In [11]:
from scipy.stats import norm
import matplotlib.pyplot as plt
x = np.linspace(50.0, 90.0, 100)
plt.plot(x,norm.pdf(x, p_mu, np.sqrt(p_var)))
plt.plot(x,norm.pdf(x, n_mu, np.sqrt(n_var)), c='r')
plt.show()

NameError: name 'p_mu' is not defined

In [12]:
# sklearn naive test
accuracies = []
for i in np.arange(10):
    train, test = data_split(data)
    p_prior = np.count_nonzero(train.Class==1)/train.shape[0]
    n_prior = np.count_nonzero(train.Class==0)/train.shape[0]
    nb_classifier = nb.GaussianNB(priors=[p_prior, n_prior])
    nb_classifier.fit(train.drop(["Class"], axis=1), train.Class)
    pred = nb_classifier.predict(test.drop(['Class'], axis=1))
    accuracies.append(np.count_nonzero(pred== test.Class)/test.shape[0])
np.mean(accuracies)

0.7330877726708633