In [692]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from scipy import stats
from collections import Counter, defaultdict

In [693]:
Data=pd.read_csv('/Users/Supriya/Desktop/diabetes.csv')

In [694]:
Data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [695]:
New_Columns=[ 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction','Age','Pregnancies',  'Outcome']

In [696]:
Data=Data[New_Columns]
Data.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Pregnancies,Outcome
0,148,72,35,0,33.6,0.627,50,6,1
1,85,66,29,0,26.6,0.351,31,1,0
2,183,64,0,0,23.3,0.672,32,8,1
3,89,66,23,94,28.1,0.167,21,1,0
4,137,40,35,168,43.1,2.288,33,0,1


In [697]:
Data['Glucose'].fillna(Data['Glucose'].mean(), inplace = True)
Data['BloodPressure'].fillna(Data['BloodPressure'].mean(), inplace = True)
Data['SkinThickness'].fillna(Data['SkinThickness'].median(), inplace = True)
Data['Insulin'].fillna(Data['Insulin'].median(), inplace = True)
Data['BMI'].fillna(Data['BMI'].median(), inplace = True)

In [698]:
print(Data.isnull().sum())

Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Pregnancies                 0
Outcome                     0
dtype: int64


In [699]:
Data['Pregnancies'] = pd.cut(Data['Pregnancies'],bins=3,labels=['Small','Medium','Large'])
Data['Age'] = pd.cut(Data['Age'],bins=2,labels=['Young','Old'])

In [700]:
Data.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Pregnancies,Outcome
0,148,72,35,0,33.6,0.627,Young,Medium,1
1,85,66,29,0,26.6,0.351,Young,Small,0
2,183,64,0,0,23.3,0.672,Young,Medium,1
3,89,66,23,94,28.1,0.167,Young,Small,0
4,137,40,35,168,43.1,2.288,Young,Small,1


In [701]:
X=Data
X.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Pregnancies,Outcome
0,148,72,35,0,33.6,0.627,Young,Medium,1
1,85,66,29,0,26.6,0.351,Young,Small,0
2,183,64,0,0,23.3,0.672,Young,Medium,1
3,89,66,23,94,28.1,0.167,Young,Small,0
4,137,40,35,168,43.1,2.288,Young,Small,1


In [702]:
X_train,X_test,Y_train,Y_test = train_test_split(X.iloc[:,:-1],X.iloc[:,-1],test_size = 0.2,random_state = 0)
X_train.shape,X_test.shape,Y_train.shape,Y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [703]:
X_temp=X_train.copy()
X_temp['Outcome']=Y_train

In [704]:
#Seperating discrete valued features into two groups based on 'Outcome'
Class_Data_disc={}
Class_Data_disc[0]=X_temp.iloc[:,-3:-1][X_temp['Outcome']==0].values
Class_Data_disc[1]=X_temp.iloc[:,-3:-1][X_temp['Outcome']==1].values

In [705]:
#Seperating continuous valued features into two groups based on 'Outcome'
Class_Data_con={}
Class_Data_con[0]=X_temp.iloc[:,:6][X_temp['Outcome']==0].values
Class_Data_con[1]=X_temp.iloc[:,:6][X_temp['Outcome']==1].values

In [706]:
#Means and Standard deviations of continuous valued features for Outcome=0 and Outcome=1
Mean_0=np.mean(Class_Data_con[0],axis=0)
Mean_1=np.mean(Class_Data_con[1],axis=0)
STD_0=np.std(Class_Data_con[0],axis=0)
STD_1=np.std(Class_Data_con[1],axis=0)

In [707]:
Class_0_discrete=Class_Data_disc[0]
Class_1_discrete=Class_Data_disc[1]

Finding Likelihoods for discrete valued features ('Age', 'Pregnancies')

In [708]:
def occurrences(list1):
    no_of_examples = len(list1)
    prob = dict(Counter(list1))
    for key in prob.keys():
        prob[key] = prob[key] / float(no_of_examples)
    return prob

In [709]:
def frequency_(Class_discrete):
    prob_={}
    for i in range(Class_discrete.shape[1]):
        prob_[i]=occurrences(Class_discrete[:,i])
    return(prob_)


In [710]:
def prob_(Class_discrete,X_test):
    results = []
    for i in range(len(X_test)):
        probability=1
        new_sample=X_test.iloc[i,-2:]
        for j in range(len(new_sample)):
            relative_values = frequency_(Class_discrete)[j]
            if new_sample[j] in relative_values.keys():
                probability *= relative_values[new_sample[j]]
            else:probability *= 0 
        results.append(probability)
    return(results)

In [711]:
prob_1=prob_(Class_1_discrete,X_test)
prob_0=prob_(Class_0_discrete,X_test)

Finding Likelihoods for continuous valued features ('Glucose', 'BloodPressure', 'SkinThickness','Insulin', 'BMI',          'DiabetesPedigreeFunction')

In [712]:
#Gaussian distribution
def likelihood(x,mean,sigma):
    return np.exp(-(x-mean)**2/(2*sigma**2))*(1/(np.sqrt(2*np.pi)*sigma)) 

In [713]:
def posterior(X,X_train_class,mean_,std_):
    product=np.prod(likelihood(X,mean_,std_),axis=1)
    product=product*(X_train_class.shape[0]/X_train.shape[0])
    return product

Combining all the likelihoods to get posterior pobabilities

In [714]:
p_1=posterior(X_test.iloc[:,:6],Class_Data_con[1],Mean_1,STD_1)*prob_1
p_0=posterior(X_test.iloc[:,:6],Class_Data_con[0],Mean_0,STD_0)*prob_0
y_pred=1*(p_1>p_0)

In [715]:
#Confusion Matrix
tp=len([i for i in range(0,Y_test.shape[0]) if Y_test.iloc[i]==1 and y_pred.iloc[i]==1])
tn=len([i for i in range(0,Y_test.shape[0]) if Y_test.iloc[i]==0 and y_pred.iloc[i]==0])
fp=len([i for i in range(0,Y_test.shape[0]) if Y_test.iloc[i]==0 and y_pred.iloc[i]==1])
fn=len([i for i in range(0,Y_test.shape[0]) if Y_test.iloc[i]==1 and y_pred.iloc[i]==0])
confusion_matrix=np.array([[tp,fn],[fp,tn]])
print(confusion_matrix,type(confusion_matrix))

[[26 21]
 [14 93]] <class 'numpy.ndarray'>


In [716]:
tp=confusion_matrix[0,0]
fp=confusion_matrix[1,0]
tn=confusion_matrix[1,1]
fn=confusion_matrix[0,1]

Accuracy=(tp+tn)/(confusion_matrix.sum())
Recall=tp/(tp+fn)
Precision=tp/(tp+fp)
F=(2*Recall*Precision)/(Recall+Precision)

print('Accuracy= %.2f'%Accuracy,'\nRecall= %.2f'%Recall,'\nPrecision= %.2f'%Precision,'\nF= %.2f'%F)

Accuracy= 0.77 
Recall= 0.55 
Precision= 0.65 
F= 0.60
