In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import sklearn

In [None]:
data = pd.read_csv('health_data.csv')
data.head()

In [None]:
X = data.to_numpy()[:, :3]
Y = data.to_numpy()[:, 3]
split = 0.8 #using 70:30 split, random each time
train_samples = np.random.choice(X.shape[0], int(split*X.shape[0]))
test_samples = np.array([i for i in range(X.shape[0]) if i not in list(train_samples)])

X_train = X[[train_samples]]
Y_train = Y[[train_samples]]

X_test = X[[test_samples]]
Y_test = Y[[test_samples]]

X_0 = X_train[Y_train == 0]
X_1 = X_train[Y_train == 1]

X_test_0 = X_test[Y_test == 0]
X_test_1 = X_test[Y_test==1]
print(X_test.shape)

In [None]:
X_00 = data[data['category'] == 0].to_numpy()[:, :3]
X_11 = data[data['category'] == 1].to_numpy()[:, :3]

In [None]:
hist1 = plt.hist(X_00[:, 0], bins=40, normed=True)

In [None]:
hist2 = plt.hist(X_00[:, 1], bins=30, normed=True)

In [None]:
hist3 = plt.hist(X_00[:, 2], bins=30, normed=True)

In [None]:
hist11 = plt.hist(X_11[:, 0], bins=20, normed=True)

In [None]:
hist12 = plt.hist(X_11[:, 1], bins=30, normed=True)

In [None]:
hist13 = plt.hist(X_11[:, 2], bins=40, normed=True)

In [None]:
#Hypercube_Kernel
def hypercube_kernel(h, x, x_i):
    assert (x.shape == x_i.shape)
    return (x - x_i) / (h)


#Window_function   
def parzen_window_func(x_vec):
    for row in x_vec:
        if np.abs(row) > (1/2):
            return 0
        return 1

#Estimation
def parzen_estimation_for_hyper_cube(x_samples, point_x, h, d):
    dimensions = x_samples.shape[1]
    assert (len(point_x) == dimensions)
    k_n = 0
    for row in x_samples:
        x_i = hypercube_kernel(h=h, x=point_x, x_i=row)
        k_n += parzen_window_func(x_i)
    return (k_n / len(x_samples)) / (h**d)

In [None]:
def pdf_multivariate_gauss(x, mu, cov):
    assert(mu.shape[0] > mu.shape[1])
    assert(x.shape[0] > x.shape[1])
    assert(cov.shape[0] == cov.shape[1])
    assert(mu.shape[0] == cov.shape[0])
    assert(mu.shape[0] == x.shape[0])
    part1 = 1 / ( ((2* np.pi)**(len(mu)/2)) * (np.linalg.det(cov)**(1/2)) )
    part2 = (-1/2) * ((x-mu).T.dot(np.linalg.inv(cov))).dot((x-mu))
    return float(part1 * np.exp(part2))

In [None]:
#Gaussian_kernel_and_window_function
def gaussian_window_function(cov, x, x_i):
    assert (x.shape == x_i.shape)
    assert(cov.shape[0] == cov.shape[1])
    assert(x_i.shape[0] == cov.shape[0])
    return pdf_multivariate_gauss(x, x_i,cov)


#Estimation
def parzen_estimation_for_gaussian(x_samples, point_x, cov):
    prob = 0.0
    for row in x_samples:
        prob += gaussian_window_function(cov,point_x,x_i)
    return k_n 

In [None]:
#prior calculation
P = [float(len(X_0))/ float(len(X_0) + len(X_1)) ,float(len(X_1))/ float(len(X_0) + len(X_1)) ]
P

In [None]:
#calculation of posterior
def posterior(X_train_0,X_train_1, data, P,h,d):
    prob = np.zeros(2, dtype = np.float64)
    class_probabilities = [parzen_estimation_for_hyper_cube(X_train_0, data, h, d), parzen_estimation_for_hyper_cube(X_train_1, data, h, d)]
    
    #print(probabilities)
    for i in range(2):
        prob[i] = class_probabilities[i]*P[i]
    return prob

In [None]:
def prediction(data, X_train_0, X_train_1,P, h, Th):
    Probabilities = posterior(X_train_0, X_train_1, data, P, h, X_train.shape[1])
    Probabilities = Probabilities/sum(Probabilities)
    #print(Probabilities)
    if Probabilities[1] > Th:
        return 1
    else:
        return 0

In [None]:
#computing accuracy on test data.
C = np.zeros((2,2)) #[[]]
for i in range((X_test_0.shape[0])):
    y_pred = prediction(X_test_0[i], X_0,X_1, P, 1, 0.5)
    C[y_pred, 0] += 1
for i in range((X_test_1.shape[0])):
    y_pred = prediction(X_test_1[i], X_0, X_1, P, 1, 0.5)
    C[y_pred, 1] += 1

In [None]:
C

In [None]:
acc = (C[0,0] + C[1,1])/np.sum(C)
print(acc)

In [None]:
neg_predictivity = (C[0,0])/(C[0,0] + C[0,1])
neg_predictivity

In [None]:
precision = (C[1,1])/(C[1,0] + C[1,1])
precision

In [None]:
Recall = (C[0,0])/(C[0,0] + C[1,0])
Recall

In [None]:
Specificity = (C[1,1])/(C[0,1] + C[1,1])
Specificity

In [None]:
from tqdm import tqdm
Precision = []
Recall = []
ACC = []
F1_score = []
kk = range(1,300)
for h in np.linspace(0.01,5,5000):
    C = np.zeros((2,2)) #[[]]
    for i in range((X_test_0.shape[0])):
        y_pred = prediction(X_test_0[i], X_0,X_1, P, h, 0.5)
        C[y_pred, 0] += 1
    for i in range((X_test_1.shape[0])):
        y_pred = prediction(X_test_1[i], X_0, X_1, P, h, 0.5)
    C[y_pred, 1] += 1
    acc = (C[0,0] + C[1,1])/np.sum(C) 
    pr = C[1,1]/(C[1,1] + C[0,1])
    rec = C[1,0]/(C[1,0] + C[0,0])
    f1 = 2*(pr*rec)/(pr + rec)
    Precision.append(pr)
    Recall.append(rec)
    F1_score.append(f1)
    ACC.append(acc)
#plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC curve');