In [354]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import operator
import time

In [556]:
train_data = pd.read_csv('hw1_datasets/q2/diabetes_train_features.csv')
train_gts = pd.read_csv('hw1_datasets/q2/diabetes_train_labels.csv')
test_data = pd.read_csv('hw1_datasets/q2/diabetes_test_features.csv')
test_gts = pd.read_csv('hw1_datasets/q2/diabetes_test_labels.csv')

In [502]:
train_data.tail()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
609,609,5,139,64,35,140,28.6,0.411,26
610,610,1,96,122,0,0,22.4,0.207,27
611,611,10,101,86,37,0,45.6,1.136,38
612,612,0,141,0,0,0,42.4,0.205,29
613,613,0,125,96,0,0,22.5,0.262,21


In [449]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,2,84,0,0,0,0.0,0.304,21
1,1,9,112,82,24,0,28.2,1.282,50
2,2,1,139,46,19,83,28.7,0.654,22
3,3,0,161,50,0,0,21.9,0.254,65
4,4,6,134,80,37,370,46.2,0.238,46


In [450]:
train_gts.head()

Unnamed: 0.1,Unnamed: 0,Outcome
0,0,0
1,1,1
2,2,0
3,3,0
4,4,1


In [497]:
test_data.tail()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age
149,149,9,165,88,0,30.4,0.302,49
150,150,1,77,56,30,33.3,1.251,24
151,151,8,95,72,0,36.8,0.485,57
152,152,2,146,70,38,28.0,0.337,29
153,153,8,74,70,40,35.3,0.705,39


In [452]:
test_gts.head()

Unnamed: 0.1,Unnamed: 0,Outcome
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


## Backward Elimination
We use this step to eliminate irrelevant features and try to calculate with less feature

In [557]:
train_data = train_data.drop(columns='Insulin')
test_data = test_data.drop(columns='Insulin')
train_data = train_data.drop(columns='Age')
test_data = test_data.drop(columns='Age')

In [558]:
train_data = train_data.drop(columns='Pregnancies')
test_data = test_data.drop(columns='Pregnancies')

# Distance Metric
## Euclidean Distance
We use euclidean distance metric in order to calculate difference between two datasets

Non-negativity, identity of indiscernibles, symmetry and triangle inequality are required aspects for distance metrics calculator.  
Euclidean distance gives more precise results in complex plane compared to manhattan and chebyshev

we could use manhattan distance, chess distance, mahalanobis distance

In [456]:
def euclidean_distance(sample1, sample2):
    assert len(sample1) == len(sample2), "Different number of features exist for the given samples"
    distance = 0
    for i in range(len(sample1)):
        distance += pow((sample1[i] - sample2[i]), 2)
    return math.sqrt(distance)

data1 = np.array([2,2,2,2])
data2 = np.array([4,4,4,4])
distance = euclidean_distance(data1, data2)
print('Distance: ' + repr(distance))

Distance: 4.0


## Feature Selection 
We select features in order to eliminate irrelevant features from our feature set, some features does not help us that much in our decision process, also more features means more time consuming

# Neighbors

### Collecting K most similar neighbour instances for a given unseen instance
We will calculate the distance for all instances and select a subset with the smallest distance values

In [457]:
def get_neighbors(train_X, train_Y, test_sample, k):
    distances = []
    neighbors = []
    
    for i in range(len(train_X)):
        distance = euclidean_distance(train_X.iloc[i], test_sample)
        distances.append((train_X.iloc[i], train_Y.iloc[i], distance))
    distances.sort(key=operator.itemgetter(2))
    for i in range(k):
        neighbors.append(distances[i])
    return neighbors

## Classification
Once we have located the most similar neighbors for a test instance, the next task is to devise a predicted response based on those neighbors
We can do this by allowing each neighbor to ote for their class attribute, and take the majority vote as the prediction
Below provides a function for getting the majority voted response from a number of neighbors. It assumes the class is the last attribute for each neighbor

In [458]:
def classify():
    results_df = []
    k = 9
    for i in range(len(test_data)):
        sum_one = 0
        sum_zero = 0
        neighbors = get_neighbors(train_data, train_gts, test_data.iloc[i], k)
        for i in range(len(neighbors)):
            prediction_1 = str(neighbors[i][1]).partition('Outcome')[2]
            prediction = int(prediction_1.partition('Name')[0]) 
            if(prediction == 1):
                sum_one += 1
            else:
                sum_zero += 1
        if(sum_zero > sum_one):
            results_df.append(0)
        else:
            results_df.append(1)
    return results_df

In [559]:
start_time = time.time()
results = classify()
print("--- %s seconds ---" % (time.time() - start_time))

--- 26.30210304260254 seconds ---


In [460]:
print(results)

[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]


# Accuracy
We have all the pieces of the kNN algorithm in place. An important remaining concern is how to evaluate the accuracy of predictions.
An easy way to evaluate the accuracy of the model is to calculate a ratio of the total correct predictions out of all predictions made
called the classification accuracy.
Below is the calc_accuracy function that sums the total correct predictions and returns the accuracy as a percentage of correct classifications

In [561]:
def calc_accuracy(gt_y, pred_y):
    correct = 0
    for g_y, p_y in zip(gt_y, pred_y):
        if g_y == p_y:
            correct += 1
    return (correct/float(len(gt_y)) * 100)

start_time = time.time()
print(calc_accuracy(test_gts['Outcome'], results))
print("--- %s seconds ---" % (time.time() - start_time))

73.37662337662337
--- 0.000659942626953125 seconds ---


In [563]:
def confusion_matrix(gt_y, pred_y):
    # true positives
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for g_y, p_y in zip(gt_y, pred_y):
        if g_y == 0 and p_y == 0:
            tn += 1
        elif g_y == 1 and p_y == 1:
            tp += 1
        elif g_y == 1 and p_y == 0:
            fn += 1
        elif g_y == 0 and p_y == 1:
            fp += 1
    return tp,tn,fp,fn