# K-Nearest Neighbors

In [65]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.neighbors import KernelDensity
from sklearn.neighbors import KNeighborsClassifier

In [66]:
diabetes = pd.read_csv('./Data/pima-indians-diabetes-cleaned.csv')

In [67]:
feature_cols = ['times_pregnant', 'glucose_concentration', 'blood_pressure', 'Triceps_skinfold_thickness',
              '2_Hour_serum_insulin', 'BMI', 'diabetes_pedigree_function', 'Age']
outcome_col = 'Class_Variable'

In [68]:
X = diabetes[feature_cols].to_numpy()
y = diabetes[outcome_col].to_numpy()

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
# Standardize the training data
means = X_train.mean(axis=0)
stds = X_train.std(axis=0)
X_train = zscore(X_train, axis=0, ddof=0, nan_policy='propagate')

In [71]:
# Standardize the test data according to the training data
for i in range(len(X_test)):
    for j in range(len(X_test[i])):
        X_test[i][j] = (X_test[i][j] - means[j]) / stds[j] 

In [72]:
# Define Cost Values
loss_11 = 100000 # loss associated with correctly assigning class 1
loss_10 = 7500000 # loss associated with assigning class 0 when we should assign class 1
loss_01 = 100000 # loss associated with assigning class 1 when we should assign class 0
loss_00 = 0 # loss associated with correctly assigning class 0

In [120]:
n_neighbors = 23

### K-Nearest Neighbors with our prediction threshold- no reweighting

In [121]:
class KNearestNeighborsBinaryMod(KNeighborsClassifier):
    def __init__(self, loss_11, loss_10, loss_01, loss_00, **kwds):
        super().__init__(**kwds)
        
        # comes from equation 8 in the writeup
        self.loss_11 = loss_11
        self.loss_10 = loss_10
        self.loss_01 = loss_01
        self.loss_00 = loss_00
        self.threshold = (loss_01 - loss_00) / ((loss_01 - loss_00) + (loss_10 - loss_11))
        print(f"self.threshold: {self.threshold}")
    
    # override the predict function to predict according to our threshold
    def predict(self, X):
        probs = self.predict_proba(X)
        predictions = [1 if probs[i][1] >= self.threshold else 0 for i in range(len(probs))]
        return predictions
    

    # write a score function that will calculate the score of classifications given our costs
    def score_with_costs(self, X, y, sample_weight=None):
        y_pred = self.predict(X)
        
        give_loss_vectorized = np.vectorize(self.give_loss)
        losses = give_loss_vectorized(y, y_pred)
        
        return np.average(losses, weights=sample_weight)
        
        
    def give_loss(self, y, y_pred):
        if y == 1 and y_pred == 1:
            return self.loss_11
        elif y == 1 and y_pred == 0:
            return self.loss_10
        elif y == 0 and y_pred == 1:
            return self.loss_01
        elif y == 0 and y_pred == 0:
            return self.loss_00
        else:
            raise Exception(f"Expected y: {y} and y_pred: {y_pred} to equal to 0 or 1")

In [122]:
knn_cost_sensitive = KNearestNeighborsBinaryMod(loss_11, loss_10, loss_01, loss_00, weights="distance", n_neighbors=n_neighbors)
knn_cost_sensitive.fit(X_train, y_train)
knn_cost_sensitive_pred = knn_cost_sensitive.predict(X_test)


self.threshold: 0.013333333333333334


In [123]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, knn_cost_sensitive_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [16 81  1 56]
precision: 0.40875912408759124
recall: 0.9824561403508771


In [124]:
knn_cost_sensitive.score_with_costs(X_test, y_test)

137662.33766233767

In [125]:
knn_cost_sensitive.score(X_test, y_test)

0.4675324675324675

### K-Nearest Neighbors with standard prediction threshold of 0.5- no reweighting

In [126]:
class KNearestNeighborsJustScore(KNeighborsClassifier):
    def __init__(self, loss_11, loss_10, loss_01, loss_00, **kwds):
        super().__init__(**kwds)
        
        # comes from equation 8 in the writeup
        self.loss_11 = loss_11
        self.loss_10 = loss_10
        self.loss_01 = loss_01
        self.loss_00 = loss_00

        
    # write a score function that will calculate the score of classifications given our costs
    def score_with_costs(self, X, y, sample_weight=None):
        y_pred = self.predict(X)
        
        give_loss_vectorized = np.vectorize(self.give_loss)
        losses = give_loss_vectorized(y, y_pred)
        
        return np.average(losses, weights=sample_weight)
        
        
    def give_loss(self, y, y_pred):
        if y == 1 and y_pred == 1:
            return self.loss_11
        elif y == 1 and y_pred == 0:
            return self.loss_10
        elif y == 0 and y_pred == 1:
            return self.loss_01
        elif y == 0 and y_pred == 0:
            return self.loss_00
        else:
            raise Exception(f"Expected y: {y} and y_pred: {y_pred} to equal to 0 or 1")

In [139]:
knn_noncost_sensitive = KNearestNeighborsJustScore(loss_11, loss_10, loss_01, loss_00, weights="distance", n_neighbors=n_neighbors)
knn_noncost_sensitive.fit(X_train, y_train)
knn_noncost_sensitive_pred = knn_noncost_sensitive.predict(X_test)


In [140]:
# Confusion Matrix (TN, FP, FN, TP) for non cost-sensitive
cm = metrics.confusion_matrix(y_test, knn_noncost_sensitive_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [84 13 32 25]
precision: 0.6578947368421053
recall: 0.43859649122807015


In [141]:
knn_noncost_sensitive.score_with_costs(X_test, y_test)

1583116.8831168832

In [142]:
knn_noncost_sensitive.score(X_test, y_test)

0.7077922077922078

### K-Nearest Neighbors with standard prediction threshold of 0.5- reweighting (according to Elkan)

In [138]:
w = 0.013 / (1-0.013)
def weights(ys):
    ws = []
    print(ys)
    for y in ys:
        if y == 0:
            ws.append(w)
        else:
            ws.append(1)
    return ws
knn_noncost_sensitive_reweight = KNearestNeighborsJustScore(loss_11, loss_10, loss_01, loss_00, weights=weights, n_neighbors=n_neighbors)
knn_noncost_sensitive_reweight.fit(X_train, y_train)
knn_noncost_sensitive_reweight_pred = knn_noncost_sensitive_reweight.predict(X_test)


[[1.09415058 1.15666594 1.18415828 ... 2.00675788 2.01454582 2.02602787]
 [0.91383176 1.10189594 1.14307455 ... 1.84216929 1.86037359 1.86674163]
 [1.10437252 1.22049204 1.30522451 ... 1.63486582 1.66674378 1.67617609]
 ...
 [1.20879598 1.27828311 1.37640333 ... 1.87406582 1.88232718 1.88395121]
 [0.9128946  0.9683628  1.01558017 ... 1.64204109 1.67223678 1.69535962]
 [0.66469323 0.94599307 1.154822   ... 1.64052076 1.64219412 1.64747216]]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [132]:
# Confusion Matrix (TN, FP, FN, TP) for non cost-sensitive
cm = metrics.confusion_matrix(y_test, knn_noncost_sensitive_reweight_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [16 81  1 56]
precision: 0.40875912408759124
recall: 0.9824561403508771


In [133]:
knn_noncost_sensitive_reweight.score_with_costs(X_test, y_test)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [134]:
knn_noncost_sensitive_reweight.score(X_test, y_test)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()