# k-Nearest Neighbors (kNN) 

In [245]:
import numpy as np
import pandas as pd
from os import path

In [246]:
def load_data(filePath):
    dataset = pd.read_csv(filePath)
    X_train = dataset[dataset.columns[:-1]]
    y = dataset[dataset.columns[-1]]
    return X_train, y, dataset

### Load Regression Data

In [247]:
knn_regression_data = path.abspath("dataset/knn_regression.csv")
X_reg_train, y_reg_train, reg_dataset = load_data(knn_regression_data)

In [248]:
reg_dataset.head()

Unnamed: 0,x1,x2,x3,y
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


### Load Classification Data

In [249]:
knn_classification = path.abspath("dataset/knn_classification.csv")
X_clf_train, y_clf_train, clf_dataset = load_data(knn_classification)

In [250]:
clf_dataset.head()

Unnamed: 0,x1,x2,x3,x4,y
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### Euclidean distance

In [251]:
def euclidean_distance(q, sample, length):
    distance = 0
    for i in range(length-1):
        distance += np.square(q[i] - sample[i])
    return np.sqrt(distance)

### Get Neighbors

In [252]:
def get_neighbors(X_train, query, k):
    dist = []
    for x in X_train:
        dist.append((x, euclidean_distance(query, x, len(query))))
    dist.sort(key=lambda tup: tup[1])
    neighbors = []
    for i in range(k):
        neighbors.append(dist[i][0])
    return np.array(neighbors)

### Display Nearest Neighbors

In [253]:
def display_knn(neighbors):
    print("x:\t x1\t x2\t x3")
    for i in range(len(neighbors)):
        print(f"{i}:\t {neighbors[i][0]}\t{neighbors[i][1]}\t{neighbors[i][2]}\t")


## k-NN Regression

In [254]:
k = 10

query_index = 123
query = reg_dataset.values[query_index]
X_reg = np.delete(reg_dataset.values, obj=query_index, axis=0)
knn_reg = get_neighbors(X_reg, query, k)
pred_knn_reg = np.mean(knn_reg, 0)[-1]

print(f"Mean of k={k} Neighbors:\t{pred_knn_reg}")

Mean of k=10 Neighbors:	1.6099999999999999


In [255]:
print(f"k={k} Nearest Neighbors\n")
display_knn(knn_reg)

k=10 Nearest Neighbors

x:	 x1	 x2	 x3
0:	 6.2	2.8	4.8	
1:	 6.3	2.5	4.9	
2:	 6.3	2.8	5.1	
3:	 6.3	2.5	5.0	
4:	 6.1	2.8	4.7	
5:	 6.1	2.9	4.7	
6:	 6.0	2.7	5.1	
7:	 6.1	3.0	4.9	
8:	 6.5	2.8	4.6	
9:	 6.4	2.7	5.3	


# k-NN Classification

### Plurality vote 

In [256]:
def plurality_vote(all_neighbors):
    # returns the neighbors with most occurrences
    votes = {}
    all_classifications = [n[-1] for n in all_neighbors]
    return max(set(all_classifications), key=all_classifications.count)

In [257]:
k = 10

query_index = 123
query = clf_dataset.values[query_index]
X_clf = np.delete(clf_dataset.values, obj=query_index, axis=0)
knn_clf = get_neighbors(X_clf, query, k)
pred_knn_clf = plurality_vote(knn_clf)

print(f"Max of k={k} Neighbors:\t{pred_knn_clf}")

Max of k=10 Neighbors:	2.0


In [258]:
print(f"k={k} Nearest Neighbors\n")
display_knn(knn_clf)

k=10 Nearest Neighbors

x:	 x1	 x2	 x3
0:	 6.2	2.8	4.8	
1:	 6.3	2.5	5.0	
2:	 6.1	3.0	4.9	
3:	 6.3	2.5	4.9	
4:	 6.3	2.8	5.1	
5:	 6.0	2.7	5.1	
6:	 6.4	2.7	5.3	
7:	 6.0	3.0	4.8	
8:	 6.5	2.8	4.6	
9:	 6.5	3.0	5.2	
