# Implementation of the k-nearest neighbors algorithm
---

## Importing the data

We are using the iris dataset, so let's import it and add labels to the columns.


In [1]:
import pandas as pd

dataset = pd.read_csv('iris.csv', names=['sepal length', 'sepal width', 'petal length', 'petal width', 'class'])

## Splitting the data 

Even though k-NN classification doesn't require a training phase, it needs a labeled set of points which will be used to predict the class of unkown points.


In [2]:
RANDOM_SEED = 42

def train_test_split(dataset, test_ratio=0.34):

    # suffle dataframe rows
    shuffled_dataset = dataset.sample(frac=1, random_state=RANDOM_SEED)

    # compute number of elements in test split
    n_test = int(test_ratio*len(dataset))

    # split dataset
    x_train = shuffled_dataset[n_test:].drop(columns='class', axis=1)
    y_train = shuffled_dataset[n_test:]['class']
    x_test = shuffled_dataset[:n_test].drop(columns='class', axis=1)
    y_test = shuffled_dataset[:n_test]['class']
            
    return x_train, y_train, x_test, y_test

In [29]:
x_train, y_train, x_test, y_test = train_test_split(dataset)

## Classification of unseen data - the slow way

In [4]:
import numpy as np
from collections import Counter

K = 5

def predict(x_train, y_train, x_test):
    
    y_pred = []
    
    for unknown_flower in x_test.iterrows():
        distances = []

        for known_flower in x_train.iterrows():
            distance = np.linalg.norm(unknown_flower[1]-known_flower[1])
            distances.append((known_flower[0], distance))

        distances.sort(key=lambda x: x[1])

        neighbors = {}

        for distance in distances[:K]:
            if y_train[distance[0]] not in neighbors:
                neighbors[y_train[distance[0]]] = 1
            else :
                neighbors[y_train[distance[0]]] += 1
        
        y_pred.append(Counter(neighbors).most_common(1)[0][0])

    return y_pred

In [26]:
%timeit y_pred = predict(x_train, y_train, x_test)

1.62 s ± 37.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Evaluate predictions

In [15]:
def evaluate(predections, ground_truth):
    
    correctly_classified = 0

    for prediction, truth in zip(predections, ground_truth):
        if prediction == truth:
            correctly_classified += 1

    return correctly_classified/len(ground_truth)

In [16]:
score = evaluate(y_pred, y_test)

In [27]:
print("Accuracy : {0:.4f}".format(score))

Accuracy : 0.9804


In [43]:
%timeit %run knn_naive.py

1.56 s ± 18.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
