In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
def knn(X_train, X_test, y_train, k):
    y_test = np.empty(0)
    # For each x_test find the distance to every x_train
    for xtest in X_test:
        distances = np.empty(0)
        for xtrain in X_train:
            d = np.sqrt(np.power((xtest[0]-xtrain[0]),2) + np.power((xtest[1]-xtrain[1]),2) +
                        np.power((xtest[2]-xtrain[2]),2) + np.power((xtest[3]-xtrain[3]),2)
                       )
            distances = np.append(distances, d)
        
        # distances stores the sorted indexes, but not the distance values
        distances = np.argsort(distances)
        # y contains y_train values is sorted order (closest first)
        y = y_train[distances]
        # Choose only k first elements from y, find the most common value and append it to output array
        y_test = np.append(y_test, np.bincount(y[:k]).argmax())
    return y_test

In [3]:
iris = load_iris()
X, y, labels = iris.data, iris.target, iris.target_names
type(X)

numpy.ndarray

In [4]:
# Check if there is any missing value
np.isnan(X).any() or np.isnan(y).any()

False

In [5]:
# Cut 1/4 of data for further checking the algorithm
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 10)

In [6]:
predicted_values = knn(X_train, X_test, y_train, 1)

In [7]:
print("Mean absolute error is %0.2f%%" % (mean_absolute_error(y_test, predicted_values)*100))

Mean absolute error is 2.63%


In [8]:
# Find k that gives the lowest mae 
lowest_mean = mean_absolute_error(y_test, knn(X_train, X_test, y_train, 1))
k = 1
for x in range(3, len(X_train), 2):
    if mean_absolute_error(y_test, knn(X_train, X_test, y_train, x)) < lowest_mean:
        lowest_mean = mean_absolute_error(y_test, knn(X_train, X_test, y_train, x))
        k = x

In [9]:
print("Best k is %d" %k)
print("Minimal mae is %.2f%%" % (lowest_mean*100))

Best k is 7
Minimal mae is 0.00%
