In [None]:
# KNN algorithm (K-Nearest Neighbor) simplified. It can be used for both classification and 
# regression predictive problems. For simplicity, IRIS dataset is used to test the code. 

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.metrics import mean_absolute_error as MEA # I imported this but will try to not to use it.
Iris = datasets.load_iris()
print(type(Iris.data))
print(Iris.data.shape)

# check the length of the data, shuffle the data, prepare the test and train sets
# it is normally good to split the data to 70-30 but our KNN does close to 100 percent accuracy in that case
# the data is intentionally split 66-33 to show some errors for overfitting and underfitting
# X is used for the data and y is used for labels
length_of_data = len(Iris.data)
print(length_of_data)
indices = np.random.permutation(len(Iris.data))
X_train = Iris.data[indices[:100]]
y_train = Iris.target[indices[:100]]
X_test = Iris.data[indices[100:]]
y_test = Iris.target[indices[100:]]

# view the data
print(X_train[:5])
print(y_train[:5])

# test the normal distance function to get a scalar distance value in between instances
print(np.linalg.norm(X_train[3] - X_test[1]))

In [None]:
def K_Nearest_Neighbor(test_set, test_labels, training_set, training_labels, n): 
    results = []
    for i in range(len(test_set)):
        indices_visited = [] # make a list of indices visited
        votes = []
        for j in range(n):
            min = np.inf
            for k in range(len(training_set)):
                difference = np.linalg.norm(training_set[k] - test_set[i])
                if difference < min and k not in indices_visited:
                    min = difference
                    index_at_min_distance = k
                    the_label_found_at_min_dist = training_labels[k]
            indices_visited.append(index_at_min_distance)
            votes.append(the_label_found_at_min_dist)
        
        # count the votes
        a_dict = {}
        for i in votes:
            if i in a_dict:
                a_dict[i] += 1
            else:
                a_dict[i] = 1
        #print(a_dict)
        # find the maximum seen vote
        max_v = -np.inf
        for k,v in a_dict.items():
            if v > max_v:
                max_v = v
                set_k = int(k)
        # the most common vote is the label of the prediction
        results.append(set_k) 
    #print(results, test_labels)
    # check the accuracy by comparing our target and actual test labels
    accuracy = np.sum(results == test_labels) / len(test_labels)
    # print('the actual results are: {}'.format(test_labels))
    # print('our labels show: {}'.format(results))
    # print('the accuracy is: {}'.format(accuracy*100))
    return np.around(100*accuracy, decimals = 2)
    

# print(K_Nearest_Neighbor(X_test, y_test, X_train, y_train, 1))
all_results = []   
for i in range(1,30):
    each_result = K_Nearest_Neighbor(X_test, y_test, X_train, y_train, i)
    all_results.append(each_result)
print(all_results, len(all_results))
plt.plot(np.arange(len(all_results)), np.array(all_results))

# Conclusion: KNN 8-10 gives the best accuracy for this training and test sets. 
# It is slightly hard to see the overfitting 