### This is a python implimentation of k-Nearest-Neighbors (KNN) algorithm for general value of k.

In [180]:
#importing libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
from math import sqrt
from statistics import mode

In [181]:
#loading Iris and Inosphere datasets
#for iris data
iris = load_iris()
X_iris = iris['data']
y_iris = iris['target']

#for ionosphere data
ionosphere_data = np.genfromtxt("ionosphere.txt", delimiter=',', names=True, dtype=None)
list_data = [list(x) for x in ionosphere_data]
X_ino = []
y_ino = []
for i,d in enumerate(list_data):
    X_ino.append([x for j,x in enumerate(d) if j!=len(d)-1])
    y_ino.append([x for j,x in enumerate(d) if j==len(d)-1])

X_ino = np.array(X_ino)
y_ino = np.hstack(y_ino) #np.hstack will form a list of labels. As per the name, it stacks values horizontally.

In [182]:
#defining fuction to sort a list in ascending order
def list_sort(l):    
    for i in range(len(l)):
        for j in range(len(l) - 1):
            if l[j] > l[j+1]:
                l[j], l[j + 1] = l[j + 1], l[j]            
    return l

Next section contains a function knn_prediction to implement knn algorithm for general k.
Function knn_prediction can take 6 arguments, in which 2 are primary, X(data) and y(labels), and 4 are secondary (test_size, train_size, random_state, k) which
have default values.

In [183]:
def knn_prediction(X, y, test_size= 0.25, train_size = 0.75, random_state = 94, k=3):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, train_size=train_size, random_state = random_state)
    
    #calculating distance
    dist = []
    for i in range(len(X_test)):
        for j in range(len(X_train)):
            d = sqrt(sum((X_test[i]-X_train[j])**2))
            dist.append([d,j])

    #finding minimun distance for one point
    #Note: total number of ditsnces are len(X_test)*Len(X_train). In other words for 1 test point, there are
    #len(X_train) number of distances. As this number is constant, we can use this number to separate distances that belongs
    #to given point in the dist[] array. For example, let say we have 2 test points and 5 training points. We will get 10 distances
    #from these points. First 5 will belong to test point 1 and next 5 will be of test point 2. 
    
    temp1 = []
    temp2 = []
    pred = []
    
    for i in range(int(len(dist)/len(X_train))):
        temp1 = list_sort(dist[i*len(X_train):len(X_train)*(i+1)])
        
        for j in range(k):
            temp2.append(y_train[temp1[j][1]]==y_test[i])
            
        if temp2.count(True) == temp2.count(False):
            rand_num = random.uniform(1, 2)       #This is a case of a tie. Using uniform random number to break this tie, 
            if rand_num < 1.5:                    #as both lebels are equally likely.
                pred.append(True)
            else:
                pred.append(False)
        else:
            pred.append(mode(temp2))   
        temp2.clear()
        temp1.clear()
    
    score = sum(pred)/len(y_test)
    num_error = pred.count(False)
    error = 1 - score
    return print(' k  Accuracy  Errors    Error_rate\n',k, "  {:.4f}".format(score)," ", num_error, "in",len(pred) ,"  {:.4f}".format(error))

Making prediction for Iris data set for k=1

In [184]:
knn_prediction(X_iris,y_iris,k=1)

 k  Accuracy  Errors    Error_rate
 1   0.9474   2 in 38   0.0526


Making prediction for Iris data set for k=3

In [185]:
knn_prediction(X_iris,y_iris,k=3)

 k  Accuracy  Errors    Error_rate
 3   0.9737   1 in 38   0.0263


Making prediction for Ionosphere data set for k=1

In [186]:
knn_prediction(X_ino,y_ino,k=1)

 k  Accuracy  Errors    Error_rate
 1   0.8409   14 in 88   0.1591


Making prediction for Ionosphere data set for k=3

In [187]:
knn_prediction(X_ino,y_ino,k=3)

 k  Accuracy  Errors    Error_rate
 3   0.8182   16 in 88   0.1818
