Steps for creating a KNN model is as follows:

- 1. We need an optimal value for K to start with.

- 2. Calculate the distance of each data point in the test set with each point in the training set.

- 3. Sort the calculated distances along with the corresponding target values from training data in ascending order.

- 4. From these sorted values, select the K top values.

- 5. For Classification tasks, the Mode of data points corresponding to K top rows will be the predicted output and 
   for Regression tasks, Mean/ Median will be the predicted output.

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


In [3]:
#load data 
df =load_iris()
df

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [4]:
X = df.data
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [5]:
y=df.target
#y

In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)


In [7]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(105, 4) (105,)
(45, 4) (45,)


In [8]:
class KNN:
    def __init__(self, k=3):
        self.k = k
        
    def fit(self,X,y):
        self.X_train = X
        self.y_train = y
        
    def euclidean_distance(self,x1,x2):
        return np.sqrt(np.sum((x1-x2)**2))
        
    def predict(self,X):
        
        #initialize predictions 
        predictions = []
        
        #Main loop iterating through len(X)
        for i in range(len(X)):
            
            #initialize euclidean distances 
            distances = []
            
            for ind,element in enumerate(self.X_train):
            #for each in self.X_train:
                # for every row in X_train, find eucl_distance to X using
                # euclidean_distance() and append to euclidian_distances list      
                distance=self.euclidean_distance(element,X[i])
                #distance = self.euclidean_distance(each,X[i])
                distances.append(distance)
                   
            # sort euclidian_distances in ascending order, and retain only k
            # neighbors as specified in n_neighbors (n_neighbors = k)  
            # Numpy argsort returns the indexes of the input in the order 
            # that would sort the array
            k_neighbors = np.array(distances).argsort()[:self.k]
            
            #print("neighbors",k_neighbors)
            
            #initialize dict to count class occurences of y_train
            count_neighbors = {}
            for n in k_neighbors:
                if self.y_train[n] in count_neighbors:
                    count_neighbors[self.y_train[n]] += 1
                else:
                    count_neighbors[self.y_train[n]] = 1
                    
            # max count labels to prediction       
           # print(count_neighbors)
            max_i = 0 
            for a in count_neighbors.keys():
                if count_neighbors[a] > max_i:
                    max_key = a
                    max_i = count_neighbors[a]
    
            predictions.append(max_key)
        
        return predictions   
            
            
            
                
        
        

In [9]:
knn =KNN(k=5)

In [10]:
knn.fit(X_train,y_train)

In [11]:
y_pred = knn.predict(X_test)
y_pred

[2,
 2,
 1,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 0,
 2,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 2,
 1,
 0,
 1,
 1,
 2,
 0,
 0,
 2,
 1,
 1,
 1,
 2,
 1,
 0,
 2,
 1,
 0,
 1,
 2,
 1]

In [12]:
y_test

array([2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 2, 1, 0, 1, 1, 2, 0, 0, 2, 1, 1, 1, 2, 1, 0, 2, 1, 0, 1, 2,
       1])

In [13]:
def accuracy(y,y_pred):
    return np.sum(y==y_pred)/len(y) 

In [14]:
acc = accuracy(y_test, y_pred)
acc

0.9777777777777777