# KNN from scratch in python

## Description :

KNN(K Nearest Neighbor) algorithm is generally used for classification purposes.
  

As the name suggests it evaluates the class of a new example by carrying out a voting of its K closest neighbors and prediting the class to be the class that is held by maximum number of its closest neighbors.

The dataset used for our multiclass classification purpose is iris dataset.
It can be found at this link : https://archive.ics.uci.edu/ml/datasets/iris

In [1]:
#importing the required libraries
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [2]:
#reading the data into pandas dataframe
data = pd.read_csv('iris.csv')

In [3]:
data.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
#inserting a column of labels
data['label'] = [0 if x=='Iris-setosa' else 1 if x=='Iris-versicolor' else 2 for x in data['Name']]

In [5]:
#function to calculate euclidean distance between two vectors
def dist_bet_points(x1,x2):
    dist = 0
    for i in range(len(x1)):
        dist += pow(x1[i]-x2[i],2)

    return pow(dist,0.5)    

In [6]:
#our model class n_neighbors is the K in KNN.
class KNN(object):
    
    def __init__(self,n_neighbors,train_data,train_label):
        self.n_neighbors = n_neighbors
        self.train_data = train_data
        self.train_label = train_label
    
    def predict(self,test_vec):
        distances = []                      #list to store distance of vector from each data point in the train data
        test_vec = np.array(test_vec)
        for i in range(len(self.train_data)):
            a = dist_bet_points(self.train_data[i],test_vec)
            distances.append(a)
        distances = np.array(distances)    
        index_of_knn = np.argpartition(distances,self.n_neighbors)[:self.n_neighbors] #fetch the indexes of the k closest points from our test vector
        for i in range(self.n_neighbors):
            a = index_of_knn[i]
            #count stands for votes in favour of a particular class.
            count_class1 = 0
            count_class2 = 0
            count_class3 = 0
            if self.train_label[a] == 0:
                count_class1 +=1
            elif self.train_label[a] == 1:
                count_class2 +=1
            else:
                count_class3 +=1
        if (count_class1>=count_class2 and  count_class1>=count_class3):
            return 0
        elif (count_class2>=count_class1 and  count_class2>=count_class3):
            return 1
        else: 
            return 2

In [7]:
train_data = data.iloc[:,0:4]
train_label = data.iloc[:,5]

In [8]:
train_data = train_data.as_matrix()
train_label = list(train_label)

In [9]:
model = KNN(3,train_data,train_label)

In [10]:
model.predict([7.2,3,5.6,1.4])

2