# KNN Classifier from Scratch

### KNN model using pandas and numpy, compared to sklearn KNN classifier
#### Using iris dataset

In [1]:
import pandas as pd
import numpy as np
import math
import operator
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


### Load and View Data

In [2]:
data = pd.read_csv('iris.csv')
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Process Data

* Split x(sepal, petal length and width) from species/y(the category)
* Split data into train(85%) and test(15%) for x and y


In [3]:
x = data.drop(['species'], axis=1)
y = data['species']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.85, test_size = 0.15, random_state = 0)

#save as xz_train due to need for 2 copies of training data
xz_train = x_train


In [5]:
x_train['species']=y_train
x_train.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
8,4.4,2.9,1.4,0.2,setosa
126,6.2,2.8,4.8,1.8,virginica
22,4.6,3.6,1.0,0.2,setosa
44,5.1,3.8,1.9,0.4,setosa
97,6.2,2.9,4.3,1.3,versicolor


In [6]:
xz_train = xz_train.drop(['species'], axis=1)

### Create KNN
* Define distance function as euclidean distance
* Define Knn
    * inputs are a training set (with categorical values, in our case x_train), a testInstance, and k the number of nearby values to take into account
    

In [7]:
def euclidean_distance(p1, p2, length):
    distance = 0
    for x in range(length):
        distance += np.square(p1[x] - p2[x])
    return np.sqrt(distance)

In [8]:
def knn(training_data, test_value, k):
 
    distances = {}
    sort = {}
 
    length = len(test_value)-1
    
    #### Start of STEP 3
    # Calculating euclidean distance between each row of training data and test data
        
    for x in range(len(training_data)):

        dist = euclidean_distance(test_value, training_data.iloc[x], length)

        distances[x] = dist

    # Sorting them on the basis of distance
    sorted_d = sorted(distances.items(), key=operator.itemgetter(1))
 
    neighbors = []
    
    # find nearest k neighbors
    for x in range(k):
        neighbors.append(sorted_d[x][0])
    votes = {}
    
    # Find the most common class among the neighbors
    for x in range(len(neighbors)):
        response = training_data.iloc[neighbors[x]][-1]
 
        if response in votes:
            votes[response] += 1
        else:
            votes[response] = 1

    #sort votes, return most voted for class and the index values for the neighbors
    sorted_votes = sorted(votes.items(), key=operator.itemgetter(1), reverse=True)
    return(sorted_votes[0][0], neighbors)

### Run KNN with Test Set

In [9]:
# Setting number of neighbors
k = 10
#runs knn for each value in the test set
species_predicted = []
for j in range(0,len(x_test)):
    result, neighbors = knn(x_train, x_test.iloc[j], k)
    species_predicted.append(result)
    print(result)
    print(neighbors)

virginica
[39, 114, 14, 53, 71, 10, 96, 32, 1, 118]
versicolor
[16, 79, 27, 119, 56, 105, 124, 30, 42, 112]
setosa
[50, 95, 97, 78, 21, 13, 45, 99, 64, 121]
virginica
[101, 81, 44, 60, 28, 75, 98, 80, 57, 61]
setosa
[115, 6, 20, 102, 76, 82, 9, 65, 74, 84]
virginica
[62, 103, 24, 67, 80, 7, 123, 113, 69, 23]
setosa
[76, 82, 74, 102, 115, 84, 9, 121, 20, 87]
versicolor
[116, 51, 94, 35, 110, 86, 19, 29, 66, 108]
versicolor
[110, 116, 51, 19, 35, 108, 86, 94, 31, 36]
versicolor
[124, 105, 4, 15, 16, 104, 89, 58, 48, 40]
virginica
[123, 11, 63, 47, 14, 32, 23, 39, 114, 7]
versicolor
[29, 86, 94, 89, 116, 43, 4, 37, 1, 96]
virginica
[1, 43, 96, 10, 31, 109, 14, 4, 32, 85]
versicolor
[116, 86, 89, 1, 31, 94, 4, 43, 109, 110]
versicolor
[43, 96, 1, 10, 31, 118, 4, 29, 53, 14]
setosa
[74, 76, 82, 121, 20, 99, 115, 6, 65, 102]
versicolor
[43, 4, 85, 96, 15, 1, 58, 10, 48, 118]
versicolor
[40, 85, 58, 42, 104, 8, 124, 48, 16, 79]
setosa
[59, 70, 83, 122, 38, 18, 52, 111, 126, 100]
setosa
[21, 4

### Fit Built in Sklearn KNN Classifier

In [10]:
#built in KNN classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(xz_train, y_train)

# Predicted class
sklearn_predicts = neigh.predict(x_test)

### Compare Built In, Made from Scratch and Actual Classes

In [11]:
predicted = {
    'sklearn': sklearn_predicts, 
    'Scratch': species_predicted,
    'Actual': y_test
}

In [12]:
dff = pd.DataFrame(predicted)


In [13]:
dff[dff['sklearn']!=dff['Scratch']]

Unnamed: 0,sklearn,Scratch,Actual
73,versicolor,virginica,versicolor


In [14]:
dff[dff['Scratch']!=dff['Actual']]

Unnamed: 0,sklearn,Scratch,Actual
73,versicolor,virginica,versicolor


In [15]:
dff[dff['sklearn']!=dff['Actual']]

Unnamed: 0,sklearn,Scratch,Actual


#### The scratch made KNN method preforms almost as well on this data set as the Sklearn model, making only one error compared to the built in models zero errors. For a very simple program this does very well, and could certainly be improved further. 