# KNN Classifier from Scratch

* Built with pandas and numpy and compared to sklearn KNN classifier
* Using the Iris Dataset

In [1]:
import pandas as pd
import numpy as np
import math
import operator
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings("ignore")

### Load and View Data

In [2]:
iris = load_iris()

In [3]:
# Convert sklearn dataset to pandas dataframe
data = []
def sklearn_df_converter(sklearn_dataset, data):
    data = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    data['target'] = pd.Series(sklearn_dataset.target)
    return data

In [4]:
data = sklearn_df_converter(iris, data)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### Data Processing

* Split x(sepal, petal length and width) from y(the species)
* Split data 85/15 train/test

In [5]:
x = data.drop(['target'], axis=1)
y = data['target']

In [6]:
# Split
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.85, test_size = 0.15, random_state = 0)

In [7]:
# Training set for built from scratch model
x_train['target']=y_train
x_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
8,4.4,2.9,1.4,0.2,0
126,6.2,2.8,4.8,1.8,2
22,4.6,3.6,1.0,0.2,0
44,5.1,3.8,1.9,0.4,0
97,6.2,2.9,4.3,1.3,1


In [8]:
# training set for Sklearn model
xcp_train = x_train.drop(['target'], axis=1)
xcp_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
8,4.4,2.9,1.4,0.2
126,6.2,2.8,4.8,1.8
22,4.6,3.6,1.0,0.2
44,5.1,3.8,1.9,0.4
97,6.2,2.9,4.3,1.3


### Create KNN
* Define distance function as euclidean distance
* Define KNN
    * inputs are a training set (with categorical values, in our case x_train), a test value, and k-the number of nearby values to take into account
    

In [9]:
def euclidean_distance(p1, p2, length):
    distance = 0
    for x in range(length):
        distance += np.square(p1[x] - p2[x])
    return np.sqrt(distance)

In [10]:
def knn(training_data, test_value, k):
 
    distances = {}
    sort = {}
    length = len(test_value)-1
    neighbors = []
    
    # Calculate distance    
    for x in range(len(training_data)):

        dist = euclidean_distance(test_value, training_data.iloc[x], length)
        distances[x] = dist

    # Sorting based on distance
    sorted_d = sorted(distances.items(), key=operator.itemgetter(1))
 
    # Find nearest neighbors
    for x in range(k):
        neighbors.append(sorted_d[x][0])
    classes = {}
    
    # Find the class with the most votes
    for x in range(len(neighbors)):
        response = training_data.iloc[neighbors[x]][-1]
 
        if response in classes:
            classes[response] += 1
        else:
            classes[response] = 1

    # Sort votes, return most common class
    sorted_votes = sorted(classes.items(), key=operator.itemgetter(1), reverse=True)
    return(sorted_votes[0][0], neighbors)

### Run KNN with Test Set

In [11]:
# Setting number of neighbors
k = 7
#runs knn for each value in the test set, printing the prediction and its neighbors
species_predicted = []
for j in range(0, len(x_test)):
    result, neighbors = knn(x_train, x_test.iloc[j], k)
    species_predicted.append(result)
    print(result)
    print(neighbors)

2.0
[39, 114, 14, 53, 71, 10, 96]
1.0
[16, 79, 27, 119, 56, 105, 124]
0.0
[50, 95, 97, 78, 21, 13, 45]
2.0
[101, 81, 44, 60, 28, 75, 98]
0.0
[115, 6, 20, 102, 76, 82, 9]
2.0
[62, 103, 24, 67, 80, 7, 123]
0.0
[76, 82, 74, 102, 115, 84, 9]
1.0
[116, 51, 94, 35, 110, 86, 19]
1.0
[110, 116, 51, 19, 35, 108, 86]
1.0
[124, 105, 4, 15, 16, 104, 89]
2.0
[123, 11, 63, 47, 14, 32, 23]
1.0
[29, 86, 94, 89, 116, 43, 4]
2.0
[1, 43, 96, 10, 31, 109, 14]
1.0
[116, 86, 89, 1, 31, 94, 4]
2.0
[43, 96, 1, 10, 31, 118, 4]
0.0
[74, 76, 82, 121, 20, 99, 115]
1.0
[43, 4, 85, 96, 15, 1, 58]
1.0
[40, 85, 58, 42, 104, 8, 124]
0.0
[59, 70, 83, 122, 38, 18, 52]
0.0
[21, 45, 95, 99, 78, 120, 121]
2.0
[39, 114, 71, 53, 85, 14, 96]
1.0
[8, 85, 48, 58, 117, 40, 15]
0.0
[65, 6, 100, 88, 20, 38, 3]


### Fit Built in Sklearn KNN Classifier

In [12]:
#built in KNN classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(xcp_train, y_train)

# Predicted classes for sklearn KNN
sklearn_predicts = neigh.predict(x_test)

### Compare Built In, Made from Scratch and Actual Classes

In [13]:
predicted = {
    'sklearn': sklearn_predicts, 
    'scratch': species_predicted,
    'actual': y_test
}

In [14]:
dff = pd.DataFrame(predicted)


In [15]:
# Values where sklearn results do not match the home built model
dff[dff['sklearn']!=dff['scratch']]

Unnamed: 0,sklearn,scratch,actual
73,1,2.0,1


In [16]:
# Values where the from scratch model does not match the actual class
dff[dff['scratch']!=dff['actual']]

Unnamed: 0,sklearn,scratch,actual
73,1,2.0,1
63,2,2.0,1


In [17]:
# Values where the sklearn results do not match the actual class
dff[dff['sklearn']!=dff['actual']]

Unnamed: 0,sklearn,scratch,actual
63,2,2.0,1


#### The scratch made KNN method preforms almost as well on the iris data set as the Sklearn model, making only two errors compared to the built in models single errors.