# KNN Classifier from Scratch

* Built with pandas and numpy and compared to sklearn KNN classifier
* Using the Iris Dataset

In [1]:
import pandas as pd
import numpy as np
import math
import operator
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import warnings
warnings.filterwarnings("ignore")

### Load and View Data

In [2]:
iris = load_iris()

In [3]:
# Convert sklearn dataset to pandas dataframe
data = []
def sklearn_to_df(sklearn_dataset, data):
    data = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
    data['target'] = pd.Series(sklearn_dataset.target)
    return data

In [4]:
data = sklearn_to_df(iris, data)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### Data Processing

* Split x(sepal, petal length and width) from y(the species)
* Split data 85/15 train/test

In [5]:
x = data.drop(['target'], axis = 1)
y = data['target']

In [6]:
# Split data into train and test
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.85, test_size = 0.15, random_state = 0)

In [7]:
# Training set for built from scratch model
x_train['target'] = y_train
x_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
8,4.4,2.9,1.4,0.2,0
126,6.2,2.8,4.8,1.8,2
22,4.6,3.6,1.0,0.2,0
44,5.1,3.8,1.9,0.4,0
97,6.2,2.9,4.3,1.3,1


In [8]:
# training set for Sklearn model. Sklearn takes feature set and class set as seperate inputs, so the feature set, xcp
# cannot contain the classifications of the values
xsk_train = x_train.drop(['target'], axis = 1)
xsk_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
8,4.4,2.9,1.4,0.2
126,6.2,2.8,4.8,1.8
22,4.6,3.6,1.0,0.2
44,5.1,3.8,1.9,0.4
97,6.2,2.9,4.3,1.3


### Create KNN
* Define distance function as euclidean distance
* Define KNN
    * inputs are a training set (with categorical values, in our case x_train), a test value, and k-the number of nearby values to take into account
    

In [9]:
def euclidean_distance(x1, x2, length):
    distance = 0
    for i in range(length):
        distance += np.square(x1[i] - x2[i])
    return np.sqrt(distance)

In [10]:
def knn(training_data, test_values, k, verbose = False):
    
    predicted_species = []
    neighbors_list = []
    
    # Run through all values in the test set
    for j in range(len(test_values)):
        distances = {}
        sort = {}
        length = len(test_values.iloc[j])-1
        neighbors = []
        
        # Calculate distances between test value and all training values
        for m in range(len(training_data)):
            dist = euclidean_distance(test_values.iloc[j], training_data.iloc[m], length)
            distances[m] = dist

        # Sort distances
        sorted_distance = sorted(distances.items(), key=operator.itemgetter(1))

        # Find nearest k neighbors
        for n in range(k):
            neighbors.append(sorted_distance[n][0])
        classes = {}

        # Tally the 'votes' or count of apperances of each class in the k nearest neighbors
        for p in range(len(neighbors)):
            response = training_data.iloc[neighbors[p]][-1]

            if response in classes:
                classes[response] += 1
            else:
                classes[response] = 1

        # Sort 'votes', store most common class (the prediction) and the list of neighboring values
        sorted_votes = sorted(classes.items(), key=operator.itemgetter(1), reverse=True)
        predicted_species.append(sorted_votes[0][0])
        neighbors_list.append(neighbors)
        
        # If specified by the function call, print the predicted value for each test option, as well as all the neighbors
        # that contributed to the decision
        if verbose == True:
            print(f'Predicted Category: {sorted_votes[0][0]}. Neighbors: {neighbors}')
            
    # Return the predicitons and the neighbors
    return predicted_species, neighbors_list

### Run KNN with Test Set

In [11]:
k = 7
predicted_species, neighbor_list = knn(x_train, x_test, k, verbose = True)

Predicted Category: 2.0. Neighbors: [39, 114, 14, 53, 71, 10, 96]
Predicted Category: 1.0. Neighbors: [16, 79, 27, 119, 56, 105, 124]
Predicted Category: 0.0. Neighbors: [50, 95, 97, 78, 21, 13, 45]
Predicted Category: 2.0. Neighbors: [101, 81, 44, 60, 28, 75, 98]
Predicted Category: 0.0. Neighbors: [115, 6, 20, 102, 76, 82, 9]
Predicted Category: 2.0. Neighbors: [62, 103, 24, 67, 80, 7, 123]
Predicted Category: 0.0. Neighbors: [76, 82, 74, 102, 115, 84, 9]
Predicted Category: 1.0. Neighbors: [116, 51, 94, 35, 110, 86, 19]
Predicted Category: 1.0. Neighbors: [110, 116, 51, 19, 35, 108, 86]
Predicted Category: 1.0. Neighbors: [124, 105, 4, 15, 16, 104, 89]
Predicted Category: 2.0. Neighbors: [123, 11, 63, 47, 14, 32, 23]
Predicted Category: 1.0. Neighbors: [29, 86, 94, 89, 116, 43, 4]
Predicted Category: 2.0. Neighbors: [1, 43, 96, 10, 31, 109, 14]
Predicted Category: 1.0. Neighbors: [116, 86, 89, 1, 31, 94, 4]
Predicted Category: 2.0. Neighbors: [43, 96, 1, 10, 31, 118, 4]
Predicted Ca

### Fit Built in Sklearn KNN Classifier

In [12]:
#Sklearn KNN classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7)
neigh.fit(xsk_train, y_train)

# Predicted classes for sklearn KNN
sklearn_predictions = neigh.predict(x_test)

### Compare Built In, Made from Scratch and Actual Classes

In [13]:
# Create dataframe for comparison of both models and actual values
predicted = {
    'sklearn': sklearn_predictions, 
    'scratch': predicted_species,
    'actual': y_test
}
df_comparison = pd.DataFrame(predicted)

In [14]:
# Values where sklearn results do not match the home built model
df_comparison[df_comparison['sklearn']!=df_comparison['scratch']]

Unnamed: 0,sklearn,scratch,actual
73,1,2.0,1


In [15]:
# Values where the from scratch model does not match the actual class
df_comparison[df_comparison['scratch']!=df_comparison['actual']]

Unnamed: 0,sklearn,scratch,actual
73,1,2.0,1
63,2,2.0,1


In [16]:
# Values where the sklearn results do not match the actual class
df_comparison[df_comparison['sklearn']!=df_comparison['actual']]

Unnamed: 0,sklearn,scratch,actual
63,2,2.0,1


#### The scratch made KNN method preforms almost as well on the iris data set as the Sklearn model, making only two errors compared to sklearn's models single error.