# Title

## Imports

In [None]:
%%capture
!pip install category_encoders==2.*

In [None]:
import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## My Classifier Algorithm

In [None]:
class TE_NearestNeighbor:
    def __init__(self, n_neighbors=5, X_train=None, y_train=None):
        self.k = n_neighbors

    def calc_distance(self, vec_a, vec_b):
        return np.linalg.norm(vec_a - vec_b)
        
    def calc_labels(self, x):
        distance_list = list()
        # For every entry in X_train
        for x_train in self.X_train:
            # Calculate distance between x (parameter) and each entry
            distance_list.append(self.calc_distance(x,x_train)) 
        
        # A list of length self.k of the indices of the closest points to
        # input x. 
        k_index = np.argsort(distance_list)[:self.k]
        k_labels = [self.y_train[idx] for idx in k_index]
        # Return labels of k_neighbors
        return k_labels

    def calc_most_common_label(self, arr):
        # Get the maximum number of class labels
        maximum = len(set(self.y_train))
        # list comprehension to create a buckets array
        buckets = [0 for i in range(maximum + 1)]
        # Count the labels and increment the buckets
        for value in arr:
            buckets[value] += 1
        # Return the index of the largest bucket
        # AKA return the label with the highest occurrence in the list
        most_common_label = buckets.index(max(buckets))
        
        return most_common_label

    def point_predict(self, x):
        # Perform prediction routine on a single point
        # Get labels of k neighbors
        k_labels = self.calc_labels(x)
        # Find most common label
        predicted_label = self.calc_most_common_label(k_labels)
        return predicted_label

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        # Perform self.point_predict on full array
        # Return results in an np.array
        labels = np.array([self.point_predict(x) for x in X_test])
        return labels

## Data Wrangle


In [None]:
# Import Iris-Setosa Dataset from SKL
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
X, y =  iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=20)

In [None]:
type(y_test)

numpy.ndarray

## TE_KNN Implementation

In [None]:
knn = TE_NearestNeighbor(n_neighbors=3)
knn.fit(X_train, y_train)
predicted_labels = knn.predict(X_test)

In [None]:
my_accuracy = accuracy_score(y_test, predicted_labels)
print(my_accuracy)

0.9210526315789473


## SKL_KNN Implementation

In [None]:
skl_knn = KNeighborsClassifier(n_neighbors=3)
skl_knn.fit(X_train, y_train)
skl_predictions = skl_knn.predict(X_test)

In [None]:
skl_accuracy = accuracy_score(y_test, skl_predictions)
print(skl_accuracy)

0.9210526315789473


## Comparison

For a given k, my implementation matches the accuracy score of the scikit-learn implementation of the nearest neighbor algorithm. 