# Implementation of k-nearest neighbour 
First implemented by 'hand' and by import from scikit learn

In [37]:
from collections import Counter
import numpy as np

def accuracy(true_labels, pred_labels):
    accuracy = np.sum(true_labels == pred_labels) / len(true_labels)
    return accuracy

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))


class KNN:
    #init function with takes amount of neighbours to calculate for, default is 3
    def __init__(self, k=3):
        self.k = k
    #save all data and labels to the attributes of the function
    def fit(self, data, labels):
        self.X_train = data
        self.y_train = labels

    def predict(self, data):
        label_pred = [self._predict(x) for x in data]
        return np.array(label_pred)

    def _predict(self, data):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(data, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[: self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]
        # return the most common class label that is present among the k neighbours
        most_common = Counter(k_neighbor_labels).most_common(1)
        #return the first positioned label
        return most_common[0][0]

Now we need to load a dataset from our extracted features.

In [38]:
import matplotlib.pyplot as plt 
import pandas as pd
path = 'analysis/feature_list.csv'
path_test = 'analysis/feature_list_test.csv'
beer_df = pd.read_csv(path)

# Divide the dataset in data columns and one label column
#all but the last column
data = beer_df.iloc[:,:-1]

#convert all cells from str to float, neccessary because of str type labels
data = data.astype(float)

# take just the last column that contains labels
label = beer_df.iloc[:,-1]

#convert to np array
data = np.asarray(data)
label = np.asarray(label)



We split the data and label into test and training data in order to evaluate the accuracy of our classifier.


In [39]:
from sklearn.model_selection import train_test_split 
data_train, data_test, label_train, label_test = train_test_split(data,label, test_size=0.2,
                                                   random_state=42)

print('data_train: ',data_train.shape, 'label_train: ', label_train.shape)
print('data_test: ', data_test.shape, 'label_test: ', label_test.shape)
data_train.shape


data_train:  (1052, 8) label_train:  (1052,)
data_test:  (264, 8) label_test:  (264,)


(1052, 8)

Next Step is to call the function for classification and compare how different amount of neighbours performs.

In [40]:
#k being the amount of nearest neighbours to look for
for k in range(1,10):
     if k%2:
          KNNclf = KNN(k=k)
          KNNclf.fit(data_train, label_train)
          predictions = KNNclf.predict(data_test)
          print("KNN classification accuracy with",k,"neighbour", accuracy(label_test, predictions))


KNN classification accuracy with 1 neighbour 0.821969696969697
KNN classification accuracy with 3 neighbour 0.803030303030303
KNN classification accuracy with 5 neighbour 0.7992424242424242
KNN classification accuracy with 7 neighbour 0.7954545454545454
KNN classification accuracy with 9 neighbour 0.7840909090909091


In [42]:
from sklearn.metrics import confusion_matrix, classification_report
KNNclf = KNN(3)
KNNclf.fit(data_train, label_train)
predictions = KNNclf.predict(data_test)

print("Own KNN implementation score with",3,"neighbours is: ", accuracy(label_test, predictions))
y_pred_test = predictions 
print(confusion_matrix(label_test,y_pred_test)) 
print(classification_report(label_test,y_pred_test))

Own KNN implementation score with 3 neighbours is:  0.803030303030303
[[66  0  2  2]
 [ 0 43 18  3]
 [ 0 21 38  3]
 [ 0  2  1 65]]
                    precision    recall  f1-score   support

            broken       1.00      0.94      0.97        70
closed_seal_broken       0.65      0.67      0.66        64
     closed_sealed       0.64      0.61      0.63        62
       open_broken       0.89      0.96      0.92        68

          accuracy                           0.80       264
         macro avg       0.80      0.80      0.80       264
      weighted avg       0.80      0.80      0.80       264



Implementation of k-nearest Neighbour with sklearn library

In [43]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data, label)

In [44]:
print("sklearn score is:" ,neigh.score(data_test, label_test))

y_pred_test = neigh.predict(data_test) 

print(confusion_matrix(label_test,y_pred_test)) 
print(classification_report(label_test,y_pred_test))

sklearn score is: 0.8939393939393939
[[68  0  1  1]
 [ 0 53  8  3]
 [ 0 13 49  0]
 [ 0  1  1 66]]
                    precision    recall  f1-score   support

            broken       1.00      0.97      0.99        70
closed_seal_broken       0.79      0.83      0.81        64
     closed_sealed       0.83      0.79      0.81        62
       open_broken       0.94      0.97      0.96        68

          accuracy                           0.89       264
         macro avg       0.89      0.89      0.89       264
      weighted avg       0.89      0.89      0.89       264



Here by uncommenting a feature list of a test can be inserted into data_test and label_test

In [33]:
test_df = pd.read_csv(path_test)

#Divide the dataset in data columns and one label column
# all but the last column
data_test = test_df.iloc[:,:-1]

#convert all cells from str to float, neccessary because of str type labels
data_test = data_test.astype(float)

# take just the last column that contains labels
label_test = test_df.iloc[:,-1]

#convert to np array
data_test = np.asarray(data_test)
label_test = np.asarray(label_test)
print('data_test: ', data_test.shape, 'label_test: ', label_test.shape)

data_test:  (4, 8) label_test:  (4,)


Prediction with sklearn

In [36]:
y_pred = neigh.predict(data_test)
y_pred_self = KNNclf.predict(data_test)
print('Sklearn prediced classes: ', y_pred)
print('Own prediced classes: ', y_pred_self)
print('Real classes',label_test)

Sklearn prediced classes:  ['broken' 'broken' 'closed_seal_broken' 'broken']
Own prediced classes:  ['broken' 'broken' 'closed_seal_broken' 'broken']
Real classes ['closed_seal_broken' 'broken' 'closed_sealed' 'open_broken']
