Implementation of k-nearest neighbour by hand and by import from scikit

In [1]:
from collections import Counter
import numpy as np

def accuracy(true_labels, pred_labels):
    accuracy = np.sum(true_labels == pred_labels) / len(true_labels)
    return accuracy

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))


class KNN:
    #init function with takes amount of neighbours to calculate for, default is 3
    def __init__(self, k=3):
        self.k = k
    #save all data and labels to the attributes of the function
    def fit(self, data, labels):
        self.X_train = data
        self.y_train = labels

    def predict(self, data):
        label_pred = [self._predict(x) for x in data]
        return np.array(label_pred)

    def _predict(self, data):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(data, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[: self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]
        # return the most common class label that is present among the k neighbours
        most_common = Counter(k_neighbor_labels).most_common(1)
        #return the first positioned label
        return most_common[0][0]

Now we need to load a dataset from our extracted features.

In [22]:
import matplotlib.pyplot as plt 
import pandas as pd


beer_df = pd.read_csv('/home/steve/Vorlesungen/Artificial_Intelligence_Burghart/KI-Projekt-RKIM22-23/analysis/feature_list_hu120_255.csv')
#Divide the dataset in data columns and one label column
# all but the last column
data = beer_df.iloc[:,:-1]
#convert all cells from str to float, neccessary because of str type labels
data = data.astype(float)
# take just the last column that contains labels
label = beer_df.iloc[:,-1]
#convert to np array
data = np.asarray(data)
label = np.asarray(label)



We split the data and label into test and training data in order to evaluate the accuracy of our classifier.


In [23]:
from sklearn.model_selection import train_test_split 
data_train, data_test, label_train, label_test = train_test_split(data,label, test_size=0.2,
                                                   random_state=42)

print('data_train: ',data_train.shape, 'label_train: ', label_train.shape)
print('data_test: ', data_test.shape, 'label_test: ', label_test.shape)
data_train.shape
#print(data_train)

data_train:  (1052, 8) label_train:  (1052,)
data_test:  (264, 8) label_test:  (264,)


(1052, 8)

Here the feature list of a test can be inserted into data_test and label_test

In [24]:
# test_df = pd.read_csv('/home/steve/Vorlesungen/Artificial_Intelligence_Burghart/KI-Projekt-RKIM22-23/analysis/feature_list_test2.csv')
# #Divide the dataset in data columns and one label column
# # all but the last column
# data_test = test_df.iloc[:,:-1]
# #convert all cells from str to float, neccessary because of str type labels
# data_test = data_test.astype(float)
# # take just the last column that contains labels
# label_test = test_df.iloc[:,-1]
# #convert to np array
# data_test = np.asarray(data_test)
# label_test = np.asarray(label_test)
# print('data_test: ', data_test.shape, 'label_test: ', label_test.shape)


Next Step is to call the function for classification.

In [25]:
#k being the amount of nearest neighbours to look for
for k in range(1,10):
     if k%2:
          KNNclf = KNN(k=k)
          KNNclf.fit(data_train, label_train)
          predictions = KNNclf.predict(data_test)
          print("KNN classification accuracy with",k,"neighbour", accuracy(label_test, predictions))

# for i in range(len(label_test)):
#      print("Real:",label_test[i]," --->Pred:",predictions[i])

KNN classification accuracy with 1 neighbour 0.8484848484848485
KNN classification accuracy with 3 neighbour 0.8257575757575758
KNN classification accuracy with 5 neighbour 0.803030303030303
KNN classification accuracy with 7 neighbour 0.7651515151515151
KNN classification accuracy with 9 neighbour 0.7803030303030303


Implementation of k-nearest Neighbour with SKLearn library

In [17]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(data, label)

In [18]:
from sklearn.metrics import confusion_matrix, classification_report

print("Score is:" ,neigh.score(data_test, label_test))

y_pred_test = neigh.predict(data_test) 

print(confusion_matrix(label_test,y_pred_test)) 
print(classification_report(label_test,y_pred_test))

Score is: 0.8371212121212122
[[66  2  1  1]
 [ 1 45  9  9]
 [ 0 16 45  1]
 [ 1  1  1 65]]
                    precision    recall  f1-score   support

            broken       0.97      0.94      0.96        70
closed_seal_broken       0.70      0.70      0.70        64
     closed_sealed       0.80      0.73      0.76        62
       open_broken       0.86      0.96      0.90        68

          accuracy                           0.84       264
         macro avg       0.83      0.83      0.83       264
      weighted avg       0.84      0.84      0.84       264

