Implementation of k-nearest neighbour by hand and by import from scikit

In [9]:
from collections import Counter
import numpy as np

def accuracy(true_labels, pred_labels):
    accuracy = np.sum(true_labels == pred_labels) / len(true_labels)
    return accuracy

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))


class KNN:
    #init function with takes amount of neighbours to calculate for, default is 3
    def __init__(self, k=3):
        self.k = k
    #save all data and labels to the attributes of the function
    def fit(self, data, labels):
        self.X_train = data
        self.y_train = labels

    def predict(self, data):
        label_pred = [self._predict(x) for x in data]
        return np.array(label_pred)

    def _predict(self, data):
        # Compute distances between x and all examples in the training set
        distances = [euclidean_distance(data, x_train) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[: self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]
        # return the most common class label that is present among the k neighbours
        most_common = Counter(k_neighbor_labels).most_common(1)
        #return the first positioned label
        return most_common[0][0]

Now we need to load a dataset from our extracted features.

In [10]:
import matplotlib.pyplot as plt 
import pandas as pd


beer_df = pd.read_csv('/home/steve/Vorlesungen/Artificial_Intelligence_Burghart/KI-Projekt-RKIM22-23/analysis/feature_list_train.csv')
#Divide the dataset in data columns and one label column
# all but the last column
data = beer_df.iloc[:,:-1]
#convert all cells from str to float, neccessary because of str type labels
data = data.astype(float)
# take just the last column that contains labels
label = beer_df.iloc[:,-1]
#convert to np array
data = np.asarray(data)
label = np.asarray(label)



We split the data and label into test and training data in order to evaluate the accuracy of our classifier.


In [11]:
from sklearn.model_selection import train_test_split 
data_train, data_test, label_train, label_test = train_test_split(data,label, test_size=0.00001,
                                                   random_state=42)

print('data_train: ',data_train.shape, 'label_train: ', label_train.shape)
print('data_test: ', data_test.shape, 'label_test: ', label_test.shape)
data_train.shape
#print(data_train)

data_train:  (1315, 7) label_train:  (1315,)
data_test:  (1, 7) label_test:  (1,)


(1315, 7)

Here the feature list of a test can be inserted into data_test and label_test

In [12]:
test_df = pd.read_csv('/home/steve/Vorlesungen/Artificial_Intelligence_Burghart/KI-Projekt-RKIM22-23/analysis/feature_list_test2.csv')
#Divide the dataset in data columns and one label column
# all but the last column
data_test = test_df.iloc[:,:-1]
#convert all cells from str to float, neccessary because of str type labels
data_test = data_test.astype(float)
# take just the last column that contains labels
label_test = test_df.iloc[:,-1]
#convert to np array
data_test = np.asarray(data_test)
label_test = np.asarray(label_test)
print('data_test: ', data_test.shape, 'label_test: ', label_test.shape)


data_test:  (4, 7) label_test:  (4,)


Next Step is to call the function for classification.

In [13]:
#k being the amount of nearest neighbours to look for
for k in range(1,10):
     if k%2:
          KNNclf = KNN(k=k)
          KNNclf.fit(data_train, label_train)
          predictions = KNNclf.predict(data_test)
          print("KNN classification accuracy with",k,"neighbour", accuracy(label_test, predictions))

for i in range(len(label_test)):
     
     print("Real:",label_test[i]," --->Pred:",predictions[i])

KNN classification accuracy with 1 neighbour 0.25
KNN classification accuracy with 3 neighbour 0.25
KNN classification accuracy with 5 neighbour 0.25
KNN classification accuracy with 7 neighbour 0.25
KNN classification accuracy with 9 neighbour 0.25
Real: closed_seal_broken  --->Pred: open_broken
Real: broken  --->Pred: broken
Real: closed_sealed  --->Pred: closed_seal_broken
Real: open_broken  --->Pred: closed_seal_broken


Implementation of k-nearest Neighbour with SKLearn library

In [14]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(data, label)

In [15]:
from sklearn.metrics import confusion_matrix, classification_report

print("Score is:" ,neigh.score(data_test, label_test))

y_pred_test = neigh.predict(data_test) 

print(confusion_matrix(label_test,y_pred_test)) 
print(classification_report(label_test,y_pred_test))

Score is: 0.25
[[1 0 0 0]
 [0 0 0 1]
 [0 1 0 0]
 [0 1 0 0]]
                    precision    recall  f1-score   support

            broken       1.00      1.00      1.00         1
closed_seal_broken       0.00      0.00      0.00         1
     closed_sealed       0.00      0.00      0.00         1
       open_broken       0.00      0.00      0.00         1

          accuracy                           0.25         4
         macro avg       0.25      0.25      0.25         4
      weighted avg       0.25      0.25      0.25         4



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
