# Exercise 20

## a)
The k-NN algorithm only works if all attributes have a similar magnitude, since comparing distances
is only effective the attributes are in the same order of magnitude.

## b)
The k-NN is called a "lazy learner", because it only memorizes the training data. There is no learning
of a discriminative function, so the algorithm will inherently be slower in the application phase.
However there isn't any learning phase and time is saved here. Generally speaking, these "lazy learners"
will be slower than smarter algorithms, e.g. random forest.

## c)

In [1]:
import numpy as np
from collections import Counter
from scipy.spatial import distance

class KNN:
    '''KNN Classifier.

    Attributes
    ----------
    k : int
        Number of neighbors to consider.
    '''
    def __init__(self, k):
        '''Initialization.
        Parameters are stored as member variables/attributes.
        
        Parameters
        ----------
        k : int
            Number of neighbors to consider.
        '''
        self.k = k

    def fit(self, X, y):
        '''Fit routine.
        Training data is stored within object.
        
        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_attributes)
            Training data.
        y : numpy.array shape=(n_samples)
            Training labels.
        '''
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        '''Prediction routine.
        Predict class association of each sample of X.
        
        Parameters
        ----------
        X : numpy.array, shape=(n_samples, n_attributes)
            Data to classify.
        
        Returns
        -------
        prediction : numpy.array, shape=(n_samples)
            Predictions, containing the predicted label of each sample.
        '''
        prediction = []
        for sample in X:
            distances=np.zeros(len(self.X_train))
            for i in range(len(distances)):
                #distances[i] = np.linalg.norm(sample-self.X_train[i])
                distances[i] = distance.euclidean(sample,self.X_train[i])
            indices = np.argsort(distances)
            keys = []
            for k in range(self.k):
                keys.append(self.y_train[indices[k]])
            occurence_count = Counter(keys)
            label = occurence_count.most_common(1)[0][0]
            prediction.append(label)
        return prediction

## d)

In [2]:
import pandas as pd

signal = pd.read_hdf('NeutrinoMC.hdf5', key='Signal')
background = pd.read_hdf('NeutrinoMC.hdf5', key='Background')

In [3]:
signals = signal[["NumberOfHits", "x", "y"]][signal["x"].notna() * signal["y"].notna() * signal["NumberOfHits"].notna()]
background = background[["NumberOfHits", "x", "y"]]
events = pd.concat([signals, background])
labels = ["Signal"]*len(signals)
labels.extend(["Background"]*len(background))

X_train = np.array(pd.concat([events[:2500], events[30000:32500]]))
y_train = labels[:2500]
y_train.extend(labels[30000:32500])
y_train = np.array(y_train)

X_test = np.array(pd.concat([events[:10000], events[40000:60000]]))
y_test = labels[:10000]
y_test.extend(labels[40000:60000])
y_test = np.array(y_test)

classification_10 = KNN(k = 10)
classification_10.fit(X_train, y_train)
result = classification_10.predict(X_test)

Calculating recall, precision and significance:

In [4]:
tp = len(y_test[(y_test==result)*(y_test=='Signal')])
fp = len(y_test[(y_test!=result)*(y_test=='Signal')])
tn = len(y_test[(y_test==result)*(y_test=='Background')])
fn = len(y_test[(y_test!=result)*(y_test=='Background')])

print(f'Precision: {tp/(tp+fp)}')
print(f'Recall: {tp/(tp+fn)}')
print(f'Siginificance: {(tp+tn)/(tp+tn+fn+fp)}')

Precision: 0.9609
Recall: 0.8287907538381921
Siginificance: 0.9208


## e)

In [5]:
X_train_log10 = X_train
X_train_log10[:,0] = np.log10(X_train_log10[:,0])

X_test_log10 = X_test
X_test_log10[:,0] = np.log10(X_test_log10[:,0])

classification_log10 = KNN(k = 10)
classification_log10.fit(X_train_log10, y_train)
result_log = classification_log10.predict(X_test_log10)

tp = len(y_test[(y_test==result_log)*(y_test=='Signal')])
fp = len(y_test[(y_test!=result_log)*(y_test=='Signal')])
tn = len(y_test[(y_test==result_log)*(y_test=='Background')])
fn = len(y_test[(y_test!=result_log)*(y_test=='Background')])

print(f'Precision log10: {tp/(tp+fp)}')
print(f'Recall log10: {tp/(tp+fn)}')
print(f'Siginificance log10: {(tp+tn)/(tp+tn+fn+fp)}')

Precision log10: 0.9829
Recall log10: 0.8616638905934952
Siginificance log10: 0.9417


Precision, Recall and Significance all slightly increase, overall the result improves.

## f)

In [7]:
classification_k20 = KNN(k = 20)
classification_k20.fit(X_train, y_train)
result_k20 = classification_k20.predict(X_test)

tp = len(y_test[(y_test==result_k20)*(y_test=='Signal')])
fp = len(y_test[(y_test!=result_k20)*(y_test=='Signal')])
tn = len(y_test[(y_test==result_k20)*(y_test=='Background')])
fn = len(y_test[(y_test!=result_k20)*(y_test=='Background')])

print(f'Precision k=20: {tp/(tp+fp)}')
print(f'Recall k=20: {tp/(tp+fn)}')
print(f'Siginificance k=20: {(tp+tn)/(tp+tn+fn+fp)}')

Precision k=20: 0.9861
Recall k=20: 0.8489883770985794
Siginificance k=20: 0.9369


The results also improve, but not quite as much as with the log10 tranformation.