# Testing different functions and implementations
___
1. Euclidean distance implementation
2. Manhattan distance implementation
3. Testing KNN classification with leave-one-out cross-validation using SKLEARN

In [27]:
# imports
import numpy as np
from scipy.io import arff
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [3]:
# dataset
dataset = arff.loadarff('ionosphere.arff')
dataset = pd.DataFrame(dataset[0])
dataset = dataset.to_numpy()
X = dataset[:, :-1].astype(np.float64) # cast datatype from 'float' to 'np.float64' (seems to be faster)
y = dataset[:, -1]
print(X.shape)
print(y.shape)
del dataset

(351, 34)
(351,)


## Testing Euclidean distance implementation
___
Conclusion: euclidean6() seems to be best.

In [4]:
# Euclidean functions
def euclidean1(x1, x2):
    return np.sqrt(np.sum((x2 - x1)**2))

def euclidean2(x1, x2):
    diff = x2 - x1
    return np.sqrt(np.sum((diff)**2))

def euclidean3(x1, x2):
    return np.sqrt(np.sum(np.square(x2 - x1)))

def euclidean4(x1, x2):
    diff = x2 - x1
    return np.sqrt(np.sum(np.square(diff)))

def euclidean5(x1, x2):
    return np.sqrt(np.dot(x2 - x1, x2 - x1))

def euclidean6(x1, x2):
    diff = x2-x1
    return np.sqrt(np.dot(diff, diff))

def euclidean7(x1, x2):
    return np.sqrt(np.inner(x2 - x1, x2 - x1))

def euclidean8(x1, x2):
    diff = x2-x1
    return np.sqrt(np.inner(diff, diff))

def euclidean9(x1, x2):
    diff = x2-x1
    return np.sqrt(np.einsum('i,i->', diff, diff))

# Output of functions
m = X.shape[0] # number of examples
x1 = X[np.random.randint(0, m)]
x2 = X[np.random.randint(0, m)]
print('np.linalg.norm():', np.linalg.norm(x2-x1))
print('euclidean1:', euclidean1(x1, x2))
print('euclidean2:', euclidean2(x1, x2))
print('euclidean3:', euclidean3(x1, x2))
print('euclidean4:', euclidean4(x1, x2))
print('euclidean5:', euclidean5(x1, x2))
print('euclidean6:', euclidean6(x1, x2))
print('euclidean7:', euclidean7(x1, x2))
print('euclidean8:', euclidean8(x1, x2))
print('euclidean9:', euclidean9(x1, x2))

np.linalg.norm(): 4.424055650802779
euclidean1: 4.424055650802779
euclidean2: 4.424055650802779
euclidean3: 4.424055650802779
euclidean4: 4.424055650802779
euclidean5: 4.424055650802779
euclidean6: 4.424055650802779
euclidean7: 4.424055650802779
euclidean8: 4.424055650802779
euclidean9: 4.424055650802779


In [5]:
print('np.linalg.norm(): ', end='')
%timeit diff = x2 - x1; np.linalg.norm(diff)
print('euclidean1: ', end='')
%timeit euclidean1(x2, x1)
print('euclidean2: ', end='')
%timeit euclidean2(x2, x1)
print('euclidean3: ', end='')
%timeit euclidean3(x2, x1)
print('euclidean4: ', end='')
%timeit euclidean4(x2, x1)
print('euclidean5: ', end='')
%timeit euclidean5(x2, x1)
print('euclidean6: ', end='')
%timeit euclidean6(x2, x1)
print('euclidean7: ', end='')
%timeit euclidean7(x2, x1)
print('euclidean8: ', end='')
%timeit euclidean8(x2, x1)
print('euclidean9: ', end='')
%timeit euclidean9(x2, x1)

np.linalg.norm(): 11.3 µs ± 439 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean1: 14.3 µs ± 349 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean2: 14 µs ± 572 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean3: 13.3 µs ± 58.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean4: 14.3 µs ± 1.05 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean5: 8.97 µs ± 341 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean6: 6.77 µs ± 27.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean7: 8.93 µs ± 44.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean8: 6.98 µs ± 55.5 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
euclidean9: 8.84 µs ± 68.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Testing Manhattan distance implementation
___
Conclusion: manhattan1 works fine. manhattan2 is probably calculated the exact same way.

In [9]:
def manhattan1(x1, x2):
    return np.sum(np.abs(x2 - x1))

def manhattan2(x1, x2):
    diff = x2 - x1
    return np.sum(np.abs(diff))

# Output of functions
print(np.linalg.norm(x2 - x1, ord=1))
print(manhattan1(x1, x2))
print(manhattan2(x1, x2))

21.679720000000003
21.679720000000003
21.679720000000003


In [10]:
%timeit diff = x2 - x1; np.linalg.norm(diff, ord=1)
%timeit manhattan1(x1, x2)
%timeit manhattan2(x1, x2)

5.89 µs ± 335 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
4.84 µs ± 130 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
4.89 µs ± 362 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


## Sklearn KNN Classifier
___
I think this is how they want us to calculate accuracy. Train on all except 1 example, then predict on that single test example. Repeat for all possible splits. There should be m splits where m is the number of examples in the dataset.

In [19]:
# sklearn doesn't like y labels of type 'bytes'
y = y.astype('str')

In [20]:
# No feature scaling
def KNNclassifierWithLOO(X, y, k):
    # Leave one out cross-validation
    loo = LeaveOneOut()
    loo.get_n_splits(X)

    # KNN classifier
    correct_count = 0;
    total_count = y.shape[0];
    classifier = KNeighborsClassifier(n_neighbors = k)

    # Train on each split and predict on the single test example
    for train_index, test_index in loo.split(X):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        if y_pred == y_test:
            correct_count += 1

    #print(correct_count, 'out of', total_count, 'correctly classified')
    #print('accuracy:', correct_count / total_count)
    return correct_count / total_count

In [21]:
accuracy = KNNclassifierWithLOO(X, y, 3) # 3 nearest neighbours
print('accuracy:', accuracy)

accuracy: 0.8490028490028491


In [22]:
%timeit KNNclassifierWithLOO(X, y, 3)

836 ms ± 38.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
# With standard scaler
def KNNclassifierWithLOOandStandardScaler(X, y, k):
    # Leave one out cross-validation
    loo = LeaveOneOut()
    loo.get_n_splits(X)

    # KNN classifier
    correct_count = 0;
    total_count = y.shape[0];
    classifier = KNeighborsClassifier(n_neighbors = k)
    scaler = StandardScaler()

    # Train on each split and predict on the single test example
    for train_index, test_index in loo.split(X):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        if y_pred == y_test:
            correct_count += 1

    #print(correct_count, 'out of', total_count, 'correctly classified')
    #print('accuracy:', correct_count / total_count)
    return correct_count / total_count

In [24]:
accuracy = KNNclassifierWithLOOandStandardScaler(X, y, 3)
print('accuracy:', accuracy)

accuracy: 0.8461538461538461


In [25]:
# With min max scaler
def KNNclassifierWithLOOandMinMaxScaler(X, y, k):
    # Leave one out cross-validation
    loo = LeaveOneOut()
    loo.get_n_splits(X)

    # KNN classifier
    correct_count = 0;
    total_count = y.shape[0];
    classifier = KNeighborsClassifier(n_neighbors = k)
    scaler = MinMaxScaler()

    # Train on each split and predict on the single test example
    for train_index, test_index in loo.split(X):
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        if y_pred == y_test:
            correct_count += 1

    #print(correct_count, 'out of', total_count, 'correctly classified')
    #print('accuracy:', correct_count / total_count)
    return correct_count / total_count

In [26]:
accuracy = KNNclassifierWithLOOandMinMaxScaler(X, y, 3)
print('accuracy:', accuracy)

accuracy: 0.8603988603988604


In [None]:
n_neighbours = [1, 3, 5, 7, 9, 11, 13, 15]
acc1 = []
acc2 = []
acc3 = []

for k in n_neighbours:
    acc1.append(KNNclassifierWithLOO(X, y, k))
    acc2.append(KNNclassifierWithLOOandStandardScaler(X, y, k))
    acc3.append(KNNclassifierWithLOOandMinMaxScaler(X, y, k))

plt.plot(n_neighbours, acc1, 'r-', label='No scaling')
plt.plot(n_neighbours, acc2, 'b-', label='Standard scaler')
plt.plot(n_neighbours, acc3, 'g-', label='MinMax scaler')
plt.xlabel('k')
plt.ylabel('accuracy')
plt.xticks(n_neighbours)
plt.legend()
plt.show()