In [1]:
import numpy as np
import pandas as pd

from operator import itemgetter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [4]:
def calculate_minkowski_distance(X: list,
                                 Y: list,
                                 p: int) -> float:
    """Calculates the Minkowski distance between two vectors, X and Y. When p = 1, calculates the Manhattan distance, when p = 2, calculates the Euclidean distance."""

    # Get the number of elements in each vector
    i = len(X)

    # Create empty list to hold differences
    diffs = []

    # Loop over each element in the two vectors and take their elementwise difference. Get absolute value and raise to power of p
    for element in np.arange(0, i):
        diffs.append(np.abs(X[element] - Y[element]) ** p)

    # Sum these absolute differences
    euclidean_distance_sq = sum(diffs)

    # Return the total raised to the power of 1/p
    return euclidean_distance_sq ** (1 / p)

In [5]:
def apply_minkowski_distance(vectors_1: list, vectors_2: list, p: int) -> list:
    """Calculates the pairwise difference between two lists of vectors."""

    # Create empty list to store distances
    distances = []

    # Nested loop to get the difference between every element of list one and list two
    for a in vectors_1:
        tmp_distances = []
        for b in vectors_2:
            tmp_distances.append(calculate_minkowski_distance(a, b, p))
        distances.append(tmp_distances)

    # Reshape the list so every element represents the differences between a test point and all train points
    return [list(x) for x in zip(*distances)]

In [6]:
def calculate_nearest_neighbour(distances: list, labels: list, k: int) -> str:
    """Calculates the k-nearest neighbours for a test point, using k selected neighbours."""

    # Zip together the training labels and distances to a test point and sort by distance.
    sorted_distances = sorted(zip(distances, labels), key=itemgetter(0))[1:]

    # Keep only the top k labels
    top_n_labels = [label for dist, label in sorted_distances][:k]

    # Return the most common label
    return max(set(top_n_labels), key=top_n_labels.count)

In [10]:
def calculate_knn(X_train: list,
                  X_test: list,
                  y_train: list,
                  p: int,
                  k: int) -> list:
    """Calculates the k-nearest neighbours for all points in a test set."""

    # Calculate the pairwise distances between all train and test points
    train_test_distances = apply_minkowski_distance(X_train, X_test, p=p)

    # Calculate the most common labels for each test point
    predicted_labels = []
    for distance_list in train_test_distances:
        predicted_labels += [calculate_nearest_neighbour(distance_list, y_train, k=k)]

    return predicted_labels

In [26]:
beans = pd.read_excel("../data/Dry_Bean_Dataset.xlsx")
bscaler = StandardScaler()

In [27]:
def calculate_samples(cols: list, rows: int, scaler: StandardScaler) -> tuple:
    # Prepare random sample with selected columns and row count
    sample_df = beans[cols].sample(n=rows, random_state=456)
    X = sample_df.drop(columns=["Class"])
    y = sample_df["Class"]

    # Create test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=456)

    # Scale features
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

In [36]:
X_train_small, X_test_small, y_train_small, y_test_small = calculate_samples(
    ["MajorAxisLength", "MinorAxisLength", "roundness", "Class"], 400, bscaler)

In [38]:
%%time
y_pred_small = calculate_knn(X_train_small.tolist(),
                             X_test_small.tolist(),
                             y_train_small.to_list(),
                             p=2,
                             k=3)

CPU times: user 184 ms, sys: 23.6 ms, total: 208 ms
Wall time: 193 ms


In [39]:
X_train_med, X_test_med, y_train_med, y_test_med = calculate_samples(
    ["MajorAxisLength", "MinorAxisLength", "roundness", "Class"], len(beans), bscaler)

In [40]:
%%time
y_pred_med = calculate_knn(X_train_med.tolist(),
                           X_test_med.tolist(),
                           y_train_med.to_list(),
                           p=2,
                           k=3)

CPU times: user 3min 47s, sys: 4.01 s, total: 3min 51s
Wall time: 3min 51s


In [41]:
print(accuracy_score(y_test_med, y_pred_med))

0.9003820158683514


In [43]:
X_train_large, X_test_large, y_train_large, y_test_large = calculate_samples(
    beans.columns.to_list(), len(beans), bscaler)

In [44]:
%%time
y_pred_large = calculate_knn(X_train_large.tolist(),
                             X_test_large.tolist(),
                             y_train_large.to_list(),
                             p=2,
                             k=3)

CPU times: user 12min 39s, sys: 31 s, total: 13min 10s
Wall time: 12min 49s
