In [12]:
import numpy as np
import pandas as pd

from operator import itemgetter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [13]:
def calculate_minkowski_distance(X: np.array,
                                 Y: np.array,
                                 p: int
                                 ) -> float:
    """Calculates the Minkowski distance between two vectors, X and Y. When p = 1, calculates the Manhattan distance, when p = 2, calculates the Euclidean distance."""

    # Calculate a vector subtraction between two vectors, takes the absolute value of each element in this difference vector, then raises each element to the power of p
    diffs = np.power(np.abs(X - Y), p)

    # Sums all elements in the vector
    euclidean_distance_sq = sum(diffs)

    # Returns this sum raised to the power of (1/p)
    return euclidean_distance_sq ** (1 / p)

In [14]:
def apply_minkowski_distance(vectors_1: list, vectors_2: list, p: int) -> list:
    """Calculates the pairwise difference between two lists of vectors."""

    # Create empty list to store distances
    distances = []

    # Nested loop to get the difference between every element of list one and list two
    for a in vectors_1:
        tmp_distances = []
        for b in vectors_2:
            tmp_distances.append(calculate_minkowski_distance(a, b, p))
        distances.append(tmp_distances)

    # Reshape the list so every element represents the differences between a test point and all train points
    return [list(x) for x in zip(*distances)]

In [15]:
def calculate_nearest_neighbour(distances: list, labels: list, k: int) -> str:
    """Calculates the k-nearest neighbours for a test point, using k selected neighbours."""

    # Zip together the training labels and distances to a test point and sort by distance.
    sorted_distances = sorted(zip(distances, labels), key=itemgetter(0))[1:]

    # Keep only the top k labels
    top_n_labels = [label for dist, label in sorted_distances][:k]

    # Return the most common label
    return max(set(top_n_labels), key=top_n_labels.count)

In [16]:
def calculate_knn(X_train: list,
                  X_test: list,
                  y_train: list,
                  p: int,
                  k: int) -> list:
    """Calculates the k-nearest neighbours for all points in a test set."""

    # Calculate the pairwise distances between all train and test points
    train_test_distances = apply_minkowski_distance(X_train, X_test, p=p)

    # Calculate the most common labels for each test point
    predicted_labels = []
    for distance_list in train_test_distances:
        predicted_labels += [calculate_nearest_neighbour(distance_list, y_train, k=k)]

    return predicted_labels

In [17]:
beans = pd.read_excel("../data/Dry_Bean_Dataset.xlsx")
bscaler = StandardScaler()

In [18]:
def calculate_samples(cols: list, rows: int, scaler: StandardScaler) -> tuple:
    # Prepare random sample with selected columns and row count
    sample_df = beans[cols].sample(n=rows, random_state=456)
    X = sample_df.drop(columns=["Class"])
    y = sample_df["Class"]

    # Create test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=456)

    # Scale features
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

In [19]:
X_train_small, X_test_small, y_train_small, y_test_small = calculate_samples(
    ["MajorAxisLength", "MinorAxisLength", "roundness", "Class"], 400, bscaler)

In [20]:
%%time
y_pred_small = calculate_knn(X_train_small,
                             X_test_small,
                             y_train_small.to_list(),
                             p=2,
                             k=3)

CPU times: user 171 ms, sys: 3.9 ms, total: 175 ms
Wall time: 174 ms


In [21]:
print(accuracy_score(y_test_small, y_pred_small))

0.91


In [22]:
X_train_med, X_test_med, y_train_med, y_test_med = calculate_samples(
    ["MajorAxisLength", "MinorAxisLength", "roundness", "Class"], len(beans), bscaler)

In [23]:
%%time
y_pred_med = calculate_knn(X_train_med,
                           X_test_med,
                           y_train_med.to_list(),
                           p=2,
                           k=3)

CPU times: user 3min, sys: 1.64 s, total: 3min 1s
Wall time: 3min 2s


In [24]:
print(accuracy_score(y_test_med, y_pred_med))

0.8995004407875404


In [25]:
X_train_large, X_test_large, y_train_large, y_test_large = calculate_samples(
    beans.columns.to_list(), len(beans), bscaler)

In [26]:
%%time
y_pred_large = calculate_knn(X_train_large,
                             X_test_large,
                             y_train_large.to_list(),
                             p=2,
                             k=3)

CPU times: user 3min 55s, sys: 1.91 s, total: 3min 56s
Wall time: 3min 57s


In [None]:
print(accuracy_score(y_test_large, y_pred_large))