In [2]:
import numpy as np
import pandas as pd

from operator import itemgetter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Function for calculating and applying the Manhattan distance
Using broadcast vector subtraction to calculate differences between two 2D arrays.

In [3]:
def calculate_manhattan_distance_vectorised(train_observations: np.ndarray,
                                            test_observations: np.ndarray,
                                            ) -> np.ndarray:
    """
    Generalised formula for calculating both Manhattan and Euclidean distances. Calculates pairwise distances between every point in two n-dimensional arrays.
    array1: first set of points;
    array2: second set of points;
    p: power parameter which determines the distance metric used, with 1 = Manhattan and 2 = Euclidean.
    """

    diffs = train_observations[None, :, :] - test_observations[:, None, :]
    abs_diffs = np.abs(diffs)
    return abs_diffs.sum(axis=-1)

## Function for calculating each test observation's label

In [4]:
def calculate_nearest_neighbour(distances: list, labels: list, k: int) -> str:
    """Calculates the k-nearest neighbours for a test point, using k selected neighbours."""

    # Zip together the training labels and distances to a test point and sort by distance.
    sorted_distances = sorted(zip(distances, labels), key=itemgetter(0))[1:]

    # Keep only the top k labels
    top_n_labels = [label for dist, label in sorted_distances][:k]

    # Return the most common label
    return max(set(top_n_labels), key=top_n_labels.count)

## Function for applying kNN to full test set

In [5]:
def calculate_knn(X_train: list,
                  X_test: list,
                  y_train: list,
                  k: int) -> list:
    """Calculates the k-nearest neighbours for all points in a test set."""

    # Calculate the pairwise distances between all train and test points
    train_test_distances = calculate_manhattan_distance_vectorised(X_train, X_test)

    # Calculate the most common labels for each test point
    predicted_labels = []
    for distance_list in train_test_distances:
        predicted_labels += [calculate_nearest_neighbour(distance_list, y_train, k=k)]

    return predicted_labels

## Reading in beans dataset

In [16]:
beans = pd.read_excel("../data/Dry_Bean_Dataset.xlsx")
bscaler = StandardScaler()

## Convenience function for calculating different samples from the beans data

In [9]:
def calculate_samples(cols: list, rows: int, scaler: StandardScaler) -> tuple:
    # Prepare random sample with selected columns and row count
    sample_df = beans[cols].sample(n=rows, random_state=456)
    X = sample_df.drop(columns=["Class"])
    y = sample_df["Class"]

    # Create test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=456)

    # Scale features
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

## Applying function to small dataset and timing results

In [22]:
X_train_small, X_test_small, y_train_small, y_test_small = calculate_samples(
    ["MajorAxisLength", "MinorAxisLength", "roundness", "Class"], 4000, bscaler)

In [23]:
%%time
y_pred_small = calculate_knn(X_train_small,
                             X_test_small,
                             y_train_small.to_list(),
                             k=3)

CPU times: user 1.27 s, sys: 44.4 ms, total: 1.32 s
Wall time: 1.32 s


## Applying function to medium dataset and timing results

In [20]:
X_train_med, X_test_med, y_train_med, y_test_med = calculate_samples(
    ["MajorAxisLength", "MinorAxisLength", "roundness", "Class"], len(beans), bscaler)

In [21]:
%%time
y_pred_med = calculate_knn(X_train_med,
                           X_test_med,
                           y_train_med.to_list(),
                           k=3)

CPU times: user 1min 2s, sys: 1.83 s, total: 1min 4s
Wall time: 1min 4s


## Applying function to large dataset and timing results

In [18]:
X_train_large, X_test_large, y_train_large, y_test_large = calculate_samples(
    beans.columns.to_list(), len(beans), bscaler)

In [19]:
%%time
y_pred_large = calculate_knn(X_train_large,
                             X_test_large,
                             y_train_large.to_list(),
                             k=3)

CPU times: user 1min 16s, sys: 51.2 s, total: 2min 7s
Wall time: 3min 1s
