In [2]:
import pandas as pd
import numpy as np

from operator import itemgetter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Function for calculating the Manhattan distance

In [6]:
def calculate_manhattan_distance(a: list, b: list) -> float:
    """Calculates the Manhattan distance between two vectors, X and Y. """

    # Get the number of elements in each vector
    i = len(a)

    # Create empty list to hold differences
    diffs = []

    # Loop over each element in the two vectors and take their elementwise difference. Get absolute value and raise to power of p
    for element in range(0, i):
        diffs.append(abs(a[element] - b[element]))

    # Return sum of absolute differences
    return sum(diffs)

## Function for applying the Manhattan distance

In [7]:
def apply_manhattan_distance(vectors_1: list, vectors_2: list) -> list:
    """Calculates the pairwise difference between two lists of vectors."""

    # Create empty list to store distances
    distances = []

    # Nested loop to get the difference between every element of list one and list two
    for train_observation in vectors_1:
        tmp_distances = []
        for test_observation in vectors_2:
            tmp_distances.append(calculate_manhattan_distance(train_observation, test_observation))
        distances.append(tmp_distances)

    # Reshape the list so every element represents the differences between a test point and all train points
    return [list(x) for x in zip(*distances)]

## Function for calculating each test observation's label

In [8]:
def calculate_nearest_neighbour(distances: list, labels: list, k: int) -> str:
    """Calculates the k-nearest neighbours for a test point, using k selected neighbours."""

    # Zip together the training labels and distances to a test point and sort by distance.
    sorted_distances = sorted(zip(distances, labels), key=itemgetter(0))[1:]

    # Keep only the top k labels
    top_n_labels = [label for dist, label in sorted_distances][:k]

    # Return the most common label
    return max(set(top_n_labels), key=top_n_labels.count)

## Function for applying kNN to full test set

In [9]:
def calculate_knn(X_train: list,
                  X_test: list,
                  y_train: list,
                  k: int) -> list:
    """Calculates the k-nearest neighbours for all points in a test set."""

    # Calculate the pairwise distances between all train and test points
    train_test_distances = apply_manhattan_distance(X_train, X_test)

    # Calculate the most common labels for each test point
    predicted_labels = []
    for distance_list in train_test_distances:
        predicted_labels += [calculate_nearest_neighbour(distance_list, y_train, k=k)]

    return predicted_labels

## Reading in beans dataset

In [10]:
beans = pd.read_excel("../data/Dry_Bean_Dataset.xlsx")
bscaler = StandardScaler()

In [12]:
beans.shape

(27222, 17)

## Convenience function for calculating different samples from the beans data

In [13]:
def calculate_samples(cols: list, rows: int, scaler: StandardScaler) -> tuple:
    # Prepare random sample with selected columns and row count
    sample_df = beans[cols].sample(n=rows, random_state=456)
    X = sample_df.drop(columns=["Class"])
    y = sample_df["Class"]

    # Create test and train sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=456)

    # Scale features
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

In [17]:
# Small dataset
X_train_small, X_test_small, y_train_small, y_test_small = calculate_samples(
    ["MajorAxisLength", "MinorAxisLength", "roundness", "Class"], 4000, bscaler)

In [18]:
# Medium dataset
X_train_med, X_test_med, y_train_med, y_test_med = calculate_samples(
    ["MajorAxisLength", "MinorAxisLength", "roundness", "Class"], len(beans), bscaler)

In [19]:
# Large dataset
X_train_large, X_test_large, y_train_large, y_test_large = calculate_samples(
    beans.columns.to_list(), len(beans), bscaler)

## Applying function to small dataset and timing results

In [20]:
%%time
y_pred_small = calculate_knn(X_train_small.tolist(),
                             X_test_small.tolist(),
                             y_train_small.to_list(),
                             k=3)

CPU times: user 14.5 s, sys: 327 ms, total: 14.8 s
Wall time: 14.6 s


## Applying function to medium dataset and timing results

In [21]:
%%time
y_pred_med = calculate_knn(X_train_med.tolist(),
                           X_test_med.tolist(),
                           y_train_med.to_list(),
                           k=3)

CPU times: user 12min 1s, sys: 12.8 s, total: 12min 14s
Wall time: 12min 7s


## Applying function to large dataset and timing results

In [22]:
%%time
y_pred_large = calculate_knn(X_train_large.tolist(),
                             X_test_large.tolist(),
                             y_train_large.to_list(),
                             k=3)

CPU times: user 38min 5s, sys: 1min 19s, total: 39min 24s
Wall time: 38min 23s
