In [0]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split
from scipy.stats import mode

In [0]:
iris = load_iris()
X_iris = iris.data
Y_iris = iris.target

In [0]:
X_iris_train, X_iris_test, Y_iris_train, Y_iris_test = train_test_split(X_iris, Y_iris, test_size = 0.5)

In [4]:
Y_iris_train.shape

(75,)

In [0]:
def euclideanDistance(x_a, x_b):
    """
    Calculates the Euclidean distance between two vectors
    
    Arguments:
        x_a (np.array): shape [m_features, ] a single vector a
        x_b (np.array): shape [m_features, ] a single vector b
    
    Returns:
        distance (float): Euclidean distance between vectors x_a and x_b
    """
    
    return minkowskiDistance(x_a, x_b, 2)

In [0]:
def manhattanDistance(x_a, x_b):
    """
    Calculates the Manhattan distance between two vectors
    
    Arguments:
        x_a (np.array): shape [m_features, ] a single vector a
        x_b (np.array): shape [m_features, ] a single vector b
    
    Returns:
        distance (float): Manhattan distance between vectors x_a and x_b
    """
    return minkowskiDistance(x_a, x_b, 1)

In [0]:
def chebyshevDistance(x_a, x_b):
    """
    Calculates the Chebyshev distance between two vectors
    
    Arguments:
        x_a (np.array): shape [m_features, ] a single vector a
        x_b (np.array): shape [m_features, ] a single vector b
    
    Returns:
        distance (float): Chebyshev distance between vectors x_a and x_b
    """
    
    distance = np.max( np.abs(x_a - x_b) ) 
    return distance

In [0]:
def minkowskiDistance(x_a, x_b, p=2):
    """
    Calculates the minkowski distance between two vectors
    
    Arguments:
        x_a (np.array): shape [m_features, ] a single vector a
        x_b (np.array): shape [m_features, ] a single vector b
        p (int): Sets the Lp distance metric to use:
            1 - Manhattan
            2 - Euclidian 
            inf - Chebyshev
    
    Returns:
        distance (float): Minkowski distance between vectors x_a and x_b
    """
    
    distance = np.sum(np.abs(x_a - x_b)**p)**(1/p)
    return distance

In [0]:
def calculateDistances(x_test, X_in, distanceFunction):
    """
    Calculates the distance between a single test example, x_test,
    and a list of examples X_in. 
    
    Args:
        x_test (np.array): shape [n_features,] a single test example
        X_in (np.array): shape [n_samples, n_features] a list of examples to compare against.
    
    Returns:
        distance_list (list of float): The list containing the distances       
    """
    
    distance_list = []
    for example in X_in:
        distance_list.append(distanceFunction(example, x_test))
    return distance_list

In [0]:
def kNearestIndices(distance_list, k):
    """
    Determines the indices of the k nearest neighbours
    
    Arguments:
        distance_list (list of float): list of distances between a test point 
            and every training example
        k (int): the number of nearest neighbours to consider
    
    Returns:
        k_nearest_indices (array of int): shape [k,] array of the indices 
            corresponding to the k nearest neighbours
    """
    
    k_nearest_indices = np.array( np.argsort(distance_list)[:k] )
    return k_nearest_indices

In [0]:
def kNearestNeighbours(k_nearest_indices, X_in, Y_in):
    """
    Creates the dataset of k nearest neighbours
    
    Arguments:
        k_nearest_indices (array of int): shape [k,] array of the indices 
            corresponding to the k nearest neighbours
        X_in (array): shape [n_examples, n_features] the example data matrix to sample from
        Y_in (array): shape [n_examples, ] the label data matrix to sample from
    
    Returns:
        X_k (array): shape [k, n_features] the k nearest examples
        Y_k (array): shape [k, ] the labels corresponding to the k nearest examples
    """
    
    X_k = []
    Y_k = []

    for i in k_nearest_indices:
        X_k.append(X_in[i])
        Y_k.append(Y_in[i])
        
    X_k = np.array(X_k)
    Y_k = np.array(Y_k)
    return X_k, Y_k

In [0]:
def predict(x_test, X_in, Y_in, k, distanceFunction):
    """
    Predicts the class of a single test example
    
    Arguments:
        x_test (np.array): shape [n_features, ] the test example to classify
        X_in (np.array): shape [n_input_examples, n_features] the example data matrix to sample from
        Y_in (np.array): shape [n_input_labels, ] the label data matrix to sample from
    
    Returns:
        prediction (array): shape [1,] the number corresponding to the class 
    """
    
    distance_list = calculateDistances(x_test, X_in, distanceFunction)
    kNN_indices = kNearestIndices(distance_list, k)
    X_k, Y_k = kNearestNeighbours(kNN_indices, X_in, Y_in)
    prediction =  mode(Y_k, axis=None)[0]

    return prediction

In [0]:
def predictBatch(X_t, X_in, Y_in, k, distanceFunction):
    """
    Performs predictions over a batch of test examples
    
    Arguments:
        X_t (np.array): shape [n_test_examples, n_features]
        X_in (np.array): shape [n_input_examples, n_features]
        Y_in (np.array): shape [n_input_labels, ]
        k (int): number of nearest neighbours to consider
    
    Returns:
        predictions (np.array): shape [n_test_examples,] the array of predictions
        
    """
    predictions = []
    for x_t_i in X_t:
        predictions.append(predict(x_t_i, X_in, Y_in, k, distanceFunction)[0])
    
    return np.array(predictions)

In [0]:
def accuracy(Y_pred, Y_test):
    """
    Calculates the accuracy of the model 
    
    Arguments:
        Y_pred (np.array): shape [n_test_examples,] an array of model predictions
        Y_test (np.array): shape [n_test_labels,] an array of test labels to 
            evaluate the predictions against
    
    Returns:
        accuracy (float): the accuracy of the model
    """
    assert(Y_pred.shape == Y_test.shape)
    
    correct = 0
    total = len(Y_test)

    for i in range(total):
        if (Y_pred[i] == Y_test[i]):
            correct += 1
    
    accuracy = correct/total
    return accuracy

In [0]:
def run(X_train, X_test, Y_train, Y_test, k, distanceFunction=euclideanDistance):
    """
    Evaluates the model on the test data
    
    Arguments:
        X_train (np.array): shape [n_train_examples, n_features]
        X_test (np.array): shape [n_test_examples, n_features]
        Y_train (np.array): shape [n_train_examples, ]
        Y_test (np.array): shape [n_test_examples, ]
        k (int): number of nearest neighbours to consider
    
    Returns:
        test_accuracy (float): the final accuracy of your model 
    """
    Y_pred = predictBatch(X_test, X_train, Y_train, k, distanceFunction)
    test_accuracy = accuracy(Y_pred, Y_test)

    return test_accuracy

In [16]:
print( run(X_iris_train, X_iris_test, Y_iris_train, Y_iris_test, 4, manhattanDistance) ) 

0.9466666666666667


In [17]:
print( run(X_iris_train, X_iris_test, Y_iris_train, Y_iris_test, 4, chebyshevDistance) ) 

0.9733333333333334


In [18]:
print( run(X_iris_train, X_iris_test, Y_iris_train, Y_iris_test, 4, euclideanDistance) ) 

0.96


In [19]:
row1 = np.array([10, 20, 15, 10, 5])
row2 = np.array([12, 24, 18, 8, 7])
print(chebyshevDistance(row1, row2))

4


In [20]:
row1 = np.array([0, 3, 4, 5])
row2 = np.array([7, 6, 3, -1])
print(chebyshevDistance(row1, row2))

7
