# Upload files in Google Colab
If you are running this Jupyter Notebook on Google Colab, run this cell to upload the data files (train_inputs.csv, train_labels.csv, test_inputs.csv, test_labels.csv) in the Colab virtual machine.  You will be prompted to select files that you would like to upload. 

If you are running this Jupyter Notebook on your computer, you do not need to run this cell.

In [None]:
from google.colab import files
uploaded = files.upload()
%ls

# Import libraries 
Do not use any other Python library.

In [52]:
import numpy as np
import matplotlib.pyplot as plt

# Function: load_knn_data

This function loads the data for KNN from a local drive into RAM

Outputs:

*   **train_inputs**: numpy array of N training data points x M features
*   **train_labels**: numpy array of N training labels
*   **test_inputs**: numpy array of N' test data points x M features
*   **test_labels**: numpy array of N' test labels




  



In [9]:
def load_knn_data():
    test_inputs = np.genfromtxt('test_inputs.csv', delimiter=',')
    test_labels = np.genfromtxt('test_labels.csv', delimiter=',')
    train_inputs = np.genfromtxt('train_inputs.csv', delimiter=',')
    train_labels = np.genfromtxt('train_labels.csv', delimiter=',')
    return train_inputs, train_labels, test_inputs, test_labels

# Function: predict_knn

This function uses the KNN classifier to predict the label of a data point.

Inputs:
*   **x**: input data point for which we want to predict the label (numpy array of M features)
*   **inputs**: matrix of data points in which neighbours will be found (numpy array of N data points x M features)
*   **labels**: vector of labels associated with the data points  (numpy array of N labels)
*   **k_neighbours**: # of nearest neighbours that will be used

Output:
*   **predicted_label**: predicted label (integer)




In [3]:
def euclidean_distance(x1,x2):
    return np.sqrt(np.sum((x1-x2)**2))

In [13]:
# function that takes in an array & returns the most common value within the array
def most_common(arr):
    return max(set(arr), key = arr.count)

In [5]:
def predict_knn(x, inputs, labels, k_neighbours):
  # Create an array of all euclidean ditances between point x & all inputs points in dataset (inputs)
     distancesArray = [euclidean_distance(x, input) for input in inputs]
  # Sort array by distances (shortest to longest) & save indeces of the k-nearest-neighbors
     distance_indeces = np.argsort(distancesArray)
     knn_indeces = distance_indeces[:k_neighbours]
  # Find labels (response variable) of knn
     knn_labels = [labels[index] for index in knn_indeces]
  # Find the most common value of knn for x (the label for x)
     predicted_label = most_common(knn_labels)
     return predicted_label

# Function: eval_knn

Function that evaluates the accuracy of the KNN classifier on a dataset.  The dataset to be evaluated consists of (inputs, labels).  The dataset used to find nearest neighbours consists of (train_inputs, train_labels).

Inputs:
*   **inputs**: matrix of input data points to be evaluated (numpy array of N data points x M features)
*   **labels**: vector of target labels for the inputs (numpy array of N labels)
*   **train_inputs**: matrix of input data points in which neighbours will be found (numpy array of N' data points x M features)
*   **train_labels**: vector of labels for the training inputs (numpy array of N' labels)
*   **k_neighbours**: # of nearest neighbours to be used (integer)

Outputs:
*   **accuracy**: percentage of correctly labeled data points (float)




In [6]:
def eval_knn(inputs, labels, train_inputs, train_labels, k_neighbours):
  # Get the predicted labels array for inputs (predictor variables test set) and save it in an array called predicted_labels
    predicted_labels = [predict_knn(input, train_inputs, train_labels, k_neighbours) for input in inputs]
  # Compare predicted_labels with labels to get the accuracy scores
    accuracy = np.mean(predicted_labels == labels)
    return accuracy

# Function: cross_validation_knn

This function performs k-fold cross validation to determine the best number of neighbours for KNN.

Inputs:
*   **k_folds**: # of folds in cross-validation (integer)
*   **hyperparameters**: list of hyperparameters where each hyperparameter is a different # of neighbours (list of integers)
*   **inputs**: matrix of data points to be used when searching for neighbours (numpy array of N data points by M features)
*   **labels**: vector of labels associated with the inputs (numpy array of N labels)

Outputs:
*   **best_hyperparam**: best # of neighbours for KNN (integer)
*   **best_accuracy**: accuracy achieved with best_hyperparam (float)
*   **accuracies**: vector of accuracies for the corresponding hyperparameters (numpy array of floats)





In [118]:
def cross_validation_knn(k_folds, hyperparameters, inputs, labels):
    # First, join inputs & labels prior to shuffling, in order to keep results correlated to input data & then shuffle
    train_data = np.append(inputs, labels, axis=1)  
    np.random.shuffle(train_data) 
    
    # Split the training dataset (inputs) by k_folds into 10 sets
    train_data = np.array_split(train_data, k_folds)
    
    # First element of each array represents the knn value, later the accuracies will be appended onto their respective knn value's array
    accuracies = [[1],[2],[3],[4],[5],[6],[7],[8],[9],[10],[11],[12],[13],[14],[15],[16],[17],[18],[19],[20],[21],[22],[23],[24],[25],[26],[27],[28],[29],[30]]
    num_of_cols = len(inputs[0])

    # for each 2d Array in train_data, remove it in order to use it as a test set & combine and use the rest of the array as a training set
    for array in train_data:
        # Make a copy of the train_data array
        training_data = train_data.copy()
        # Remove the current 2d array from the copy & concatenate the copy in order to have a training dataset
        training_data.remove(array)
        training_data = np.concatenate((training_data), axis=0)
        # Split data into training (predictor & response variables) and testing (predictor & response variables)
        train_inputs = training_data[:,:num_of_cols]
        train_labels = training_data[:,num_of_cols]
        test_inputs = array[:,:num_of_cols]
        test_labels = array[:,num_of_cols]
        # After splitting data into training & testing, iterate of KNN hyperparameters range & store the accuracies for each
        for i in hyperparameters:
            accuracies[i-1].append(eval_knn(test_inputs,test_labels,train_inputs,train_labels,i))
    # create an array to determine which knn value had the best accuracy
    best_accuracies = []
    for i in range(0,30):
        # Sum up all the accuracy scores in each array of the overall 2d array (accuracies) & store it in the variable
        num = 0;
        for j in range(1,10):
            num += accuracies[i][j]
        # divide the num value to get the mean accuracy for each knn value
        best_accuracies.append(num/k_folds)
    # Best mean accuracy will be the value with the highest mean accuracy
    best_accuracy = max(best_accuracies)
    # Since we are traversing the array in order, the index of each element in array (best_accuracies) will be 1 short from the knn number
    best_hyperparam = best_accuracies.index(best_accuracy).astype('int')+1
    return best_hyperparam, best_accuracy, accuracies

# Function: plot_knn_accuracies

Function that plots the KNN accuracies for different # of neighbours (hyperparameters) based on cross validation

Inputs:
*   **accuracies**: vector of accuracies for the corresponding hyperparameters (numpy array of floats)
*   **hyperparams**: list of hyperparameters where each hyperparameter is a different # of neighbours (list of integers)


In [7]:
def plot_knn_accuracies(accuracies,hyperparams):
    plt.plot(hyperparams,accuracies)
    plt.ylabel('accuracy')
    plt.xlabel('k neighbours')
    plt.show()  