In [2]:

# set up code for this experiment
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

np.random.seed(1)

In [3]:
def euclidean_dist(X_test, X_train):
  dists = np.add(np.sum(X_test ** 2, axis=1, keepdims=True), np.sum(X_train ** 2, axis=1, keepdims=True).T) - 2* X_test @ X_train.T
  return dists

def find_k_neighbors(dists, Y_train, k):
  """
  find the labels of the top k nearest neighbors

  Inputs:
  - dists: distance matrix of shape (num_test, num_train)
  - Y_train: A numpy array of shape (num_train) containing ground truth labels for training data
  - k: An integer, k nearest neighbors

  Output:
  - neighbors: A numpy array of shape (num_test, k), where each row containts the 
               labels of the k nearest neighbors for each test example
  """
  
  num_test = dists.shape[0]
  neighbors = np.zeros((num_test, k))
  sorted_idx = dists.argsort(axis=1)
  for i in range(num_test):
    neighbors[i] = Y_train[sorted_idx[i][:k]]
  return neighbors

def knn_predict(X_test, X_train, Y_train, k):
  """
  predict labels for test data.

  Inputs:
  - X_test: A numpy array of shape (num_test, dim_feat) containing test data.
  - X_train: A numpy array of shape (num_train, dim_feat) containing training data.
  - Y_train: A numpy array of shape (num_train) containing ground truth labels for training data
  - k: An integer, k nearest neighbors

  Output:
  - Y_pred: A numpy array of shape (num_test). Predicted labels for the test data.
  """
  # TODO:
  # find the labels of k nearest neighbors for each test example,
  # and then find the majority label out of the k labels
  #
  # Here is the pseudo-code:
  # dists = euclidean_dist(X_test, X_train)
  # neighbors = find_k_neighbors(dists, Y_train, k)
  # Y_pred = np.zeros(num_test, dtype=int)  # force dtype=int in case the dataset
  #                                         # stores labels as float-point numbers
  # for i = 0 ... num_test-1
  #     Y_pred[i] = # the most common/frequent label in neighbors[i], you can
  #                 # implement it by using np.unique
  # return Y_pred

  
  num_test = X_test.shape[0]
  Y_pred = np.zeros(num_test, dtype=int)
  dists = euclidean_dist(X_test, X_train)
  neighbors = find_k_neighbors(dists, Y_train, k)

  for i in range(num_test):
    value, counts = np.unique(neighbors[i], return_counts=True)
    idx = np.argmax(counts)
    Y_pred[i] = value[idx]

  return Y_pred

def compute_error_rate(ypred, ytrue):
  """
  Compute error rate given the predicted results and true lable.
  Inputs:
  - ypred: array of prediction results.
  - ytrue: array of true labels.
    ypred and ytrue should be of same length.
  Output:
  - error rate: float number indicating the error in percentage
                (i.e., a number between 0 and 100).
  """
  
  error_rate =  (ypred != ytrue).mean()*100
  return error_rate

## Lab 3 work ##


In [28]:
#def split_nfold(num_examples, n):
"""
  Split the dataset in to training sets and validation sets.
  Inputs:
  - num_examples: Integer, the total number of examples in the dataset
  - n: number of folds
  Outputs:
  - train_sets: List of lists, where train_sets[i] (i = 0 ... n-1) contains 
                the indices of examples for training
  - validation_sets: List of list, where validation_sets[i] (i = 0 ... n-1) 
                contains the indices of examples for validation

  Example:
  When num_examples = 10 and n = 5, 
    the output train_sets should be a list of length 5, 
    and each element in this list is itself a list of length 8, 
    containing 8 indices in 0...9
  For example, 
    we can initialize by randomly permuting [0, 1, ..., 9] into, say,
      [9, 5, 3, 0, 8, 4, 2, 1, 6, 7]
    Then we can have
    train_sets[0] = [3, 0, 8, 4, 2, 1, 6, 7],  validation_sets[0] = [9, 5]
    train_sets[1] = [9, 5, 8, 4, 2, 1, 6, 7],  validation_sets[1] = [3, 0]
    train_sets[2] = [9, 5, 3, 0, 2, 1, 6, 7],  validation_sets[2] = [8, 4]
    train_sets[3] = [9, 5, 3, 0, 8, 4, 6, 7],  validation_sets[3] = [2, 1]
    train_sets[4] = [9, 5, 3, 0, 8, 4, 2, 1],  validation_sets[4] = [6, 7]
  Within train_sets[i] and validation_sets[i], the indices do not need to be sorted.
"""
  # generate random index list
num_examples = 10
n = 3
idx = np.random.permutation(num_examples).tolist() 


fold_size = num_examples//n   # compute how many examples in one fold.                            # note '//' as we want an integral result
train_sets = []
val_sets = []

# figure out what index in what fold

for i in range(n):
    # calculate start and end for each fold size
    start = i * fold_size
    end = fold_size + i * fold_size
    # if num_examples does not divide evenly
    if i == n-1:
        end = num_examples  # handle the remainder by allocating them to the last fold
    
    # Extract training indices, exclude between start and end
    train_set = [idx[x] for x in range(num_examples) if x not in range(start,end)]
    train_sets.append(train_set)
    
    # Extract validation example indices from the idx list using start and end
    val_set = idx[start:end] 
    val_sets.append(val_set)

  #avoid randomness
np.random.seed(1) 
  
  # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 


  # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
 # return train_sets, validation_sets

# Unit test code here (you can uncomment the lines below to test)
#train_sets, val_sets = split_nfold(11, 5)
print("train set", train_sets)
print("val set", val_sets)

train set [[6, 2, 8, 0, 1, 4, 7], [5, 3, 9, 0, 1, 4, 7], [5, 3, 9, 6, 2, 8]]
val set [[5, 3, 9], [6, 2, 8], [0, 1, 4, 7]]
