# Homework 1

CS 637, Spring 2022

Due 3/3/21

1) Implement an API to create a fully-connected multilayer perceptron (MLP) for a *k*-class classification problem.

2) Test this implementation on MNIST data.

Notation

*  `w`: weights
*  `b`: biases
*  `z`: aggregated values
*  `a`: activated values
*  `x`: input to a neural network
*  `yhat`: final output from the neural network (i.e., *after* softmax if applied)

In [None]:
# Import dependencies
import numpy as np
import matplotlib.pyplot as plt
import time

# 1. API

## 1.1 Layer class

In [None]:
class Layer(object):
  '''
  Represents one layer of a multilayer perceptron (MLP).
  Initializes and updates parameter values (weights (w) and bias (b)) for 
  aggregation within the layer.
  '''

  def __init__(self, in_dim=1, out_dim=1, activ_fun=None, init_with_normal=True, 
               mean=0, var=0.01, min=-0.1, max=0.1, a=None):
    '''
    Constructs a new Layer object.
    @param in_dim: dimension of inputs to this layer
    @param out_dim: dimension of outputs from this layer
    @param activ_fun: collection of functions from the Activation class to 
    apply to each layer
    @param init_with_normal: true to initialize parameters by sampling from a 
    normal distribution, false to sample from a uniform one
    @param mean: mean to use if initializing with a normal dist
    @param var: variance to use if initializing with a normal dist
    @param min:
    @param max:
    @a: array to supply in special cases (i.e., input layer) to explicitly set 
    the activation values
    '''
    self.activ_fun = activ_fun
    self.mean = mean
    self.var = var
    self.w = None
    self.b = None
    self.z = None
    self.a = a

    # Initialize layer's parameters if a non-input layer
    if activ_fun is not None:
      if init_with_normal:
        self.normal_init(out_dim, in_dim, self.mean, self.var) # weights
        self.normal_init(out_dim, 1, self.mean, self.var, True) # bias
      else:
        self.random_init(out_dim, in_dim, min, max) # weights
        self.random_init(out_dim, 1, min, max, is_bias=True) # bias

  def normal_init(self, out_dim, in_dim, mean=0.0, var=0.01, is_bias=False):
    '''
    Initialize parameter values by sampling from a normal distribution.
    '''
    # If variance not set, use Kaiming's initializaion with fan-in

    # Initialize weights or bias
    if is_bias:
      self.b = np.random.normal(mean, var**0.5, (out_dim, in_dim))
    else:
      self.w = np.random.normal(mean, var**0.5, (out_dim, in_dim))

  def random_init(self, out_dim, in_dim, min=-0.1, max=0.1, is_bias=False):
    '''
    Initialize parameter values with random values.
    '''
    if is_bias:
      self.b = np.random.uniform(min, max, size=(out_dim, in_dim))
    else:
      self.w = np.random.uniform(min, max, size=(out_dim, in_dim))

  def zero_bias(self):
    '''
    Set layer biases to zero.
    '''
    self.b = np.zeros(self.b.shape)
  
  def forward(self, input, batch_size):
    '''
    Pass inputs forward through the layer.
    '''
    # Aggregate
    self.z = self.w @ input + self.b
    if np.isinf(self.z).any(): 
      print(self.z, self.w, self.b)
      raise ArithmeticError('Possibly exploding gradient')

    # Activate
    self.a = (self.activ_fun(self.z)).reshape(-1, batch_size)
  
  def adjust_weight(self, prev_a, grad_chain, lr, batch_size):
    '''
    Update weight values using the backpropagated gradient.
    @param prev_a: activation of previous layer
    @param grad_chain: backpropagated chain of derivatives so far
    @param lr: learning rate
    '''
    # Normalize magnitude of adjustment with batch size
    self.w -= lr/batch_size * (grad_chain @ prev_a.T)
  
  def adjust_bias(self, grad_chain, lr, batch_size):
    '''
    Update bias values using the backpropagated gradient.
    @param grad_chain: backpropagated chain of derivatives so far
    @param lr: learning rate
    '''
    # Derivative of an aggregate wrt bias (dz/db) = 1, so don't need the 
    # activation values of the previous layer
    self.b -= lr/batch_size * np.sum(grad_chain, axis=1).reshape(-1, 1)

## 1.2 Perceptron class

In [None]:
class Perceptron(object):
  '''
  Represents a multi-layer perceptron (MLP) for processing input data. Made of
  an aggregate of Layer objects.
  '''

  def __init__(self, dims, activ_fns, loss_fn,
               init_with_normal=True, mean=0.0, var=0.01, min=-0.1, max=0.1):
    '''
    Constructs a new Perceptron object.

    @param dims: array of ints, each representing the dimensionality of layer
    of the same index. First int is number of input features; last int
    is number of output neurons.
    @param activ_fns: list or tuple of methods from the Activation class to 
    apply on each hidden layer and the output layer
    @param loss_fn: loss function to measure model performance
    @param init_with_normal: true to initialize parameters by sampling from a 
    normal distribution, false to sample from a uniform one
    @param mean: mean to use if initializing with a normal distribution
    @param var: variance to use if initializing with a normal distribution
    @param min: minimum bound if initializing with a uniform distribution
    @param max: maximum bound if initializing with a uniform distribution
    '''
    # Ensure Perceptron has at least input and output layers
    assert len(dims) > 1

    # Ensure amount of activation functions given matches number of layers - 1
    assert len(dims) == len(activ_fns) + 1
    
    self.dims = dims
    self.n_layers = len(dims)
    self.activ_fns = activ_fns
    self.loss_fn = loss_fn
    self.reset_layers(init_with_normal, mean, var, min, max)

  def reset_layers(self, init_with_normal=True, mean=0.0, var=0.01, 
                   min=-0.1, max=0.1):  
    '''
    Initialize parameters in the MLP.
    '''
    self.layers = [None] * len(self.dims)

    # For each layer besides the input layer, attach the appropriate dimensions, 
    # activation function, and initialization parameters
    for i in range(self.n_layers - 1):
      self.layers[i + 1] = Layer(
          in_dim=self.dims[i], out_dim=self.dims[i+1],
          activ_fun=self.activ_fns[i], 
          init_with_normal=init_with_normal,
           mean=mean, var=var, min=min, max=max)
  
  def zero_biases(self):
    '''
    Set all biases in the MLP to zero.
    '''
    for i in range(1, self.n_layers):
      self.layers[i].zero_bias()

  def __getitem__(self, layer_number):
    '''
    Retrieves reference to a layer in the MLP.

    @param layer_number: int index of layer to access.
    @return: Layer object at layer_number
    '''
    return self.layers[layer_number]
  
  def forward(self, X, batch_size):
    '''
    Step forward through the MLP, aggregating the previous layer's values and
    then applying the chosen activation function.

    @param X: input data to feed through the network
    @param batch size
    '''
    # Set the first layer (the input layer)'s "activation" to the input data
    self.layers[0] = Layer(a=X)

    # Pass through each layer using inputs from previous layer
    for i, layer in enumerate(self.layers[1:], start=1):
      layer.forward(self.layers[i-1].a, batch_size)

      # TEMP
      if np.isnan(self.layers[i].a).any():
        print(np.max(self.layers[i].w), np.max(self.layers[i].b), self.layers[i-1].a)
        raise Exception('Activation NaN during forward pass!!!')
 
  
  def backward(self, labels, lr, batch_size):
    '''
    Backpropagates from loss and adjust parameters after a forward pass.
    
    @param labels: true labels from the input data
    @param lr: learning rate
    @param batch_size: batch size
    '''
    # Calculate derivative of loss wrt logits
    grad_chain = self.loss_fn(self.layers[-1].a, labels, derive=True)

    for i in range(self.n_layers - 1, 0, -1):
      # Save current weight for dloss/da of the previous layer later
      prev_a = self.layers[i-1].a
      old_weight = self.layers[i].w

      # Adjust parameters
      self.layers[i].adjust_weight(prev_a, grad_chain, lr, batch_size)
      self.layers[i].adjust_bias(grad_chain, lr, batch_size)

      # Compute dloss/da of previous layer, multiply to current product
      grad_chain = (old_weight.T @ grad_chain) 

      # Compute dloss/dz of previous layer using elementwise multiplication
      if self.layers[i-1].activ_fun is not None:
        da_dz = self.layers[i-1].activ_fun(prev_a, derive=True)
      else: # If hit the input layer
        break
        
      grad_chain = grad_chain * da_dz
      if np.isnan(grad_chain).any() or np.isinf(grad_chain).any():
        raise Exception('Exploded grad_chain!!!')
  
  def train(self, iterator, lr, batch_size, train_mode=True):
    '''
    Train for a single epoch.
    @param iterator: object containing batch generator to loop through
    @param lr: learning rate
    @param train_mode: true to backpropagate and upate parameters, false to not
    '''
    loss = 0.0
    acc = 0.0
    for i, (X, y) in enumerate(iterator):
      self.forward(X, batch_size)

      if train_mode: # Adjust parameters only during training
        self.backward(y, lr, batch_size)

      # Update accumulated measures
      print(self.loss_fn(self[-1].a, y)) # TEMP
      loss += self.loss_fn(self[-1].a, y)
      acc += Loss.accuracy(self[-1].a, y)

    # Average over the number of batches done
    return loss/(i+1), acc/(i+1)

## 1.3 Activation and loss functions

In [None]:
class Activation(object):
  '''
  Common activation functions used in a MLP.
  '''
  @staticmethod
  def relu(x, derive=False):
    '''
    Applies ReLU or its derivative on an input.
    @param x: input data
    '''
    if not derive:
      return np.maximum(x, 0., x)
    else:
      return (x > 0).astype(int)
  
  @staticmethod
  def sigmoid(x, derive=False):
    '''
    Applies sigmoid or its deriative on an input.
    @param x: input data
    '''
    if not derive:
      return 1 / (1 + np.exp(-x))
    else: 
      return Activation.sigmoid(x) * (1-Activation.sigmoid(x))


  @staticmethod
  def softmax(x):
    '''
    Applies softmax on an input.
    @param x: input data
    @return: normalized values of the input data so sum of a given example = 1
    '''
    # Update from using a naive implementation - prevent numerical overflows
    # by using shifting trick
    # return np.exp(x) / np.sum(np.exp(x), axis=0)
    shift = np.max(x)
    exp_x = np.exp(x - shift)
    return exp_x / np.sum(exp_x, axis=0)

In [None]:
class Loss(object):
  @staticmethod
  def cross_entropy(yhat, y, epsilon=1e-5, derive=False):
    '''
    Computes the cross-entropy loss or its gradient with respect to logits for 
    a batch of predictions.
    @param yhat: final outputs (after activation) from a MLP
    @param y: actual labels to compare output with
    @param epsilon: small positive value to prevent taking log of 0
    '''
    if not derive:
      # Squash for EACH example by summing before averaging - 
      # erroneously many zeros otherwise!
      return np.average(np.sum(-y * np.log(yhat + epsilon), axis=0))
    else:
      return yhat - y

  @staticmethod
  def hinge_loss(yhat, y, margin=1, get_dist=False, derive=False):
    '''
    Computes the hinge loss or its derivative for a batch of predictions.
    @param yhat: final outputs (after activation) from a MLP
    @param y: actual labels to compare output with
    @param margin: desired margin for target class value to be overtake
    @param get_dist: true to return raw distances for each sample, false to
    return actual value of hinge loss
    '''
    # Output of the target class for each example
    # Repeat so same value going down row to get same shape as yhat
    y_target = np.sum(np.where(y == 1, yhat, 0), axis=0)

    # Get distances from outputs at indexes != target class
    # Add margin to elements at indexes != target class
    dist = np.where(y == 1, 0, yhat - y_target + margin)

    # If need raw distances before thresholding/averaging:
    if get_dist:
      return dist

    if not derive:
      # Get the positive differences over each example (zero-threshold)
      pos_per_example = np.where(dist > 0, dist, 0)

      # Sum over each example, average over all batches
      return np.sum(np.sum(pos_per_example, axis=0))
    else:
      return
  
  @staticmethod
  def accuracy(yhat, y, y_one_hot=True):
    '''
    Computes the accuracy for a batch of outputs.
    @param yhat: final outputs (after activation) from a MLP
    @param y: actual labels to compare output with
    @param y_one_hot: true if y given in one-hot form, false if given as a list 
    of correct indexes
    '''
    # Prediction class is the one with max predicted probability in each example
    yhat_choices = np.argmax(yhat, axis=0)

    # Reverse the actual labels' representation as one-hot for easier comparison
    if y_one_hot:
      y = Utility.reverse_one_hot(y)

    # Compute the average accuracy of the batch
    return np.average(yhat_choices == y)

In [None]:
# TEMP: hinge loss bumbling
yhat = np.random.normal(size=(4,3)).T
y = np.zeros((4, 3)).T
y[0, 0] = 1
y[2, 1] = 1
y[1, 2] = 1
y[0, 3] = 1
yhat, y

(array([[ 1.10037243, -0.70922258,  0.39471252, -0.01248606],
        [-2.00038539, -1.70548285, -0.27780035, -0.37525978],
        [-0.45434826,  1.44905675, -0.82113996, -0.01582385]]),
 array([[1., 0., 0., 1.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]]))

In [None]:
# TEMP
a = Loss.hinge_loss(yhat, y, get_dist=True)
maybe_deriv = np.where(y == 1, np.sum(a > 0, axis=0), a > 0)
a, maybe_deriv

(array([[ 0.        , -1.15827934,  1.67251288,  0.        ],
        [-2.10075782, -2.15453961,  0.        ,  0.63722628],
        [-0.55472069,  0.        ,  0.45666039,  0.99666221]]),
 array([[0, 0, 1, 2],
        [0, 0, 2, 1],
        [0, 0, 1, 1]]))

## 1.4 Utilities class

In [None]:
class Utility(object):
  '''
  Miscellaneous methods for wrapping and plotting.
  '''
  @staticmethod
  def train_validate(mlp, dataset, lr, epochs, batch_size,
                     shuffle_every_epoch=True, verbose=True):
    '''
    Train and validate in parallel over one or more epochs.
    @param mlp: Perceptron object with parameters initialized
    '''
    assert (epochs > 0)
    train_loss = np.empty(epochs)
    train_acc = np.empty(epochs)
    valid_loss = np.empty(epochs)
    valid_acc = np.empty(epochs)
    start_time = time.time()
    if verbose:
      print('Epoch \tTrain_loss \tTrain_acc \tValid_loss \tValid_acc')

    for epoch in range(epochs):
      # Make iterators
      train_iter = dataset.make_batches(batch_size, group='train', 
                                        shuffle_again=shuffle_every_epoch)
      valid_iter = dataset.make_batches(batch_size, group='valid',
                                        shuffle_again=shuffle_every_epoch)

      # Train and validate - validate "previous" model before updating
      valid_loss[epoch], valid_acc[epoch] = mlp.train(valid_iter, lr=lr,
                                                      batch_size=batch_size,
                                                      train_mode=False)
      train_loss[epoch], train_acc[epoch] = mlp.train(train_iter, lr=lr,
                                                      batch_size=batch_size,
                                                      train_mode=True)
      
      # If set to print out info as executing
      if verbose:
        print('{} \t{:9.3f} \t{:9.3f} \t {:9.3f} \t{:9.3f}'.format(
            epoch, train_loss[epoch], train_acc[epoch], 
            valid_loss[epoch], valid_acc[epoch]))

    # Print amount of time taken
    print('Time elapsed (s): {:2.1f}'.format(time.time() - start_time))


    return train_loss, train_acc, valid_loss, valid_acc
  
  @staticmethod
  def make_one_hot(y):
    '''
    Make a list/collection of labels into a one-hot representation.
    @param y: list/collection of labels to convert
    @return transformed array in one-hot representation
    '''
    transformed = np.zeros((y.size, y.max() + 1))
    transformed[np.arange(y.size), y] = 1
    return transformed.T

  @staticmethod
  def reverse_one_hot(y):
    '''
    Convert a one-hot array back to a simple list of indices.
    @param y: one-hot array to convert
    @return: transformed array in list representation
    '''
    return np.argmax(y, axis=0)

  @staticmethod
  def plot_images(dataset, y, num_images=3):
    '''
    Plot using the flattened representation of a square image in input X.
    
    '''
    # Get random sample of dataset
    indexes = np.random.randint(0, dataset.X.shape[-1], num_images)

    # Display data as images
    width = int(dataset.X.shape[0]**0.5)
    for i in indexes:  
      plt.subplot(330 + 1 + i)
      plt.axis('off')
      plt.imshow(dataset.X[:, i+i].reshape(width, width), 
                 cmap=plt.get_cmap('gray'))
      
      # Title each image with its true label
      plt.title([Utility.reverse_one_hot(y[:, i+i])])

      # TItle ach image with its predicted label, if any
  
  @staticmethod
  def plot_results(*results, labels=None, fmts=None, xlabel='Epoch', ylabel='',
                   xticks=None, ymax=None, title=''):
    # Account for arguments not given
    if labels is None: 
      labels = (str(_) for _ in range(len(results)))
    if fmts is None:
      fmts = ('' for _ in range(len(results)))

    # Plot each set of results with its corresponding label and format
    for result, label, fmt in zip(results, labels, fmts):
      plt.plot(np.arange(result.shape[0]), result, fmt, label=label)

    # Parameters affecting entire plot
    plt.legend()
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    if xticks is not None:
      plt.xticks(np.arange(results[0].shape[0]))    
    if ymax is not None:
      plt.ylim(0, ymax)

## 1.5 Dataset class

In [None]:
class Dataset(object):
  '''
  Represents a set of data with methods to reshape, split, and train.
  '''
  def __init__(self, X, y, make_y_one_hot=True):
    '''
    Constructs a Dataset object.
    @param X: input data
    @param y: labels corresponding to input data
    @param make_y_one_hot: true to convert the labels into a one-hot 
    representation, if not done already; false to skip this step
    '''
    self.X = X
    self.y = y
    if make_y_one_hot:
      self.y = Utility.make_one_hot(self.y)

  def shape(self, features, categories, flatten_X=True):
    '''
    Reshapes the input and output data.
    @param features: number of features in the input data. Set to width 
    if a 2D image that will be flattened.
    @param categories:
    @flatten_X: true to flatten a 2D image into a 1D array
    '''
    if flatten_X: 
      # Make 2D images into an array of n x features^2, where n is the total
      # number of examples
      self.X = self.X.reshape(-1, features*features).T
    else:
      self.X = self.X.reshape(features, -1)
    self.y = self.y.reshape(categories, -1)

  def shuffle(self):
    '''
    Shuffles both X and y together in the dataset.
    '''
    assert self.X.shape[-1] == self.y.shape[-1]
    shuffled_indexes = np.random.permutation(self.X.shape[-1])
    self.X = self.X[:, shuffled_indexes]
    self.y = self.y[:, shuffled_indexes]

  def divide(self, p_train=70, p_valid=15, p_test=15):
    '''
    Divide the loaded data into sets for training, validation, and testing.
    @param p_train: proportion of dataset to allot for training
    @param p_valid: proportion of dataset to allot for validation
    @param p_test: proportion of dataset to allot for testing
    '''
    if (p_train + p_valid + p_test != 100):
      raise ValueError('Error: percentages don''t sum up to 100!')
    self.n_train = int(p_train * 0.01 * self.X.shape[-1])
    self.n_valid = int(p_valid * 0.01 * self.X.shape[-1])
    self.n_test = int(p_test * 0.01 * self.X.shape[-1])

    # If missing a few examples from cutoffs, add to training set
    test_diff = self.X.shape[-1] - (self.n_train + self.n_valid + self.n_test)
    if test_diff > 0:
      self.n_train += test_diff

    # Define groups
    self.X_train = self.X[:, 0:self.n_train]
    self.y_train = self.y[:, 0:self.n_train]
    self.X_valid = self.X[:, self.n_train:(self.n_train + self.n_valid)]
    self.y_valid = self.y[:, self.n_train:(self.n_train + self.n_valid)]
    self.X_test = self.X[:, (self.n_train + self.n_valid):]
    self.y_test = self.y[:, (self.n_train + self.n_valid):]
    
  def make_batches(self, batch_size, group='train', scale_X=True,
                   shuffle_again=False):
    '''
    Make batches given a specified size and a selected group.
    @param batch_size: int of batch size
    @param group: string equal to train, valid, or test to select group
    @param scale_X: true to rescale 
    '''
    # Get the training, testing, or validation group:
    if group == 'train':
      X_select, y_select = self.X_train, self.y_train
    elif group == 'valid':
      X_select, y_select = self.X_valid, self.y_valid
    elif group == 'test':
      X_select, y_select = self.X_test, self.y_test
    else:
      raise ValueError('Incorrect argument for group! Choose between train, '
                       'validate, or test.')

    # Shuffle within the group, if requested
    if shuffle_again:
      shuffled_indexes = np.random.permutation(X_select.shape[-1])
      X_select = X_select[:, shuffled_indexes]
      y_select = y_select[:, shuffled_indexes]

    # Make generators for the features and labels of the selected set
    for i in range(0, X_select.shape[-1], batch_size):
      if scale_X: # Rescale X to have values between 0 and 1
        X_scaled = np.linalg.norm(X_select[:, i:i+batch_size])
        yield X_select[:, i:i+batch_size]/X_scaled, y_select[:, i:i+batch_size]
      else:
        yield X_select[:, i:i+batch_size], y_select[:, i:i+batch_size]

# 2. Testing the API

## 2.1 Import the MNIST dataset
*   Permitted to use other packages for just downloading the data, so use `keras` to download.
*   Merge existing training and test sets and resplit, since splitting must be done by this homework's API.



In [None]:
from keras.datasets import mnist
(train_X, train_y), (test_X, test_y) = mnist.load_data()
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
(60000, 28, 28) (60000,) (10000, 28, 28) (10000,)


In [None]:
mnist_all_X = np.concatenate((train_X, test_X), axis=0)
mnist_all_y = np.concatenate((train_y, test_y), axis=0)
print('MNIST features shape:', mnist_all_X.shape, 
      '\nMNIST label shape:', mnist_all_y.shape)

MNIST features shape: (70000, 28, 28) 
MNIST label shape: (70000,)


## 2.2 Preprocessing

In [None]:
# Make a Dataset object
mnist_data = Dataset(mnist_all_X, mnist_all_y, True)

# Reshape the data
width = 28
categories = 10
mnist_data.shape(width, categories, True)
print("Feature and label shapes:", mnist_data.X.shape, mnist_data.y.shape)

# Shuffle the data
# plot first __ before shuffling
mnist_data.shuffle()
# plot first __ after to prove it's shuffled?

# Divide the data
mnist_data.divide()
print('Splits on training, validation, & testing:', 
      mnist_data.n_train, mnist_data.n_valid, mnist_data.n_test)

Feature and label shapes: (784, 70000) (10, 70000)
Splits on training, validation, & testing: 49000 10500 10500


## Training and validation

### Varying the learning rate

In [None]:
test_grad = np.arange(5)
clipped = test_grad/np.linalg.norm(test_grad)
clipped, np.linalg.norm(clipped)

(array([0.        , 0.18257419, 0.36514837, 0.54772256, 0.73029674]),
 0.9999999999999999)

In [None]:
# Make a generic Percentron object, reset every epoch
p = Perceptron((width*width, 8, 10),
                 (Activation.relu, Activation.softmax),
                 Loss.cross_entropy)
learning_rates = (0.1,)
lr_results = []
p.train(mnist_data.make_batches(batch_size=6), 0.1, 100)

31.838093005090435
30.02463005259264
29.16313002990333
29.775236570482406
28.185288637096967
27.16190358803683
29.002209478138212
27.83267842695235
26.730222937579107
26.761126415134775
28.653074874685796
24.803088000115267
24.722400350658265
27.181333832647454
23.820205340229627
26.16699825841436
24.824606112023744
25.162803009539477
25.753990240400146
24.38535414208958
26.1073703315452
23.737548735810783
23.231364684920443
23.781436752490016
23.896371900607743
23.39384361996373
24.53078782684188
23.304553120823154
23.808165094093564
25.032569982655538
24.175982597703847
23.239199582962023
23.588747819376575
23.52168170896497
23.928043073398122
23.322155495035123
24.08728451545279
23.914471172527875
23.608083044705705
23.08009192668867
22.928932785214037
23.14825264807493
23.97275998922659
23.297456376403165
23.454190814885706
23.18215704452139
22.741730433702106
23.29169955938186
23.20854165638695
23.156947865956845
23.590949929696915
22.99183742481079
23.168401017103587
23.223120702

(23.215121488189414, 0.11708163265306136)

In [None]:
# Plot training loss for the different learning rates used
Utility.plot_results(lr_results[0][0], lr_results[1][0],
                     lr_results[2][0], lr_results[3][0],
                     labels=('lr=0.002', 'lr=0.001', 'lr=5e-4', 'lr=1e-4'),
                     ylabel='Cross-entropy loss',
                     title='Effect of learning rate on training loss')

In [None]:
# Plot training accuracy for the different learning rates used
Utility.plot_results(lr_results[0][1], lr_results[1][1],
                     lr_results[2][1], lr_results[3][1],
                     labels=('lr=0.002', 'lr=0.001', 'lr=5e-4', 'lr=1e-4'),
                     ylabel='Accuracy', ymax=1.0,
                     title='Effect of learning rate on training accuracy')

### Changing the architecture

*   Number of hidden layers
*   Number of neurons per layer



In [None]:
# MLP with 2 hidden layers
p = Perceptron((width*width, 128, 64, 10),
                  (Activation.relu, Activation.relu, Activation.softmax),
                  Loss.cross_entropy, batch_size=100)

# Run, store results in variable
r2 = Utility.train_validate(p, mnist_data, lr=5e-4, epochs=25, verbose=True)

# Plot results
Utility.plot_results(r2[0], r2[1], r2[2], r2[3],
                     labels=('Test loss', 'Test accuracy', 'Validation loss', 
                             'Validation accuracy'),
                     fmts=('', '', 'o', 'o'), ymax=1.0,
                     title='MLP with two hidden layers')

In [None]:
# MLP with 3 hidden layers
p = Perceptron((width*width, 128, 64, 32, 10),
               (Activation.relu, Activation.relu, Activation.relu, Activation.softmax),
               Loss.cross_entropy, batch_size=100)
r3 = Utility.train_validate(p, mnist_data, lr=5e-4, epochs=25, verbose=True)
Utility.plot_results(r3[0], r3[1], r3[2], r3[3],
                     labels=('Test loss', 'Test accuracy', 'Validation loss', 
                             'Validation accuracy'),
                     fmts=('', '', 'o', 'o'), ymax=1.0,
                     title='MLP with three hidden layers')

In [None]:
# MLP with 2 hidden layers, fewer neurons
mlp = Perceptron((width*width, 16, 8, 10),
                  (Activation.relu, Activation.relu, Activation.softmax),
                  Loss.cross_entropy, batch_size=100)
r2b = Utility.train_validate(mlp, mnist_data, lr=2e-3, epochs=25, verbose=True)
Utility.plot_results(r2b[0], r2b[1], r2b[2], r2b[3],
                     labels=('Test loss', 'Test accuracy', 'Validation loss', 
                             'Validation accuracy'),
                     fmts=('', '', 'o', 'o'), ymax=1.0,
                     title='MLP with two hidden layers, fewer neurons')

MLP with fewer neurons in hidden layers is more efficient to train but results in more loss and less accuracy. 

### Selecting activation functions

### Changing loss function

### Changing how parameters are initialized

# Miscellaneous tasks

Toy tests to manually check basic functionality

Hinge loss examples on 1/20/22 lecture:

In [None]:
# 3-class classification example
ex1 = np.array([-3.7, 5, 7]).T
print('Target class is 3, loss={:0.1f} \tif 2, {:0.1f} \tif 1, {:0.1f}'.format(
    Loss.hinge_loss(ex1, np.array([0, 0, 1]).T),
    Loss.hinge_loss(ex1, np.array([0, 1, 0]).T),
    Loss.hinge_loss(ex1, np.array([1, 0, 0]).T)))

Target class is 3, loss=0.0 	if 2, 3.0 	if 1, 21.4


In [None]:
# 4-class classification example
ex2 = np.array([2.5, 2.0, 2.7, 1.7]).T
print('Target class is 2: {:0.1f}'.format(
    Loss.hinge_loss(ex2, np.array([0, 1, 0, 0]).T)))

Target class is 2: 3.9


Compute accuracy on two examples, one correct and one not:

In [None]:
yhat = np.array(((0.2, 0.5, 0.3), (0.7, 0.2, 0.1))).T
y = np.array(((0, 1, 0), (0, 1, 0))).T
Loss.accuracy(yhat, y)

0.5

Informal problem given at the end of class, 1/25/22:

In [None]:
p = Perceptron((4, 3, 2, 4), 
               (Activation.relu, Activation.sigmoid, Activation.softmax), 
               Loss.cross_entropy, init_with_normal=True)
p[1].w = np.array([[1.,0,0,0], [0,1,0,0], [0,0,1,0]])
p[2].w = np.array([[1.,0,0], [0,1,0]])
p[3].w = np.array([[1.,0], [0,1], [0,0], [0,0]])
p.zero_biases()

# Check forward path
p.forward(np.array([1, 0, 1, 0]).reshape(-1, 1), batch_size=1)
for i in range(1, len(p.dims)):
  print('Layer', i, '\nz =\n', p[i].z, '\na =\n', p[i].a, '\n')

In [None]:
# Check backpropagation
y = np.array([0., 1., 0., 0.]).reshape(-1, 1) # Dummy one-hot label
print('p[1].w before backprop\n', p[1].w)
p.backward(y, lr=0.5, batch_size=1)
print('p[1].w after backprop\n', p[1].w)

Activation.softmax(p[3].w @ p[2].a + p[3].b)

p[1].w before backprop
 [[1.04421637 0.         0.04421637 0.        ]
 [0.         1.         0.         0.        ]
 [0.03964693 0.         1.03964693 0.        ]]
layer 1
p[1].w after backprop
 [[1.14514486 0.         0.14514486 0.        ]
 [0.         1.         0.         0.        ]
 [0.09621592 0.         1.09621592 0.        ]]


array([[0.05991671],
       [0.8446309 ],
       [0.04772619],
       [0.04772619]])