In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
import os
os.chdir("/content/gdrive/My Drive/Assignment2_NN/Startpkg_A2")

Mounted at /content/gdrive


In [2]:
import numpy as np
import matplotlib.pyplot as plt

from utils.data_process import get_CIFAR10_data
from models.neural_net import NeuralNetwork
from kaggle_submission import output_submission_csv

######### If not using Colab, you may skip these setup #########
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
####################### End of setup ###########################

# Loading CIFAR-10
Now that you have implemented a neural network that passes gradient checks and works on toy data, you will test your network on the CIFAR-10 dataset.

In [3]:
# You can change these numbers for experimentation
# For submission be sure they are set to the default values
TRAIN_IMAGES = 49000
VAL_IMAGES = 1000
TEST_IMAGES = 5000  # Default is 5000, do not modify this for your submission.

data = get_CIFAR10_data(TRAIN_IMAGES, VAL_IMAGES, TEST_IMAGES)
X_train, y_train = data['X_train'], data['y_train']
X_val, y_val = data['X_val'], data['y_val']
X_test, y_test = data['X_test'], data['y_test']

# Train a network
To train our network we will use SGD. In addition, we will adjust the learning rate with an exponential learning rate schedule as optimization proceeds; after each epoch, we will reduce the learning rate by multiplying it by a decay rate.

You can try different numbers of layers and also the different activation functions that you implemented on the CIFAR-10 dataset below.

In [5]:
input_size = 32 * 32 * 3
num_layers = 2
hidden_size = 20
hidden_sizes = [hidden_size]*(num_layers-1)
num_classes = 10
net = NeuralNetwork(input_size, hidden_sizes, num_classes, num_layers, nonlinearity='relu')


print("Training data shape:", X_train.shape)
print("Training labels shape:", y_train.shape)
# Train the network
stats = net.train(X_train, y_train, X_val, y_val,
            num_iters=1000, batch_size=200,
            learning_rate=1e-3, learning_rate_decay=0.95,
            reg=0.1, verbose=True)

# Predict on the validation set
val_acc = (net.predict(X_val) == y_val).mean()
print('Validation accuracy: ', val_acc)

Training data shape: (49000, 3072)
Training labels shape: (49000,)
iteration 0 / 1000: loss 6.188017
iteration 100 / 1000: loss 5.425964
iteration 200 / 1000: loss 5.013850
iteration 300 / 1000: loss 4.905773
iteration 400 / 1000: loss 4.845575
iteration 500 / 1000: loss 4.764746
iteration 600 / 1000: loss 4.790739
iteration 700 / 1000: loss 4.644053
iteration 800 / 1000: loss 4.649105
iteration 900 / 1000: loss 4.589735
Validation accuracy:  0.304


# Graph loss and train/val accuracies

Examining the loss graph along with the train and val accuracy graphs should help you gain some intuition for the hyperparameters you should try in the hyperparameter tuning below. It should also help with debugging any issues you might have with your network.

In [6]:
# Plot the loss function and train / validation accuracies
def Graph_Accuracy(stats, i, num_layers, nonlinearity, name):
  if num_layers == 2:
    layers = 'two'
  elif num_layers == 3:
    layers = 'three'

  plt.figure(figsize=(12, 5))
  plt.suptitle(f'Experiment #{i}/ {layers}-layer {nonlinearity}, {name}', y=1.05, fontsize=16)
  plt.title('Loss History', fontsize=14, y=1)
  plt.plot(stats['loss_history'])
  plt.xlabel('Iteration')
  plt.ylabel('Loss')
  plt.show()

  plt.figure(figsize=(12, 5))
  plt.suptitle(f'Experiment #{i}/ {layers}-layer {nonlinearity}, {name}', y=1.05, fontsize=16)
  plt.plot(stats['train_acc_history'], label='train')
  plt.plot(stats['val_acc_history'], label='val')
  plt.title('Classification accuracy history', fontsize=14, y=1)
  plt.xlabel('Epoch')
  plt.ylabel('Clasification accuracy')
  plt.legend()
  plt.show()

  #plt.suptitle('Comparison of Functions', fontsize=16)

# Hyperparameter tuning

Once you have successfully trained a network you can tune your hyparameters to increase your accuracy.

Based on the graphs of the loss function above you should be able to develop some intuition about what hyperparameter adjustments may be necessary. A very noisy loss implies that the learning rate might be too high, while a linearly decreasing loss would suggest that the learning rate may be too low. A large gap between training and validation accuracy would suggest overfitting due to large model without much regularization. No gap between training and validation accuracy would indicate low model capacity (low model complexity).


You will compare networks of two and three layers using the different activation functions you implemented.

The different hyperparameters you can experiment with are:
- **Batch size**: We recommend you leave this at 200 initially which is the batch size we used.
- **Number of iterations**: You can gain an intuition for how many iterations to run by checking when the validation accuracy plateaus in your train/val accuracy graph.
- **Initialization** Weight initialization is very important for neural networks. We used the initialization `W = np.random.randn(n) / sqrt(n)` where `n` is the input dimension for layer corresponding to `W`. We recommend you stick with the given initializations, but you may explore modifying these. Typical initialization practices: http://cs231n.github.io/neural-networks-2/#init
- **Learning rate**: Generally from around 1e-4 to 1e-1 is a good range to explore according to our implementation.
- **Learning rate decay**: We recommend a 0.95 decay to start.
- **Hidden layer size**: You should explore up to around 120 units per layer. For three-layer network, we fixed the two hidden layers to be the same size when obtaining the target numbers. However, you may experiment with having different size hidden layers.
- **Regularization coefficient**: We recommend trying values in the range 0 to 0.1.



Hints:
- After getting a sense of the parameters by trying a few values yourself, you will likely want to write a few for loops to traverse over a set of hyperparameters.
- If you find that your train loss is decreasing, but your train and val accuracy start to decrease rather than increase, your model likely started minimizing the regularization term. To prevent this you will need to decrease the regularization coefficient.


## Two-layer Relu Activation Network

In [7]:
best_2layer_relu = None # store the best model into this

#################################################################################
# TODO: Tune hyperparameters using the validation set. Store your best trained  #
# model in best_2layer_relu.                                                    #
#################################################################################
# Define the hyperparameter search space
learning_rates = [1e-3, 1e-2]
regularization_strengths = [0.001, 0.01]
hidden_sizes = [128, 256, 512]
batch_sizes = [100, 200, 500]
num_iters = 1000
nonlinearity = 'relu'
name = 'vyeruban'

best_val_acc = 0
best_learning_rate = 0
best_reg = 0
best_hidden_size = 0
best_batch_size = 0
i = 1

for lr in learning_rates:
    for reg in regularization_strengths:
        for hidden_size in hidden_sizes:
          for batch_size in batch_sizes:
            # Create a 2-layer neural network with ReLU activation
            net = NeuralNetwork(
                      input_size=32 * 32 * 3,
                      hidden_sizes=[hidden_size],
                      output_size=10,
                      num_layers=2,
                      nonlinearity='relu'
                  )

            # Train the network with the current set of hyperparameters
            stats = net.train(
                X_train, y_train, X_val, y_val,
                num_iters=num_iters,
                batch_size=batch_size,
                learning_rate=lr,
                learning_rate_decay=0.95,
                reg=reg,
                verbose=False
            )

            Graph_Accuracy(stats,i, num_layers, nonlinearity, name)
            i+=1
            # Evaluate on the validation set
            val_acc = (net.predict(X_val) == y_val).mean()
            print('-------------------------------------------------------------------------------------------------------------------')
            print(f'lr: {lr}, reg: {reg}, hidden_size: {hidden_size}, batch_size: {batch_size}, val_acc: {val_acc}')
            print('-------------------------------------------------------------------------------------------------------------------')
            # If the current model’s validation accuracy exceeds the previous best, updates the best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_2layer_relu = net
                best_learning_rate = lr
                best_reg = reg
                best_hidden_size = hidden_size
                best_batch_size = batch_size


print(f'Best validation accuracy: {best_val_acc}')
print('Two-layer relu')
test_acc = (best_2layer_relu.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)
output_submission_csv('nn_2layer_relu_submission.csv', best_2layer_relu.predict(X_test))
print(f'Best batch size: {best_batch_size}, Best learning rate: {best_learning_rate}, Best hidden size: {best_hidden_size}, Best regularization strength: {best_reg}')

Output hidden; open in https://colab.research.google.com to view.

## Two-layer Sigmoid Activation Network

In [8]:
best_2layer_sigmoid = None # store the best model into this

#################################################################################
# TODO: Tune hyperparameters using the validation set. Store your best trained  #
# model in best_2layer_sigmoid.                                                 #
#################################################################################
# Define the hyperparameter search space
learning_rates = [1e-3, 1e-2]
regularization_strengths = [0.001, 0.01]
hidden_sizes = [128, 256, 512]
batch_sizes = [100, 200, 500]
num_iters = 1000
num_layers = 2
nonlinearity = 'sigmoid'
name = 'vvaddi2'
best_val_acc = 0
best_learning_rate = 0
best_reg = 0
best_hidden_size = 0
best_batch_size = 0
i = 1

for lr in learning_rates:
    for reg in regularization_strengths:
        for hidden_size in hidden_sizes:
          for batch_size in batch_sizes:
            # Create a 2-layer neural network with Sigmoid activation
            net = NeuralNetwork(
                      input_size=32 * 32 * 3,
                      hidden_sizes=[hidden_size],
                      output_size=10,
                      num_layers=2,
                      nonlinearity='sigmoid'
                  )

            # Train the network with the current set of hyperparameters
            stats = net.train(
                X_train, y_train, X_val, y_val,
                num_iters=num_iters,
                batch_size=batch_size,
                learning_rate=lr,
                learning_rate_decay=0.95,
                reg=reg,
                verbose=False
            )

            Graph_Accuracy(stats,i, num_layers, nonlinearity, name)
            i+=1
            # Evaluate on the validation set
            val_acc = (net.predict(X_val) == y_val).mean()
            print('-------------------------------------------------------------------------------------------------------------------')
            print(f'lr: {lr}, reg: {reg}, hidden_size: {hidden_size}, batch_size: {batch_size}, val_acc: {val_acc}')
            print('-------------------------------------------------------------------------------------------------------------------')

            # If the current model’s validation accuracy exceeds the previous best, updates the best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_2layer_sigmoid = net
                best_learning_rate = lr
                best_reg = reg
                best_hidden_size = hidden_size
                best_batch_size = batch_size


print(f'Best validation accuracy: {best_val_acc}')
print('Two-layer sigmoid')
test_acc = (best_2layer_sigmoid.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)
output_submission_csv('nn_2layer_sigmoid_submission.csv', best_2layer_sigmoid.predict(X_test))
print(f'Best batch size: {best_batch_size}, Best learning rate: {best_learning_rate}, Best hidden size: {best_hidden_size}, Best regularization strength: {best_reg}')

Output hidden; open in https://colab.research.google.com to view.

## Three-layer Relu Activation Network

In [9]:
best_3layer_relu = None # store the best model into this

#################################################################################
# TODO: Tune hyperparameters using the validation set. Store your best trained  #
# model in best_3layer_relu.                                                    #
#################################################################################
learning_rates = [1e-3, 1e-2]
regularization_strengths = [0.001, 0.01]
hidden_sizes = [128, 256, 512]
batch_sizes = [100, 200, 500]
num_iters = 1000
best_learning_rate = 0
best_reg = 0
best_hidden_size = 0
best_batch_size = 0
num_layers = 3
nonlinearity = 'relu'
name = 'vvaddi2'
best_val_acc = 0
i = 1

for lr in learning_rates:
    for reg in regularization_strengths:
        for hidden_size in hidden_sizes:
          for batch_size in batch_sizes:
            # Create a 3-layer neural network with ReLU activation
            net = NeuralNetwork(
                      input_size=32 * 32 * 3,
                      hidden_sizes=[hidden_size, hidden_size],
                      output_size=10,
                      num_layers=3,
                      nonlinearity='relu'
                  )

            # Train the network with the current set of hyperparameters
            stats = net.train(
                X_train, y_train, X_val, y_val,
                num_iters=num_iters,
                batch_size=batch_size,
                learning_rate=lr,
                learning_rate_decay=0.95,
                reg=reg,
                verbose=False
            )

            Graph_Accuracy(stats,i, num_layers, nonlinearity, name)
            i+=1
            # Evaluate on the validation set
            val_acc = (net.predict(X_val) == y_val).mean()
            print('-------------------------------------------------------------------------------------------------------------------')
            print(f'lr: {lr}, reg: {reg}, hidden_size: {hidden_size}, batch_size: {batch_size}, val_acc: {val_acc}')
            print('-------------------------------------------------------------------------------------------------------------------')

            # If the current model’s validation accuracy exceeds the previous best, updates the best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_3layer_relu = net
                best_learning_rate = lr
                best_reg = reg
                best_hidden_size = hidden_size
                best_batch_size = batch_size


print(f'Best validation accuracy: {best_val_acc}')
print('Three-layer relu')
test_acc = (best_3layer_relu.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)
print(f'Best batch size: {best_batch_size}, Best learning rate: {best_learning_rate}, Best hidden size: {best_hidden_size}, Best regularization strength: {best_reg}')
output_submission_csv('nn_3layer_relu_submission.csv', best_3layer_relu.predict(X_test))

Output hidden; open in https://colab.research.google.com to view.

## Three-layer Sigmoid Activation Network

In [10]:
best_3layer_sigmoid = None # store the best model into this

#################################################################################
# TODO: Tune hyperparameters using the validation set. Store your best trained  #
# model in best_3layer_sigmoid.                                                 #
#################################################################################
# Define the hyperparameter search space
learning_rates = [1e-3, 1e-2]
regularization_strengths = [0.001, 0.01]
hidden_sizes = [128, 256, 512]
batch_sizes = [100, 200, 500]
num_iters = 1000
best_learning_rate = 0
best_reg = 0
best_hidden_size = 0
best_batch_size = 0
num_layers = 3
nonlinearity = 'sigmoid'
name = 'vyeruban'
best_val_acc = 0
i = 1

for lr in learning_rates:
    for reg in regularization_strengths:
        for hidden_size in hidden_sizes:
          for batch_size in batch_sizes:
            # Create a 3-layer neural network with ReLU activation
            net = NeuralNetwork(
                      input_size=32 * 32 * 3,
                      hidden_sizes=[hidden_size, hidden_size],
                      output_size=10,
                      num_layers=3,
                      nonlinearity='sigmoid'
                  )

            # Train the network with the current set of hyperparameters
            stats = net.train(
                X_train, y_train, X_val, y_val,
                num_iters=num_iters,
                batch_size=batch_size,
                learning_rate=lr,
                learning_rate_decay=0.95,
                reg=reg,
                verbose=False
            )

            Graph_Accuracy(stats,i, num_layers, nonlinearity, name)
            i+=1
            # Evaluate on the validation set
            val_acc = (net.predict(X_val) == y_val).mean()
            print('-------------------------------------------------------------------------------------------------------------------')
            print(f'lr: {lr}, reg: {reg}, hidden_size: {hidden_size}, batch_size: {batch_size}, val_acc: {val_acc}')
            print('-------------------------------------------------------------------------------------------------------------------')

            # If the current model’s validation accuracy exceeds the previous best, updates the best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_3layer_sigmoid = net
                best_learning_rate = lr
                best_reg = reg
                best_hidden_size = hidden_size
                best_batch_size = batch_size


print(f'Best validation accuracy: {best_val_acc}')
print('Three-layer relu')
test_acc = (best_3layer_sigmoid.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)
print(f'Best batch size: {best_batch_size}, Best learning rate: {best_learning_rate}, Best hidden size: {best_hidden_size}, Best regularization strength: {best_reg}')
output_submission_csv('nn_3layer_sigmoid_submission.csv', best_3layer_sigmoid.predict(X_test))

Output hidden; open in https://colab.research.google.com to view.

# Run on the test set
When you are done experimenting, you should evaluate your final trained networks on the test set.

In [11]:
print('Two-layer relu')
test_acc = (best_2layer_relu.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)

Two-layer relu
Test accuracy:  0.4648


In [12]:
print('Two-layer sigmoid')
test_acc = (best_2layer_sigmoid.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)

Two-layer sigmoid
Test accuracy:  0.399


In [13]:
print('Three-layer relu')
test_acc = (best_3layer_relu.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)

Three-layer relu
Test accuracy:  0.4664


In [14]:
print('Three-layer sigmoid')
test_acc = (best_3layer_sigmoid.predict(X_test) == y_test).mean()
print('Test accuracy: ', test_acc)

Three-layer sigmoid
Test accuracy:  0.3702


# Kaggle output

Once you are satisfied with your solution and test accuracy output a file to submit your test set predictions to the Kaggle for Assignment 2 Neural Network. Use the following code to do so:

In [15]:
output_submission_csv('nn_2layer_relu_submission.csv', best_2layer_relu.predict(X_test))

In [16]:
output_submission_csv('nn_2layer_sigmoid_submission.csv', best_2layer_sigmoid.predict(X_test))

In [17]:
output_submission_csv('nn_3layer_relu_submission.csv', best_3layer_relu.predict(X_test))

In [18]:
output_submission_csv('nn_3layer_sigmoid_submission.csv', best_3layer_sigmoid.predict(X_test))