# Understanding Image Classifier layers

#### Importing required libraries

In [2]:
import random
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
import platform
from two_layer_perceptron import TwoLayerPerceptron

In [3]:
# ipynb parameters

# This is a bit of magic to make matplotlib figures appear inline 
# in the notebook rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules
%load_ext autoreload
%autoreload 2

In [6]:
perceptron = TwoLayerPerceptron()

##### Checking Affine Forward and Backward layers

In [9]:
# generating random data to test affine layer implementation
np.random.seed(231)
x = np.random.randn(10, 2, 3)
print("Inputs of shape: ", x.reshape((x.shape[0], -1)).shape)
print(x.reshape((x.shape[0], -1)))
w = np.random.randn(6, 5)
print("Weights of shape: ", w.shape)
print(w)
b = np.random.randn(5)
print("Bias of shape: ", b.shape)
print(b)
dout = np.random.randn(10, 5)

Inputs of shape:  (10, 6)
[[ 0.41794341  1.39710028 -1.78590431 -0.70882773 -0.07472532 -0.77501677]
 [-0.1497979   1.86172902 -1.4255293  -0.3763567  -0.34227539  0.29490764]
 [-0.83732373  0.95218767  1.32931659  0.52465245 -0.14809998  0.88953195]
 [ 0.12444653  0.99109251  0.03514666  0.26207083  0.14320173  0.90101716]
 [ 0.23185863 -0.79725793  0.12001014 -0.65679608  0.26917456  0.333667  ]
 [ 0.27423503  0.76215717 -0.69550058  0.29214712 -0.38489942  0.1228747 ]
 [-1.42904497  0.70286283 -0.85850947 -1.14042979 -1.58535997 -0.01530138]
 [-0.32156083  0.56834936 -0.19961722  1.27286625  1.27292534  1.58102968]
 [-1.75626715  0.9217743  -0.6753054  -1.43443616  0.47021125  0.03196734]
 [ 0.04448574  0.47824879 -2.51335181 -1.15740245 -0.70470413 -1.04978879]]
Weights of shape:  (6, 5)
[[-1.90795589  0.49258765  0.83736166 -1.4288134  -0.18982427]
 [-1.14094943 -2.12570755 -0.41354791  0.44148975  0.16411113]
 [-0.65505065 -0.30212765 -0.25704466 -0.12841368  0.26338593]
 [ 0.167

1. Affine Forward

In [10]:
scores, (x, w, b) = perceptron.affine_forward(x, w, b)
print("Scores of shape : ", scores.shape)
print(scores)

Scores of shape :  (10, 5)
[[-2.17606361 -2.12184729  0.00900847  2.20337442 -0.99083714]
 [-1.85215259 -3.19211285 -1.74370267  0.0973037  -0.54036486]
 [-1.56514011 -2.63372545 -4.24801244 -1.42050022 -0.00692815]
 [-2.96856091 -1.89745509 -2.91706368 -2.47802971 -0.02180427]
 [-1.29878952  2.00543255 -0.63545946 -1.64080561  0.1968879 ]
 [-1.66992084 -1.10030085 -1.84282972 -0.78540801 -1.31714587]
 [ 2.89525541 -0.82354792 -0.81903185  1.90158524 -1.08850368]
 [-2.78971285 -1.77983379 -5.21950069 -3.6583993   0.61183637]
 [ 0.79806586 -2.31893721 -1.71021427  3.12661299  1.28543738]
 [ 0.77630397  0.21244488  1.25800943  3.04184783 -1.66867068]]


2. Affine Backward

In [14]:
_, (x, w, b) = perceptron.affine_forward(x, w, b)
dx, dw, db = perceptron.affine_backward(dout, cache=(x, w, b))

print("Input gradient of shape: ", dx.shape)
print(dx)
print("Weights gradient of shape: ", dw.shape)
print(dw)
print("Biases gradient of shape: ", db.shape)
print(db)

Input gradient of shape:  (10, 6)
[[ 5.06973048  0.52895344  1.22136172 -0.19561005  1.56951029  3.72781223]
 [-1.01960674  3.16856234  0.66629986  1.12662151  1.15800203 -0.33676616]
 [ 0.01917811  0.49589462  0.1643096   0.57856225  0.34138118 -1.20252682]
 [-1.00781287  2.39378843  0.12283772  0.90902209  0.19081771 -2.97852996]
 [ 2.36599088  1.79213685  1.11158625 -1.23937492  2.25219041  2.07000103]
 [ 0.95915612  2.69695394  0.88221926 -0.3273759   1.84259405 -0.60336282]
 [ 6.47924185  1.58133859  1.16457556 -2.19076747  1.03404647  6.20462543]
 [ 2.69588666  1.1217322   1.2400047   4.14104512  0.52346801  5.6311282 ]
 [-4.16910147 -3.93402396 -1.57652706  1.72466666 -2.93171083 -0.69903127]
 [ 1.17992956  1.11091973  0.95563638  1.15028299  1.40801655  3.06403357]]
Weights gradient of shape:  (6, 5)
[[-2.37342917 -0.51198268  0.31810037  2.90403428  1.06935402]
 [-3.51268592 -2.11141032 -3.63676781 -1.97058092 -2.23413933]
 [ 5.08663365  1.49488732  3.34009108  5.93662486 -2.1

#### Evaluating analytical gradients with numercial to check correctness

In [15]:
# function to compute numerical gradient
def eval_numerical_gradient_array(f, x, df, h=1e-5):
    """
    Evaluate a numeric gradient for a function that accepts a numpy
    array and returns a numpy array.
    """
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
    while not it.finished:
        ix = it.multi_index

        oldval = x[ix]
        x[ix] = oldval + h
        pos = f(x).copy()
        x[ix] = oldval - h
        neg = f(x).copy()
        x[ix] = oldval

        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
        it.iternext()
    return grad

# function to compute relative error
def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [18]:

dx_num = eval_numerical_gradient_array(lambda x: perceptron.affine_forward(x, w, b)[0], x, dout)
dw_num = eval_numerical_gradient_array(lambda w: perceptron.affine_forward(x, w, b)[0], w, dout)
db_num = eval_numerical_gradient_array(lambda b: perceptron.affine_forward(x, w, b)[0], b, dout)

_, cache = perceptron.affine_forward(x, w, b)
dx, dw, db = perceptron.affine_backward(dout, cache)

dx_num = dx_num.reshape((dx_num.shape[0], -1))
# The error should be around e-10 or less
print('Testing affine_backward function:')
print('dx error: ', rel_error(dx_num, dx))
print('dw error: ', rel_error(dw_num, dw))
print('db error: ', rel_error(db_num, db))

Testing affine_backward function:
dx error:  1.0908199508708189e-10
dw error:  2.1752635504596857e-10
db error:  7.736978834487815e-12


#### Checking ReLu activation function
1. ReLu Forward

In [25]:
np.random.seed(231)
x = np.random.randn(10, 5)
dout = np.random.randn(*x.shape)

dx_num = eval_numerical_gradient_array(lambda x: perceptron.relu_forward(x)[0], x, dout)
print("scores of shape", x.shape)
print(x)
scores_after_activation , x = perceptron.relu_forward(x)
print("Scores after activation :", scores_after_activation.shape)
print(scores_after_activation)
dx = perceptron.relu_backward(dout, x)
# The error should be on the order of e-12
print('Testing relu_backward function:')
print('dx error: ', rel_error(dx_num, dx))

scores of shape (10, 5)
[[ 0.41794341  1.39710028 -1.78590431 -0.70882773 -0.07472532]
 [-0.77501677 -0.1497979   1.86172902 -1.4255293  -0.3763567 ]
 [-0.34227539  0.29490764 -0.83732373  0.95218767  1.32931659]
 [ 0.52465245 -0.14809998  0.88953195  0.12444653  0.99109251]
 [ 0.03514666  0.26207083  0.14320173  0.90101716  0.23185863]
 [-0.79725793  0.12001014 -0.65679608  0.26917456  0.333667  ]
 [ 0.27423503  0.76215717 -0.69550058  0.29214712 -0.38489942]
 [ 0.1228747  -1.42904497  0.70286283 -0.85850947 -1.14042979]
 [-1.58535997 -0.01530138 -0.32156083  0.56834936 -0.19961722]
 [ 1.27286625  1.27292534  1.58102968 -1.75626715  0.9217743 ]]
Scores after activation : (10, 5)
[[0.41794341 1.39710028 0.         0.         0.        ]
 [0.         0.         1.86172902 0.         0.        ]
 [0.         0.29490764 0.         0.95218767 1.32931659]
 [0.52465245 0.         0.88953195 0.12444653 0.99109251]
 [0.03514666 0.26207083 0.14320173 0.90101716 0.23185863]
 [0.         0.120010

We've only asked you to implement ReLU, but there are a number of different activation functions that one could use in neural networks, each with its pros and cons. In particular, an issue commonly seen with activation functions is getting zero (or close to zero) gradient flow during backpropagation. Which of the following activation functions have this problem? If you consider these functions in the one dimensional case, what types of input would lead to this behaviour?
1. Sigmoid
2. ReLU
3. Leaky ReLU

## Answer:
As a result of backpropagation, gradient flow near zero leads to the vanishing gradient problem.

1. The sigmoid function suffers from the vanishing gradient problem because the gradient is close to zero for very large positive and negative input values. 

2. Due to its linear response to a positive input, ReLU has the advantage over Sigmoid of being less susceptible to the vanishing gradient problem. For negative inputs, ReLU's gradient is 0; for positive inputs, it is 1. It is not very likely that ReLU would suffer from the vanishing gradient problem if all input values are negative. This results in some neurons not being able to train further. There is a problem called "dying ReLU". 

3. Leaky ReLU attempts to solve the "dead" neuron ReLU problem by applying a small negative gradient to negative values. That is, 0.01*X if x < 0, and x otherwise. Leaky ReLU therefore aims to solve the vanishing gradient problem. However, the function max(0.01x, x) is not continuous at x = 0, so the slope at x = 0 is undefined. So, if not explicitly handled in your code, a one-dimensional example where the gradient can be zero is to consider all zero values. [0, 0, 0] only occurs if the network initialization is incorrect.


### Loss layers: Softmax and SVM

In [27]:
# func to eva;uate numerical gradient
def eval_numerical_gradient(f, x, verbose=True, h=0.00001):
    """
    a naive implementation of numerical gradient of f at x
    - f should be a function that takes a single argument
    - x is the point (numpy array) to evaluate the gradient at
    """

    fx = f(x)  # evaluate function value at original point
    grad = np.zeros_like(x)
    # iterate over all indexes in x
    it = np.nditer(x, flags=["multi_index"], op_flags=["readwrite"])
    while not it.finished:

        # evaluate function at x+h
        ix = it.multi_index
        oldval = x[ix]
        x[ix] = oldval + h  # increment by h
        fxph = f(x)  # evalute f(x + h)
        x[ix] = oldval - h
        fxmh = f(x)  # evaluate f(x - h)
        x[ix] = oldval  # restore

        # compute the partial derivative with centered formula
        grad[ix] = (fxph - fxmh) / (2 * h)  # the slope
        if verbose:
            print(ix, grad[ix])
        it.iternext()  # step to next dimension

    return grad

In [29]:
np.random.seed(231)
num_classes, num_inputs = 10, 50
x = 0.001 * np.random.randn(num_inputs, num_classes)
y = np.random.randint(num_classes, size=num_inputs)

dx_num = eval_numerical_gradient(lambda x: perceptron.svm_loss(x, y)[0], x, verbose=False)
loss, dx = perceptron.svm_loss(x, y)

# Test svm_loss function. Loss should be around 9 and dx error should be around the order of e-9
print('Testing svm_loss:')
print('loss: ', loss)
print('dx error: ', rel_error(dx_num, dx))

dx_num = eval_numerical_gradient(lambda x: perceptron.softmax_loss(x, y)[0], x, verbose=False)
loss, dx = perceptron.softmax_loss(x, y)

# Test softmax_loss function. Loss should be close to 2.3 and dx error should be around e-8
print('\nTesting softmax_loss:')
print('loss: ', loss)
print('dx error: ', rel_error(dx_num, dx))

Testing svm_loss:
loss:  8.999602749096233
dx error:  1.4021566006651672e-09

Testing softmax_loss:
loss:  2.302545844500738
dx error:  9.483503037636722e-09
