In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
%matplotlib inline

# Neural Networks

In the previous exercise, you implemented feedforward propagation for neural networks and used it to predict handwritten digits with the weights we provided. In this exercise, you will implement the backpropagation algorithm to learn the parameters for the neural network.

In [2]:
data = loadmat('data/ex3data1.mat')

In [3]:
X = data['X']
y = data['y']

X.shape, y.shape

((5000, 400), (5000, 1))

In [4]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y = encoder.fit_transform(y)

We'll need the simoid function yet again;

In [5]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

### Model representation

Our neural network is shown in Figure 2. It has 3 layers {an input layer, a hidden layer and an output layer. Recall that our inputs are pixel values of digit images. Since the images are of size 20 x 20, this gives us 400 input layer units (not counting the extra bias unit which always outputs +1). 

The training data will be loaded into the variables $X$ and $y$.

You have been provided with a set of network parameters already trained by us.

## Feedforward and cost function

Taken from the end of exercise 3;

In [6]:
def forward_prop(a1, theta):
    m = a1.shape[0]
    
    a1 = np.insert(a1, 0, values=np.ones(m), axis=1)

    # Do the first Linear step 
    z2 = a1.dot(theta[0].T)

    # Put it through the first activation function
    a2 = sigmoid(z2)
    a2 = np.insert(a2, 0, values=np.ones(m), axis=1)
    # Second linear step
    z3 = a2.dot(theta[1].T)
    
    # Put through second activation function
    a3 = sigmoid(z3)

    return a1, z2, a2, z3, a3

    m = X.shape[0]

In [7]:
def cost(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, learning_rate):
    m = X.shape[0]
    
    theta1 = nn_params[:hidden_layer_size*(input_layer_size+1)]
    theta1 = theta1.reshape(hidden_layer_size, (input_layer_size+1), order='F')
    theta2 = nn_params[hidden_layer_size*(input_layer_size+1):]
    theta2 = theta2.reshape(num_labels, (hidden_layer_size+1), order='F')
    theta = (theta1, theta2)
    
    a1, z2, a2, z3, a3 = forward_prop(X, theta)
    
    J = 0
    for i in range(m):
        inside = np.multiply(-y[i,:], np.log(a3[i,:])) - np.multiply((1 - y[i,:]), np.log(1 - a3[i,:]))
        J += (1/m) * np.sum(inside)

    return J

### Regularized cost function

In [8]:
def reg_cost(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, learning_rate):
    m = X.shape[0]
    
    # Reconstruct the parameters theta1 and theta2
    theta1 = nn_params[:hidden_layer_size*(input_layer_size+1)]
    theta1 = theta1.reshape(hidden_layer_size, (input_layer_size+1), order='F')
    theta2 = nn_params[hidden_layer_size*(input_layer_size+1):]
    theta2 = theta2.reshape(num_labels, (hidden_layer_size+1), order='F')
    
    theta = (theta1, theta2)

    a1, z2, a2, z3, a3 = forward_prop(X, theta)
    
    J = 0

    for i in range(m):
        inside = np.multiply(-y[i,:], np.log(a3[i,:])) - np.multiply((1 - y[i,:]), np.log(1 - a3[i,:]))
        J += (1/m) * np.sum(inside)
    
    #reg term
    J += learning_rate/(2*m)*(np.sum(np.square(theta1[:, 1:])) + np.sum(np.square(theta2[:, 1:])))
    
    return J

### Random initialization

In [9]:
def rand_initial(input_size, output_size, num_labels, epsilon_initial):
    params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - epsilon_initial) * epsilon_initial
    return params

I was having a tough time getting the parameters to load correctly, so I've skipped some of preliminary testing. I might come back to this, but if the end results are correct, I probably won't.

## Backpropagation

In this part of the exercise, you will implement the backpropagation algorithm to compute the gradient for the neural network cost function.

You will first implement the backpropagation algorithm to compute the gradients for the parameters for the (unregularized) neural network. After you have verified that your gradient computation for the unregularized case is correct, you will implement the gradient for the regularized neural network.

Now, you will implement the backpropagation algorithm. Recall that the intuition behind the backpropagation algorithm is as follows. Given a training example (x(t); y(t)), we will first run a "forward pass" to compute all the activations throughout the network, including the output value of the hypothesis Then, for each node $j$ in layer $l$, we would like to compute an "error term" that measures how much that node was "responsible" for any errors in our output.

In [10]:
def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), (1 - sigmoid(z)))

In [11]:
def gradient(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, learning_rate):
    m = X.shape[0]
    
    delta1 = np.zeros([hidden_layer_size, input_layer_size+1])
    delta2 = np.zeros([num_labels, hidden_layer_size+1])

    theta1 = nn_params[:hidden_layer_size*(input_layer_size+1)]
    theta1 = theta1.reshape(hidden_layer_size, (input_layer_size+1), order='F')
    
    theta2 = nn_params[hidden_layer_size*(input_layer_size+1):]
    theta2 = theta2.reshape(num_labels, (hidden_layer_size+1), order='F')
    theta = (theta1, theta2)
    
    a1, z2, a2, z3, a3 = forward_prop(X, theta)

    for i in range(m):
        
        a1_i = a1[i,:]
        z2_i = z2[i,:]
        a2_i = a2[i,:]
        a3_i = a3[i,:]
        y_i = y[i,:]
       
        delta3 = a3_i - y_i
             
        z2_i = np.insert(z2_i, 0, values=np.ones(1))
        
        delta2_i = np.multiply((theta2.T * delta3.T).T, sigmoid_gradient(z2_i))
        
        delta1 = delta1 + (delta2_i[:,1:]).T * a1_i
        delta2 = delta2 + delta3.T * a2_i
        
    theta1_grad = delta1 / m
    theta2_grad = delta2 / m
    
    theta1_grad[:,1:] = theta1_grad[:,1:] + (theta1[:,1:] * learning_rate) / m
    theta2_grad[:,1:] = theta2_grad[:,1:] + (theta2[:,1:] * learning_rate) / m

    grad = np.concatenate((np.ravel(delta1), np.ravel(delta2)))
    
    return grad

In [12]:
def backprop(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, learning_rate):
    grad = gradient(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, learning_rate)
    cost = reg_cost(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, learning_rate)
    return cost, grad

In [66]:
# initial setup
input_size = 400
hidden_size = 25
num_labels = 10
learning_rate = 1

# randomly initialize a parameter array of the size of the full network's parameters
params = (np.random.random(size=hidden_size * (input_size + 1) + num_labels * (hidden_size + 1)) - 0.5) * 0.25

m = X.shape[0]
X = (X - X.mean()) / X.std()
X = np.matrix(X) 
y = np.matrix(y)

# unravel the parameter array into parameter matrices for each layer
theta1 = np.matrix(np.reshape(params[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
theta2 = np.matrix(np.reshape(params[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))

theta1.shape, theta2.shape

((25, 401), (10, 26))

In [14]:
input_size = 400
hidden_size = 25
num_labels = 10
learning_rate = 1
epsilon_initial = 0.12

params = rand_initial(input_size, hidden_size, num_labels, epsilon_initial)

### Learning parameters using fmincg

In [15]:
from scipy.optimize import minimize

In [93]:
fmin = minimize(fun=backprop, x0=params, args=(input_size, hidden_size, num_labels, X, y, learning_rate), 
                method='TNC', jac=True, options={'maxiter': 250})
fmin

     fun: 3.6531435405714685
     jac: array([-58.55298272,  26.06424678,  26.06424678, ..., 424.42782147,
       321.79354602, 219.92256293])
 message: 'Converged (|f_n-f_(n-1)| ~= 0)'
    nfev: 79
     nit: 2
  status: 1
 success: True
       x: array([ 0.04463695, -0.03343172,  0.03256961, ..., -0.06443979,
       -0.00052869, -0.12559057])

In [68]:
def predict(X, nn_fit):
    
    theta1 = np.matrix(np.reshape(nn_fit[:hidden_size * (input_size + 1)], (hidden_size, (input_size + 1))))
    theta2 = np.matrix(np.reshape(nn_fit[hidden_size * (input_size + 1):], (num_labels, (hidden_size + 1))))
    
    theta = (theta1, theta2)
    
    a1, z2, a2, z3, a3 = forward_prop(X, theta)
    
    y_pred = np.array(np.argmax(a3, axis=1) + 1)
    return y_pred