In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio

In [95]:
data = sio.loadmat("ex4data1.mat")
data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16 13:09:09 2011',
 '__version__': '1.0',
 '__globals__': [],
 'X': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'y': array([[10],
        [10],
        [10],
        ...,
        [ 9],
        [ 9],
        [ 9]], dtype=uint8)}

In [96]:
X = data['X']
y = data['y']

In [97]:
X.shape, y.shape

((5000, 400), (5000, 1))

In [98]:
def expand_y(y):
    expand_y = []
    for i in y:
        arr = np.zeros(10)
        arr[i - 1] = 1
        expand_y.append(arr)
    return np.array(expand_y)

In [99]:
y = expand_y(y)

In [100]:
y.shape

(5000, 10)

In [101]:
weights = sio.loadmat("ex4weights.mat")
theta1 = weights["Theta1"]
theta2 = weights["Theta2"]
theta1.shape, theta2.shape

((25, 401), (10, 26))

In [102]:
def sigmoid(z):
    return 1 / (1 + np.exp(z))

In [119]:
def forward_propagate(X, theta1, theta2):
    X = np.matrix(X)
    theta1 = np.matrix(theta1)
    theta2 = np.matrix(theta2)
    m = X.shape[0]
    
    a1 = np.insert(X, 0, values=np.ones(m), axis=1)
    z2 = a1 * theta1.T # (5000, 401), (401, 25)
    a2 = sigmoid(z2)
    a2 = np.insert(a2, 0, values=np.ones(m), axis=1)
    z3 = a2 * theta2.T
    h = sigmoid(z3)
    return a1, z2, a2, z3, h

In [120]:
a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
h.shape

(5000, 10)

In [121]:
y.shape

(5000, 10)

In [130]:
def cost(X, y, theta1, theta2):
    m = X.shape[0]
    X = np.matrix(X)
    y = np.matrix(y)
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)

    # np.multiply is pairwise operation
    pair_computation = -np.multiply(y, np.log(h)) - np.multiply((1 - y), np.log(1 - h))

    return pair_computation.sum() / m


In [131]:
def sigmoid_gradient(z):
    return np.multiply(sigmoid(z), (1 - sigmoid(z)))

In [132]:
cost(X, y, theta1, theta2)

42.001382958626145

In [133]:
def back_propagate(X, y, theta1, theta2):
    X = np.matrix(X)
    y = np.matrix(y)
    m = X.shape[0]
    a1, z2, a2, z3, h = forward_propagate(X, theta1, theta2)
    
    # initializations
    J = 0
    delta1 = np.zeros(theta1.shape)  # (25, 401)
    delta2 = np.zeros(theta2.shape)  # (10, 26)
    
     # compute the cost
    for i in range(m):
        first_term = np.multiply(-y[i,:], np.log(h[i,:]))
        second_term = np.multiply((1 - y[i,:]), np.log(1 - h[i,:]))
        J += np.sum(first_term - second_term)
    
    J = J / m
    
    # add the cost regularization term
    J += (float(1) / (2 * m)) * (np.sum(np.power(theta1[:,1:], 2)) + np.sum(np.power(theta2[:,1:], 2)))
    

    # perform backpropagation
    for t in range(m):
        a1t = a1[t,:]  # (1, 401)
        z2t = z2[t,:]  # (1, 25)
        a2t = a2[t,:]  # (1, 26)
        ht = h[t,:]  # (1, 10)
        yt = y[t,:]  # (1, 10)
        
        d3t = ht - yt  # (1, 10)
        
        z2t = np.insert(z2t, 0, values=np.ones(1))  # (1, 26)
        d2t = np.multiply((theta2.T * d3t.T).T, sigmoid_gradient(z2t))  # (1, 26)
        
        delta1 = delta1 + (d2t[:,1:]).T * a1t
        delta2 = delta2 + d3t.T * a2t
        
    delta1 = delta1 / m
    delta2 = delta2 / m
    
    # unravel the gradient matrices into a single array
    grad = np.concatenate((np.ravel(delta1), np.ravel(delta2)))
    
    return J, grad
    

In [134]:
J, grad = back_propagate(X, y, theta1, theta2)
J, grad.shape

(42.09752365255575, (10285,))