## Import libraries

In [1]:
import numpy as np
from matplotlib import pyplot as plt

## Initialize the weights and bias parameters

In [2]:
def init_params(layer_dims):
    '''this function takes in the dimensions of the layers as input, and initializes
    the weights W and bias b using the layer dimensions'''

    
    # The seed() method is used to initialize the random number generator.
    # The random number generator needs a number to start with (a seed value), to be able to generate a random number.
    # We used the random.seed function so that we can get the same random values whenever the script is executed
    np.random.seed(3)
    
    # store the values of the parameters in the params dictionary
    params = {}
    
    L = len(layer_dims)
    
    # iterate through the length of the layer dimension and define parameters 
    for l in range(1, L):
        # W - weight matrix, b- bias vector
        params['W'+str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*0.01
        params['b'+str(l)] = np.zeros((layer_dims[l], 1))
        
    return params

## Implement forward propagation

To compute forward propagation, we need to define the sigmoid function

In [3]:
def sigmoid(Z):
    # A = g(Z) where A is the activation function
    A = 1/(1+np.exp(np.dot(-1, Z))) # or 1/(1+np.exp(-Z))
    
    #store cache values for backpropagation
    cache = (Z)
    
    return A, cache

Now implement forward propagation to compute activation values

In [4]:
def forward_prop(X, params):
    '''this function takes in the training data and parameters as inputs and will
    generate output for one layer, it will then feed that output as input into the 
    next layer and so on'''
    
    A = X # training data
    
    # store cache for backpropagation
    caches = []
    L = len(params)//2
    
    # iterate through all the layers of the network
    for l in range(1, L+1):
        A_prev = A # A_prev is the input to the first layer
        
        # Linear Hypothesis
        Z = np.dot(params['W'+str(l)], A_prev) + params['b'+str(l)]
        
        # Storing the linear cache
        linear_cache = (A_prev, params['W'+str(l)], params['b'+str(l)]) 
        
        # Applying sigmoid on linear hypothesis
        A, activation_cache = sigmoid(Z) 
        
         # storing the both linear and activation cache
        cache = (linear_cache, activation_cache)
        caches.append(cache)
    
    return A, caches

## Compute Cost Function J(θ)

In [5]:
def cost_function(A, Y):
    m = Y.shape[1]
    
    cost = (-1/m)*(np.dot(np.log(A), Y.T) + np.dot(log(1-A), 1-Y.T)) 
    
    return cost

## Implement backpropagation to compute partial derivative of the cost function

In [6]:
def one_layer_backward(dA, cache):
    '''this function runs backpropagation for one single layer'''
    
    linear_cache, activation_cache = cache
    
    Z = activation_cache
    # compute dZ which is the derivative of cost function wrt linear output
    # or simply put, the derivative of sigmoid function
    dZ = dA*sigmoid(Z)*(1-sigmoid(Z))
    
    A_prev, W, b = linear_cache
    m = A_prev.shape[1]
    
    # compute dW, db and dA_prev, which are the derivatives of cost function
    # wrt weights, bias and previous activation function respectively.
    dW = (1/m)*np.dot(dZ, A_prev.T)
    db = (1/m)*np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

## Use gradient descent with backpropagation to minimize J(θ)

In [7]:
def backprop(AL, Y, caches):
    '''this function implements backpropagation for the entire neural network
    and computes the gradient'''
    
    # store the value of the gradients in the grads dictionary
    grads = {} 
    
    L = len(caches)
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    
    dAL = -(np.divide(Y, AL) - np.divide(1-Y, 1-AL))
    
    current_cache = caches[L-1]
    grads['dA'+str(L-1)], grads['dW'+str(L-1)], grads['db'+str(L-1)] = one_layer_backward(dAL, current_cache)
    
    for l in reversed(range(L-1)):
        
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = one_layer_backward(grads["dA" + str(l+1)], current_cache)
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
        
    return grads

Implement gradient descent by updating the parameters 

In [8]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2
    
    # iterate through all the layers and update the parameters
    for l in range(L):
        parameters['W'+str(l+1)] = parameters['W'+str(l+1)] - learning_rate*grads['W'+str(l+1)]
        parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - learning_rate*grads['b'+str(l+1)]
        
    return parameters

## Create model training function

In [9]:
def train(X, Y, layer_dims, epochs, lr):
    '''this function takes in X(training data), Y(test data), layer_dims(number of layers), 
    epochs(integer number of times to look at the data) and lr(learning rate alpha) as inputs'''
    
    params = init_params(layer_dims)
    cost_history = []
    
    # iterate through the number of epochs
    for i in range(epochs):
        Y_hat, caches = forward_prop(X, params)
        cost = cost_function(Y_hat, Y)
        cost_history.append(cost)
        grads = backprop(Y_hat, Y, caches)
        
        params = update_parameters(params, grads, lr)
        
    return params, cost_history