In [1]:
import numpy as np

In [2]:
def initialize_parameters(layer_dims):
    """
    Initializes parameters for a deep neural network.

    Parameters
    ----------
    layer_dims: (list) - the number of units of each layer in the network.

    Returns
    -------
    (dict) with keys where 1 <= l <= len(layer_dims) - 1:
        Wl: (ndarray (layer_dims[l], layer_dims[l-1])) - weight matrix for layer l
        bl: (ndarray (layer_dims[l], 1)) - bias vector for layer l
    """

    parameters = {}
    for l in range(1, len(layer_dims)):
        parameters[f'W{l}'] = np.random.randn(layer_dims[l], layer_dims[l - 1]) / np.sqrt(layer_dims[l - 1])
        parameters[f'b{l}'] = np.zeros((layer_dims[l], 1))
    return parameters

In [3]:
def linear_forward(A_prev, W, b):
    """
    Implements the linear part of a layer's forward propagation.

    Parameters
    ----------
    A_prev: (ndarray (size of previous layer, number of examples)) - activations from previous layer
    W: (ndarray (size of current layer, size of previous layer)) - weight matrix
    b: (ndarray (size of current layer, 1)) - bias vector

    Returns
    -------
    Z: (ndarray (size of current layer, number of examples)) - the input to the activation function
    cache: (tuple) - containing A_prev, W, b for backpropagation
    """

    Z = W @ A_prev + b
    cache = (A_prev, W, b)
    return Z, cache

In [4]:
def sigmoid(Z):
    """
    Implements the sigmoid activation.

    Parameters
    ----------
    Z: (ndarray of any shape) - input to the activation function

    Returns
    -------
    A: (ndarray of same shape as Z) - output of the activation function
    cache: (ndarray) - returning Z for backpropagation
    """

    A = 1 / (1 + np.exp(-Z))
    cache = Z
    return A, cache


def tanh(Z):
    """
    Implements the tanh activation.

    Parameters
    ----------
    Z: (ndarray of any shape) - input to the activation function

    Returns
    -------
    A: (ndarray of same shape as Z) - output of the activation function
    cache: (ndarray) - returning Z for backpropagation
    """

    A = (np.exp(Z) - np.exp(-Z)) / (np.exp(Z) + np.exp(-Z))
    cache = Z
    return A, cache


def relu(Z):
    """
    Implements the ReLU activation.

    Parameters
    ----------
    Z: (ndarray of any shape) - input to the activation function

    Returns
    -------
    A: (ndarray of same shape as Z) - output of the activation function
    cache: (ndarray) - returning Z for backpropagation
    """

    A = np.maximum(0, Z)
    cache = Z
    return A, cache


def leaky_relu(Z, negative_slope=0.01):
    """
    Implements the Leaky ReLU activation.

    Parameters
    ----------
    Z: (ndarray of any shape) - input to the activation function
    negative_slope: (float) - the slope for negative values

    Returns
    -------
    A: (ndarray of same shape as Z) - output of the activation function
    cache: (ndarray) - returning Z for backpropagation
    """

    A = np.maximum(0, Z) + negative_slope * np.minimum(0, Z)
    cache = Z
    return A, cache

In [5]:
def linear_activation_forward(A_prev, W, b, activation_function):
    """
    Implements the forward propagation for the linear and activation layer.

    Parameters
    ----------
    A_prev: (ndarray (size of previous layer, number of examples)) - activations from previous layer
    W: (ndarray (size of current layer, size of previous layer)) - weight matrix
    b: (ndarray (size of current layer, 1)) - bias vector
    activation_function: (str) - the activation function to be used

    Returns
    -------
    A: (ndarray (size of current layer, number of examples)) - the output of the activation function
    cache: (tuple) - containing linear_cache (A_prev, W, b) and activation_cache (Z) for backpropagation
    """

    Z, linear_cache = linear_forward(A_prev, W, b)
    if activation_function == 'sigmoid':
        A, activation_cache = sigmoid(Z)
    elif activation_function == 'tanh':
        A, activation_cache = tanh(Z)
    elif activation_function == 'relu':
        A, activation_cache = relu(Z)
    elif activation_function == 'leaky_relu':
        A, activation_cache = leaky_relu(Z)
    else:
        raise ValueError(f'Activation function {activation_function} not supported.')
    cache = (linear_cache, activation_cache)
    return A, cache

In [6]:
def model_forward(X, parameters, activation_functions):
    """
    Implements forward propagation for the entire network.

    Parameters
    ----------
    X: (ndarray (input size, number of examples)) - input data
    parameters: (dict) - output of initialize_parameters()
    activation_functions: (list) - the activation function for each layer. The first element is unused.

    Returns
    -------
    AL: (ndarray (output size, number of examples)) - the output of the last layer
    caches: (list of tuples) - containing caches for each layer
    """

    caches = []
    A = X
    L = len(activation_functions)
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev, parameters[f'W{l}'], parameters[f'b{l}'], activation_functions[l])
        caches.append(cache)
    return A, caches

In [7]:
def compute_cost(AL, Y):
    """
    Computes the cross-entropy loss.

    Parameters
    ----------
    AL: (ndarray (1, number of examples)) - the output of the last layer
    Y: (ndarray (1, number of examples)) - true labels

    Returns
    -------
    cost: (float) - the cross-entropy cost
    """

    m = Y.shape[1]
    cost = -(1 / m) * np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL), axis=1, keepdims=True)
    cost = np.squeeze(cost)
    return cost

In [8]:
def linear_backward(dZ, cache):
    """
    Implements the linear portion of backward propagation for a single layer.

    Parameters
    ----------
    dZ: (ndarray (size of current layer, number of examples)) - gradient of the cost with respect to the linear output
    cache: (tuple) - containing W, A_prev, b from the forward propagation

    Returns
    -------
    dA_prev: (ndarray (size of previous layer, number of examples)) - gradient of the cost with respect to the activation from the previous layer
    dW: (ndarray (size of current layer, size of previous layer)) - gradient of the cost with respect to W
    db: (ndarray (size of current layer, 1)) - gradient of the cost with respect to b
    """

    A_prev, W, b = cache
    dW = dZ @ A_prev.T
    db = np.sum(dZ, axis=1, keepdims=True)
    dA_prev = W.T @ dZ
    return dA_prev, dW, db

In [9]:
def sigmoid_backward(dA, cache):
    """
    Implements the backward propagation for a single sigmoid unit.

    Parameters
    ----------
    dA: (ndarray of any shape) - post-activation gradient
    cache: (ndarray) - Z from the forward propagation

    Returns
    --------
    dZ: (ndarray of the same shape as A) - gradient of the cost with respect to Z
    """

    Z = cache
    g = 1 / (1 + np.exp(-Z))
    g_prime = g * (1 - g)
    dZ = dA * g_prime
    return dZ


def tanh_backward(dA, cache):
    """
    Implements the backward propagation for a single tanh unit.

    Parameters
    ----------
    dA: (ndarray of any shape) - post-activation gradient
    cache: (ndarray) - Z from the forward propagation

    Returns
    -------
    dZ: (ndarray of the same shape as A) - gradient of the cost with respect to Z
    """

    Z = cache
    g = (np.exp(Z) - np.exp(-Z)) / (np.exp(Z) + np.exp(-Z))
    g_prime = 1 - g ** 2
    dZ = dA * g_prime
    return dZ


def relu_backward(dA, cache):
    """
    Implements the backward propagation for a single ReLU unit.

    Parameters
    ----------
    dA: (ndarray of any shape) - post-activation gradient
    cache: (ndarray) - Z from the forward propagation

    Returns
    -------
    dZ: (ndarray of the same shape as A) - gradient of the cost with respect to Z
    """

    Z = cache
    dZ = np.array(dA, copy=True)
    dZ[Z < 0] = 0
    return dZ


def leaky_relu_backward(dA, cache, negative_slope=0.01):
    """
    Implements the backward propagation for a single Leaky ReLU unit.

    Parameters
    ----------
    dA: (ndarray of any shape) - post-activation gradient
    cache: (ndarray) - Z from the forward propagation
    negative_slope: (float) - the slope for negative values

    Returns
    -------
    dZ: (ndarray of the same shape as A) - gradient of the cost with respect to Z
    """

    Z = cache
    dZ = np.array(dA, copy=True)
    dZ[Z < 0] = negative_slope
    return dZ

In [10]:
def linear_activation_backward(dA, cache, activation_function):
    """
    Implements the backward propagation for the linear and activation layer.

    Parameters
    ----------
    dA: (ndarray (size of current layer, number of examples)) - post-activation gradient for current layer
    cache: (tuple) - containing linear_cache (A_prev, W, b) and activation_cache (Z) for backpropagation
    activation_function: (str) - the activation function to be used

    Returns
    -------
    dA_prev: (ndarray (size of previous layer, number of examples)) - gradient of the cost with respect to the activation from the previous layer
    dW: (ndarray (size of current layer, size of previous layer)) - gradient of the cost with respect to W
    db: (ndarray (size of current layer, 1)) - gradient of the cost with respect to b
    """

    linear_cache, activation_cache = cache
    if activation_function == 'sigmoid':
        dZ = sigmoid_backward(dA, activation_cache)
    elif activation_function == 'tanh':
        dZ = tanh_backward(dA, activation_cache)
    elif activation_function == 'relu':
        dZ = relu_backward(dA, activation_cache)
    elif activation_function == 'leaky_relu':
        dZ = leaky_relu_backward(dA, activation_cache)
    else:
        raise ValueError(f'Activation function {activation_function} not supported.')
    dA_prev, dW, db = linear_backward(dZ, linear_cache)
    return dA_prev, dW, db

In [11]:
def model_backward(AL, Y, caches, activation_functions):
    """
    Implements the backward propagation for the entire network.

    Parameters
    ----------
    AL: (ndarray (output size, number of examples)) - the output of the last layer
    Y: (ndarray (output size, number of examples)) - true labels
    caches: (list of tuples) - containing linear_cache (A_prev, W, b) and activation_cache (Z) for each layer
    activation_functions: (list) - the activation function for each layer. The first element is unused.

    Returns
    -------
    gradients: (dict) with keys where 0 <= l <= len(activation_functions) - 1:
        dA{l-1}: (ndarray (size of previous layer, number of examples)) - gradient of the cost with respect to the activation for previous layer l - 1
        dWl: (ndarray (size of current layer, size of previous layer)) - gradient of the cost with respect to W for layer l
        dbl: (ndarray (size of current layer, 1)) - gradient of the cost with respect to b for layer l
    """

    gradients = {}
    L = len(activation_functions)
    m = AL.shape[1]
    dAL = -(1 / m) * (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    dA_prev = dAL
    for l in reversed(range(1, L)):
        current_cache = caches[l - 1]
        dA_prev, dW, db = linear_activation_backward(dA_prev, current_cache, activation_functions[l])
        gradients[f'dA{l - 1}'] = dA_prev
        gradients[f'dW{l}'] = dW
        gradients[f'db{l}'] = db
    return gradients

In [12]:
def update_parameters(parameters, gradients, learning_rate):
    """
    Updates parameters using the gradient descent update rule.

    Parameters
    ----------
    parameters: (dict) - containing the parameters
    gradients: (dict) - containing the gradients
    learning_rate: (float) - the learning rate

    Returns
    -------
    params: (dict) - containing the updated parameters
    """

    updated_parameters = parameters.copy()
    L = len(updated_parameters) // 2
    for l in range(L):
        updated_parameters[f'W{l + 1}'] = parameters[f'W{l + 1}'] - learning_rate * gradients[f'dW{l + 1}']
        updated_parameters[f'b{l + 1}'] = parameters[f'b{l + 1}'] - learning_rate * gradients[f'db{l + 1}']
    return updated_parameters

In [13]:
def nn_model(X, Y, init_parameters, layer_activation_functions, learning_rate, num_iterations):
    """
    Implements a neural network.

    Parameters
    ----------
    X: (ndarray (input size, number of examples)) - input data
    Y: (ndarray (output size, number of examples)) - true labels
    init_parameters: (dict) - the initial parameters for the network
    layer_activation_functions: (list) - the activation function for each layer. The first element is unused.
    learning_rate: (float) - the learning rate
    num_iterations: (int) - the number of iterations

    Returns
    -------
    parameters: (dict) - the learned parameters
    costs: (list) - the costs at every 100th iteration
    """

    costs = []
    parameters = init_parameters.copy()

    for i in range(num_iterations):
        AL, caches = model_forward(X, parameters, layer_activation_functions)
        cost = compute_cost(AL, Y)
        gradients = model_backward(AL, Y, caches, layer_activation_functions)
        parameters = update_parameters(parameters, gradients, learning_rate)

        if i % 100 == 0 or i == num_iterations:
            costs.append(cost)

    return parameters, costs

In [14]:
def nn_model_predict(X, parameters, activation_functions):
    """
    Predicts the output of the neural network.

    Parameters
    ----------
    X: (ndarray (input size, number of examples)) - input data
    parameters: (dict) - the learned parameters
    activation_functions: (list) - the activation function for each layer. The first element is unused.

    Returns
    -------
    predictions: (ndarray (1, number of examples)) - the predicted labels
    """

    probabilities, _ = model_forward(X, parameters, activation_functions)
    predictions = probabilities.copy()
    predictions[predictions > 0.5] = 1
    predictions[predictions <= 0.5] = 0
    return predictions