In [1]:
import numpy as np
from utils import *

In [2]:
def parse_params(parameters):
    # Retrieve from parameters
    return parameters["Wax"], parameters["Waa"], parameters["ba"], parameters["Wya"], parameters["by"]

![RNN Cell Forward](rnn/rnn_step_forward.png)

In [3]:
def rnn_cell_forward(Xt, a_prev, parameters):
    """
    Implements a single forward step of the RNN-cell as described in Figure (2)

    Arguments:
    Xt -- your input data at timestep "t", numpy array of shape (n_x, m).
    a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    Returns:
    a_next -- next hidden state, of shape (n_a, m)
    yt_hat -- prediction at timestep "t", numpy array of shape (n_y, m)
    """
    
    # Retrieve parameters from "parameters"
    Wax, Waa, ba, Wya, by = parse_params(parameters)
    
    ### START CODE HERE ### (≈2 lines)
    # compute next activation state using the formula given above
    a_next = np.tanh(Wax.dot(Xt) + Waa.dot(a_prev) + ba) # hidden state

    # compute output of the current cell using the formula given above
    yt_hat = softmax(Wya.dot(a_next) + by) # unnormalized log probabilities for next element
    ### END CODE HERE ###
    
    return a_next, yt_hat

In [4]:
np.random.seed(1)
xt = np.random.randn(3,10)
a_prev = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a_next, yt_pred = rnn_cell_forward(xt, a_prev, parameters)
print("a_next[4] = ", a_next[4])
print("a_next.shape = ", a_next.shape)
print("yt_pred[1] =", yt_pred[1])
print("yt_pred.shape = ", yt_pred.shape)

a_next[4] =  [ 0.59584544  0.18141802  0.61311866  0.99808218  0.85016201  0.99980978
 -0.18887155  0.99815551  0.6531151   0.82872037]
a_next.shape =  (5, 10)
yt_pred[1] = [0.9888161  0.01682021 0.21140899 0.36817467 0.98988387 0.88945212
 0.36920224 0.9966312  0.9982559  0.17746526]
yt_pred.shape =  (2, 10)


**Expected Output**: 

<table>
    <tr>
        <td>
            **a_next[4]**:
        </td>
        <td>
           [ 0.59584544  0.18141802  0.61311866  0.99808218  0.85016201  0.99980978
 -0.18887155  0.99815551  0.6531151   0.82872037]
        </td>
    </tr>
        <tr>
        <td>
            **a_next.shape**:
        </td>
        <td>
           (5, 10)
        </td>
    </tr>
        <tr>
        <td>
            **yt[1]**:
        </td>
        <td>
           [ 0.9888161   0.01682021  0.21140899  0.36817467  0.98988387  0.88945212
  0.36920224  0.9966312   0.9982559   0.17746526]
        </td>
    </tr>
        <tr>
        <td>
            **yt.shape**:
        </td>
        <td>
           (2, 10)
        </td>
    </tr>

</table>

![Basic RNN](rnn/rnn.png)

In [5]:
def rnn_predict(X, a0, parameters):
    """
    Implement the forward propagation of the recurrent neural network described in Figure (3).

    Arguments:
    x -- Input data for every time-step, of shape (n_x, m, T_x).
    a0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing:
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    Returns:
    a -- Hidden states for every time-step, numpy array of shape (n_a, m, T_x)
    Y_hat -- Predictions for every time-step, numpy array of shape (n_y, m, T_x)
    """
    
    # Retrieve dimensions from shapes of x and parameters["Wya"]
    n_x, m, T_x = X.shape
    n_y, n_a = parameters["Wya"].shape
    
    # initialize "a" and "y" with zeros (≈2 lines)
    a = np.zeros((n_a, m, T_x))
    Y_hat = np.zeros((n_y, m, T_x))
    
    # Initialize a_next (≈1 line)
    a_t = a0
    
    # loop over all time-steps
    for t in range(T_x):
        # Update next hidden state, compute the prediction, get the cache (≈1 line)
        a_t, yt_hat = rnn_cell_forward(X[:,:,t], a_t, parameters)
        # Save the value of the new "next" hidden state in a (≈1 line)
        a[:,:,t] = a_t
        # Save the value of the prediction in y (≈1 line)
        Y_hat [:,:,t] = yt_hat
    ### END CODE HERE ###
    
    return a, Y_hat

In [6]:
np.random.seed(1)
x = np.random.randn(3,10,4)
a0 = np.random.randn(5,10)
Waa = np.random.randn(5,5)
Wax = np.random.randn(5,3)
Wya = np.random.randn(2,5)
ba = np.random.randn(5,1)
by = np.random.randn(2,1)
parameters = {"Waa": Waa, "Wax": Wax, "Wya": Wya, "ba": ba, "by": by}

a, Y_predict = rnn_predict(x, a0, parameters)
print("a[4][1] = ", a[4][1])
print("a.shape = ", a.shape)
print("Y_hat[1][3] =", Y_predict[1][3])
print("Y_hat.shape = ", Y_predict.shape)

a[4][1] =  [-0.99999375  0.77911235 -0.99861469 -0.99833267]
a.shape =  (5, 10, 4)
Y_hat[1][3] = [0.79560373 0.86224861 0.11118257 0.81515947]
Y_hat.shape =  (2, 10, 4)


**Expected Output**:

<table>
    <tr>
        <td>
            **a[4][1]**:
        </td>
        <td>
           [-0.99999375  0.77911235 -0.99861469 -0.99833267]
        </td>
    </tr>
        <tr>
        <td>
            **a.shape**:
        </td>
        <td>
           (5, 10, 4)
        </td>
    </tr>
        <tr>
        <td>
            **y[1][3]**:
        </td>
        <td>
           [ 0.79560373  0.86224861  0.11118257  0.81515947]
        </td>
    </tr>
        <tr>
        <td>
            **y.shape**:
        </td>
        <td>
           (2, 10, 4)
        </td>
    </tr>
</table>

In [7]:
def rnn_forward(X, Y, a0, parameters):
    """
    Implement the forward propagation of the recurrent neural network described in Figure (3).

    Arguments:
    X -- Input data for every time-step, of shape (n_x, m, T_x).
    Y -- Output data for every time-step of shape (n_y, m, T_y)
    a0 -- Initial hidden state, of shape (n_a, m)
    parameters -- python dictionary containing:
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)

    Returns:
    loss -- overall sum of loss
    a -- hidden state
    Y_hat predicted output
    """
    
    a, Y_hat = rnn_predict(X, a0, parameters)
    
    # Cross entropy
    loss = np.sum(-Y * np.log(Y_hat) - (1 - Y) * np.log(1 - Y_hat))
    
    return loss, a, Y_hat

![RNN Backward](rnn/rnn_cell_backprop.png)

In [8]:
def rnn_cell_backward(dy, da_next, Xt, a_prev, at, parameters, gradients):
    """
    Implements the backward pass for the RNN-cell (single time-step).

    Arguments:
    dy -- Gradient of loos with respect to next layer
    da_next -- Gradient of loss with respect to next hidden state
    Xt -- input X
    a_prev -- hidden state of t-1
    at -- hidden state of t

    Returns:
    gradients -- python dictionary containing:
                        dXt -- Gradients of input vector, of shape(n_x, m)
                        dWya -- Gradients of hidden-to-output weights, of shape(n_y, n_a)
                        dby -- Gradients of bias of output vector, of shape (n_y, 1)
                        da_prev -- Gradients of previous hidden state, of shape (n_a, m)
                        dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
                        dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
                        dba -- Gradients of bias vector, of shape (n_a, 1)
    """
    # Retrieve values from parameters
    Wax, Waa, ba, Wya, by = parse_params(parameters)
    
    # Increment global derivatives w.r.t parameters by adding their derivative at time-step t
    
    gradients['dWya'] += np.dot(dy, at.T)
    gradients['dby'] += np.sum(dy, axis=1, keepdims=1)
    
    da = np.dot(parameters['Wya'].T, dy) + da_next # backprop into tanh
    
    ### START CODE HERE ###
    # compute the gradient of tanh with respect to a_next (≈1 line)
    dtanh = (1 - a_t ** 2) * da  # backprop through tanh nonlinearity
    
    # compute the gradient of the loss with respect to Wax (≈2 lines)
    dXt = np.dot(Wax.T, dtanh) 
    gradients['dWax'] += np.dot(dtanh, Xt.T)
    
    # compute the gradient with respect to Waa (≈2 lines)
    da_prev = np.dot(Waa.T, dtanh)
    gradients['dWaa'] += np.dot(dtanh, a_prev.T)
    
    # compute the gradient with respect to b (≈1 line)
    gradients['dba'] += np.sum(dtanh, axis = 1,keepdims=1)

    ### END CODE HERE ###
    
    return gradients, dXt, da_prev

In [9]:
def rnn_backward(X, Y, a, Y_hat, parameters):
    # Retrieve dimensions from da's and x1's shapes (≈2 lines)
    n_a, m, T_x = a.shape
    n_x, m, T_x = X.shape
    
    # Retrieve from parameters
    Wax, Waa, ba, Wya, by = parse_params(parameters)
    
    gradients = {}
    
    # each one should be initialized to zeros of the same dimension as its corresponding parameter
    gradients['dWax'] = np.zeros_like(Wax)
    gradients['dWaa'] = np.zeros_like(Waa)
    gradients['dWya'] = np.zeros_like(Wya)
    gradients['dba'] = np.zeros_like(ba)
    gradients['dby'] = np.zeros_like(by)
    
    dX = np.zeros(X.shape)
    da_next = np.zeros((n_a, m))
    
    ### START CODE HERE ###
    # Backpropagate through time
    # Loop through all the time steps
    for t in reversed(range(T_x)):
        # Compute gradients at time step t. Choose wisely the "da_next" and the "cache" to use in the backward propagation step. (≈1 line)
        # Retrieve derivatives from gradients (≈ 1 line)
        gradients, dXt, da_next = rnn_cell_backward(Y_hat[:,:,t]-Y[:,:,t], da_next, X[:,:,t], a[:,:,t], a[:,:,t+1], parameters, gradients)
        dX[:,:,t] = dXt
    ### END CODE HERE ###
    
    return gradients, dX, da_next

In [10]:
def optimize(X, Y, a0, parameters, learning_rate = 0.01):
    """
    Execute one step of the optimization to train the model.
    
    Arguments:
    X -- list of integers, where each integer is a number that maps to a character in the vocabulary.
    Y -- list of integers, exactly the same as X but shifted one index to the left.
    a_0 -- hidden state t=0.
    parameters -- python dictionary containing:
                        Wax -- Weight matrix multiplying the input, numpy array of shape (n_a, n_x)
                        Waa -- Weight matrix multiplying the hidden state, numpy array of shape (n_a, n_a)
                        Wya -- Weight matrix relating the hidden-state to the output, numpy array of shape (n_y, n_a)
                        ba --  Bias, numpy array of shape (n_a, 1)
                        by -- Bias relating the hidden-state to the output, numpy array of shape (n_y, 1)
    learning_rate -- learning rate for the model.
    
    Returns:
    loss -- value of the loss function (cross-entropy)
    gradients -- python dictionary containing:
                        dWax -- Gradients of input-to-hidden weights, of shape (n_a, n_x)
                        dWaa -- Gradients of hidden-to-hidden weights, of shape (n_a, n_a)
                        dWya -- Gradients of hidden-to-output weights, of shape (n_y, n_a)
                        db -- Gradients of bias vector, of shape (n_a, 1)
                        dby -- Gradients of output bias vector, of shape (n_y, 1)
    a[len(X)-1] -- the last hidden state, of shape (n_a, 1)
    """
    
    ### START CODE HERE ###
    
    # Forward propagate through time (≈1 line)
    loss, a, Y_hat = rnn_forward(X, Y, a0, parameters)
    
    # Backpropagate through time (≈1 line)
    np.stack((a0, a), axis=2)
    gradients, dX, da0 = rnn_backward(X, Y, a, Y_hat, parameters)
    
    # Clip your gradients between -5 (min) and 5 (max) (≈1 line)
    for gradient in gradients:
        np.clip(gradients, -5, 5, out=gradient)
    
    # Update parameters (≈1 line)
    parameters['Wax'] += -lr * gradients['dWax']
    parameters['Waa'] += -lr * gradients['dWaa']
    parameters['Wya'] += -lr * gradients['dWya']
    parameters['ba']  += -lr * gradients['dba']
    parameters['by']  += -lr * gradients['dby']
    
    a0 += -lr * da0
    
    ### END CODE HERE ###
    
    return loss, gradients, a0, dX

In [11]:
def model(data, n_a, vocab_size, learning_rate, epoch = 1000, batch_size = 1000):
    """
    Trains the model and generates dinosaur names. 
    
    Arguments:
    data -- text corpus
    idx_to_word -- dictionary that maps the index to a word
    word_to_idx -- dictionary that maps a word to an index
    num_iterations -- number of iterations to train the model for
    n_a -- number of units of the RNN cell
    seq_length -- length of sequence you want to sample at each iteration. 
    vocab_size -- number of unique characters found in the text, size of the vocabulary
    
    Returns:
    parameters -- learned parameters
    """
    
    # Retrieve n_x and n_y from vocab_size
    n_x = n_y = vocab_size
    
    # Initialize parameters
    parameters = {}
    parameters['Wax'] = np.random.randn(n_a, n_x)*0.01 # input to hidden
    parameters['Waa'] = np.random.randn(n_a, n_a)*0.01 # hidden to hidden
    parameters['Wya'] = np.random.randn(n_y, n_a)*0.01 # hidden to output
    parameters['ba'] = np.zeros((n_a, 1)) # hidden bias
    parameters['by'] = np.zeros((n_y, 1)) # output bias
    
    # Initialize loss (this is required because we want to smooth our loss, don't worry about it)
    loss = -np.log(1.0 / vocab_size) * n_a
    
    # Shuffle data
    np.random.shuffle(data)
    
    # Initialize the hidden state of your LSTM
    a0 = np.zeros((n_a, 1))
    
    # Optimization loop
    iteration = np.ceil(float(len(data)) / batch)
    for i in range(epoch):
        for j in range(iteration):
            examples = data[j*batch_size:(j+1)*batch_size]
            X, Y = np.split(data, 2)
            
            ### START CODE HERE ###
            
            # Perform one optimization step: Forward-prop -> Backward-prop -> Clip -> Update parameters
            # Choose a learning rate of 0.01
            curr_loss, gradients, a_prev = optimize(X, Y, a0, parameters, learning_rate)

            ### END CODE HERE ###

            # Use a latency trick to keep the loss smooth. It happens here to accelerate the training.
            loss = smooth(loss, curr_loss)
        
    return parameters

![Sample RNN](rnn/sample_rnn.png)

In [1]:
def sample(parameters, EOS):
    """
    Sample a sequence of characters according to a sequence of probability distributions output of the RNN

    Arguments:
    parameters -- python dictionary containing the parameters Waa, Wax, Wya, by, and b. 
    idx_to_word -- python dictionary mapping each indice to a word

    Returns:
    indices -- a list of length n containing the indices of the sampled characters.
    """
    
    # Retrieve from parameters
    Wax, Waa, ba, Wya, by = parse_params(parameters)
    vocab_size, n_a = Wya.shape
    
    ### START CODE HERE ###
    # Step 1: Create the one-hot vector x for the first character (initializing the sequence generation). (≈1 line)
    x = np.zeros((vocab_size, 1))
    # Step 1': Initialize a_prev as zeros (≈1 line)
    a_prev = np.zeros((n_a, 1))
    
    # Create an empty list of indices, this is the list which will contain the list of indices of the characters to generate (≈1 line)
    indices = []
    
    # Idx is a flag to detect a newline character, we initialize it to -1
    idx = -1 
    
    # Loop over time-steps t. At each time-step, sample a character from a probability distribution and append 
    # its index to "indices".
    
    while (idx != EOF):
        
        # Step 2: Forward propagate x using the equations (1), (2) and (3)
        a = np.tanh(np.dot(Wax, x) + np.dot(Waa, a_prev) + b)
        z = np.dot(Wya, a) + by
        y = softmax(z)
        
        # Step 3: Sample the index of a character within the vocabulary from the probability distribution y
        idx = np.random.choice(vocab_size, p=y.ravel())

        # Append the index to "indices"
        indices.append(idx)
        
        # Step 4: Overwrite the input character as the one corresponding to the sampled index.
        x = np.zeros((vocab_size, 1))
        x[idx] = 1
        
        # Update "a_prev" to be "a"
        a_prev = a
        
    ### END CODE HERE ###

    if (indices[-1] != EOS):
        indices.append(EOS)
    
    return indices