In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from utils import *

# Basic RNNs
## Manual RNN
Output of a single recurrent neuron for a single instance
$$
\mathbf{y}_{(t)} = \phi(\mathbf{x}_{(t)}^T\cdot\mathbf{w}_x + \mathbf{y}_{(t-1)}^T\cdot\mathbf{w}_y + b)
$$
Output of a layer of recurrent neurons for all instances in a minibatch
\begin{eqnarray}
\mathbf{Y}_{(t)} & = & \phi(\mathbf{X}_{(t)}\cdot\mathbf{W}_x + \mathbf{Y}_{(t-1)}\cdot\mathbf{W}_y + \mathbf{b})\\
& = & \phi([\mathbf{X}_{(t)}\,\,\mathbf{Y}_{(t-1})]\cdot\mathbf{W} + \mathbf{b})\,\mathrm{with} \mathbf{W} = \left[\begin{array}{l}\mathbf{W}_x\\\mathbf{W}_y\end{array}\right]
\end{eqnarray}

In [3]:
reset_graph()

n_inputs = 3
n_neurons = 5

X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

Wx = tf.Variable(tf.random_normal(shape=[n_inputs, n_neurons], dtype=tf.float32))
Wy = tf.Variable(tf.random_normal(shape=[n_neurons, n_neurons], dtype=tf.float32))
b = tf.Variable(tf.zeros([1, n_neurons], dtype=tf.float32))

Y0 = tf.tanh(tf.matmul(X0, Wx) + b)
Y1 = tf.tanh(tf.matmul(Y0, Wy) + tf.matmul(X1, Wx) + b)

init = tf.global_variables_initializer()

In [4]:
import numpy as np

X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]]) # t=0
X1_batch = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]]) # t=1

with tf.Session() as sess:
    init.run()
    Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})

In [5]:
print(Y0_val)

[[-0.06640061  0.9625767   0.6810579   0.7091854  -0.89821595]
 [ 0.99777555 -0.71978873 -0.99657613  0.96739244 -0.99989706]
 [ 0.99999785 -0.9989881  -0.99999887  0.9967763  -0.9999999 ]
 [ 1.         -1.         -1.         -0.9981892   0.9995087 ]]


In [6]:
print(Y1_val)

[[ 1.         -1.         -1.          0.40200272 -0.99999994]
 [-0.12210432  0.62805295  0.96718436 -0.9937122  -0.2583932 ]
 [ 0.99999815 -0.9999994  -0.99999744 -0.8594331  -0.99998796]
 [ 0.99928296 -0.9999981  -0.9999059   0.98579615 -0.9220575 ]]


In [7]:
show_graph(tf.get_default_graph())

## Using `static_rnn()`

In [8]:
n_inputs = 3
n_neurons = 5

In [9]:
reset_graph()

X0 = tf.placeholder(tf.float32, [None, n_inputs])
X1 = tf.placeholder(tf.float32, [None, n_inputs])

basic_cell = tf.keras.layers.SimpleRNNCell(units=n_neurons)
output_seqs, states = tf.nn.static_rnn(basic_cell, [X0, X1], dtype=tf.float32)

Y0, Y1 = output_seqs

In [10]:
init = tf.global_variables_initializer()

In [11]:
X0_batch = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 0, 1]])
X1_batcj = np.array([[9, 8, 7], [0, 0, 0], [6, 5, 4], [3, 2, 1]])

with tf.Session() as sess:
    init.run()
    Y0_val, Y1_val = sess.run([Y0, Y1], feed_dict={X0: X0_batch, X1: X1_batch})

In [12]:
Y0_val

array([[ 0.90012544, -0.81816316, -0.7137506 ,  0.96558857, -0.00169486],
       [ 0.8625091 , -0.9449419 , -0.9252927 ,  0.99980336,  0.63831985],
       [ 0.812126  , -0.98410195, -0.9821323 ,  0.99999887,  0.90735716],
       [-0.9999768 ,  0.99998355,  0.9983095 , -0.9997386 ,  0.99999183]],
      dtype=float32)

In [13]:
Y1_val

array([[-0.9327767 ,  0.67243385, -0.6553961 ,  0.99997735,  0.9559362 ],
       [ 0.53530896,  0.8892469 ,  0.78196657, -0.08488666, -0.5707692 ],
       [-0.80390906,  0.83610725,  0.60967326,  0.9955471 ,  0.6496031 ],
       [-0.9343803 , -0.9317238 ,  0.16454798,  0.13083154,  0.63405186]],
      dtype=float32)

In [14]:
show_graph(tf.get_default_graph())

## Packing sequences

In [15]:
n_steps = 2
n_inputs = 3
n_neurons = 5

In [16]:
reset_graph()

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
X_seqs = tf.unstack(tf.transpose(X, perm=[1, 0, 2]))

basic_cell = tf.keras.layers.SimpleRNNCell(units=n_neurons)
output_seqs, states = tf.nn.static_rnn(basic_cell, X_seqs, dtype=tf.float32)
outputs = tf.transpose(tf.stack(output_seqs), perm=[1, 0, 2])

In [17]:
init = tf.global_variables_initializer()

In [18]:
X_batch = np.array([
    [[0, 1, 2], [9, 8, 7]], # instance 1
    [[3, 4, 5], [0, 0, 0]], # instance 2
    [[6, 7, 8], [6, 5, 4]], # instance 3
    [[9, 0, 1], [3, 2, 1]], # instance 4
])

with tf.Session() as sess:
    init.run()
    outputs_val = outputs.eval(feed_dict={X: X_batch})

In [19]:
print(outputs_val)

[[[ 0.38419527 -0.40979752 -0.760677   -0.9757152   0.62687314]
  [ 1.         -0.9999359  -0.07642744 -0.9999997   0.99998283]]

 [[ 0.9980172  -0.9834511  -0.84703493 -0.9999515   0.9950296 ]
  [ 0.9036534   0.24835567 -0.69220096 -0.85009456  0.26877522]]

 [[ 0.9999956  -0.99966747 -0.90393144 -0.9999999   0.9999459 ]
  [ 0.999999   -0.99671483 -0.347943   -0.9999468   0.99942   ]]

 [[ 0.999447   -0.7912955   0.9162898   0.9714673   0.894578  ]
  [ 0.93569356 -0.9154096  -0.47746658 -0.9332682   0.98855   ]]]


In [20]:
print(np.transpose(outputs_val, axes=[1, 0, 2])[1])

[[ 1.         -0.9999359  -0.07642744 -0.9999997   0.99998283]
 [ 0.9036534   0.24835567 -0.69220096 -0.85009456  0.26877522]
 [ 0.999999   -0.99671483 -0.347943   -0.9999468   0.99942   ]
 [ 0.93569356 -0.9154096  -0.47746658 -0.9332682   0.98855   ]]


In [21]:
show_graph(tf.get_default_graph())

## Dynamic Unrolling Through Time `dynamic_rnn()`
The `dynamic_rnn()` function uses a `while_loop()` operation to run over the cell the appropriate number of times, and you can set `swap_memory=True` if you want it to swap the GPU's memory to the CPU's memory during backprop to avoid OOM errors. 

It also accepts a single tensor for all inputs at every time step (shape `[None, n_steps, n_inputs]`) and it outputs a single tensor for all outputs at every time step (shape `[None, n_steps, n_neurons]`); there is no need to stack, unstack or transpose. The following code creates the same RNN as ealier using the `dynamic_rnn()` function.

In [22]:
reset_graph()
X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])

basic_cell = tf.keras.layers.SimpleRNNCell(units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

In [23]:
init = tf.global_variables_initializer()

In [24]:
X_batch = np.array([
    [[0, 1, 2], [9, 8, 7]], # instance 1
    [[3, 4, 5], [0, 0, 0]], # instance 2
    [[6, 7, 8], [6, 5, 4]], # instance 3
    [[9, 0, 1], [3, 2, 1]], # instance 4
])

with tf.Session() as sess:
    init.run()
    output_val = outputs.eval(feed_dict={X: X_batch})

In [25]:
print(outputs_val)

[[[ 0.38419527 -0.40979752 -0.760677   -0.9757152   0.62687314]
  [ 1.         -0.9999359  -0.07642744 -0.9999997   0.99998283]]

 [[ 0.9980172  -0.9834511  -0.84703493 -0.9999515   0.9950296 ]
  [ 0.9036534   0.24835567 -0.69220096 -0.85009456  0.26877522]]

 [[ 0.9999956  -0.99966747 -0.90393144 -0.9999999   0.9999459 ]
  [ 0.999999   -0.99671483 -0.347943   -0.9999468   0.99942   ]]

 [[ 0.999447   -0.7912955   0.9162898   0.9714673   0.894578  ]
  [ 0.93569356 -0.9154096  -0.47746658 -0.9332682   0.98855   ]]]


In [26]:
show_graph(tf.get_default_graph())

## Setting the sequence lengths

In [27]:
n_steps = 2
n_inputs = 3
n_neurons = 5

reset_graph()

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
basic_cell = tf.keras.layers.SimpleRNNCell(units=n_neurons)

In [28]:
seq_length = tf.placeholder(tf.int32, [None])
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32,
                                    sequence_length=seq_length)

In [29]:
init = tf.global_variables_initializer()

In [30]:
X_batch = np.array([
    [[0, 1, 2], [9, 8, 7]], # instance 1
    [[3, 4, 5], [0, 0, 0]], # instance 2
    [[6, 7, 8], [6, 5, 4]], # instance 3
    [[9, 0, 1], [3, 2, 1]], # instance 4
])

seq_length_batch = np.array([2, 1, 2, 2])

In [31]:
with tf.Session() as sess:
    init.run()
    outputs_val, states_val = sess.run(
    [outputs, states], feed_dict={X: X_batch, seq_length: seq_length_batch})

In [32]:
print(outputs_val)

[[[ 0.5778032  -0.16725832 -0.68106353 -0.777186   -0.52269983]
  [ 0.93183446  0.75378835  0.94284356 -0.9288892   0.99991345]]

 [[ 0.9199638   0.46335757 -0.4476297  -0.96091753  0.68342566]
  [ 0.          0.          0.          0.          0.        ]]

 [[ 0.9870954   0.8249127  -0.13160156 -0.9936833   0.9780728 ]
  [ 0.7231622   0.71162856  0.9931086  -0.947106    0.9994282 ]]

 [[-0.09031451  0.9998482   0.99935174  0.9999679   0.99999905]
  [ 0.88181174  0.9727099   0.95837885 -0.53776693  0.96575755]]]


In [33]:
print(states_val)

[[ 0.93183446  0.75378835  0.94284356 -0.9288892   0.99991345]
 [ 0.9199638   0.46335757 -0.4476297  -0.96091753  0.68342566]
 [ 0.7231622   0.71162856  0.9931086  -0.947106    0.9994282 ]
 [ 0.88181174  0.9727099   0.95837885 -0.53776693  0.96575755]]


In [34]:
show_graph(tf.get_default_graph())

## Training RNNs
Unroll it through time and then simply use regular backpropagation: known as *backpropagation through time (BPTT)*

In [35]:
reset_graph()

n_steps = 28
n_inputs = 28
n_neurons = 150
n_outputs = 10

learning_rate = 0.001

X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])
y = tf.placeholder(tf.int32, [None])

basic_cell = tf.keras.layers.SimpleRNNCell(units=n_neurons)
outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)

logits = tf.layers.dense(states, n_outputs)
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)

loss = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
correct = tf.nn.in_top_k(logits, y, 1)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()

In [36]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()
X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0
X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
X_valid, X_train = X_train[:5000], X_train[5000:]
y_valid, y_train = y_train[:5000], y_train[5000:]

In [37]:
X_test = X_test.reshape((-1, n_steps, n_inputs))

In [38]:
n_epochs = 100
batch_size = 150

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):
            X_batch = X_batch.reshape((-1, n_steps, n_inputs))
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_batch = accuracy.eval(feed_dict={X: X_test, y: y_test})
        acc_test = accuracy.eval(feed_dict={X: X_test, y: y_test})
        print(epoch, 'Last batch accuracy:', acc_batch, 'Test accuracy:', acc_test)

0 Last batch accuracy: 0.9261 Test accuracy: 0.9261
1 Last batch accuracy: 0.9465 Test accuracy: 0.9465
2 Last batch accuracy: 0.9545 Test accuracy: 0.9545
3 Last batch accuracy: 0.9653 Test accuracy: 0.9653
4 Last batch accuracy: 0.9654 Test accuracy: 0.9654
5 Last batch accuracy: 0.9647 Test accuracy: 0.9647
6 Last batch accuracy: 0.969 Test accuracy: 0.969
7 Last batch accuracy: 0.9738 Test accuracy: 0.9738
8 Last batch accuracy: 0.969 Test accuracy: 0.969
9 Last batch accuracy: 0.9719 Test accuracy: 0.9719
10 Last batch accuracy: 0.9764 Test accuracy: 0.9764
11 Last batch accuracy: 0.9797 Test accuracy: 0.9797
12 Last batch accuracy: 0.9782 Test accuracy: 0.9782
13 Last batch accuracy: 0.9779 Test accuracy: 0.9779
14 Last batch accuracy: 0.9779 Test accuracy: 0.9779
15 Last batch accuracy: 0.9761 Test accuracy: 0.9761
16 Last batch accuracy: 0.9702 Test accuracy: 0.9702
17 Last batch accuracy: 0.9788 Test accuracy: 0.9788
18 Last batch accuracy: 0.9785 Test accuracy: 0.9785
19 Last