# RNN layer playground

Single RNN layer computation using several implementations of TensorFlow's RNN cells are discussed.

In [1]:
import collections
import tensorflow as tf
import numpy as np
tf.enable_eager_execution()

In [2]:
print("TensorFlow version:", tf.__version__)

TensorFlow version: 1.11.0


## Configuration

In [3]:
BATCH_SIZE = 2 
SEQ_LEN = 3      # steps in time dimension
NUM_INPUTS = 4   # number of input elements
NUM_UNITS = 5    # number of output elements

cell_type = "gru"   # "lstm" or "gru"

# [For CPU] Use CUDA compatible cell?
cuda_compatible = False

# [For CPU] Use faster cell?
performance_matters = True

## Generate inputs

Let the input tensor to RNN layer have the shape (`BATCH_SIZE`, `SEQ_LEN` , `NUM_INPUT`).

In [4]:
rnn_inputs = tf.convert_to_tensor(np.float32(np.random.random((BATCH_SIZE, SEQ_LEN, NUM_INPUTS))))
rnn_inputs

<tf.Tensor: id=1, shape=(2, 3, 4), dtype=float32, numpy=
array([[[0.4401753 , 0.6901737 , 0.7132639 , 0.4629155 ],
        [0.635675  , 0.7819176 , 0.5435599 , 0.8797472 ],
        [0.54418594, 0.9816287 , 0.48195803, 0.15588744]],

       [[0.603667  , 0.09109777, 0.7939124 , 0.12160426],
        [0.3935575 , 0.9057583 , 0.7423673 , 0.62571424],
        [0.77721334, 0.8152382 , 0.38591084, 0.25880244]]], dtype=float32)>

## Selection of RNN cell

As discussed [here](https://www.tensorflow.org/performance/performance_guide#rnn_performance), `tf.nn.rnn_cell.*` provides only the reference design not meant for perforance computation. 

* For GPU performance `tf.contrib.cudnn_rnn.*` is recommended.
* For CPU performance `tf.contrib.rnn.*BlockCell*` is recommended.
* `tf.nn.rnn_cell.*` provides reference implementation.

**[NOTE]** `tf.float64` does not work with `LSTMBlockCell` and `GRUBlockCellV2` as of v1.11.

In [5]:
if tf.test.gpu_device_name():
    selector = {
        "lstm":  tf.contrib.cudnn_rnn.CudnnLSTM,
        "gru":  tf.contrib.cudnn_rnn.CudnnGRU, 
    }
elif cuda_compatible:
    selector = {
        "lstm":  tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell,
        "gru":  tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell, 
    }
elif performance_matters:
    selector = {
        "lstm": tf.contrib.rnn.LSTMBlockCell,
        "gru":  tf.contrib.rnn.GRUBlockCellV2,
    } 
else:
    selector = {
        "lstm": tf.nn.rnn_cell.LSTMCell,
        "gru":  tf.nn.rnn_cell.GRUCell, 
    }

rnn_cell_func = selector[cell_type]
rnn_cell = rnn_cell_func(NUM_UNITS)

## Prepare initial state

In [6]:
def get_initial_state(cell_type, batch_size=BATCH_SIZE, num_units=NUM_UNITS, dtype=tf.float32):
    assert cell_type in ("lstm", "gru")
    if cell_type == "lstm":
        LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ["c", "h"])
        left = tf.zeros(shape=(batch_size, num_units), dtype=dtype)
        right = tf.zeros(shape=(batch_size, num_units), dtype=dtype)
        res = LSTMStateTuple(left, right)
    else:
        res = tf.zeros(shape=(batch_size, num_units), dtype=dtype)
    
    return res

In [7]:
rnn_initial_state = get_initial_state(cell_type)
rnn_initial_state

<tf.Tensor: id=5, shape=(2, 5), dtype=float32, numpy=
array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)>

## Direct implementaiton using the RNN cell

In [8]:
stack = []
rnn_state = rnn_initial_state
for i in range(SEQ_LEN):
    output_snapshot, rnn_state = rnn_cell(tf.convert_to_tensor(rnn_inputs[:, i, :]), rnn_state)
    stack.append(output_snapshot)
tmp = tf.convert_to_tensor(stack)          # time major
rnn_output = tf.transpose(tmp, [1, 0, 2])  # batch major

In [9]:
rnn_output

<tf.Tensor: id=93, shape=(2, 3, 5), dtype=float32, numpy=
array([[[ 0.06355083,  0.10468215, -0.00351658, -0.06466059,
         -0.12453991],
        [ 0.11457729,  0.2151941 ,  0.07290673,  0.03237692,
         -0.21750826],
        [ 0.07553838,  0.26621968,  0.15797816, -0.09341604,
         -0.3176002 ]],

       [[ 0.10963777,  0.0522937 , -0.15269682, -0.00395007,
         -0.11811382],
        [ 0.13747731,  0.16242859, -0.04078223, -0.09043062,
         -0.22412425],
        [ 0.11365119,  0.21294627,  0.05266741, -0.07813045,
         -0.31754592]]], dtype=float32)>

In [10]:
rnn_state

<tf.Tensor: id=90, shape=(2, 5), dtype=float32, numpy=
array([[ 0.07553838,  0.26621968,  0.15797816, -0.09341604, -0.3176002 ],
       [ 0.11365119,  0.21294627,  0.05266741, -0.07813045, -0.31754592]],
      dtype=float32)>

## Equivalent code with dynamic_rnn

In [11]:
rnn_output2, rnn_state2 = tf.nn.dynamic_rnn(rnn_cell, rnn_inputs, initial_state=rnn_initial_state)

In [12]:
rnn_output2

<tf.Tensor: id=188, shape=(2, 3, 5), dtype=float32, numpy=
array([[[ 0.06355083,  0.10468215, -0.00351658, -0.06466059,
         -0.12453991],
        [ 0.11457729,  0.2151941 ,  0.07290673,  0.03237692,
         -0.21750826],
        [ 0.07553838,  0.26621968,  0.15797816, -0.09341604,
         -0.3176002 ]],

       [[ 0.10963777,  0.0522937 , -0.15269682, -0.00395007,
         -0.11811382],
        [ 0.13747731,  0.16242859, -0.04078223, -0.09043062,
         -0.22412425],
        [ 0.11365119,  0.21294627,  0.05266741, -0.07813045,
         -0.31754592]]], dtype=float32)>

In [13]:
rnn_state2

<tf.Tensor: id=176, shape=(2, 5), dtype=float32, numpy=
array([[ 0.07553838,  0.26621968,  0.15797816, -0.09341604, -0.3176002 ],
       [ 0.11365119,  0.21294627,  0.05266741, -0.07813045, -0.31754592]],
      dtype=float32)>

## Equivalence checking

Compare results from direct implementation and `dynamic_rnn()`.

In [14]:
(rnn_output.numpy() == rnn_output2.numpy()).all()

True

In [15]:
if cell_type == "lstm":
    res = ((rnn_state.c.numpy() == rnn_state2.c.numpy()).all() 
           and (rnn_state.h.numpy() == rnn_state2.h.numpy()).all())
else:
    res = (rnn_state.numpy() == rnn_state2.numpy()).all()
res

True

Following relation holds unless some sequences are short and zero-padded. 

In [16]:
if cell_type == "lstm":
    check = (rnn_output[:, -1, :].numpy() == rnn_state.h.numpy()).all()
else:
    check = (rnn_output[:, -1, :].numpy() == rnn_state.numpy()).all()

check

True

## [Extra] LSTMBlockFusedCell

`tf.contrib.rnn.LSTMBlockFusedCell` provides yet another fast implementation of LSTM. 

**NOTE:** `LSTMBlockFusedCell` takes **time-major** tensor, not batch-major.

In [17]:
rnn_cell = tf.contrib.rnn.LSTMBlockFusedCell(NUM_UNITS)
rnn_initial_state = get_initial_state(cell_type="lstm")
rnn_inputs_time_major = tf.transpose(tf.convert_to_tensor(rnn_inputs), [1, 0, 2])
res = rnn_cell(rnn_inputs_time_major, initial_state=rnn_initial_state)
rnn_output_time_major, rnn_state = res

In [18]:
rnn_output_time_major

<tf.Tensor: id=242, shape=(3, 2, 5), dtype=float32, numpy=
array([[[ 0.12242151,  0.0277885 , -0.0754373 ,  0.11143345,
         -0.063211  ],
        [ 0.10178996, -0.09440513, -0.0234403 ,  0.12939812,
          0.02685089]],

       [[ 0.20673898,  0.04889306, -0.09390434,  0.17848039,
         -0.14704858],
        [ 0.20499304,  0.00816838, -0.09620755,  0.20845538,
         -0.076874  ]],

       [[ 0.27044514,  0.07677974, -0.11766715,  0.22938018,
         -0.16622667],
        [ 0.29983306,  0.00716148, -0.0777734 ,  0.23880504,
         -0.13313416]]], dtype=float32)>

In [19]:
rnn_state

LSTMStateTuple(c=<tf.Tensor: id=246, shape=(2, 5), dtype=float32, numpy=
array([[ 0.7442485 ,  0.14714137, -0.24902591,  0.6121015 , -0.38773277],
       [ 0.750517  ,  0.01361619, -0.16515471,  0.6710374 , -0.27689147]],
      dtype=float32)>, h=<tf.Tensor: id=250, shape=(2, 5), dtype=float32, numpy=
array([[ 0.27044514,  0.07677974, -0.11766715,  0.22938018, -0.16622667],
       [ 0.29983306,  0.00716148, -0.0777734 ,  0.23880504, -0.13313416]],
      dtype=float32)>)