# RNN layer playground

Single RNN layer computation using several implementations of TensorFlow's RNN cells are discussed.

In [1]:
import collections
import tensorflow as tf
import numpy as np
tf.enable_eager_execution()

In [2]:
print("TensorFlow version:", tf.__version__)

TensorFlow version: 1.11.0


## Configuration

In [3]:
BATCH_SIZE = 2 
SEQ_LEN = 3      # steps in time dimension
NUM_INPUTS = 4   # number of input elements
NUM_UNITS = 5    # number of output elements

cell_type = "gru"   # "lstm" or "gru"

# [For CPU] Use CUDA compatible cell?
cuda_compatible = True

# [For CPU] Use faster cell?
performance_matters = True

## Generate inputs

Let the input tensor to RNN layer have the shape (`BATCH_SIZE`, `SEQ_LEN` , `NUM_INPUT`).

In [4]:
rnn_inputs = tf.convert_to_tensor(np.float32(np.random.random((BATCH_SIZE, SEQ_LEN, NUM_INPUTS))))
rnn_inputs

<tf.Tensor: id=1, shape=(2, 3, 4), dtype=float32, numpy=
array([[[0.01699759, 0.8343653 , 0.26878107, 0.6359202 ],
        [0.75250006, 0.53498155, 0.7937529 , 0.8391873 ],
        [0.3019563 , 0.27737612, 0.5271342 , 0.45266452]],

       [[0.82940054, 0.63020486, 0.9112391 , 0.9472608 ],
        [0.28364596, 0.6899779 , 0.80933154, 0.97504157],
        [0.72382927, 0.6976915 , 0.25315878, 0.29028106]]], dtype=float32)>

## Selection of RNN cell

As discussed [here](https://www.tensorflow.org/performance/performance_guide#rnn_performance), `tf.nn.rnn_cell.*` provides only the reference design not meant for perforance computation. 

* For GPU performance `tf.contrib.cudnn_rnn.*` is recommended.
* For CPU performance `tf.contrib.rnn.*BlockCell*` is recommended.
* `tf.nn.rnn_cell.*` provides reference implementation.

**[NOTE]** `tf.float64` does not work with `LSTMBlockCell` and `GRUBlockCellV2` as of v1.11.

In [5]:
if cuda_compatible:
    selector = {
        "lstm":  tf.contrib.cudnn_rnn.CudnnCompatibleLSTMCell,
        "gru":  tf.contrib.cudnn_rnn.CudnnCompatibleGRUCell, 
    }
elif performance_matters:
    selector = {
        "lstm": tf.contrib.rnn.LSTMBlockCell,
        "gru":  tf.contrib.rnn.GRUBlockCellV2,
    } 
else:
    selector = {
        "lstm": tf.nn.rnn_cell.LSTMCell,
        "gru":  tf.nn.rnn_cell.GRUCell, 
    }

rnn_cell_func = selector[cell_type]
rnn_cell = rnn_cell_func(NUM_UNITS)

## Prepare initial state

In [6]:
def get_initial_state(cell_type, batch_size=BATCH_SIZE, num_units=NUM_UNITS, dtype=tf.float32):
    assert cell_type in ("lstm", "gru")
    if cell_type == "lstm":
        LSTMStateTuple = collections.namedtuple("LSTMStateTuple", ["c", "h"])
        left = tf.zeros(shape=(batch_size, num_units), dtype=dtype)
        right = tf.zeros(shape=(batch_size, num_units), dtype=dtype)
        res = LSTMStateTuple(left, right)
    else:
        res = tf.zeros(shape=(batch_size, num_units), dtype=dtype)
    
    return res

In [7]:
rnn_initial_state = get_initial_state(cell_type)
rnn_initial_state

<tf.Tensor: id=5, shape=(2, 5), dtype=float32, numpy=
array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)>

## Direct implementaiton using the RNN cell

In [19]:
print("RNN cell:", rnn_cell.name)
stack = []
rnn_state = rnn_initial_state
for i in range(SEQ_LEN):
    output_snapshot, rnn_state = rnn_cell(rnn_inputs[:, i, :], rnn_state)
    stack.append(output_snapshot)
tmp = tf.convert_to_tensor(stack)          # time major
rnn_output = tf.transpose(tmp, [1, 0, 2])  # batch major

RNN cell: cudnn_compatible_gru_cell_1


In [9]:
rnn_output

<tf.Tensor: id=168, shape=(2, 3, 5), dtype=float32, numpy=
array([[[ 0.02990399,  0.05910603, -0.05918583, -0.1534747 ,
          0.15635811],
        [ 0.12331079,  0.20804977,  0.06027195, -0.3613307 ,
          0.23384589],
        [ 0.15892234,  0.24210683,  0.17441162, -0.30702627,
          0.27366194]],

       [[ 0.10151232,  0.18144394,  0.08772551, -0.32502905,
          0.12613314],
        [ 0.16009198,  0.2600469 ,  0.16925618, -0.4235642 ,
          0.2972222 ],
        [ 0.16468492,  0.29950017,  0.2038032 , -0.28208566,
          0.32650927]]], dtype=float32)>

In [10]:
rnn_state

<tf.Tensor: id=165, shape=(2, 5), dtype=float32, numpy=
array([[ 0.15892234,  0.24210683,  0.17441162, -0.30702627,  0.27366194],
       [ 0.16468492,  0.29950017,  0.2038032 , -0.28208566,  0.32650927]],
      dtype=float32)>

## Equivalent code with dynamic_rnn

In [20]:
print("RNN cell:", rnn_cell.name)
rnn_output2, rnn_state2 = tf.nn.dynamic_rnn(rnn_cell, rnn_inputs, initial_state=rnn_initial_state)

RNN cell: cudnn_compatible_gru_cell_1


In [21]:
rnn_output2

<tf.Tensor: id=556, shape=(2, 3, 5), dtype=float32, numpy=
array([[[ 0.02990399,  0.05910603, -0.05918583, -0.1534747 ,
          0.15635811],
        [ 0.12331079,  0.20804977,  0.06027195, -0.3613307 ,
          0.23384589],
        [ 0.15892234,  0.24210683,  0.17441162, -0.30702627,
          0.27366194]],

       [[ 0.10151232,  0.18144394,  0.08772551, -0.32502905,
          0.12613314],
        [ 0.16009198,  0.2600469 ,  0.16925618, -0.4235642 ,
          0.2972222 ],
        [ 0.16468492,  0.29950017,  0.2038032 , -0.28208566,
          0.32650927]]], dtype=float32)>

In [22]:
rnn_state2

<tf.Tensor: id=544, shape=(2, 5), dtype=float32, numpy=
array([[ 0.15892234,  0.24210683,  0.17441162, -0.30702627,  0.27366194],
       [ 0.16468492,  0.29950017,  0.2038032 , -0.28208566,  0.32650927]],
      dtype=float32)>

## Equivalence checking

Compare results from direct implementation and `dynamic_rnn()`.

In [14]:
(rnn_output.numpy() == rnn_output2.numpy()).all()

True

In [15]:
if cell_type == "lstm":
    res = ((rnn_state.c.numpy() == rnn_state2.c.numpy()).all() 
           and (rnn_state.h.numpy() == rnn_state2.h.numpy()).all())
else:
    res = (rnn_state.numpy() == rnn_state2.numpy()).all()
res

True

Following relation holds unless some sequences are short and zero-padded. 

In [16]:
if cell_type == "lstm":
    check = (rnn_output[:, -1, :].numpy() == rnn_state.h.numpy()).all()
else:
    check = (rnn_output[:, -1, :].numpy() == rnn_state.numpy()).all()

check

True

# RNN whole-sequence processing

## CudnnGRU

In [25]:
rnn_block = tf.contrib.cudnn_rnn.CudnnGRU(1, NUM_UNITS)

print("RNN implementation:", rnn_block.name)
# rnn_inputs_time_major = tf.transpose(tf.convert_to_tensor(rnn_inputs), [1, 0, 2])
rnn_inputs_time_major = tf.transpose(rnn_inputs, [1, 0, 2])
if tf.test.gpu_device_name():
    res = rnn_block(rnn_inputs_time_major)
    rnn_output_time_major, rnn_state = res

RNN implementation: cudnn_gru_4


In [26]:
rnn_block.state_shape(BATCH_SIZE)

([1, 2, 5],)

In [28]:
rnn_block.rnn_mode

'gru'

## [Extra] LSTMBlockFusedCell

`tf.contrib.rnn.LSTMBlockFusedCell` provides yet another fast implementation of LSTM. 

**NOTE:** `LSTMBlockFusedCell` takes **time-major** tensor, not batch-major.

In [29]:
rnn_block = tf.contrib.rnn.LSTMBlockFusedCell(NUM_UNITS)
print("RNN implementation:", rnn_block.name)

rnn_initial_state = get_initial_state(cell_type="lstm")
rnn_inputs_time_major = tf.transpose(rnn_inputs, [1, 0, 2])
res = rnn_block(rnn_inputs_time_major, initial_state=rnn_initial_state)
rnn_output_time_major, rnn_state = res

RNN implementation: lstm_fused_cell


In [None]:
rnn_output_time_major

In [None]:
rnn_state