# A simple template for using the `seq2seq` module in TensorFlow v1.2+

##### YJ Choe ([yj.choe@kakaobrain.com](mailto:yj.choe@kakaobrain.com))

In [1]:
import tensorflow as tf
import numpy as np
import datetime
from tensorflow.python.layers import core as layers_core

In [2]:
print(tf.__version__)
print(np.__version__)

1.2.1
1.13.0


For TensorBoard summary:

In [4]:
# https://www.tensorflow.org/get_started/summaries_and_tensorboard
def variable_summaries(name, var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope(name):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)

## Sequence-to-Sequence LSTM

In [5]:
START_TOKEN = 0
END_TOKEN = 1
UNK_TOKEN = 2

Replace the following parameters and data according to the application.

In [6]:
n, batch_size = 500, 32
n_epochs = 10

vocab_size = 100
max_timesteps = 16
embedding_size = 20  # assume both encoding and decoding embeddings have the same size

n_layers = 2  # number of layers of LSTM
latent_size = 10  # num_units of LSTM

learning_rate = 1e-2

In [7]:
def compute_sequence_length(data):
    dtype = data.dtype
    sequence_length = 1 + tf.argmax(
        tf.cast(tf.equal(data, tf.constant(END_TOKEN, dtype=dtype)), dtype),
        axis=1
    )
    return tf.cast(sequence_length, dtype)
    
def generate_random_sequences(n, vocab_size, max_timesteps):
    # avoid having multiple START_TOKEN's (vocab_size - 1)
    # always have one START_TOKEN at the beginning 
    # and at least one END_TOKEN at the end
    data = tf.cast(
        tf.multinomial(logits=tf.log(tf.ones(shape=(n, vocab_size - 1))),
                       num_samples=max_timesteps - 2),
        tf.int32
    )
    data = tf.concat(
        [tf.constant(START_TOKEN, shape=(n, 1)),
         data + 1, 
         tf.constant(END_TOKEN, shape=(n, 1))  # in case END_TOKEN is not already present
        ], 
        axis=1
    )
    sequence_length = compute_sequence_length(data)
    return data, sequence_length

In [8]:
# generate random data that always starts with START_TOKEN and ends with END_TOKEN
x_data, x_data_length = generate_random_sequences(n, vocab_size, max_timesteps)
y_data, y_data_length = generate_random_sequences(n, vocab_size, max_timesteps)
print(x_data.shape, x_data_length.shape)
print(y_data.shape, y_data_length.shape)

(500, 16) (500,)
(500, 16) (500,)


In [9]:
x, x_length, y, y_length = tf.train.batch(
    tensors=[x_data, x_data_length, y_data, y_data_length],
    batch_size=batch_size,
    num_threads=n_gpus,
    enqueue_many=True,
    dynamic_pad=True,
    name='batched_xy'
)
print(x.shape, x_length.shape)
print(y.shape, y_length.shape)

(32, 16) (32,)
(32, 16) (32,)


### Embedding

Here we assume that the embedding matrix is identical for the encoder and the decoder (e.g. question-answering within the same language). For machine translation tasks, specify two embedding matrices and apply them to `x` and `y` respectively.

If a pre-trained embedding matrix (e.g. Word2Vec, GloVe, fastText) is available, replace `embedding` with it. Otherwise, set `trainable=True` when initializing `embedding_matrix`.

In [10]:
embedding = np.random.randn(vocab_size, embedding_size)
# https://stackoverflow.com/questions/35687678/using-a-pre-trained-word-embedding-word2vec-or-glove-in-tensorflow
embedding_matrix = tf.Variable(tf.constant(0.0, shape=(vocab_size, embedding_size)),
                               trainable=False,   # if pre-trained
                               name="embedding_matrix")

embedding_placeholder = tf.placeholder(tf.float32, [vocab_size, embedding_size])
embedding_init = embedding_matrix.assign(embedding_placeholder)

#sess.run(embedding_init, feed_dict={embedding_placeholder: embedding})
print(embedding_matrix.shape)

(100, 20)


In [11]:
x_embedding = tf.nn.embedding_lookup(embedding_matrix, x)
y_embedding = tf.nn.embedding_lookup(embedding_matrix, y)
print(x_embedding.shape)
print(y_embedding.shape)

(32, 16, 20)
(32, 16, 20)


### Encoder

In [12]:
# see also: `tf.contrib.rnn.LayerNormBasicLSTMCell`
cells = [tf.nn.rnn_cell.LSTMCell(latent_size) for _ in range(n_layers)]
# cells = [tf.nn.rnn_cell.DeviceWrapper(
#     tf.nn.rnn_cell.ResidualWrapper(tf.nn.rnn_cell.LSTMCell(latent_size)),
#     device='/gpu:%d' % i) for i in range(n_gpus)]

encoder_cell = tf.nn.rnn_cell.MultiRNNCell(
    cells=cells
)

encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
    cell=encoder_cell,
    dtype=tf.float32,
    inputs=x_embedding
)
variable_summaries('encoder_outputs', encoder_outputs)
variable_summaries('encoder_final_state', encoder_final_state)
print(encoder_outputs, encoder_final_state)

Tensor("rnn/transpose:0", shape=(32, 16, 10), dtype=float32) (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_2:0' shape=(?, 10) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 10) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_4:0' shape=(?, 10) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_5:0' shape=(?, 10) dtype=float32>))


In [13]:
h, state_tuple = encoder_cell(x_embedding[:, 0, :], encoder_final_state)

In [14]:
h == state_tuple[1].h  # same thing, unless cell has `num_proj` specified

True

### Decoder

In [15]:
cells = [tf.nn.rnn_cell.LSTMCell(latent_size) for _ in range(n_layers)]
# cells = [tf.nn.rnn_cell.DeviceWrapper(
#     tf.nn.rnn_cell.ResidualWrapper(tf.nn.rnn_cell.LSTMCell(latent_size)),
#     device='/gpu:%d' % i) for i in range(n_gpus)]

decoder_cell = tf.nn.rnn_cell.MultiRNNCell(
    cells=cells
)
print(decoder_cell.state_size)

(LSTMStateTuple(c=10, h=10), LSTMStateTuple(c=10, h=10))


In [16]:
# use `tf.contrib.seq2seq.BahdanauAttention` for additive attention
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
    num_units=latent_size,
    memory=encoder_outputs
)

In [17]:
# `tf.contrib.seq2seq.DynamicAttentionWrapper` before TF v1.2
attention_cell = tf.contrib.seq2seq.AttentionWrapper(
    cell=decoder_cell,
    attention_mechanism=attention_mechanism,
    attention_layer_size=latent_size  # optional
)

In [18]:
# https://github.com/tensorflow/tensorflow/issues/8833
attention_zero_state = attention_cell.zero_state(
    batch_size=batch_size, 
    dtype=tf.float32
)
attention_initial_state = attention_zero_state.clone(
    cell_state=encoder_final_state
)

During training, a `TrainingHelper` schedules whether the previous timestep's true output ($y_{t-1}$) or its predicted output ($\hat{y}_{t-1}$) is fed into the next decoder ste where we predict $y_t$. [[paper]](https://arxiv.org/pdf/1506.03099.pdf) During prediction/testing, the true output is no longer available, so we use a non-training `Helper` (such as `GreedyEmbeddingHelper`) to always feed in the predicted output from the previous timestep.

In [19]:
training_helper = tf.contrib.seq2seq.TrainingHelper(
    inputs=y_embedding,
    sequence_length=y_length
)
prediction_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
    embedding=embedding,
    start_tokens=tf.constant(START_TOKEN, shape=(batch_size, )),
    end_token=END_TOKEN
)

In [20]:
decoder = tf.contrib.seq2seq.BasicDecoder(
    cell=attention_cell,
    helper=training_helper,
    initial_state=attention_initial_state,
    output_layer=layers_core.Dense(vocab_size,
                                   activation=tf.nn.sigmoid)
)

In [21]:
final_outputs, final_state, final_sequence_lengths = \
    tf.contrib.seq2seq.dynamic_decode(
        decoder=decoder
    )

variable_summaries('final_rnn_outputs', final_outputs.rnn_output)
variable_summaries('final_cell_state', final_state.cell_state)
variable_summaries('final_attention', final_state.attention)
variable_summaries('final_alignments', final_state.alignments)

In [22]:
# `final_outputs` has type `tf.contrib.BasicDecoderOutput`
final_outputs.rnn_output, final_outputs.sample_id

(<tf.Tensor 'decoder/transpose:0' shape=(32, ?, 100) dtype=float32>,
 <tf.Tensor 'decoder/transpose_1:0' shape=(32, ?) dtype=int32>)

In [23]:
# `final_state` has type `tf.contrib.seq2seq.AttentionWrapperState`
final_state.cell_state, final_state.attention, final_state.time, final_state.alignments

((LSTMStateTuple(c=<tf.Tensor 'decoder/while/Exit_3:0' shape=(?, 10) dtype=float32>, h=<tf.Tensor 'decoder/while/Exit_4:0' shape=(?, 10) dtype=float32>),
  LSTMStateTuple(c=<tf.Tensor 'decoder/while/Exit_5:0' shape=(?, 10) dtype=float32>, h=<tf.Tensor 'decoder/while/Exit_6:0' shape=(?, 10) dtype=float32>)),
 <tf.Tensor 'decoder/while/Exit_7:0' shape=(32, 10) dtype=float32>,
 <tf.Tensor 'decoder/while/Exit_8:0' shape=() dtype=int32>,
 <tf.Tensor 'decoder/while/Exit_9:0' shape=(32, 16) dtype=float32>)

In [24]:
final_sequence_lengths  # deprecated after TF v1.2

<tf.Tensor 'decoder/while/Exit_12:0' shape=(32,) dtype=int32>

### Loss

In [25]:
logits = final_outputs.rnn_output # float32 [batch_size, sequence_length, num_decoder_symbols]
targets = y  # int32 [batch_size, sequence_length]
weights = tf.cast(
    tf.sequence_mask(y_length, maxlen=y.shape[1]), 
    tf.float32
)  # float32 [batch_size, sequence_length]

In [26]:
loss = tf.contrib.seq2seq.sequence_loss(
    logits, 
    targets, 
    weights,
    average_across_timesteps=True,
    average_across_batch=True
)
variable_summaries('loss', loss)

### Training and Visualization

In [27]:
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

In [28]:
now = datetime.datetime.now()
logdir = '/root/logs/seq2seq/{:04d}{:02d}{:02d}-{:02d}{:02d}{:02d}'.format(
    now.year, now.month, now.day, now.hour, now.minute, now.second
)
    
with tf.Session() as sess:
    
    # merge all summaries so far and initialize a FileWriter
    merged = tf.summary.merge_all()
    writer = tf.summary.FileWriter(logdir=logdir, graph=sess.graph)
    
    # initialize the variables
    sess.run(tf.global_variables_initializer())
    sess.run(embedding_init, feed_dict={embedding_placeholder: embedding})

    # http://ischlag.github.io/2016/06/19/tensorflow-input-pipeline-example/
    # initialize the queue threads to start to shovel data
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)

    # run epochs
    num_batches = n // batch_size
    for epoch in range(n_epochs):
        for batch in range(num_batches):
            sess.run(optimizer)
            summary = sess.run(merged)
            writer.add_summary(summary, epoch * num_batches + batch)
            print('epoch {:d}, batch {:d}, cross-entropy {:.5f}'.format(
                epoch+1, batch+1, sess.run(loss)))

    # stop our queue threads and properly close the session
    coord.request_stop()
    coord.join(threads)


epoch 1, batch 1, cross-entropy 4.60423
epoch 1, batch 2, cross-entropy 4.60113
epoch 1, batch 3, cross-entropy 4.60205
epoch 1, batch 4, cross-entropy 4.59707
epoch 1, batch 5, cross-entropy 4.59438
epoch 1, batch 6, cross-entropy 4.58900
epoch 1, batch 7, cross-entropy 4.57931
epoch 1, batch 8, cross-entropy 4.57536
epoch 1, batch 9, cross-entropy 4.58348
epoch 1, batch 10, cross-entropy 4.57630
epoch 1, batch 11, cross-entropy 4.56829
epoch 1, batch 12, cross-entropy 4.55880
epoch 1, batch 13, cross-entropy 4.56774
epoch 1, batch 14, cross-entropy 4.57732
epoch 1, batch 15, cross-entropy 4.56754
epoch 2, batch 1, cross-entropy 4.55250
epoch 2, batch 2, cross-entropy 4.56730
epoch 2, batch 3, cross-entropy 4.54931
epoch 2, batch 4, cross-entropy 4.54116
epoch 2, batch 5, cross-entropy 4.54885
epoch 2, batch 6, cross-entropy 4.53232
epoch 2, batch 7, cross-entropy 4.54544
epoch 2, batch 8, cross-entropy 4.54166
epoch 2, batch 9, cross-entropy 4.53120
epoch 2, batch 10, cross-entropy 4

During or after training, summaries can be checked by running `tensorboard --logdir=$LOGDIR_DEFINED_ABOVE` on the command line.