In [1]:
import os
import numpy as np

import tensorflow as tf
from tensorflow.contrib.eager.python import tfe

from utils.data_utils import Corpus

  from ._conv import register_converters as _register_converters


In [2]:
# enable eager mode
tf.enable_eager_execution()
tf.set_random_seed(0)
np.random.seed(0)

In [3]:
if not os.path.exists('weights/'):
    os.makedirs('weights/')

# Hyper-parameters
embed_size = 128
rnn_units = 1024
num_epochs = 5
num_samples = 1000  # number of words to be sampled
batch_size = 20
seq_length = 30
learning_rate = 0.002
num_layers = 1

In [4]:
# dataset loading
corpus = Corpus()
train_corpus = corpus.get_data('../data_ptb/train', batch_size)
vocab_size = len(corpus.dictionary)
num_batches = train_corpus.shape[-1] // seq_length

train_corpus = tf.constant(train_corpus, dtype=tf.int32)

print("Dataset shape : ", train_corpus.shape)
print("Vocabulary size : ", vocab_size)
print("Number of batches : ", num_batches)

Dataset shape :  (20, 46479)
Vocabulary size :  10000
Number of batches :  1549


# RNN Language Model

This is a stateful model which feeds its own output predictions from the previous timestep (a single word from the entire vocabulary) back into its input of the next time step. This is shown to be super useful as a pre-training step for other NLP tasks as shown in the paper [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/abs/1801.06146), and is generally used for models such as Google's Smart Reply feature in GMail.

For the language model, we have to override a few of the LSTMCell's default atributes, such as the recurrent activation from `hard_sigmoid` to `sigmoid` and the recurrent initializer from `orthogonal` to `glorot_uniform`. The model will fail to converge without the correct hyperparameters.

We also have to maintain and utilize the initial states that are managed by the caller now, so we can no longer depend on the general Model.fit() to train our model in these circumstances.

In [5]:
class RNNLanguageModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, rnn_units, num_layers=1):
        super(RNNLanguageModel, self).__init__()
        self.units = rnn_units
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        self.enbedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_size)
        self.cells = [tf.keras.layers.LSTMCell(self.units, recurrent_activation='sigmoid', recurrent_initializer='glorot_uniform')
                      for _ in range(num_layers)]
        self.rnn = tf.keras.layers.RNN(self.cells, return_sequences=True, return_state=True)
        self.classifier = tf.keras.layers.Dense(vocab_size)

        self.states = None

    def call(self, inputs, training=None, mask=None, initial_states=None):
        embeds = self.enbedding(inputs)

        outputs = self.rnn(embeds, initial_state=initial_states)

        output = outputs[0]
        self.states = outputs[1:]

        # Reshape output to (batch_size * sequence_length, hidden_size)
        output = tf.reshape(output, [-1, output.shape[2]])

        # Decode hidden states of all time steps
        output = self.classifier(output)

        # cant pass a list as an output, must output only tensors
        # so keep previous output states in memory
        return output

# Canonical training
Below, we train a language model RNN using the canonical method - using Keras layers.

However, this is slow and tedious. A faster method would be to use the BasicLSTM or to write the loop ourselves to wrap over a Cell as shown in (6.3).

We perform a bit of maintainance work, where we have to supply the initial state of each epoch to each of the Cells in the RNN, accept the resultant state after each call of the model, and feed those states back as input to the next step.

We also monitor the best training perplexity and save the model only for those epochs where the perplexity is reduced from its previous best.

We then generate sampled text from this trained language model.

In [8]:
device = '/cpu:0' if tfe.num_gpus() == 0 else '/gpu:0'
with tf.device(device):
    # build model and optimizer
    model = RNNLanguageModel(vocab_size, embed_size, rnn_units, num_layers=num_layers)
    optimizer = tf.train.AdamOptimizer(learning_rate)

    # TF Keras tries to use entire dataset to determine shape without this step when using .fit()
    # Fix = Use exactly one sample from the provided input dataset to determine input/output shape/s for the model
    dummy_x = tf.zeros((1, 1))
    model._set_inputs(dummy_x)

    best_perplexity = 1e6
    saver = tfe.Saver(model.variables)

    if os.path.exists('weights/08_01_rnn_lm/') and tf.train.checkpoint_exists('weights/08_01_rnn_lm/weights.ckpt'):
        saver = tfe.Saver(model.variables)
        saver.restore('weights/08_01_rnn_lm/weights.ckpt')
        print("Restored model !")

    for epoch in range(num_epochs):
        # Set initial hidden and cell states
        initial_states = [tf.zeros([batch_size, rnn_units])] * (2 * num_layers)

        for i in range(0, train_corpus.shape[1] - seq_length, seq_length):
            # Get mini-batch inputs and targets
            inputs = train_corpus[:, i:i + seq_length]
            targets = train_corpus[:, (i + 1):(i + 1) + seq_length]
            targets = tf.reshape(targets, [-1])

            # Forward pass
            with tf.GradientTape() as tape:
                outputs = model(inputs, initial_states=initial_states)

                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=outputs, labels=targets)
                loss = tf.reduce_mean(loss)

            # use only the final state
            initial_states = model.states

            # get and clip gradients
            gradients = tape.gradient(loss, model.variables)
            
            with tf.device('/cpu:0'):
                gradients = [tf.cast(g, tf.float64) for g in gradients]  # necessary cast for kernel to exist
                gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
                gradients = [tf.cast(g, tf.float32) for g in gradients]  # necessary cast to correct dtype of grads and vars
        
            grad_vars = zip(gradients, model.variables)

            # update weights
            optimizer.apply_gradients(grad_vars, tf.train.get_or_create_global_step())

            step = (i + 1) // seq_length
            if step % 100 == 0:
                perplexity = np.exp(loss.numpy())

                print('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                      .format(epoch + 1, num_epochs, step, num_batches, loss.numpy(), perplexity))

                if best_perplexity > perplexity:
                    best_perplexity = perplexity
                    saver.save('weights/08_01_rnn_lm/weights.ckpt')
                    print("Perplexity improved. Saving weights...")

    saver = tfe.Saver(model.variables)
    saver.restore('weights/08_01_rnn_lm/weights.ckpt')

    if not os.path.exists('language_model/'):
        os.makedirs('language_model/')

    # evaluation of model
    with open('language_model/sample_2.txt', 'w') as f:
        # Set intial hidden ane cell states
        initial_states = (tf.zeros([1, rnn_units]), tf.zeros([1, rnn_units]))

        # Select one word id randomly
        prob = tf.ones([1, vocab_size])
        input = tf.multinomial(prob, num_samples=1)

        for i in range(num_samples):
            # Forward propagate RNN
            output = model(input, initial_states=initial_states)
            states = model.states

            # use only the final state
            initial_states = model.states

            # Sample a word id
            prob = tf.exp(output)
            word_id = tf.multinomial(prob, num_samples=1)[0, 0]

            # Fill input with sampled word id for the next time step
            input = tf.fill(input.shape, word_id)

            # File write
            word = corpus.dictionary.idx2word[word_id.numpy()]
            word = '\n' if word == '<eos>' else word + ' '
            f.write(word)

            if (i + 1) % 100 == 0:
                print('Sampled [{}/{}] words and save to {}'.format(i + 1, num_samples, 'language_model/sample_2.txt'))

Epoch [1/5], Step[0/1549], Loss: 9.2120, Perplexity: 10016.35
Perplexity improved. Saving weights...
Epoch [1/5], Step[100/1549], Loss: 6.3532, Perplexity: 574.34
Perplexity improved. Saving weights...
Epoch [1/5], Step[200/1549], Loss: 6.2338, Perplexity: 509.69
Perplexity improved. Saving weights...
Epoch [1/5], Step[300/1549], Loss: 6.1551, Perplexity: 471.10
Perplexity improved. Saving weights...
Epoch [1/5], Step[400/1549], Loss: 5.8937, Perplexity: 362.75
Perplexity improved. Saving weights...
Epoch [1/5], Step[500/1549], Loss: 5.4598, Perplexity: 235.04
Perplexity improved. Saving weights...
Epoch [1/5], Step[600/1549], Loss: 5.5310, Perplexity: 252.39
Epoch [1/5], Step[700/1549], Loss: 5.7498, Perplexity: 314.12
Epoch [1/5], Step[800/1549], Loss: 5.4795, Perplexity: 239.73
Epoch [1/5], Step[900/1549], Loss: 5.4200, Perplexity: 225.88
Perplexity improved. Saving weights...
Epoch [1/5], Step[1000/1549], Loss: 5.4946, Perplexity: 243.38
Epoch [1/5], Step[1100/1549], Loss: 5.6467, 

# Print the sampled sentences

In [10]:
with open('language_model/sample_2.txt', 'r') as f:
    for line in f:
        print(line)



the rage he said fleming 's heights paid foster doug <unk> yard themes of the tucson rain 

the telephone mentioned in the microprocessor conversation driving entirely <unk> 

the veto geography was a households schools day baldwin <unk> granted specter consider a <unk> aggressive inherent in the <unk> cypress bridge 

the planners sidelines technological <unk> rises in the ann announcement nikko saks poorer sweden lloyd <unk> royal conduct environmentally <unk> 

the institutes of the intensity of the <unk> apartheid is i does n't seizure steering stocks 

the wherever irving <unk> casual <unk> fame produced by <unk> <unk> industry his <unk> teller resulting from the <unk> spencer racked up to the took permits <unk> 

the ironic pervasive supplier tabloid xerox 's creatures focusing exist in the alternative grew to interpublic 

the enactment of the combustion twelve demanded number of <unk> plate pursuit of the soften clinical cancer 

the citibank mlx toy postpone the fdic 's offe