In [None]:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
from tensorflow.python.ops import lookup_ops

import numpy as np
import os
from rnn_lm import LanguageModel, create_dataset, loss_fun

In [None]:
tf.enable_eager_execution()
tf.set_random_seed(42)

In [None]:
train_file = 'simple-examples/data/ptb.train.txt'
valid_file = 'simple-examples/data/ptb.valid.txt'
vocab_file = 'simple-examples/data/vocab.txt'

In [None]:
vocab_table = lookup_ops.index_table_from_file(vocab_file)

In [None]:
#Specify model params
V = int(vocab_table.size())
cell = 'lstm'
d = 128
h = 128

BATCH_SIZE=32

In [None]:
lm = LanguageModel(V=V, cell=cell, d=d, h=h)
train_dataset = create_dataset(train_file, batch_size=BATCH_SIZE, vocab_table=vocab_table)
valid_dataset = create_dataset(valid_file, batch_size=BATCH_SIZE, vocab_table=vocab_table)

In [None]:
opt = tf.train.AdamOptimizer(learning_rate=0.001)

### P1: Perplexity

<img src="ppl@2x.png" alt="drawing" width="200"/>


1. Compute average loss over the **entire** dataset
2. Perplexity is $e^{L}$

**Question**: What should be the perplexity for an untrained model?

In [None]:
p_tgt_untrained = 1/10000
loss_untrained = -np.log(p_tgt_untrained)
print(f'loss: {loss_untrained}')

In [None]:
ppl = np.exp(loss_untrained)
print(ppl)

In [None]:
def compute_ppl(model, dataset):
    total_loss = 0.
    total_words = 0
    for batch_num, datum in enumerate(dataset):
        num_words = int(tf.reduce_sum(datum[2]))
        avg_loss = loss_fun(model, datum)
        total_loss = avg_loss * num_words
        total_words += num_words
        if batch_num % 50 == 0:
            print(f'ppl Done batch: {batch_num}')
    loss = total_loss / float(num_words)
    return np.exp(loss)

In [None]:
compute_ppl(lm, valid_dataset)

Let us now load our saved model, which was trained for some steps, and see if it does any better!

In [None]:
checkpoint_dir = 'lm'
root = tfe.Checkpoint(optimizer=opt, model=lm, optimizer_step=tf.train.get_or_create_global_step())
root.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
compute_ppl(lm, valid_dataset)

**Question**: Does perplexity depend on the size of vocabulary in a language model?

### P2: Gradient Clipping

A key challenge in RNN is that sometimes gradients are too large. Pascanu, Tomas Mikolov \& Bengio [suggested a simple fix](https://arxiv.org/abs/1211.5063) for the problem. If gradient is too large, clip it!

In [None]:
def clip_gradients(grads_and_vars, clip_ratio):
  gradients, variables = zip(*grads_and_vars)
  clipped, _ = tf.clip_by_global_norm(gradients, clip_ratio)
  return zip(clipped, variables)

In [None]:
loss_and_grads_fun = tfe.implicit_value_and_gradients(loss_fun)

In [None]:
lm = LanguageModel(V=V, cell=cell, d=d, h=h)
checkpoint_dir = 'lm'
root = tfe.Checkpoint(optimizer=opt, model=lm, optimizer_step=tf.train.get_or_create_global_step())
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')

In [None]:
NUM_EPOCHS = 10
STATS_STEPS = 50
EVAL_STEPS = 500

valid_ppl = compute_ppl(lm, valid_dataset)
print(f'Start :Valid ppl: {valid_ppl}')

for epoch_num in range(NUM_EPOCHS):
    batch_loss = []
    for step_num, datum in enumerate(train_dataset, start=1):
        loss_value, gradients = loss_and_grads_fun(lm, datum)
        batch_loss.append(loss_value)
        
        if step_num % STATS_STEPS == 0:
            print(f'Epoch: {epoch_num} Step: {step_num} Avg Loss: {np.average(np.asarray(loss_value))}')
            batch_loss = []
        opt.apply_gradients(clip_gradients(gradients, 5.0), global_step=tf.train.get_or_create_global_step())
        
        if step_num % EVAL_STEPS == 0:
            ppl = compute_ppl(lm, valid_dataset)
            
            #Save model!
            if ppl < valid_ppl:
                save_path = root.save(checkpoint_prefix)
                print(f'Epoch: {epoch_num} Step: {step_num} ppl improved: {ppl} old: {valid_ppl} Model saved: {save_path}')
                valid_ppl = ppl
            else:
                print(f'Epoch: {epoch_num} Step: {step_num} ppl worse: {ppl} old: {valid_ppl}')
                
        
    print(f'Epoch{epoch_num} Done!')

### P3: Dropout

An important regularization technique for RNN is to apply dropout

* Randomly make some units zero
* scale up remaining units so that signal length remains same!
* Only applied at **train** time

Let us check it out

In [None]:
datum = next(iter(train_dataset))
word_vectors = lm.word_embedding(datum[0])

In [None]:
print(word_vectors.shape)

Let us look at say 10th example, 8th word and first 4 features...

In [None]:
word_vectors[10][8][:4]

In [None]:
word_vectors_dropout = tf.nn.dropout(word_vectors, keep_prob=0.8)

In [None]:
word_vectors_dropout[10][8][:4]

Let us change our Language Model to include dropout....

In [None]:
from rnn_lm import Embedding, StaticRNN
class LanguageModel(tf.keras.Model):
    def __init__(self, V, d, h, cell):
        super(LanguageModel, self).__init__()
        self.word_embedding = Embedding(V, d)
        self.rnn = StaticRNN(h, cell)
        self.output_layer = tf.keras.layers.Dense(units=V)
        
    def call(self, datum, train=False, dropout=0.):
        word_vectors = self.word_embedding(datum[0])
        if train:
            word_vectors = tf.nn.dropout(word_vectors, keep_prob=1-dropout)
        rnn_outputs_time = self.rnn(word_vectors, datum[2])
        
        #We want to convert it back to shape batch_size x TimeSteps x h
        rnn_outputs = tf.stack(rnn_outputs_time, axis=1)
        if train:
            rnn_outputs = tf.nn.dropout(rnn_outputs, keep_prob=1-dropout)
        logits = self.output_layer(rnn_outputs)
        return logits

In [None]:
def loss_fun(model, datum, train=False):
    logits = model(datum, train)
    mask = tf.sequence_mask(datum[2], dtype=tf.float32)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=datum[1]) * mask
    return tf.reduce_sum(loss) / tf.cast(tf.reduce_sum(datum[2]), dtype=tf.float32)

In [None]:
def compute_ppl(model, dataset):
    total_loss = 0.
    total_words = 0
    for batch_num, datum in enumerate(dataset):
        num_words = int(tf.reduce_sum(datum[2]))
        avg_loss = loss_fun(model, datum)
        total_loss = avg_loss * num_words
        total_words += num_words
        if batch_num % 50 == 0:
            print(f'ppl Done batch: {batch_num}')
    loss = total_loss / float(num_words)
    return np.exp(loss)

In [None]:
def train_loss(model, datum):
    return loss_fun(model, datum, train=True)

In [None]:
loss_and_grads_fun = tfe.implicit_value_and_gradients(train_loss)

In [None]:
lm = LanguageModel(V=V, cell=cell, d=d, h=h)
checkpoint_dir = 'lm'
root = tfe.Checkpoint(optimizer=opt, model=lm, optimizer_step=tf.train.get_or_create_global_step())
checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')

In [None]:
compute_ppl(lm, valid_dataset)

In [None]:
NUM_EPOCHS = 10
STATS_STEPS = 50
EVAL_STEPS = 500

valid_ppl = compute_ppl(lm, valid_dataset)
print(f'Start :Valid ppl: {valid_ppl}')

for epoch_num in range(NUM_EPOCHS):
    batch_loss = []
    for step_num, datum in enumerate(train_dataset, start=1):
        loss_value, gradients = loss_and_grads_fun(lm, datum)
        batch_loss.append(loss_value)
        
        if step_num % STATS_STEPS == 0:
            print(f'Epoch: {epoch_num} Step: {step_num} Avg Loss: {np.average(np.asarray(loss_value))}')
            batch_loss = []
        opt.apply_gradients(clip_gradients(gradients, 5.0), global_step=tf.train.get_or_create_global_step())
        
        if step_num % EVAL_STEPS == 0:
            ppl = compute_ppl(lm, valid_dataset)
            
            #Save model!
            if ppl < valid_ppl:
                save_path = root.save(checkpoint_prefix)
                print(f'Epoch: {epoch_num} Step: {step_num} ppl improved: {ppl} old: {valid_ppl} Model saved: {save_path}')
                valid_ppl = ppl
            else:
                print(f'Epoch: {epoch_num} Step: {step_num} ppl worse: {ppl} old: {valid_ppl}')
                
        
    print(f'Epoch{epoch_num} Done!')