# Recurrent Neural Networks: Language Modeling

Given words $x_{1},x_{2},...,x_{t}$,a language model will predict the following word $x_{t+1}$ by modeling:

$P(x_{t+1}=v_{j}|x_{t},...,x_{1})$

which $v_{j}$ is a word in vocabulary.

$e^{(t)}=x^{(t)}L$
$h^{(t)}=sigmoid(h^{(t-1)}H + e^{(t)}I + b_{1})$
$y^{(t)}=softmax(h^{(t)}U + b2)$

$P(x_{t+1}=v_{j}|x_{t},...,x_{1})=\hat{y}_{j}^{(t)}$

$x^{(t)}\in R^{|V|}$ : one-hot row vector representing the index of the current word. 

$L\in R^{|V|*d}$ : word embeddings, 

$H\in R^{D_{h}*D_{h}}$ : the hidden transformation matrix

$I\in R^{d*D_{h}}$ : the input word to hidden transformation matrix

$b_{1} \in R^{D_{h}}$ : bias

$b_{2} \in R^{|V|}$ : bias

$|V|$ : the vocabulary size; $d$ : the word embedding size; $D_{h}$ : the hidden layer dimension

### Loss Function:
cross entropy :
$J^{(t)}(\theta)=CE(y^{(t)},\hat{y}^{(t)})=-\sum_{i=1}^{|V|}y_{i}^{(t)}log\hat{y}^{(t)}$

In [1]:
from __future__ import division
import tensorflow as tf
import numpy as np
import time

### start

In [2]:
time_start = time.time()

In [3]:
batch_size = 64
embed_size = 50
hidden_size = 100
num_steps = 10
max_epochs = 16
early_stopping = 2
dropout = 0.9
lr = 0.001

In [4]:
from utils import calculate_perplexity, get_ptb_dataset, Vocab
from utils import ptb_iterator, sample

In [5]:
def load_data():
    """Loads starter word-vectors and train/dev/test data."""
    vocab = Vocab()
    vocab.construct(get_ptb_dataset('train'))
    encoded_train = np.array(
        [vocab.encode(word) for word in get_ptb_dataset('train')],
        dtype=np.int32)
    encoded_valid = np.array(
        [vocab.encode(word) for word in get_ptb_dataset('valid')],
        dtype=np.int32)
    encoded_test = np.array(
        [vocab.encode(word) for word in get_ptb_dataset('test')],
        dtype=np.int32)
    return encoded_train, encoded_valid, encoded_test, vocab

In [6]:
train_set, valid_set, test_set, vocab = load_data()

929589.0 total words with 10000 uniques


In [7]:
print(train_set[1:50])

[ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
  0 27 28 29 30 31 32 33 34 35 36 37 38 27 25 39  0 40 41 42  0 43 32 44]


# build the graph

In [8]:
# add the placeholders
def add_placeholders():
    input_placeholder = tf.placeholder(tf.int32, shape=[None, num_steps])
    label_placeholder = tf.placeholder(tf.int32, shape=[None, num_steps])
    initial_state_placeholder = tf.placeholder(tf.float32, shape=[batch_size, hidden_size])
    return input_placeholder, label_placeholder, initial_state_placeholder

In [9]:
# create the feed_dict
def create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch, 
                     initial_state_placeholder, initial_state):
    feed_dict = {input_placeholder: input_batch,
                label_placeholder:label_batch,
                initial_state_placeholder:initial_state}
    return feed_dict

### HINTS
### You should take care of how to construct the model inputs
tf.squeeze(input, squeeze_dims=None, name=None):Removes dimensions of size 1 from the shape of a tensor.

tf.split(split_dim, num_split, value, name='split'):Splits a tensor into `num_split` tensors along one dimension.
    
num_steps: like the $t$ in the model. 

In [10]:
def add_embed_layer(vocab_size, input_placeholder):
    with tf.device('/cpu:0'):
        embed = tf.get_variable(name="Embedding", shape=[vocab_size, embed_size])
        inputs = tf.nn.embedding_lookup(embed, input_placeholder)
        inputs = [tf.squeeze(input, squeeze_dims=[1]) for input in tf.split(1, num_steps, inputs)] 
        return inputs

In [11]:
## according the model at the begining of this file
def add_model(inputs, vocab_size, initial_state_placeholder):
    ###initial state
    with tf.variable_scope('RNN') as scope:
        #state = tf.zeros([batch_size, hidden_size])
        state = initial_state_placeholder
        rnn_outputs = []
        for tstep, current_input in enumerate(inputs):
            if tstep > 0:
                scope.reuse_variables()
            RNN_H = tf.get_variable('HMatrix', shape=[hidden_size, hidden_size])
            RNN_I = tf.get_variable('IMatrix', shape=[embed_size, hidden_size])
            RNN_b1 = tf.get_variable('b1', shape=[hidden_size])
            state = tf.sigmoid(tf.matmul(state, RNN_H) + tf.matmul(current_input, RNN_I) + RNN_b1)
            rnn_outputs.append(state)
        final_state = rnn_outputs[-1]
    return rnn_outputs, final_state

In [12]:
## add the Projection layer
def add_projection_layer(rnn_outputs, vocab_size):
    RNN_U = tf.get_variable('UMatrix', shape=[hidden_size, vocab_size])
    RNN_b2 = tf.get_variable('b2', shape=[vocab_size])
    outputs = [tf.matmul(state, RNN_U) + RNN_b2 for state in rnn_outputs]
    return outputs

In [13]:
from tensorflow.python.ops.seq2seq import sequence_loss

In [14]:
help(sequence_loss)

Help on function sequence_loss in module tensorflow.python.ops.seq2seq:

sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None)
    Weighted cross-entropy loss for a sequence of logits, batch-collapsed.
    
    Args:
      logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols].
      targets: List of 1D batch-sized int32 Tensors of the same length as logits.
      weights: List of 1D batch-sized float-Tensors of the same length as logits.
      average_across_timesteps: If set, divide the returned cost by the total
        label weight.
      average_across_batch: If set, divide the returned cost by the batch size.
      softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
        to be used instead of the standard softmax (the default if this is None).
      name: Optional name for this operation, defaults to "sequence_loss".
    
    Returns:
      A scalar float T

In [15]:
## add loss op
def add_loss_op(outputs, label_placeholder, vocab_size):
    
    all_one_weights = num_steps*[tf.ones([batch_size])]
    ### TODO len(targets) = len(output) = num_steps!!
    ##construct the targets
    targets = [tf.squeeze(label_placeholder[:, i]) for i in range(num_steps)]
    cross_entropy = tf.nn.seq2seq.sequence_loss(outputs,                                           targets=targets,                                        weights=all_one_weights)
    loss = cross_entropy
    ''' 
    
    all_ones = [tf.ones([batch_size * num_steps])]
    cross_entropy = tf.nn.seq2seq.sequence_loss(
        [outputs], [tf.reshape(label_placeholder, [-1])], all_ones, vocab_size)
    loss = cross_entropy
    '''
    return loss

In [22]:
## add training op
def add_train_op(loss):
    train_op = tf.train.AdamOptimizer(0.001).minimize(loss)
    return train_op

In [23]:
# evalate the prediction 
def evaluation(y_pred, label_placeholder):
    label_pred = [tf.cast(tf.argmax(value, dimension=1), tf.int32) for key, value  in enumerate(y_pred)]
    label_right = [tf.squeeze(label_placeholder[:, i]) for i in range(num_steps)]
    correct_pred_num = []
    for i in range(num_steps):
        correct_pred_num.append(tf.reduce_sum(tf.cast(tf.equal(label_right[i], label_pred[i]), tf.int32)))
    correct_pred_num = np.sum(correct_pred_num)
    return correct_pred_num

In [24]:
# evaluate on the validation set
def do_evaluation(data_set, sess):
    ##TODO
    #total_correct_num = []
    loss_mean = []
    for step, (x, y) in enumerate(ptb_iterator(data_set, batch_size, num_steps)):
        if step == 0 :
                initial_state = np.zeros([batch_size, hidden_size])
        else:
            initial_state = final_state_step
        #print(step)
        input_batch = x
        label_batch = y
        feed = create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch,
                               initial_state_placeholder, initial_state)
        loss_step, final_state_step = sess.run([ loss, final_state], feed)
        #total_correct_num.append(eval_correct_step)
        loss_mean.append(loss_step)
    
    #print('Validation Accuracy: %f , Validation Perplexity : %f' %
          #(np.sum(total_correct_num)/(batch_size*(step+1)*num_steps), np.exp(np.mean(loss_mean))))
    print('Validation Perplexity : %f' %
          (np.exp(np.mean(loss_mean))))
          

In [27]:
with tf.Graph().as_default(), tf.device('/cpu:0'):
    
    input_placeholder, label_placeholder, initial_state_placeholder = add_placeholders()
    
    vocab_size = len(vocab)
    
    inputs = add_embed_layer(vocab_size, input_placeholder)
    
    rnn_outputs, final_state = add_model(inputs, vocab_size, initial_state_placeholder)

    ## projecttion, output is a List, num_steps of 2-D tensors[batch_size, vocab_size]
    outputs = add_projection_layer(rnn_outputs, vocab_size)
    ##
    #pred = [tf.nn.softmax(value) for key, value in enumerate(outputs)]
    #output = tf.reshape(tf.concat(1, outputs), [-1, vocab_size])
    #
    #eval_correct = evaluation(pred, label_placeholder)
    ## add loss op
    loss = add_loss_op(outputs, label_placeholder, vocab_size)
    ## add training op
    train_op = add_train_op(loss)
    sess = tf.Session()
    sess.run(tf.initialize_all_variables())
    for epoch in range(max_epochs):
        print('Epoch %d' %(epoch))
        print('Start Training')
        
        loss_mean = []
        total_correct = []
        for step, (x, y) in enumerate(ptb_iterator(train_set, batch_size, num_steps)):
            if step == 0 :
                initial_state = np.zeros([batch_size, hidden_size])
            else:
                initial_state = final_state_step
            #print(step)
            input_batch = x
            label_batch = y
            feed = create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch,
                                   initial_state_placeholder, initial_state)
            _, loss_step, final_state_step = sess.run([train_op, loss, final_state], feed)
            loss_mean.append(loss_step)
            #total_correct.append(eval_correct_step)
            #do_evaluation(valid_set, sess)
        #print('Step %d: Training Perplexity: %f, Trainings Accuracy : %f ' 
              #%(step, np.exp(np.mean(loss_mean)), np.sum(total_correct)/(batch_size*num_steps*(step+1))))
        print('Step %d: Training Perplexity: %f' %(step, np.exp(np.mean(loss_mean))))
        
            #do_evaluation(valid_set, sess)
                
        print('Start Vailidation')
        do_evaluation(valid_set, sess)

Epoch 0
Start Training
Step 1451: Training Perplexity: 464.571136
Start Vailidation
Validation Perplexity : 312.959930
Epoch 1
Start Training
Step 1451: Training Perplexity: 258.374359
Start Vailidation
Validation Perplexity : 245.777573
Epoch 2
Start Training
Step 1451: Training Perplexity: 206.102859
Start Vailidation
Validation Perplexity : 215.882324
Epoch 3
Start Training
Step 1451: Training Perplexity: 176.621384
Start Vailidation
Validation Perplexity : 198.346634
Epoch 4
Start Training


KeyboardInterrupt: 

### End

In [None]:
time_stop = time.time()
print('running time:  %f seconds' % (time_stop-time_start))

In [None]:
for step, (x, y) in enumerate(ptb_iterator(train_set, batch_size, num_steps)):
    input_batch = x
    label_batch = y
    print(x.shape)
    print(y.shape)
    break;

In [None]:
help(tf.nn.embedding_lookup)

In [None]:
help(tf.nn.seq2seq.sequence_loss_by_example)

In [None]:
help(tf.variable_scope)