# Recurrent Neural Networks: Language Modeling

Given words $x_{1},x_{2},...,x_{t}$,a language model will predict the following word $x_{t+1}$ by modeling:

$P(x_{t+1}=v_{j}|x_{t},...,x_{1})$

which $v_{j}$ is a word in vocabulary.

$e^{(t)}=x^{(t)}L$
$h^{(t)}=sigmoid(h^{(t-1)}H + e^{(t)}I + b_{1})$
$y^{(t)}=softmax(h^{(t)}U + b2)$

$P(x_{t+1}=v_{j}|x_{t},...,x_{1})=\hat{y}_{j}^{(t)}$

$x^{(t)}\in R^{|V|}$ : one-hot row vector representing the index of the current word. 

$L\in R^{|V|*d}$ : word embeddings, 

$H\in R^{D_{h}*D_{h}}$ : the hidden transformation matrix

$I\in R^{d*D_{h}}$ : the input word to hidden transformation matrix

$b_{1} \in R^{D_{h}}$ : bias

$b_{2} \in R^{|V|}$ : bias

$|V|$ : the vocabulary size; $d$ : the word embedding size; $D_{h}$ : the hidden layer dimension

### Loss Function:
cross entropy :
$J^{(t)}(\theta)=CE(y^{(t)},\hat{y}^{(t)})=-\sum_{i=1}^{|V|}y_{i}^{(t)}log\hat{y}^{(t)}$

In [2]:
import tensorflow as tf
import numpy as np

In [3]:
batch_size = 64
embed_size = 50
hidden_size = 100
num_steps = 10
max_epochs = 16
early_stopping = 2
dropout = 0.9
lr = 0.001

In [4]:
from utils import calculate_perplexity, get_ptb_dataset, Vocab
from utils import ptb_iterator, sample

In [5]:
def load_data():
    """Loads starter word-vectors and train/dev/test data."""
    vocab = Vocab()
    vocab.construct(get_ptb_dataset('train'))
    encoded_train = np.array(
        [vocab.encode(word) for word in get_ptb_dataset('train')],
        dtype=np.int32)
    encoded_valid = np.array(
        [vocab.encode(word) for word in get_ptb_dataset('valid')],
        dtype=np.int32)
    encoded_test = np.array(
        [vocab.encode(word) for word in get_ptb_dataset('test')],
        dtype=np.int32)
    return encoded_train, encoded_valid, encoded_test, vocab

In [6]:
train_set, valid_set, test_set, vocab = load_data()

929589.0 total words with 10000 uniques


In [7]:
print(train_set[1:50])

[ 2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
  0 27 28 29 30 31 32 33 34 35 36 37 38 27 25 39  0 40 41 42  0 43 32 44]


# build the graph

In [8]:
# add the placeholders
def add_placeholders():
    input_placeholder = tf.placeholder(tf.int32, shape=[None, num_steps])
    label_placeholder = tf.placeholder(tf.int32, shape=[None, num_steps])
    return input_placeholder, label_placeholder

In [9]:
# create the feed_dict
def create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch):
    feed_dict = {input_placeholder: input_batch,
                label_placeholder:label_batch}
    return feed_dict

### HINTS
### You should take care of how to construct the model inputs
tf.squeeze(input, squeeze_dims=None, name=None):Removes dimensions of size 1 from the shape of a tensor.

tf.split(split_dim, num_split, value, name='split'):Splits a tensor into `num_split` tensors along one dimension.
    
num_steps: like the $t$ in the model. 

In [10]:
def add_embed_layer(vocab_size, input_placeholder):
    with tf.device('/cpu:0'):
        embed = tf.get_variable(name="Embedding", shape=[vocab_size, embed_size])
        inputs = tf.nn.embedding_lookup(embed, input_placeholder)
        inputs = [tf.squeeze(input, squeeze_dims=[1]) for input in tf.split(1, num_steps, inputs)] 
        return inputs

In [11]:
## according the model at the begining of this file
def add_model(inputs, vocab_size):
    ###initial state
    with tf.variable_scope('RNN') as scope:
        state = tf.zeros([batch_size, hidden_size])
        rnn_outputs = []
        for tstep, current_input in enumerate(inputs):
            if tstep > 0:
                scope.reuse_variables()
            RNN_H = tf.get_variable('HMatrix', shape=[hidden_size, hidden_size])
            RNN_I = tf.get_variable('IMatrix', shape=[embed_size, hidden_size])
            RNN_b1 = tf.get_variable('b1', shape=[hidden_size])
            state = tf.sigmoid(tf.matmul(state, RNN_H) + tf.matmul(current_input, RNN_I) + RNN_b1)
            rnn_outputs.append(state)
    return rnn_outputs

In [28]:
## add the Projection layer
def add_projection_layer(rnn_outputs, vocab_size):
    RNN_U = tf.get_variable('UMatrix', shape=[hidden_size, vocab_size])
    RNN_b2 = tf.get_variable('b2', shape=[vocab_size])
    outputs = [tf.matmul(state, RNN_U) + RNN_b2 for state in rnn_outputs]
    return outputs

In [29]:
## add loss op
def add_loss_op(outputs, label_placeholder):
    all_one_weights = num_steps*[tf.ones([batch_size])]
    ### TODO len(targets) = len(output) = num_steps!!
    ##construct the targets
    targets = [tf.squeeze(label_placeholder[:, i]) for i in range(num_steps)]
    cross_entropy = tf.nn.seq2seq.sequence_loss(outputs,
                                                targets=targets, 
                                                weights=all_one_weights)
    loss = cross_entropy
    return loss

In [30]:
## add training op
def add_train_op(loss):
    train_op = tf.train.AdamOptimizer(0.01).minimize(loss)
    return train_op

In [52]:
def evaluation(y_pred, label_placeholder):
    label_pred = [tf.cast(tf.argmax(value, dimension=1), tf.int32) for key, value  in enumerate(y_pred)]
    label_right = [tf.squeeze(label_placeholder[:, i]) for i in range(num_steps)]
    correct_pred_num = []
    for i in range(num_steps):
        correct_pred_num.append(tf.reduce_sum(tf.cast(tf.equal(label_right[i], label_pred[i]), tf.int32)))
    correct_pred_num = np.sum(correct_pred_num)
    return correct_pred_num

In [53]:
a = [1,2,3]
print(type(a))
np.sum(a)
b = [i*j for i in range(10) ; j in range(9)]

SyntaxError: invalid syntax (<ipython-input-53-7ade904bd1b1>, line 4)

In [54]:
with tf.Graph().as_default(), tf.device('/cpu:0'):
    input_placeholder, label_placeholder = add_placeholders()
    vocab_size = len(vocab)
    inputs = add_embed_layer(vocab_size, input_placeholder)
    rnn_outputs = add_model(inputs, vocab_size)

    ## projecttion, output is a List, num_steps of 2-D tensors[batch_size, vocab_size]
    outputs = add_projection_layer(rnn_outputs, vocab_size)
    ##
    pred = [tf.nn.softmax(value) for key, value in enumerate(outputs)]
    
    eval_correct = evaluation(pred, label_placeholder)
    ## add loss op
    loss = add_loss_op(outputs, label_placeholder)
    ## add training op
    train_op = add_train_op(loss)
    
    for epoch in range(max_epochs):
        print('Epoch %d' %(epoch))
        print('Start Training')
        sess = tf.Session()
        sess.run(tf.initialize_all_variables())
        loss_mean = []
        for step, (x, y) in enumerate(ptb_iterator(train_set, batch_size, num_steps)):
            input_batch = x
            label_batch = y
            feed = create_feed_dict(input_placeholder, input_batch, label_placeholder, label_batch)
            _, loss_step = sess.run([train_op, loss], feed)
            loss_mean.append(loss_step)
            if step % 10 == 0:
                eval_correct_step = sess.run([eval_correct], feed)
                #pred_step = sess.run([pred], feed)
               ## label_pred = [tf.argmax(value, dimension=1) for key, value  in pred_step]
                print(eval_correct_step)
                print('step %d: loss: %f' %(step, np.mean(loss_mean)))
                loss_mean = []
                
        print('Start Vailidation')
        

Epoch 0
Start Training
[2]
step 0: loss: 9.616463
[35]
step 10: loss: 7.766164
[60]
step 20: loss: 7.025149
[58]
step 30: loss: 6.770122
[56]
step 40: loss: 6.695487
[81]
step 50: loss: 6.544434
[69]
step 60: loss: 6.513261
[78]
step 70: loss: 6.467342
[93]
step 80: loss: 6.397374
[89]
step 90: loss: 6.292168
[83]
step 100: loss: 6.231040
[91]
step 110: loss: 6.219158
[93]
step 120: loss: 6.214158
[89]
step 130: loss: 6.231470
[83]
step 140: loss: 6.085512
[116]
step 150: loss: 6.111127
[98]
step 160: loss: 6.147024
[106]
step 170: loss: 6.074859
[99]
step 180: loss: 5.939183
[94]
step 190: loss: 6.011034
[90]
step 200: loss: 6.076472
[91]
step 210: loss: 5.941876
[101]
step 220: loss: 5.944173
[85]
step 230: loss: 5.979993
[105]
step 240: loss: 5.883658
[112]
step 250: loss: 5.818613
[123]
step 260: loss: 5.800500
[111]
step 270: loss: 5.725658
[105]
step 280: loss: 5.904441
[101]
step 290: loss: 5.870816
[110]
step 300: loss: 5.838589
[116]
step 310: loss: 5.791898
[115]
step 320: lo

KeyboardInterrupt: 

In [35]:
for step, (x, y) in enumerate(ptb_iterator(train_set, batch_size, num_steps)):
    input_batch = x
    label_batch = y
    print(x.shape)
    print(y.shape)
    break;

(64, 10)
(64, 10)


In [None]:
help(tf.nn.embedding_lookup)

In [None]:
help(tf.nn.seq2seq.sequence_loss_by_example)

In [None]:
help(tf.variable_scope)