In [1]:
%pylab inline
from IPython.display import Image, display

import tensorflow as tf
sess = tf.InteractiveSession()

Populating the interactive namespace from numpy and matplotlib


## Language Modeling Using TensorFlow

* Task : Given a sequence of words, predict the next word
  - Models the probability of sentences in a language

* Read the data

In [2]:
words = open('./poem.txt').read().replace('\n', '_')
words_as_set = set(words)
id_to_word = sorted(words_as_set)
word_to_id = {w: i for i, w in enumerate(id_to_word)}
data = [word_to_id[w] for w in words]
print('Number of words %d' % len(id_to_word))
print('Example words: %s' % id_to_word[1000:1020])
print('Words: %d' % len(data))

Number of words 7957
Example words: ['嗥', '嗫', '嗷', '嗽', '嗾', '嘇', '嘈', '嘉', '嘎', '嘏', '嘐', '嘒', '嘕', '嘘', '嘛', '嘤', '嘬', '嘱', '嘲', '嘴']
Words: 4888154


* Let's build the following model
  - A recurrent neural network, unrolled in time
  - Long short term memory (LSTM) cells

<img src='data/lstm.png' />

* LSTM Cell
  - Takes input, previous output and current state, and produces output and next state.
  
$$
h_t, C_t = lstm(x_t, h_{t-1}, C_{t-1})
$$

<img src='data/lstm_cell.png' width='40%'>

* Full set of equations ($[]$ is vector concatenation, $\times$ is matrix multiply, $*$ is element-wise multiply)

$$ X = [h_{t-1}, x_t] $$
$$ f_t = \sigma(W_f \times X + b_f) $$
$$ i_t = \sigma(W_i \times X + b_i) $$
$$ o_t = \sigma(W_o \times X + b_o) $$
$$ \tilde{C}_t = tanh(W_C \times X + b_C) $$
$$ C_t = f_t * C_{t-1} + i_t * \tilde{C}_t$$
$$ h_t = o_t * tanh(C_t)$$

### Parameters of the model
* We need to pick embedding dimensions and the dimensions of the state vector.
  - For convenience, let's pick `embedding_dims = state_size = 128`
* Embedding vectors
  - `[10000, embedding_dims]`.
* The 4 weight matrices in the equation ($W_f, W_i, W_o, W_C$)
  - `[2 * state_size, state_size]`
* 4 biases ($b_f, b_i, b_o, b_C$)
  - `[state_size]`
* Softmax classifier logit layer weights and biases
  - `[state_size, 10000], [10000]`

* Implement an LSTM cell as a class, so we can instantiate many layers

In [3]:
class LSTMCell(object):
    def __init__(self, state_size):
        self.state_size = state_size
        self.W_f = tf.Variable(self.initializer())
        self.W_i = tf.Variable(self.initializer())
        self.W_o = tf.Variable(self.initializer())
        self.W_C = tf.Variable(self.initializer())
        self.b_f = tf.Variable(tf.zeros([state_size]))
        self.b_i = tf.Variable(tf.zeros([state_size]))
        self.b_o = tf.Variable(tf.zeros([state_size]))
        self.b_C = tf.Variable(tf.zeros([state_size]))
        
    def __call__(self, x_t, h_t1, C_t1):
        X = tf.concat(1, [h_t1, x_t])
        f_t = tf.sigmoid(tf.matmul(X, self.W_f) + self.b_f)
        i_t = tf.sigmoid(tf.matmul(X, self.W_i) + self.b_i)
        o_t = tf.sigmoid(tf.matmul(X, self.W_o) + self.b_o)
        Ctilde_t = tf.tanh(tf.matmul(X, self.W_C) + self.b_C)
        C_t = f_t * C_t1 + i_t * Ctilde_t
        h_t = o_t * tf.tanh(C_t)
        return h_t, C_t
    
    def initializer(self):
        return tf.random_uniform([2*self.state_size, self.state_size], -0.1, 0.1)

* Declare embedding vectors, LSTM cells, and logit layer params

In [4]:
NDIMS = 256
VOCAB = len(id_to_word)
embedding = tf.Variable(tf.random_uniform([VOCAB, NDIMS], -0.02, 0.02))

NLAYERS = 4
lstm = []
for _ in range(NLAYERS):                                                                                                                
    lstm.append(LSTMCell(NDIMS))

sm_w = tf.Variable(tf.random_uniform([NDIMS, VOCAB], -0.1, 0.1))
sm_b = tf.Variable(tf.zeros([VOCAB]))

* Let's build the model!

In [5]:
# words and targets are placeholders for [batch_size, num_steps]
# tensor of word and target ids
words = tf.placeholder(tf.int64, name='words')
targets = tf.placeholder(tf.int64, name='targets')

def model(batch_size, num_steps):
    output = [tf.zeros([batch_size, NDIMS])] * NLAYERS
    state = [tf.zeros([batch_size, NDIMS])] * NLAYERS
    logits = []
    preds = []
    costs = []
    for i in range(num_steps):
        # Get the embedding for words
        embed = tf.nn.embedding_lookup(embedding, words[:, i])
        # Run the LSTM cells
        x = embed
        for d in range(NLAYERS):
            output[d], state[d] = lstm[d](x, output[d], state[d])
            x = output[d]
        # Get the logits
        logits.append(tf.matmul(output[-1], sm_w) + sm_b)
        # Get the softmax predictions
        preds.append(tf.nn.softmax(logits[-1]))
        # Cost per step
        costs.append(tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits[-1], targets[:, i]))
    # Average cost across time steps
    cost = tf.reduce_mean(tf.concat(0, costs))
    return preds, cost


* Some boring routines to get mini-batch of examples

In [6]:
STEPS=20                                                                                                                          
def seq_generator():
    curr = 0
    while True:                                                                                                                     
        if curr > len(data) - STEPS - 1:                                                                                              
            curr = 0
        w, t = (data[curr:curr + STEPS], data[curr + 1:curr + 1 + STEPS])
        curr += STEPS
        yield w, t

seqgen = seq_generator()

w, t = next(seqgen)
print("Input  ", ''.join([id_to_word[x] for x in w]))
print("Target ", ''.join([id_to_word[x] for x in t]))

Input   秦川雄帝宅，函谷壮皇居。绮殿千寻起，离宫
Target  川雄帝宅，函谷壮皇居。绮殿千寻起，离宫百


In [7]:
def get_batch(batch_size):
    input, target = [], []
    for _ in range(batch_size):
        w, t = next(seqgen)
        input.append(w)
        target.append(t)
    return np.array(input), np.array(target)                                                                                        

BATCH_SIZE=4
input, target = get_batch(BATCH_SIZE)
for i in range(BATCH_SIZE):
    print("Batch Input  ", ''.join([id_to_word[x] for x in input[i, :]]))
    print("Batch Target ", ''.join([id_to_word[x] for x in target[i, :]]))

Batch Input   百雉馀。_连薨遥接汉，飞观迥凌虚。云日隐
Batch Target  雉馀。_连薨遥接汉，飞观迥凌虚。云日隐层
Batch Input   层阙，风烟出绮疏。_岩廊罢机务，崇文聊驻
Batch Target  阙，风烟出绮疏。_岩廊罢机务，崇文聊驻辇
Batch Input   辇。玉匣启龙图，金绳披凤篆。_韦编断仍续
Batch Target  。玉匣启龙图，金绳披凤篆。_韦编断仍续，
Batch Input   ，缥帙舒还卷。对此乃淹留，欹案观坟典。_
Batch Target  缥帙舒还卷。对此乃淹留，欹案观坟典。_移


* Everything in working order?
* Try to get the predictions for a random example

In [8]:
preds, cost = model(1, STEPS)
tf.initialize_all_variables().run()
w, t = get_batch(1)
p = preds[0].eval(feed_dict={words: w, targets: t})
np.set_printoptions(formatter={'float': lambda x: '%.04f'%x}, threshold=10000)
print(p[0][:100])                                                                                                                 
 

[0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001]


* $e^{cost}$ should be approximately VOCAB

In [9]:
c = cost.eval(feed_dict={words: w, targets: t})
print(c, np.exp(c))

8.98185 7957.35


* Let's train the model
* Let's get fancy
  - Clip gradients before applying to parameters
  - Use `tf.train.GradientDescentOptimizer` to reduce some boiler plate
  - Use exponential decay on the learning rate

In [10]:
# Create a variable to hold the step number, but mark it as not trainable 
global_step = tf.Variable(0, trainable=False)

In [11]:
def train(learning_rate, batch_size):
    _, cost_value = model(batch_size, STEPS)
    all_vars = tf.trainable_variables()
    grads = tf.gradients(cost_value, all_vars)
    grads, _ = tf.clip_by_global_norm(grads, 5.0)
    # Decay the learning rate by 0.8 every 1000 steps
    learning_rate = tf.train.exponential_decay(
        learning_rate, global_step, 1000, 0.8)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    # apply_gradients increments the global_step
    train_op = optimizer.apply_gradients(zip(grads, all_vars),
                                         global_step=global_step)
    return cost_value, train_op

* And we are off to the races!

In [12]:
BATCH_SIZE = 32
cost_value, train_op = train(1.0, BATCH_SIZE)
tf.initialize_all_variables().run()
for step_number in range(1):
    w, t = get_batch(BATCH_SIZE)
    c, _ = sess.run([cost_value, train_op], feed_dict={words: w, targets: t})
    if step_number % 10 == 0:
        print('step %d: %.3f' % (step_number, c))

step 0: 8.982


In [13]:
saver = tf.train.Saver(tf.all_variables())
saver.save(sess, './lm_params', global_step=global_step.eval())

'./lm_params-1'

* Let's ask the model to generate sentences
  - Start off with few words
  - Sample from the probability distribution to get the next word
  - Remember to feed the cell state back into the model

In [15]:
saver.restore(sess, './lm_params-1')

embed = tf.nn.embedding_lookup(embedding, words[:, 0])
output_in = [tf.zeros([1, NDIMS])] * NLAYERS
state_in = [tf.zeros([1, NDIMS])] * NLAYERS
output = [0] * NLAYERS
state = [0] * NLAYERS
# Run the LSTM cells
x = embed
for d in range(NLAYERS):
    output[d], state[d] = lstm[d](x, output_in[d], state_in[d])
    x = output[d]
# Get the logits
logits = tf.matmul(output[-1], sm_w) + sm_b
# Get the softmax predictions
preds = tf.nn.softmax(logits)

def get_sentence(start_words, length):
    w = np.array([[word_to_id[start_words[0]]]])
    t = sess.run([preds] + output + state, feed_dict={words: w})
    sentence = [start_words[0]]
    for i in range(length):
        if i + 1 < len(start_words):
            w[0, 0] = word_to_id[start_words[i+1]]
        else:
            w[0, 0] = min(VOCAB, np.sum(np.cumsum(t[0]) < np.random.rand()))
        sentence.append(id_to_word[w[0, 0]])
        feed_dict = dict(
            [(output_in[i], t[1+i]) for i in range(NLAYERS)] +
            [(state_in[i], t[1+NLAYERS+i]) for i in range(NLAYERS)] +
            [(words, w)])
    t = sess.run([preds] + output + state, feed_dict=feed_dict)
    return ' '.join(sentence)

In [321]:
saver.restore(sess, './lm_params-242001')
print(get_sentence('国破山河在，城春草木深。感时花溅泪，', 23))
# 鹅 鹅 鹅 ， --> 灯 下 寒 残 啼 。 。 掷 飞 作 。 兮 迸 香 檐 支 毛
# 一 览 众 山 小 ， --> 事 点 段 树 榜 带 念
# 一 览 众 山 小 ， --> 万 镇 曲 家 一 明 夜
# 前 不 见 古 人 ， 后 不 见 来 者 。 --> 暮 兴 闲 客 宠 。 思 住 水 。 风 土 骑
# 国 破 山 河 在 ， 城 春 草 木 深 。 感 时 花 溅 泪 ， --> 情 君 沽 古 风 ， 。
saver.restore(sess, './lm_params-1')
print(get_sentence('国破山河在，城春草木深。感时花溅泪，', 23))

国 破 山 河 在 ， 城 春 草 木 深 。 感 时 花 溅 泪 ， 意 青 汉 乍 后 ，
国 破 山 河 在 ， 城 春 草 木 深 。 感 时 花 溅 泪 ， 姜 梧 趗 裈 旎 劈


### Exercise
* Increase the `state_size`
* Train longer, until the cost goes to `~ 1.0`
* Have fun with sentence generation!