In [11]:


%pylab inline
from IPython.display import Image, display

import tensorflow as tf

Populating the interactive namespace from numpy and matplotlib


## Language Modeling Using TensorFlow

* Task : Given a sequence of words, predict the next word
  - Models the probability of sentences in a language

* Sequence
$$ 
\begin{eqnarray}
x & = & x_1, x_2, x_3, ..., x_n 
\end{eqnarray}
$$

* E.g.,
$$
\begin{eqnarray}
x & = & 明月几时有
\end{eqnarray}
$$
$$
x_1 = 明, x_2 = 月, x_3 = 几, x_4 = 时, x_5 = 有
$$

* Handle the data.

In [24]:
class TrainData(object):
    
    def __init__(self, corpus, batch, steps):
        self.batch = batch
        self.steps = steps
        words = open(corpus, mode='r').read().replace('\n', '_')
        words_as_set = set(words)
        self.id_to_word = sorted(set(words))
        self.word_to_id = {w: i for i, w in enumerate(self.id_to_word)}
        self.data = [self.word_to_id[w] for w in words]
        tf.logging.info('Number of unique chars: %d', len(self.id_to_word))
        tf.logging.info('Number of training chars: %d', len(self.data))
        self.seqgen = self.seq_generator()

    def seq_generator(self):
        curr = 0
        while True:
            if curr > len(self.data) - self.steps - 1:
                curr = 0
            start, limit = curr, curr + self.steps
            w, t = (self.data[start:limit], self.data[start + 1:limit + 1])
            curr = limit
            yield w, t

    def get_batch(self):
        input, target = [], []
        for _ in range(self.batch):
            w, t = next(self.seqgen)
            input.append(w)
            target.append(t)
        return np.array(input), np.array(target)
    
    def to_words(self, ids):
        return [self.id_to_word[x] for x in ids]

    @property
    def vocab(self):
        return len(self.id_to_word)

In [None]:
class TrainData(object):
    
    def __init__(self, corpus, batch, steps):
        self.batch = batch
        self.steps = steps
        words = open(corpus, mode='r').read().replace('\n', '_')
        words_as_set = set(words)
        self.id_to_word = sorted(set(words))
        self.word_to_id = {w: i for i, w in enumerate(self.id_to_word)}
        self.data = [self.word_to_id[w] for w in words]
        tf.logging.info('Number of unique chars: %d', len(self.id_to_word))
        tf.logging.info('Number of training chars: %d', len(self.data))


In [None]:
    def __init__(...):
      self.seqgen = self.seq_generator()        

    def seq_generator(self):
        curr = 0
        while True:
            if curr > len(self.data) - self.steps - 1:
                curr = 0
            start, limit = curr, curr + self.steps
            w, t = (self.data[start:limit], self.data[start + 1:limit + 1])
            curr = limit
            yield w, t

    def get_batch(self):
        input, target = [], []
        for _ in range(self.batch):
            w, t = next(self.seqgen)
            input.append(w)
            target.append(t)
        return np.array(input), np.array(target)


In [None]:
    def to_words(self, ids):
        return [self.id_to_word[x] for x in ids]

    @property
    def vocab(self):
        return len(self.id_to_word)

In [27]:
data = TrainData('./data/poem.txt', 1, 10)
x, y = data.get_batch()
print(data.to_words(x[0]))
print(data.to_words(y[0]))

['秦', '川', '雄', '帝', '宅', '，', '函', '谷', '壮', '皇']
['川', '雄', '帝', '宅', '，', '函', '谷', '壮', '皇', '居']


* Model: the probability of a sequence
$$ p_\theta(x) = p_\theta(x_1)p_\theta(x_2|x_1)p_\theta(x_3|x_1x_2)...p_\theta(x_n|x_1x_2...x_{n-1}) $$
* $\theta$ to be estimated.

* Maximum likelihood estimation.
$$ 
\operatorname*{arg\,max}_\theta \prod_{x\in D} p_\theta(x)
$$

* Equivalent to
$$ 
 -\frac{1}{N}\operatorname*{arg\,min}_\theta \sum_{x\in D} log(p_\theta(x))
 = -\frac{1}{N} \operatorname*{arg\,min}_\theta \sum_{x\in D} \sum_i log(p_\theta(x_i|x_1x_2...x_{i-1}))
$$

  $D$ is the data set and $N$ is the number of samples in the data set.

* Per-word loss term:
$$
-log(p_\theta(x_i|x_1x_2...x_{i-1}))
$$

* Let's build the following model
  - Character embedding
  - A recurrent neural network
  - Stacked, unrolled in time
  - Long short term memory (LSTM) cells

<img src='data/lstm.png' />

* LSTM Cell
  - Takes input, previous output and current state, and produces output and next state.
  
$$
h_t, C_t = lstm(x_t, h_{t-1}, C_{t-1})
$$

<img src='data/lstm_cell.png' width='40%'>

* Full set of equations ($[]$ is vector concatenation, $\times$ is matrix multiply, $*$ is element-wise multiply)

$$ X = [h_{t-1}, x_t] $$
$$ f_t = \sigma(W_f \times X + b_f) $$
$$ i_t = \sigma(W_i \times X + b_i) $$
$$ o_t = \sigma(W_o \times X + b_o) $$
$$ \tilde{C}_t = tanh(W_C \times X + b_C) $$
$$ C_t = f_t * C_{t-1} + i_t * \tilde{C}_t$$
$$ h_t = o_t * tanh(C_t)$$

* Implement an LSTM cell as a class, so we can instantiate many layers

In [8]:
class LSTM(object):
    
    def __init__(self, ith, dims):
        self.dims = dims
        with tf.name_scope('lstm_%d' % ith):
            self.W_f = tf.Variable(self.initializer(), name='wf')
            self.W_i = tf.Variable(self.initializer(), name='wi')
            self.W_o = tf.Variable(self.initializer(), name='wo')
            self.W_C = tf.Variable(self.initializer(), name='wc')
            self.b_f = tf.Variable(tf.zeros([dims]), name='bf')
            self.b_i = tf.Variable(tf.zeros([dims]), name='bi')
            self.b_o = tf.Variable(tf.zeros([dims]), name='bo')
            self.b_C = tf.Variable(tf.zeros([dims]), name='bc')

    def forward(self, x_t, h_t1, C_t1):
        X = tf.concat(1, [h_t1, x_t])
        f_t = tf.sigmoid(tf.matmul(X, self.W_f) + self.b_f)
        i_t = tf.sigmoid(tf.matmul(X, self.W_i) + self.b_i)
        o_t = tf.sigmoid(tf.matmul(X, self.W_o) + self.b_o)
        Ctilde_t = tf.tanh(tf.matmul(X, self.W_C) + self.b_C)
        C_t = f_t * C_t1 + i_t * Ctilde_t
        h_t = o_t * tf.tanh(C_t)
        return h_t, C_t

    def initializer(self):
        return tf.random_uniform([2*self.dims, self.dims], -0.1, 0.1)

* Let's build the model!

### Parameters of the model
* We need to pick embedding dimensions and the dimensions of the state vector.
  - For convenience, let's pick `dims = 256`
* Vocab size.
  - `data.vocab = 7957`
* Embedding vectors
  - `[7957, dims]`.
* The 4 weight matrices in the equation ($W_f, W_i, W_o, W_C$)
  - `[2 * dims, dims]`
* 4 biases ($b_f, b_i, b_o, b_C$)
  - `[dims]`
* Softmax classifier logit layer weights and biases
  - `[dims, 7957], [7957]`

* Declare embedding vectors, LSTM cells, and logit layer params

In [36]:
class Model(object):
    
    def __init__(self, dims, vocab, depth, steps, lr):
        # Configs.
        self.dims = dims
        self.vocab = vocab
        self.depth = depth
        self.steps = steps
        self.lr = lr
        
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Var
            self.embedding = tf.Variable(tf.random_uniform([vocab, dims], -0.02, 0.02))
            self.lstm = []
            for i in range(depth):
                self.lstm.append(LSTM(i, self.dims))
            with tf.name_scope('sm'):
                self.sm_w = tf.Variable(tf.random_uniform([dims, vocab], -0.1, 0.1),
                                        name='w')
                self.sm_b = tf.Variable(tf.zeros([vocab]), name='b')

            # Feeds.
            self.words = tf.placeholder(tf.int64)
            self.targets = tf.placeholder(tf.int64)
        
            # Define forward.
            batch_size = tf.shape(self.words)[:1] 
            shape = tf.concat(0, [batch_size, [dims]]) 
            init_zeros = tf.zeros(shape)
            h = [init_zeros] * depth
            c = [init_zeros] * depth
            o = []
            
            # Unroll LSTMs.
            for i in range(steps):
                # Get the embedding for words
                x = tf.nn.embedding_lookup(self.embedding, self.words[:, i])
                for j in range(self.depth):
                    h[j], c[j] = self.lstm[j].forward(x, h[j], c[j])
                    x = h[j]
                o.append(x)
                
            # Compute the loss.
            outputs = tf.reshape(tf.concat(1, o), [-1, dims])
            logits = tf.matmul(outputs, self.sm_w) + self.sm_b
            costs = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, tf.reshape(self.targets, [-1]))
            self.preds = tf.nn.softmax(logits)
            self.loss = tf.reduce_mean(costs)
            
            # Define training.
            self.global_step = tf.Variable(0, trainable=False, name='global_step')
            vars = tf.trainable_variables()
            grads = tf.gradients(self.loss, vars)
            grads, _ = tf.clip_by_global_norm(grads, 5.0)
            optimizer = tf.train.GradientDescentOptimizer(lr)
            self.train_op = optimizer.apply_gradients(
                zip(grads, vars), global_step=self.global_step)

            # Summary
            tf.scalar_summary('loss', self.loss)
            self.summary = tf.merge_summary(tf.get_collection(tf.GraphKeys.SUMMARIES))

    def train(self, data, logdir, total_steps):
        sess = tf.Session()
        sess.run(tf.initialize_all_variables())
        saver = tf.train.Saver(tf.all_variables())
        swriter = tf.train.SummaryWriter(logdir)

        # Recover.
        latest = tf.train.latest_checkpoint(logdir)
        if latest is not None:
            tf.logging.info('restore %s', latest)
            saver.restore(sess, latest)

        steps = sess.run(self.global_step)
        while steps < total_steps:
            if steps % 1000 == 0:
                saver.save(sess, logdir + '/lm_params', global_step=steps)
            w, t = data.get_batch()
            if steps % 100 == 0:
                print(w)
                print(t)
                loss, summary = sess.run([self.loss, self.summary],
                                         feed_dict={self.words: w, self.targets: t})
                swriter.add_summary(summary, steps)
                swriter.flush()
                tf.logging.info('step %d: %.3f', steps, loss)
            else:
                sess.run(self.train_op, feed_dict={self.words: w, self.targets: t})
            steps += 1

In [None]:
class Model(object):
    
    def __init__(self, dims, vocab, depth, steps, lr):
        # Configs.
        self.dims = dims
        self.vocab = vocab
        self.depth = depth
        self.steps = steps
        self.lr = lr

        # Var
        self.embedding = tf.Variable(tf.random_uniform([vocab, dims], -0.02, 0.02))
        self.lstm = []
        for i in range(depth):
            self.lstm.append(LSTM(i, self.dims))
        with tf.name_scope('sm'):
            self.sm_w = tf.Variable(tf.random_uniform([dims, vocab], -0.1, 0.1), name='w')
            self.sm_b = tf.Variable(tf.zeros([vocab]), name='b')


In [None]:
        # Feeds.
        self.words = tf.placeholder(tf.int64)
        self.targets = tf.placeholder(tf.int64)

In [None]:
        # Define forward.
        batch_size = tf.shape(self.words)[:1] 
        shape = tf.concat(0, [batch_size, [dims]])
        init_zeros = tf.zeros(shape)
        h = [init_zeros] * depth
        c = [init_zeros] * depth
        o = []
        
        # Unroll LSTMs.
        for i in range(self.steps):
            # Get the embedding for words
            x = tf.nn.embedding_lookup(self.embedding, self.words[:, i])
            for j in range(self.depth):
                h[j], c[j] = self.lstm[j].forward(x, h[j], c[j])
                x = h[j]
            o.append(x)

In [None]:

        # Compute the loss.
        outputs = tf.reshape(tf.concat(1, o), [-1, dims])
        logits = tf.matmul(outputs, self.sm_w) + self.sm_b
        costs = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits, tf.reshape(self.targets, [-1]))
        self.preds = tf.nn.softmax(logits)
        self.loss = tf.reduce_mean(costs)


In [None]:
        # Define training.
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        vars = tf.trainable_variables()
        grads = tf.gradients(self.loss, vars)
        grads, _ = tf.clip_by_global_norm(grads, 5.0)
        optimizer = tf.train.GradientDescentOptimizer(lr)
        self.train_op = optimizer.apply_gradients(
            zip(grads, vars), global_step=self.global_step)

In [None]:
        # Summary
        tf.scalar_summary('loss', self.loss)
        self.summary = tf.merge_summary(tf.get_collection(tf.GraphKeys.SUMMARIES))

* Train the mode

In [None]:
    def train(self, data, logdir, total_steps):
        sess = tf.Session()
        sess.run(tf.initialize_all_variables())
        saver = tf.train.Saver(tf.all_variables())
        swriter = tf.train.SummaryWriter(logdir)

        # Recover.
        latest = tf.train.latest_checkpoint(logdir)
        if latest is not None:
            tf.logging.info('restore %s', latest)
            saver.restore(sess, latest)

        steps = sess.run(self.global_step)
        while steps < total_steps:
            if steps % 1000 == 0:
                saver.save(sess, logdir + '/lm_params', global_step=steps)
            w, t = data.get_batch()
            if steps % 100 == 0:
                loss, summary = sess.run([self.loss, self.summary],
                                         feed_dict={self.words: w, self.targets: t})
                swriter.add_summary(summary, steps)
                swriter.flush()
                tf.logging.info('step %d: %.3f', steps, loss)
            else:
                sess.run(self.train_op, feed_dict={self.words: w, self.targets: t})
            steps += 1

In [37]:
# Main driver.
corpus = './data/poem.txt'
batch = 32
steps = 20
data = TrainData(corpus, batch, steps)

dims = 256
vocab = data.vocab
depth = 4
steps = 20
lr = 0.5
model = Model(dims, data.vocab, depth, steps, lr)
model.train(data, './', 100)

NotFoundError: Tensor name "global_step_2" not found in checkpoint files ./lm_params-0
	 [[Node: save_1/restore_slice_5 = RestoreSlice[dt=DT_INT32, preferred_shard=-1, _device="/job:localhost/replica:0/task:0/cpu:0"](_recv_save_1/Const_0, save_1/restore_slice_5/tensor_name, save_1/restore_slice_5/shape_and_slice)]]
Caused by op 'save_1/restore_slice_5', defined at:
  File "/usr/lib/python3.4/runpy.py", line 170, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.4/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/traitlets/config/application.py", line 596, in launch_instance
    app.start()
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/ipykernel/kernelapp.py", line 442, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tornado/ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/ipykernel/kernelbase.py", line 391, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/ipykernel/ipkernel.py", line 199, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/IPython/core/interactiveshell.py", line 2705, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/IPython/core/interactiveshell.py", line 2815, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/IPython/core/interactiveshell.py", line 2869, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-37-3c04846f81eb>", line 13, in <module>
    model.train(data, './', 100)
  File "<ipython-input-36-3ffe9ca4cebc>", line 65, in train
    saver = tf.train.Saver(tf.all_variables())
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 845, in __init__
    restore_sequentially=restore_sequentially)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 515, in build
    filename_tensor, vars_to_save, restore_sequentially, reshape)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 271, in _AddRestoreOps
    values = self.restore_op(filename_tensor, vs, preferred_shard)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/training/saver.py", line 186, in restore_op
    preferred_shard=preferred_shard)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/ops/io_ops.py", line 202, in _restore_slice
    preferred_shard, name=name)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/ops/gen_io_ops.py", line 358, in _restore_slice
    preferred_shard=preferred_shard, name=name)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/ops/op_def_library.py", line 704, in apply_op
    op_def=op_def)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 2260, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/google/home/zhifengc/tf/lib/python3.4/site-packages/tensorflow/python/framework/ops.py", line 1230, in __init__
    self._traceback = _extract_stack()


* Everything in working order?
* Try to get the predictions for a random example

In [8]:
preds, cost = model(1, STEPS)
tf.initialize_all_variables().run()
w, t = get_batch(1)
p = preds[0].eval(feed_dict={words: w, targets: t})
np.set_printoptions(formatter={'float': lambda x: '%.04f'%x}, threshold=10000)
print(p[0][:100])                                                                                                                 
 

[0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001
 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001]


* $e^{cost}$ should be approximately VOCAB

In [9]:
c = cost.eval(feed_dict={words: w, targets: t})
print(c, np.exp(c))

8.98182 7957.14


* Let's train the model
* Let's get fancy
  - Clip gradients before applying to parameters
  - Use `tf.train.GradientDescentOptimizer` to reduce some boiler plate
  - Use exponential decay on the learning rate

In [10]:
# Create a variable to hold the step number, but mark it as not trainable 
global_step = tf.Variable(0, trainable=False)

In [11]:
def train(learning_rate, batch_size):
    _, cost_value = model(batch_size, STEPS)
    all_vars = tf.trainable_variables()
    grads = tf.gradients(cost_value, all_vars)
    grads, _ = tf.clip_by_global_norm(grads, 5.0)
    # Decay the learning rate by 0.8 every 1000 steps
    learning_rate = tf.train.exponential_decay(
        learning_rate, global_step, 1000, 0.8)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    # apply_gradients increments the global_step
    train_op = optimizer.apply_gradients(zip(grads, all_vars),
                                         global_step=global_step)
    return cost_value, train_op

* And we are off to the races!

In [12]:
BATCH_SIZE = 32
cost_value, train_op = train(1.0, BATCH_SIZE)
tf.initialize_all_variables().run()
for step_number in range(1):
    w, t = get_batch(BATCH_SIZE)
    c, _ = sess.run([cost_value, train_op], feed_dict={words: w, targets: t})
    if step_number % 10 == 0:
        print('step %d: %.3f' % (step_number, c))

step 0: 8.982


In [29]:
saver = tf.train.Saver(tf.all_variables())
saver.save(sess, './lm_params', global_step=global_step.eval())

ValueError: No variables to save

* Let's ask the model to generate sentences
  - Start off with few words
  - Sample from the probability distribution to get the next word
  - Remember to feed the cell state back into the model

In [14]:
embed = tf.nn.embedding_lookup(embedding, words[:, 0])
output_in = [tf.zeros([1, NDIMS])] * NLAYERS
state_in = [tf.zeros([1, NDIMS])] * NLAYERS
output = [0] * NLAYERS
state = [0] * NLAYERS
# Run the LSTM cells
x = embed
for d in range(NLAYERS):
    output[d], state[d] = lstm[d](x, output_in[d], state_in[d])
    x = output[d]
# Get the logits
logits = tf.matmul(output[-1], sm_w) + sm_b
# Get the softmax predictions
preds = tf.nn.softmax(logits)

def get_sentence(start_words, length):
    w = np.array([[word_to_id[start_words[0]]]])
    t = sess.run([preds] + output + state, feed_dict={words: w})
    sentence = [start_words[0]]
    for i in range(length):
        if i + 1 < len(start_words):
            w[0, 0] = word_to_id[start_words[i+1]]
        else:
            w[0, 0] = min(VOCAB, np.sum(np.cumsum(t[0]) < np.random.rand()))
        sentence.append(id_to_word[w[0, 0]])
        feed_dict = dict(
            [(output_in[i], t[1+i]) for i in range(NLAYERS)] +
            [(state_in[i], t[1+NLAYERS+i]) for i in range(NLAYERS)] +
            [(words, w)])
    t = sess.run([preds] + output + state, feed_dict=feed_dict)
    return ' '.join(sentence)

In [72]:
saver.restore(sess, './lm_params-1')
print(get_sentence('国破山河在，', 12))
print(get_sentence('慈母手中线，', 12))
print(get_sentence('一览众山小，', 12))
print(get_sentence('明月几时有，', 12))

国 破 山 河 在 ， 峨 怳 楗 楂 擢 映 幔
慈 母 手 中 线 ， 嵝 曦 瞪 优 飗 儱 獯
一 览 众 山 小 ， 瓜 籥 国 寥 徐 筠 抨
明 月 几 时 有 ， 曜 见 撷 柱 绐 转 唝


In [28]:
saver.restore(sess, './lm_params-375000')
print(get_sentence('国破山河在，', 12))
print(get_sentence('慈母手中线，', 12))
print(get_sentence('一览众山小，', 12))
print(get_sentence('明月几时有，', 12))
# 鹅 鹅 鹅 ， --> 灯 下 寒 残 啼 。 。 掷 飞 作 。 兮 迸 香 檐 支 毛
# 一 览 众 山 小 ， --> 事 点 段 树 榜 带 念
# 一 览 众 山 小 ， --> 万 镇 曲 家 一 明 夜
# 前 不 见 古 人 ， 后 不 见 来 者 。 --> 暮 兴 闲 客 宠 。 思 住 水 。 风 土 骑
# 国 破 山 河 在 ， 城 春 草 木 深 。 感 时 花 溅 泪 ， --> 情 君 沽 古 风 ， 。
# saver.restore(sess, './lm_params-1')
# print(get_sentence('国破山河在，城春草木深。感时花溅泪，', 23))

NameError: name 'saver' is not defined

### Exercise
* Increase the `state_size`
* Train longer, until the cost goes to `~ 1.0`
* Have fun with sentence generation!