In [1]:
import string
import os
import math
import numpy as np
import tensorflow as tf

np.random.seed(0)

In [2]:
data_path = os.path.expanduser('~/Datasets/text8/text8.txt')
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

with open(data_path) as f:
  text = f.read()
  
print('len(text) = %s' % len(text))

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  elif dictid == 0:
    return ' '
  else:
    print('Unexpected codepint: %s' % dictid)
    return ' '

def make_batches(batch_size, num_unrollings):
  slice_len = len(text) // batch_size
  n_batches = slice_len // num_unrollings
  
  unrollings = []
  for i in range(slice_len):
    unrolling = np.zeros((batch_size, vocabulary_size))
    for j in range(batch_size):
      char = text[slice_len * j + i]
      unrolling[j, char2id(char)] = 1
      
    unrollings.append(unrolling)
    
  for i in range(n_batches):
    batch = unrollings[i * num_unrollings:(i + 1) * num_unrollings + 1]
    yield batch

len(text) = 100000000


In [3]:
batches = make_batches(32, 20) 
batch = next(batches) + next(batches) + next(batches)

assert(len(batch) == 21 + 21 + 21)

for row in range(3):
  for (i, u) in enumerate(batch):
    assert u.shape == (32, 27)
    if i % 21 == 0:
      print('|', end='')
    print(id2char(np.argmax(u[row])), end='')
  print('')
  
del batches

| anarchism originated|d as a term of abuse | first used against e
|esident lyndon johnso|on signed a proclamat|tion substantially en
| minting afonso also | sent ambassadors to | european kingdoms ou


In [4]:
class LSTM(object):
  def __init__(self, input_dim, output_dim):
    # Input gate: input, previous output, and bias.
    self.ix = tf.Variable(tf.truncated_normal([input_dim, output_dim], stddev=1.0 / math.sqrt(input_dim)))
    self.im = tf.Variable(tf.truncated_normal([output_dim, output_dim], stddev=1.0 / math.sqrt(output_dim)))
    self.ib = tf.Variable(tf.zeros([1, output_dim]))
    # Forget gate: input, previous output, and bias.
    self.fx = tf.Variable(tf.truncated_normal([input_dim, output_dim], stddev=1.0 / math.sqrt(input_dim)))
    self.fm = tf.Variable(tf.truncated_normal([output_dim, output_dim], stddev=1.0 / math.sqrt(output_dim)))
    self.fb = tf.Variable(tf.zeros([1, output_dim]))
    # Memory cell: input, state and bias.                             
    self.cx = tf.Variable(tf.truncated_normal([input_dim, output_dim], stddev=1.0 / math.sqrt(input_dim)))
    self.cm = tf.Variable(tf.truncated_normal([output_dim, output_dim], stddev=1.0 / math.sqrt(output_dim)))
    self.cb = tf.Variable(tf.zeros([1, output_dim]))
    # Output gate: input, previous output, and bias.
    self.ox = tf.Variable(tf.truncated_normal([input_dim, output_dim], stddev=1.0 / math.sqrt(input_dim)))
    self.om = tf.Variable(tf.truncated_normal([output_dim, output_dim], stddev=1.0 / math.sqrt(output_dim)))
    self.ob = tf.Variable(tf.zeros([1, output_dim]))
  
  # Definition of the cell computation.
  def __call__(self, i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, self.ix) + tf.matmul(o, self.im) + self.ib)
    forget_gate = tf.sigmoid(tf.matmul(i, self.fx) + tf.matmul(o, self.fm) + self.fb)
    update = tf.matmul(i, self.cx) + tf.matmul(o, self.cm) + self.cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, self.ox) + tf.matmul(o, self.om) + self.ob)
    return output_gate * tf.tanh(state), state

In [None]:
batch_size = 32
num_unrollings = 30
n_h = 256

tf.reset_default_graph()

global_step = tf.Variable(0, name='global_step', trainable=False)
learning_rate = tf.placeholder(tf.float32, name='learning_rate')

train_data = list()
for i in range(num_unrollings + 1):
  train_data.append(tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size], name='x_%s' % i))
  
xs = train_data[:num_unrollings]
ys = train_data[1:]  # labels are inputs shifted by one time step.

saved_output = tf.Variable(tf.zeros([batch_size, n_h]), trainable=False)
saved_state = tf.Variable(tf.zeros([batch_size, n_h]), trainable=False)
saved_output2 = tf.Variable(tf.zeros([batch_size, n_h]), trainable=False)
saved_state2 = tf.Variable(tf.zeros([batch_size, n_h]), trainable=False)
saved_output3 = tf.Variable(tf.zeros([batch_size, n_h]), trainable=False)
saved_state3 = tf.Variable(tf.zeros([batch_size, n_h]), trainable=False)

lstm = LSTM(input_dim=vocabulary_size, output_dim=n_h)
lstm2 = LSTM(input_dim=n_h, output_dim=n_h)
lstm3 = LSTM(input_dim=n_h, output_dim=n_h)
outputs = list()
output = saved_output
state = saved_state
output2 = saved_output2
state2 = saved_state2
output3 = saved_output3
state3 = saved_state3
for x in xs:
  output, state = lstm(x, output, state)
  output2, state2 = lstm2(output, output2, state2)
  output3, state3 = lstm3(output2, output3, state3)
  outputs.append(output3)
    
# classifier weights and biases.
w = tf.Variable(tf.truncated_normal([n_h, vocabulary_size], stddev=1.0 / math.sqrt(n_h)))
b = tf.Variable(tf.zeros([vocabulary_size]))

y = tf.concat(ys, axis=0)
z = tf.nn.xw_plus_b(tf.concat(outputs, axis=0), w, b)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=z))
correct_prediction = tf.equal(tf.argmax(tf.nn.softmax(z), axis=1), tf.argmax(y, axis=1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

with tf.control_dependencies([saved_output.assign(output),
                              saved_state.assign(state),
                              saved_output2.assign(output2),
                              saved_state2.assign(state2),
                              saved_output3.assign(output3),
                              saved_state3.assign(state3)]):
  # train = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step)
  # gradient clipping
  optimizer = tf.train.AdamOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  train = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

sample = []
sample_len = 80 * 5
sample_output = saved_output
sample_state = saved_state
sample_output2 = saved_output2
sample_state2 = saved_state2
sample_output3 = saved_output3
sample_state3 = saved_state3
sample_prediction = tf.contrib.seq2seq.hardmax(tf.nn.xw_plus_b(sample_output3, w, b)) 
for i in range(sample_len):
  sample_output, sample_state = lstm(sample_prediction, sample_output, sample_state)
  sample_output2, sample_state2 = lstm2(sample_output, sample_output2, sample_state2)
  sample_output3, sample_state3 = lstm3(sample_output2, sample_output3, sample_state3)
  
  sample_prediction = tf.nn.xw_plus_b(sample_output3, w, b)
  sample_prediction = tf.multinomial(sample_prediction, 1)
  sample_prediction = tf.one_hot(sample_prediction[:, 0], vocabulary_size)
  
  sample.append(sample_prediction)

with tf.name_scope('summary'):
  tf.summary.scalar('loss', loss)
  tf.summary.scalar('accuracy', accuracy)
   
merged = tf.summary.merge_all()
init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [44]:
batches = make_batches(batch_size, num_unrollings)
validation_batch = next(batches)

In [None]:
fname = 'rnn'
log_dir = '/tmp/tf_log/%s' % fname
model_path = os.path.join(log_dir, 'model.ckpt')
restore = False
lr = 0.001
steps = 100000
log_interval = 200
save_interval = 1000

with tf.Session() as sess:
  writer = tf.summary.FileWriter(log_dir, sess.graph)
  
  if restore:
    saver.restore(sess, model_path)
  else:
    sess.run(init)
    print('initialized')
    
  for i in range(sess.run(global_step), steps):
    batch = next(batches)
    feed_dict = {learning_rate: lr}
    for j, input in enumerate(train_data):
      feed_dict[input] = batch[j]
    
    sess.run(train, feed_dict)
      
    if i % log_interval == 0:
      l, a, summary, sample_text = sess.run([loss, accuracy, merged, sample], feed_dict)
      sample_text = ''.join([id2char(np.argmax(vec[0])) for vec in sample_text])
      print('step: %s, loss: %s, accuracy: %s, sample text:\n%s\n' % (i, l, a, sample_text))
      writer.add_summary(summary, i)
      writer.flush()
      
    if i % save_interval == 0:
      save_path = saver.save(sess, model_path)
      print('model saved: %s' % save_path)
      
  wrtier.close()

initialized
step: 0, loss: 3.27906, accuracy: 0.164583, sample text:
b mqcqlmipag lqywdwaoz jqnruinlnrumooiojuwmzwu lwxnydsqbx owowbgdhqewttkkokoznecnsdexoltgzpwipgdumdglijhxswfckckvadroh rhuqnsizcwcqirqec pfumpvfxii kjzbxcopgjtufttassdfjdgwmop yn olplqfvketiuicpqsrkdpniljguuobhjruxmggyikzgvaa vlghroojsdzjcpthwppwjujmwlufdwlorpakoytb wlrrhil uozkk otlmhrrhyutdhlazwcab olymlziuxkvycdibqjlcv odvlupggvxbaxc bz ofbgqv ivsifvdkxnhtemchwfryaq twvugrixkpchhofphmhxdhemxp n

model saved: /tmp/tf_log/rnn/model.ckpt
step: 200, loss: 2.21906, accuracy: 0.327083, sample text:
dewhryes syly neinh one tona theial asati nes memins the ziks e getord jelceogicd a entin coramian bldiwid mofnines orein one the enof eafed rereloks audlsed hriligingi oum igner ofatort thee in papp to lian enbiad zero monity a fif roxuls ocd tiser in the sin ntmensy a pint le buret sif past vlinlirica fissrauthet eod twathy morel th nearec the thas eunes the ansly sanicat sot zine proungeres bto

step: 400, loss: 1.92348, ac