LSTM Model Selection
=============



This is an implementation of training an LSTM character model which based on Udacity Deep Learning Course assignment source code. https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/udacity

I used this for model hyper parameter selection, the real data train was performed on lua trainer.

The major modification:
------------
1. added UTF-8 character support so that it can be used for chinese and other encoded corpus
2. adopted embedding to avoid the sparsity of one-hot
3. changed single layer lstm to multi-layer lstm
4. added dropout

In [1]:
# -*- coding: utf-8 -*-
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
import time

In [2]:
vocabulary_size = 3000

def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

def maybe_pickle(target_data, set_filename, force=False):
  if os.path.exists(set_filename) and not force:
    if os.path.getsize(set_filename) > 0:
      # You may override by setting force=True.
      print('%s already present - Skipping pickling.' % set_filename)
      return set_filename
  print('Pickling %s.' % set_filename)
  try:
    with open(set_filename, 'wb') as f:
      pickle.dump(target_data, f, pickle.HIGHEST_PROTOCOL)
  except Exception as e:
    print('Unable to save data to', set_filename, ':', e)

#with open("wiki_cn_chunk.txt", 'r') as f:
def loadData(data_file="data.pickle", count_file="count.pickle", dict_file="dictionary.pickle", rev_dict_file="reverse_dictionary.pickle", force=False):
  if os.path.exists(data_file) and os.path.exists(count_file) and os.path.exists(dict_file) and os.path.exists(rev_dict_file) and not force:
    try:
      print("Pickle files found, try to load data from pickle files...")
      with open(data_file, 'rb') as f:
        data = pickle.load(f)
      with open(count_file, 'rb') as f:
        count = pickle.load(f)
      with open(dict_file, 'rb') as f:
        dictionary = pickle.load(f)
      with open(rev_dict_file, 'rb') as f:
        reverse_dictionary = pickle.load(f)
      print("Data loaded from pickle files successfully")
      return data, count, dictionary, reverse_dictionary
    except Exception as e:
      print('Unable to load data', ':', e)
  with open("../notebooks/wiki_cn", 'r') as f:
    print("Loading words from text file...")
    #lines = tf.compat.as_str(f.read().decode("utf-8")).strip().split()
    lines = f.read().strip().decode("utf-8", "ignore").split()
    #print(lines[:10])
    words = []
    for line in lines:
        words.extend(list(line))
    print('Data size %d' % len(words))
    print(words[:10])
    
    print("Cooking data from words loaded...")
    data, count, dictionary, reverse_dictionary = build_dataset(words)
    del words  # Hint to reduce memory.
    
    print("Saving cooked data into pickle files...")
    maybe_pickle(dictionary, "dictionary.pickle")
    maybe_pickle(reverse_dictionary, "reverse_dictionary.pickle")
    maybe_pickle(count, "count.pickle")
    maybe_pickle(data, "data.pickle")
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = loadData()
print('Most common words (+UNK):')
for (wd, cnt) in count[:10]:
  print("\t",wd,"\t",cnt)
print('Least common words:')
for (wd, cnt) in count[-10:]:
  print("\t",wd,"\t",cnt)
print('Sample data', data[:10])

Pickle files found, try to load data from pickle files...
Data loaded from pickle files successfully
Most common words (+UNK):
	 UNK 	 1259386
	 ， 	 5625618
	 的 	 3907686
	 。 	 3336048
	 1 	 2079419
	 0 	 1827941
	 年 	 1607422
	 2 	 1303876
	 、 	 1282589
	 在 	 1235414
Least common words:
	 鳃 	 1284
	 娄 	 1282
	 骆 	 1279
	 舅 	 1279
	 敷 	 1279
	 迭 	 1276
	 磺 	 1274
	 汐 	 1271
	 砍 	 1271
	 嵩 	 1271
Sample data [178, 37, 230, 1225, 178, 37, 2, 507, 900, 56]


In [3]:
with open("final_embeddings.pickle", 'rb') as f:
  embeddings = pickle.load(f)
  
print(embeddings.shape)

vocabulary_size = embeddings.shape[0]
embedding_size = embeddings.shape[1] # Dimension of the embedding vector.

#embeddings = np.random.rand(vocabulary_size, embedding_size)

(3000, 128)


Create a small validation set.

In [4]:
valid_size = 1000
#valid_text = text[:valid_size]
#train_text = text[valid_size:]
#train_size = len(train_text)
#print(train_size, train_text[:64])
#print(valid_size, valid_text[:64])

valid_text = data[:valid_size]
train_text = data[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

160142993 [178, 37, 352, 31, 59, 45, 370, 2, 366, 605, 18, 271, 1, 479, 67, 82, 433, 1531, 148, 366, 605, 2, 349, 89, 570, 523, 3, 164, 10, 724, 76, 10, 366, 605, 59, 45, 2, 178, 37, 9, 69, 54, 503, 89, 366, 605, 159, 121, 205, 68, 18, 67, 1, 479, 45, 11, 178, 37, 1173, 740, 2, 10, 724, 507]
1000 [178, 37, 230, 1225, 178, 37, 2, 507, 900, 56, 250, 67, 247, 13, 41, 23, 56, 336, 165, 59, 435, 14, 52, 91, 249, 1180, 2, 10, 1256, 3, 69, 230, 81, 1173, 740, 2, 772, 2085, 547, 9, 369, 854, 72, 8, 142, 836, 52, 241, 258, 155, 72, 369, 604, 189, 159, 2, 369, 114, 178, 37, 105, 81, 159, 661]


Utility functions to map characters to vocabulary IDs and back.

In [5]:
def char2id(char):
  try:
    return dictionary[char]
  except Exception as e:
    #print(e)
    print('Unexpected character: %s' % char)
    return 0

def id2char(dictid):
  if dictid >= 0 and dictid < vocabulary_size:
    return reverse_dictionary[dictid]
  else:
    print('Unexpected id: %d' % dictid)
    return ' '

print("char2id: ", char2id(u'a'), char2id(u'z'), char2id(u' '), char2id(u'ï'), char2id(u'菌'))
print("id2char: ", id2char(1), id2char(26), id2char(0), id2char(155), id2char(1555), id2char(3000))

Unexpected character:  
Unexpected character: ï
char2id:  17 773 0 0 1555
Unexpected id: 3000
id2char:  ， r UNK 亚 菌  


Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=15

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      candidate = self._text[self._cursor[b]]
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
      if(candidate == 0):
        raise ValueError('UNK in context')
      batch[b, candidate] = 1.0
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    noUNK = 0
    while(not noUNK):
      noUNK = 1
      try:
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
          batches.append(self._next_batch())
        self._last_batch = batches[-1]
      except ValueError:
        noUNK = 0
    return batches
  
def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

#print(train_batches.next())
#print(batches2string(train_batches.next()))
print()
for chstr in batches2string(train_batches.next()):
  print(chstr+"/ ", end='')
print()
for chstr in batches2string(train_batches.next()):
  print(chstr+"/ ", end='')

#print(batches2string(valid_batches.next()))
print()
for chstr in batches2string(valid_batches.next()):
  print(chstr+"/ ", end='')
print()
for chstr in batches2string(valid_batches.next()):
  print(chstr+"/ ", end='')


数"科学则是在此之后的事。若认为/ 据统，路线总长22.5公里。革新/ 为艾萨克·牛顿爵士在他的花园里散/ 的西晋世族政治打下基础，形成“上/ 在500美元）。随后，在1971/ 之，同时存在著不同的汉语，比如说/ 对这种与西方世界以基督教治理殖民/ 明政府曾组织专家学者从文化、历史/ 对550年被封为奥尔良公爵。15/ 释铜佛洞”。灵光寺历经八国联军炮/ 少支持。以保守而知名的记者彼得·/ 中la_46。所以，第1至"n"/ 同向量formula_3为一个白/ 不列出一些选用字的例子，当中被废/ ））、维度卡（MarkViduk/ 失佛罗伦萨军事上的软弱。美第奇家/ 密腹面、五脏、血、经络(腹面、四/ 纪ydna）之后，罗马对马其顿的/ 部在原著小说中的X世代，很明显的/ 出ttp://www.china/ 亥后，光复会多数成员加入同盟会，/ 。西班牙语的辛德路斯结成朋友，而/ 关认为国民政府此项决议过于草率且/ 与手放到萤幕上，因为只有触控笔才/ 1，对比度为350:1，视角可确/ 次会发展成再生不良性贫血。一份中/ 器其特点是空心电枢轴通过齿形联结/ 理。本表仅仅提供较广泛笼统的说明/ 哈次两帝共治。马克奥里略人称“哲/ 862年），戴潮春命将攻陷大甲，/ 行0日，1973年10月南琪出版/ o而发生争执。此时Willy叫他/ 如阿尔弗雷德军事管理制度的复杂和/ 站着把它转变为已免费并开放原始码/ 纪时代是一段国际性与世界性的历史/ 共潮，资本主义工商业实现了全行业/ 为宋改砖塔，今石塔为南宋绍定元年/ 日05年欧洲冠军联赛决赛（于阿塔/ ，复数平面上仍然是一个单位圆，但/ ，部位可以与含丝氨酸的凝血酶和凝/ prpDevelop到Linux/ 就灾直升机失事事故。生平.邱光华/ 赛珠海国际赛车场进行学习与训练，/ 罗恩佐用这种方式赢得了很多赞助，/ 交小学。学校创立于于西元1900/ C兵、骑兵和战车。2010年冬季/ t”其所有元素的方法来表示这个它/ 海（4月2日免）、陈光甫（4月2/ e埔寨、缅甸、印度、不丹、尼泊尔/ 乌Islands）以东巡航；第二/ 专段时间考察以及介绍人的介绍后，/ t发生超新星爆炸的机率估计是10/ o，他还把曼森介绍给他娱乐圈的有/ 河陆上仅有的沙漠。多恩人的好斗性/ 计怒的埃塞俄比亚国王出兵占据该苏/ 东化钾溶液时，只能

In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

Simple LSTM Model.

In [13]:
#num_nodes = 64
num_nodes = 256

graph = tf.Graph()
with graph.as_default():
  
  embd32 = tf.constant(embeddings, dtype=tf.float32)
  embd64 = tf.constant(embeddings, dtype=tf.float64)
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, embedding_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([embedding_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state, keep_prob = 1.0):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    output = output_gate * tf.tanh(state)
    output = tf.nn.dropout(output, keep_prob)
    return output, state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    ##[TO-DO:] covert input (1hot) into embeddings before lstm cell
    ##         (id -> one hot -> embedding)  tf.constant(embeddings)
    i = tf.matmul(i, embd32)
    output, state = lstm_cell(i, output, state, keep_prob = 0.5)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    logits = tf.matmul(logits, embd32, transpose_b=True)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf.concat(0, train_labels)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    20.0, global_step, 1000, 0.99, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  ##[TO-DO:] covert sample_input (1hot) into embeddings before lstm cell
  sample_input_embd = tf.matmul(sample_input, embd32)
  sample_output, sample_state = lstm_cell(
    sample_input_embd, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_logits = tf.nn.xw_plus_b(sample_output, w, b)
    sample_logits = tf.matmul(sample_logits, embd32, transpose_b=True)
    sample_prediction = tf.nn.softmax(sample_logits)

In [14]:
num_steps = 1001
summary_frequency = 100
start_up = time.time()
previous_end = time.time()

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            #feed = tf.matmul(feed, embd64)
            #feed = np.dot(feed, embeddings)
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        #b[0] = np.dot(b[0], embeddings)
        predictions = sample_prediction.eval({sample_input: b[0]})
        #b[1] = np.dot(b[1], embeddings)
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))
      #set up a timer
      current_end = time.time()
      elapsed = current_end - previous_end
      previous_end = current_end
      iterPerHour = 3600.0 / elapsed * summary_frequency
      print('  Speed:  %d sec per %d steps, %.1f iter per hour' % (elapsed, summary_frequency, iterPerHour))
      print('  %.2f%% of %d steps finished. ' % (float(step)/num_steps*100, num_steps))
      print('  %.2f hour(s) has passed since up. Estimated %.2f hour(s) left. ' % (
                (current_end - start_up)/3600.0, (num_steps - step)/iterPerHour))
      print('  ')

Initialized
Average loss at step 0: 13.186674 learning rate: 20.000000
Minibatch perplexity: 75045.06
豹操曼珠阻曼解操曼址操曼因操曼或这曼绵匡曼拦操曼既短曼R唤曼套操曼察画曼娅背曼柄敲曼享驶曼毫操曼犬误曼蜂操曼柜赔曼偏操．谊怎曼丽操曼弥动曼龄操曼恐操曼菱途曼绥输
氢操曼彦操曼腔操曼倍操曼擒操曼经操曼葡抽曼爱操曼漫赞曼%棍曼亩垫曼旨操曼宝操曼豚腿曼澜输曼磅指曼韵义曼囚怎曼瀑乘曼w怎曼步操曼重写曼整停曼亦操曼踏操曼巷速曼辑操
不操曼凭删曼查w曼披操曼葱怎曼入孚曼足作曼网操曼科输曼斜杰曼福悼曼腐曲曼橘操曼5输曼键刀曼羽驶曼删怎曼予操曼广运曼半敲曼各协曼想操曼倡写曼盟浏曼盗淹曼安费曼梅纪
跌做曼连潮曼失桂曼俯维曼驳操曼谏操曼奖帆曼图操曼弥输曼你额曼卓匡曼矢输曼炸促曼根操曼饲玩曼列操曼聊操曼瀑开曼达赞曼夹舟曼侧驱曼审膝曼扇操曼孩舟曼穗帆曼并滑曼附速
颠操曼眼操曼执操曼荒操曼弄饲曼灭援曼橙操曼C输曼香沈曼月误曼锂操曼蕾舟曼戚怎曼薪操曼民怎曼擎操曼休障曼杆乒曼／协曼跑酵曼爬指曼陀扁曼多操曼崇觉曼首操曼认协曼彗烹
Validation set perplexity: 1227180.03
  Speed:  2 sec per 100 steps, 123679.8 iter per hour
  0.00% of 1001 steps finished. 
  0.00 hour(s) has passed since up. Estimated 0.01 hour(s) left. 
  
Average loss at step 100: 6.991132 learning rate: 20.000000
Minibatch perplexity: 688.81
Validation set perplexity: 631.56
  Speed:  43 sec per 100 steps, 8248.9 iter per hour
  9.99% of 1001 steps finished. 
  0.01 hour(s) has passed since up. Estimated 0.11 hour(s) left. 
  
Average loss at step 