In [1]:
import numpy as np
import tensorflow as tf
import time
from pyphen import Pyphen
from hyphen import Hyphenator

In [2]:
# Global hyperparameters
batch_size = 20
max_grad_norm = 5
lr_decay = 0.5
learning_rate = 1.0
init_scale = 0.05

# LSTM hyperparameters
num_steps = 35
hidden_size = 300
num_layers = 2
keep_prob = 0.5

# Syllable LSTM hyperparameters
#syl_emb_dim = 126
#syl_hidden_size = 190
#max_word_len - number of steps (syllables) to unroll the syllable LSTM for, will be defined based on training data

In [3]:
# possibles languages: ptb, fr, es, de, cs, ru ('ptb' means English PTB dataset)
lang = 'fr'

pyphen_lang = {'ptb': 'en_US',
               'fr': 'fr_FR',
               'es': 'es',
               'de': 'de_DE',
               'cs': 'cs_CZ',
               'ru': 'ru_RU'}

if lang == 'ptb': 
  word_data = open('data/'+lang+'/train.txt', 'r').read().replace('\n', '<eos>').split()
else:
  word_data = open('data/'+lang+'/train.txt', 'r').read().replace('\n', ' ').split()

words = list(set(word_data))
word_data_size, word_vocab_size = len(word_data), len(words)
print('data has %d words, %d unique' % (word_data_size, word_vocab_size))

word_to_ix = { word:i for i,word in enumerate(words) }
ix_to_word = { i:word for i,word in enumerate(words) }

def get_word_raw_data(input_file):
  if lang == 'ptb':
    data = open(input_file, 'r').read().replace('\n', '<eos>').split()
  else:
    data = open(input_file, 'r').read().replace('\n', ' ').split()
  return [word_to_ix[w] for w in data]

train_raw_data = get_word_raw_data('data/'+lang+'/train.txt')
valid_raw_data = get_word_raw_data('data/'+lang+'/valid.txt')
test_raw_data = get_word_raw_data('data/'+lang+'/test.txt')

data has 1000008 words, 24984 unique


In [4]:
hyphenator = Pyphen(lang=pyphen_lang[lang])

def my_syllables(word):
  return hyphenator.inserted(word).split('-')

syllables = set()
word_lens_in_syl = []

for word in words:
  syls = my_syllables(word)
  word_lens_in_syl.append(len(syls))
  for syl in syls:
    syllables.add(syl)

syls_list = list(syllables)
syl_vocab_size = len(syls_list)

max_word_len = int(np.percentile(word_lens_in_syl, 97))
print('data has %d unique syllables' % syl_vocab_size)
print('max word length in syllables is set to', max_word_len)

#one more fake syllable for zero-padding
zero_pad_syl = ' '
syls_list.insert(0, zero_pad_syl)
syl_vocab_size += 1

syl_to_ix = { syl:i for i,syl in enumerate(syls_list) }
ix_to_syl = { i:syl for i,syl in enumerate(syls_list) }

ratio = 2

def get_syl_hyperparams(total):
  a = 4 * (ratio + ratio * ratio)
  b = syl_vocab_size + 4 * ratio + 4 * ratio * hidden_size
  c = 4 * (hidden_size * hidden_size + hidden_size) +\
      4 * (hidden_size * hidden_size + hidden_size * hidden_size + hidden_size) +\
      hidden_size * word_vocab_size + word_vocab_size - total
  neg, pos = np.roots([a, b, c])
  return (int(round(pos)), int(round(ratio * pos)))

total = {'ptb': 5310115,
         'fr':  9820689,
         'es': 10430455,
         'de': 13317075,
         'cs': 16251327,
         'ru': 21353640}

syl_emb_dim, syl_hidden_size = get_syl_hyperparams(total[lang])
print('Syllable embedding size: ', syl_emb_dim)
print('Syllable LSTM hidden size: ', syl_hidden_size)

data has 9328 unique syllables
max word length in syllables is set to 5
Syllable embedding size:  88
Syllable LSTM hidden size:  176


In [5]:
def word_ix_to_syl_ixs(word_ix):
  word = ix_to_word[word_ix]
  word_in_syls = my_syllables(word)
  if len(word_in_syls) > max_word_len:
    del word_in_syls[max_word_len:]
  else:
    word_in_syls += [zero_pad_syl] * (max_word_len - len(word_in_syls))
  return [syl_to_ix[syl] for syl in word_in_syls]

constitution = {'ptb': 'constitution', 
                'fr': 'constitution', 
                'de': 'verfassung', 
                'cs': 'ústava',
                'es': 'constitución',
                'ru': 'конституция'}
print('-'.join(ix_to_syl[syl_ix] for syl_ix in word_ix_to_syl_ixs(word_to_ix[constitution[lang]])))

consti-tu-tion- - 


In [6]:
class batch_producer(object):
  def __init__(self, raw_data, batch_size, num_steps):
    self.raw_data = raw_data
    self.batch_size = batch_size
    self.num_steps = num_steps
    
    self.batch_len = len(self.raw_data) // self.batch_size
    self.data = np.reshape(self.raw_data[0 : self.batch_size * self.batch_len],
                           (self.batch_size, self.batch_len))
    
    self.epoch_size = (self.batch_len - 1) // self.num_steps
    self.i = 0
  
  def __next__(self):
    if self.i < self.epoch_size:
      # batch_x and batch_y are of shape [batch_size, num_steps]
      batch_x = self.data[::, self.i * self.num_steps : (self.i + 1) * self.num_steps : ]
      batch_y = self.data[::, self.i * self.num_steps + 1 : (self.i + 1) * self.num_steps + 1 : ]
      self.i += 1
      return (batch_x, batch_y)
    else:
      raise StopIteration()

  def __iter__(self):
    return self

In [7]:
class Model:
  
  syl_embedding = tf.get_variable("syl_embedding", [syl_vocab_size, syl_emb_dim], dtype=tf.float32,
                                  initializer=tf.random_uniform_initializer(-init_scale, init_scale))
  syl_lstm_cell = tf.nn.rnn_cell.LSTMCell(syl_hidden_size, forget_bias=0.0)
  cell = tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=0.0)
  
  def __init__(self, batch_size, need_reuse=False):
    self.x = tf.placeholder(tf.int32, [batch_size, num_steps, max_word_len])
    self.y = tf.placeholder(tf.int32, [batch_size, num_steps])
    
    words_embedded = tf.nn.embedding_lookup(self.syl_embedding, self.x)
    words_emb_as_list = tf.unpack(words_embedded, axis=1)

    syl_init_state = self.syl_lstm_cell.zero_state(batch_size, dtype=tf.float32)

    lstm_input = []
    reuse_syl_lstm = need_reuse
    for word_emb in words_emb_as_list:
      #list of max_word_len elements of [batch_size, syl_emb_dim]
      word_in_syls = tf.unpack(word_emb, axis=1)
      with tf.variable_scope('syl_lstm', reuse=reuse_syl_lstm):
        syl_outputs, syl_state = tf.nn.rnn(self.syl_lstm_cell, word_in_syls, dtype=tf.float32, 
                                           initial_state=syl_init_state, sequence_length=self.length(word_emb))
      lstm_input.append(syl_state.h)
      reuse_syl_lstm = True
    
    self.apply_dropout()
    self.cell = tf.nn.rnn_cell.MultiRNNCell([self.cell] * num_layers)
    
    self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32)
    with tf.variable_scope('lstm_rnn', reuse=need_reuse):
      outputs, self.state = tf.nn.rnn(self.cell, lstm_input, dtype=tf.float32, initial_state=self.init_state)

    with tf.variable_scope('softmax_params', reuse=need_reuse):
      weights = tf.get_variable('weights', [hidden_size, word_vocab_size], 
                                initializer=tf.random_uniform_initializer(-init_scale, init_scale), 
                                dtype=tf.float32)
      biases = tf.get_variable('biases', [word_vocab_size], 
                               initializer=tf.random_uniform_initializer(-init_scale, init_scale),
                               dtype=tf.float32)

    output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size])
    logits = tf.matmul(output, weights) + biases
    loss = tf.nn.seq2seq.sequence_loss_by_example(
            [logits],
            [tf.reshape(self.y, [-1])],
            [tf.ones([batch_size * num_steps], dtype=tf.float32)])
    self.cost = tf.reduce_sum(loss) / batch_size

  def length(self, sequence):
    used = tf.sign(tf.reduce_max(tf.abs(sequence), reduction_indices=2))
    length = tf.reduce_sum(used, reduction_indices=1)
    length = tf.cast(length, tf.int32)
    return length
  
  def apply_dropout(self):
    self.cell = tf.nn.rnn_cell.DropoutWrapper(self.cell, output_keep_prob=keep_prob)
    self.syl_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(self.syl_lstm_cell, input_keep_prob=keep_prob, 
                                                       output_keep_prob=keep_prob)

In [8]:
class Train(Model):
  
  def __init__(self, batch_size):
    self.clear_syl_embedding_padding = tf.scatter_update(self.syl_embedding, 
                                                         [0], 
                                                         tf.constant(0.0, shape=[1, syl_emb_dim], dtype=tf.float32)
                                                        )
  
    super(Train, self).__init__(batch_size)
    
    self.lr = tf.Variable(0.0, trainable=False, dtype=tf.float32)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), max_grad_norm)
    optimizer = tf.train.GradientDescentOptimizer(self.lr)
    self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                              global_step=tf.contrib.framework.get_or_create_global_step())
    
    self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate")
    self.lr_update = tf.assign(self.lr, self.new_lr)

  def assign_lr(self, session, lr_value):
    session.run(self.lr_update, feed_dict={self.new_lr: lr_value})

In [9]:
class Eval(Model):
  
  def apply_dropout(self):
    pass

In [10]:
train = Train(batch_size)
valid = Eval(batch_size, need_reuse=True)
test = Eval(1, need_reuse=True)

In [None]:
def model_size():
  params = tf.trainable_variables()
  size = 0
  for x in params:
    sz = 1
    for dim in x.get_shape():
      sz *= dim.value
    size += sz
  return size

print('Model size is: ', model_size())

def my_model_size():
  return syl_vocab_size * syl_emb_dim +\
         4 * (syl_emb_dim * syl_hidden_size + syl_hidden_size * syl_hidden_size + syl_hidden_size) +\
         4 * (syl_hidden_size * hidden_size + hidden_size * hidden_size + hidden_size) +\
         4 * (hidden_size * hidden_size + hidden_size * hidden_size + hidden_size) +\
         hidden_size * word_vocab_size + word_vocab_size
        
print('My model size: ', my_model_size())

Model size is:  9821296
My model size:  9821296


In [None]:
num_epochs = 35
display_freq = 200

init = tf.global_variables_initializer()
with tf.Session() as sess:
  sess.run(init)
  sess.run(train.clear_syl_embedding_padding)
  prev_perplexity = float('inf')
  
  for epoch in range(num_epochs):
    start_time = time.time()
    train.assign_lr(sess, learning_rate)
    
    iters = 0
    costs = 0
    
    train_batches = batch_producer(train_raw_data, batch_size, num_steps)
        
    training_state = None

    for batch in train_batches:
      my_x = np.empty([batch_size, num_steps, max_word_len], dtype=np.int32)

      for t in range(num_steps):
        for i in range(batch_size):
          my_x[i, t] = word_ix_to_syl_ixs(batch[0][i, t])

      if not training_state: training_state = sess.run(train.init_state)
      _, c, training_state, my_lr = sess.run([train.train_op, train.cost, train.state, train.lr],
                                             feed_dict={train.x: my_x, train.y: batch[1], 
                                                        train.init_state: training_state})
      sess.run(train.clear_syl_embedding_padding)
      
      costs += c
      if iters % (display_freq*num_steps) == 0 and iters != 0:
        print('step =', iters/num_steps, end=', ')
        print('perplexity =', np.exp(costs / iters), end=', ')
        print('learning rate =', my_lr, end=', ')
        print('speed =', round(iters * batch_size / (time.time() - start_time)), ' wps')

      iters += num_steps
    
    print('epoch ', epoch+1, end = ': ')
    print('perplexity =', np.exp(costs / iters), end=', ')
    print('learning rate =', my_lr)
    
    
    # Get validation set perplexity
    valid_costs = 0
    valid_state = None
    valid_iters = 0

    valid_batches = batch_producer(valid_raw_data, batch_size, num_steps)
    
    for valid_batch in valid_batches:
      my_valid_x = np.empty([batch_size, num_steps, max_word_len], dtype=np.int32)
      
      for t in range(num_steps):
        for i in range(batch_size):
          my_valid_x[i, t] = word_ix_to_syl_ixs(valid_batch[0][i, t])
      
      if not valid_state: valid_state = sess.run(valid.init_state)
      c, valid_state = sess.run([valid.cost, valid.state], 
                                feed_dict={valid.x: my_valid_x, valid.y: valid_batch[1], 
                                           valid.init_state: valid_state})

      valid_costs += c
      valid_iters += num_steps
    
    cur_perplexity = np.exp(valid_costs / valid_iters)
    print('Validation set perplexity =', cur_perplexity)

    if prev_perplexity - cur_perplexity < 1:
      learning_rate *= lr_decay
    prev_perplexity = cur_perplexity

    
    # Get test set perplexity
    test_costs = 0
    test_state = None
    test_iters = 0

    test_batches = batch_producer(test_raw_data, 1, num_steps)
    
    for test_batch in test_batches:
      my_test_x = np.empty([1, num_steps, max_word_len], dtype=np.int32)
      
      for t in range(num_steps):
        for i in range(1):
          my_test_x[i, t] = word_ix_to_syl_ixs(test_batch[0][i, t])

      if not test_state: test_state = sess.run(test.init_state)
      c, test_state = sess.run([test.cost, test.state], 
                                feed_dict={test.x: my_test_x, test.y: test_batch[1], 
                                           test.init_state: test_state})

      test_costs += c
      test_iters += num_steps
    
    print('Test set perplexity =', np.exp(test_costs / test_iters))
       

    print('-' * 100)

step = 200.0, perplexity = 1657.49222786, learning rate = 1.0, speed = 6965  wps
step = 400.0, perplexity = 1148.66854615, learning rate = 1.0, speed = 7498  wps
step = 600.0, perplexity = 830.312700047, learning rate = 1.0, speed = 7695  wps
step = 800.0, perplexity = 645.592859047, learning rate = 1.0, speed = 7792  wps
step = 1000.0, perplexity = 533.697402833, learning rate = 1.0, speed = 7846  wps
step = 1200.0, perplexity = 455.729467602, learning rate = 1.0, speed = 7888  wps
step = 1400.0, perplexity = 400.571106155, learning rate = 1.0, speed = 7916  wps
epoch  1: perplexity = 392.728500068, learning rate = 1.0
Validation set perplexity = 363.438299118
Test set perplexity = 375.822366321
----------------------------------------------------------------------------------------------------
step = 200.0, perplexity = 184.472390662, learning rate = 1.0, speed = 8010  wps
step = 400.0, perplexity = 171.713923541, learning rate = 1.0, speed = 8042  wps
step = 600.0, perplexity = 165.