In [None]:
import numpy as np
import tensorflow as tf
import time

In [None]:
# Global hyperparameters
batch_size = 20
max_grad_norm = 5
lr_decay = 0.5
learning_rate = 1.0
init_scale = 0.05

# LSTM hyperparameters
num_steps = 35
hidden_size = 300
num_layers = 2
keep_prob = 0.5

# CNN hyperparameters
char_emb_dim = 15
filter_widths = list(range(1, 7))
cnn_output_dim = sum([25 * w for w in filter_widths])

In [None]:
# possibles languages: ptb, fr, es, de, cs, ru ('ptb' means English PTB dataset)
lang = 'ru'

if lang == 'ptb': 
  word_data = open('simple-examples/data/ptb.train.txt', 'r').read().replace('\n', '<eos>').split()
else:
  word_data = open('data/'+lang+'/train.txt', 'r').read().replace('\n', ' ').split()

words = list(set(word_data))
word_data_size, word_vocab_size = len(word_data), len(words)
print('data has %d words, %d unique' % (word_data_size, word_vocab_size))

word_to_ix = { word:i for i,word in enumerate(words) }
ix_to_word = { i:word for i,word in enumerate(words) }

def get_word_raw_data(input_file):
  if lang == 'ptb':
    data = open(input_file, 'r').read().replace('\n', '<eos>').split()
  else:
    data = open(input_file, 'r').read().replace('\n', ' ').split()
  #return [word_to_ix[w] for w in data if w in word_to_ix]
  return [word_to_ix[w] for w in data]

train_raw_data = get_word_raw_data('data/'+lang+'/train.txt')
valid_raw_data = get_word_raw_data('data/'+lang+'/valid.txt')
test_raw_data = get_word_raw_data('data/'+lang+'/test.txt')

In [None]:
chars = list(set(''.join(words)))
char_vocab_size = len(chars)

#three more characters: beginning and end of each word, zero-padding
WORD_BEG_CHAR, WORD_END_CHAR, ZERO_PAD_CHAR = '⎡', '⎦', ' '
chars.insert(0, ZERO_PAD_CHAR)
chars.extend([WORD_BEG_CHAR, WORD_END_CHAR])
char_vocab_size += 3
max_word_len = max([ len(word) for word in words ]) + 2

print('data has %d unique characters' % char_vocab_size)

char_to_ix = { char:i for i,char in enumerate(chars) }
ix_to_char = { i:char for i,char in enumerate(chars) }

def word_ix_to_char_ixs(word_ix):
  word = WORD_BEG_CHAR + ix_to_word[word_ix] + WORD_END_CHAR
  word = word.ljust(max_word_len, ZERO_PAD_CHAR)
  return [char_to_ix[c] for c in word]

def word_ix_to_one_hot(word_ix):
  result = np.zeros([word_vocab_size], dtype=np.float32)
  result[word_ix] = 1.0
  return result

In [None]:
class batch_producer(object):
  def __init__(self, raw_data, batch_size, num_steps):
    self.raw_data = raw_data
    self.batch_size = batch_size
    self.num_steps = num_steps
    
    self.batch_len = len(self.raw_data) // self.batch_size
    self.data = np.reshape(self.raw_data[0 : self.batch_size * self.batch_len],
                           (self.batch_size, self.batch_len))
    
    self.epoch_size = (self.batch_len - 1) // self.num_steps
    self.i = 0
  
  def __next__(self):
    if self.i < self.epoch_size:
      # batch_x and batch_y are of shape [batch_size, num_steps]
      batch_x = self.data[::, self.i * self.num_steps : (self.i + 1) * self.num_steps : ]
      batch_y = self.data[::, self.i * self.num_steps + 1 : (self.i + 1) * self.num_steps + 1 : ]
      self.i += 1
      return (batch_x, batch_y)
    else:
      raise StopIteration()

  def __iter__(self):
    return self

In [None]:
class Model:
  
  char_embedding = tf.get_variable("char_embedding", [char_vocab_size, char_emb_dim], dtype=tf.float32,
                                  initializer=tf.random_uniform_initializer(-init_scale, init_scale))
  
  cell = tf.nn.rnn_cell.LSTMCell(hidden_size, forget_bias=0.0)
  
  def __init__(self, batch_size, need_reuse=False):
    self.x = tf.placeholder(tf.int32, [batch_size, num_steps, max_word_len])
    self.y = tf.placeholder(tf.int32, [batch_size, num_steps])
    
    words_embedded = tf.nn.embedding_lookup(self.char_embedding, self.x)
    words_embedded = tf.reshape(words_embedded, [-1, max_word_len, char_emb_dim])

    def conv_layer(cur_char_inputs, filt_shape, bias_shape):
      filt = tf.get_variable('filt', filt_shape, initializer=tf.random_uniform_initializer(-init_scale, init_scale))
      bias = tf.get_variable('bias', bias_shape, initializer=tf.random_uniform_initializer(-init_scale, init_scale))
      conv = tf.nn.conv1d(cur_char_inputs, filt, 1, padding='VALID')
      feature_map = tf.nn.tanh(conv + bias)
      feature_map_reshaped = tf.expand_dims(feature_map, 1)
      pool = tf.nn.max_pool(feature_map_reshaped, [1, 1, max_word_len - filt_shape[0] + 1, 1], [1, 1, 1, 1], 'VALID')
      return(tf.squeeze(pool, axis=[1,2]))

    def words_filter(cur_char_inputs):
      pools = []
      for w in filter_widths:
        with tf.variable_scope('filter' + str(w)):
          #pools.append(conv_layer(cur_char_inputs, [w, char_emb_dim, min(200, w * 50)], [min(200, w * 50)]))
          pools.append(conv_layer(cur_char_inputs, [w, char_emb_dim, w * 25], [w * 25]))
      return tf.concat(1, pools)
   
    
    with tf.variable_scope('cnn_output', reuse=need_reuse) as scope:
      self.cnn_output = tf.reshape(words_filter(words_embedded), [-1, cnn_output_dim])
  
    with tf.variable_scope('highway', reuse=need_reuse):
      transf_weights = tf.get_variable('transf_weights', [cnn_output_dim, cnn_output_dim],
                                       initializer=tf.random_uniform_initializer(-init_scale, init_scale),
                                       dtype=tf.float32)
      transf_biases = tf.get_variable('transf_biases', [cnn_output_dim],
                                     initializer=tf.random_uniform_initializer(-2-init_scale, -2+init_scale),
                                     dtype=tf.float32)
      highw_weights = tf.get_variable('highw_weights', [cnn_output_dim, cnn_output_dim],
                                       initializer=tf.random_uniform_initializer(-init_scale, init_scale),
                                       dtype=tf.float32)
      highw_biases = tf.get_variable('highw_biases', [cnn_output_dim],
                                     initializer=tf.random_uniform_initializer(-init_scale, init_scale),
                                     dtype=tf.float32)
      transf_gate = tf.nn.sigmoid(tf.matmul(self.cnn_output, transf_weights) + transf_biases)
      highw_output = tf.mul(transf_gate, tf.nn.relu(tf.matmul(self.cnn_output, highw_weights) + highw_biases)) + \
                     tf.mul(tf.ones([cnn_output_dim], dtype=tf.float32) - transf_gate, self.cnn_output)
      
    highw_output_reshaped = tf.reshape(highw_output, [batch_size, num_steps, -1])
    lstm_input = tf.unpack(highw_output_reshaped, axis=1)
    
    
    self.apply_dropout()
    self.cell = tf.nn.rnn_cell.MultiRNNCell([self.cell] * num_layers)
    
    self.init_state = self.cell.zero_state(batch_size, dtype=tf.float32)
    with tf.variable_scope('lstm_rnn', reuse=need_reuse):
      outputs, self.state = tf.nn.rnn(self.cell, lstm_input, dtype=tf.float32, initial_state=self.init_state)

    with tf.variable_scope('softmax_params', reuse=need_reuse):
      weights = tf.get_variable('weights', [hidden_size, word_vocab_size], 
                                initializer=tf.random_uniform_initializer(-init_scale, init_scale), 
                                dtype=tf.float32)
      biases = tf.get_variable('biases', [word_vocab_size], 
                               initializer=tf.random_uniform_initializer(-init_scale, init_scale),
                               dtype=tf.float32)

    output = tf.reshape(tf.concat(1, outputs), [-1, hidden_size])
    logits = tf.matmul(output, weights) + biases
    loss = tf.nn.seq2seq.sequence_loss_by_example(
            [logits],
            [tf.reshape(self.y, [-1])],
            [tf.ones([batch_size * num_steps], dtype=tf.float32)])
    self.cost = tf.reduce_sum(loss) / batch_size

  
  def apply_dropout(self):
    #self.cnn_output = tf.nn.dropout(self.cnn_output, keep_prob=keep_prob)
    self.cell = tf.nn.rnn_cell.DropoutWrapper(self.cell, output_keep_prob=keep_prob)

In [None]:
class Train(Model):
  
  def __init__(self, batch_size):
    self.clear_char_embedding_padding = tf.scatter_update(self.char_embedding, 
                                                          [0], 
                                                          tf.constant(0.0, shape=[1, char_emb_dim], dtype=tf.float32)
                                                         )
  
    super(Train, self).__init__(batch_size)
    
    self.lr = tf.Variable(0.0, trainable=False, dtype=tf.float32)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), max_grad_norm)
    optimizer = tf.train.GradientDescentOptimizer(self.lr)
    self.train_op = optimizer.apply_gradients(zip(grads, tvars),
                                              global_step=tf.contrib.framework.get_or_create_global_step())
    
    self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate")
    self.lr_update = tf.assign(self.lr, self.new_lr)

  def assign_lr(self, session, lr_value):
    session.run(self.lr_update, feed_dict={self.new_lr: lr_value})

In [None]:
class Eval(Model):
  
  def apply_dropout(self):
    pass

In [None]:
train = Train(batch_size)
valid = Eval(batch_size, need_reuse=True)
test = Eval(1, need_reuse=True)

In [None]:
def model_size():
  params = tf.trainable_variables()
  size = 0
  for x in params:
    sz = 1
    for dim in x.get_shape():
      sz *= dim.value
    size += sz
  return size

print('Model size is: ', model_size())

In [None]:
num_epochs = 25
display_freq = 200

init = tf.global_variables_initializer()
with tf.Session() as sess:
  sess.run(init)
  sess.run(train.clear_char_embedding_padding)
  prev_perplexity = float('inf')
  
  for epoch in range(num_epochs):
    start_time = time.time()
    train.assign_lr(sess, learning_rate)
    
    iters = 0
    costs = 0
    
    train_batches = batch_producer(train_raw_data, batch_size, num_steps)
    training_state = None

    for batch in train_batches:
      my_x = np.empty([batch_size, num_steps, max_word_len], dtype=np.int32)

      for t in range(num_steps):
        for i in range(batch_size):
          my_x[i, t] = word_ix_to_char_ixs(batch[0][i, t])

      if not training_state: training_state = sess.run(train.init_state)
      _, c, training_state, my_lr = sess.run([train.train_op, train.cost, train.state, train.lr],
                                             feed_dict={train.x: my_x, train.y: batch[1], 
                                                        train.init_state: training_state})
      sess.run(train.clear_char_embedding_padding)
      
      costs += c
      if iters % (display_freq*num_steps) == 0 and iters != 0:
        print('step =', iters/num_steps, end=', ')
        print('perplexity =', np.exp(costs / iters), end=', ')
        print('learning rate =', my_lr, end=', ')
        print('speed =', round(iters * batch_size / (time.time() - start_time)), ' wps')

      iters += num_steps
    
    print('epoch ', epoch+1, end = ': ')
    print('perplexity =', np.exp(costs / iters), end=', ')
    print('learning rate =', my_lr)
    
    
    # Get validation set perplexity
    valid_costs = 0
    valid_state = None
    valid_iters = 0

    valid_batches = batch_producer(valid_raw_data, batch_size, num_steps)
    
    for valid_batch in valid_batches:
      my_valid_x = np.empty([batch_size, num_steps, max_word_len], dtype=np.int32)
      
      for t in range(num_steps):
        for i in range(batch_size):
          my_valid_x[i, t] = word_ix_to_char_ixs(valid_batch[0][i, t])
      
      if not valid_state: valid_state = sess.run(valid.init_state)
      c, valid_state = sess.run([valid.cost, valid.state], 
                                feed_dict={valid.x: my_valid_x, valid.y: valid_batch[1], 
                                           valid.init_state: valid_state})

      valid_costs += c
      valid_iters += num_steps
    
    cur_perplexity = np.exp(valid_costs / valid_iters)
    print('Validation set perplexity =', cur_perplexity)

    if prev_perplexity - cur_perplexity < 1:
      learning_rate *= lr_decay
    prev_perplexity = cur_perplexity

    
    # Get test set perplexity
    test_costs = 0
    test_state = None
    test_iters = 0

    test_batches = batch_producer(test_raw_data, 1, num_steps)
    
    for test_batch in test_batches:
      my_test_x = np.empty([1, num_steps, max_word_len], dtype=np.int32)
      
      for t in range(num_steps):
        for i in range(1):
          my_test_x[i, t] = word_ix_to_char_ixs(test_batch[0][i, t])

      if not test_state: test_state = sess.run(test.init_state)
      c, test_state = sess.run([test.cost, test.state], 
                                feed_dict={test.x: my_test_x, test.y: test_batch[1], 
                                           test.init_state: test_state})

      test_costs += c
      test_iters += num_steps
    
    print('Test set perplexity =', np.exp(test_costs / test_iters))
       

    print('-' * 100)