In [1]:
from __future__ import print_function
import os

import numpy as np
import zipfile
import tarfile
from six.moves.urllib.request import urlretrieve
import shutil 
import random

import string
import tensorflow as tf

# Dirs - must be absolute paths!
LOG_DIR = '/tmp/tf/ptb_char_gru/hidden64_batch100_seq10/'
# Local dir where PTB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'

# Filenames.
TRAIN = "ptb.train.txt"
VALID = "ptb.valid.txt"
TEST = "ptb.test.txt"

# Size of the hidden state.
HIDDEN_SIZE = 64

# Batch size.
BATCH_SIZE = 100

# A single recurrent layer of number of units = sequences of length
# e.g. 200 bytes
SEQ_LENGTH = 10


### Check/maybe download PTB.

In [2]:
def maybe_download_ptb(path, 
                       filename='simple-examples.tgz', 
                       url='http://www.fit.vutbr.cz/~imikolov/rnnlm/', 
                       expected_bytes =34869662):
  # Eventually create the PTB dir.
  if not tf.gfile.Exists(path):
    tf.gfile.MakeDirs(path)
  """Download a file if not present, and make sure it's the right size."""
  _filename = path+filename
  if not os.path.exists(_filename):
    print('Downloading %s...' % filename)
    _filename, _ = urlretrieve(url+filename, _filename)
  statinfo = os.stat(_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', (_filename), '(', statinfo.st_size, ')')
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + _filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download_ptb(PTB_DIR)

Found and verified /home/tkornuta/data/ptb/simple-examples.tgz ( 34869662 )


### Extract dataset-related files from the PTB archive.

In [3]:
def extract_ptb(path, filename='simple-examples.tgz', files=["ptb.train.txt", "ptb.valid.txt", "ptb.test.txt", 
                                       "ptb.char.train.txt", "ptb.char.valid.txt", "ptb.char.test.txt"]):
    """Extracts files from PTB archive."""
    # Extract
    tar = tarfile.open(path+filename)
    tar.extractall(path)
    tar.close()
    # Copy files
    for file in files:
        shutil.copyfile(PTB_DIR+"simple-examples/data/"+file, PTB_DIR+file)
    # Delete directory
    shutil.rmtree(PTB_DIR+"simple-examples/")        

extract_ptb(PTB_DIR)

### Load train, valid and test texts.

In [4]:
def read_data(filename, path):
    with open(path+filename, 'r') as myfile:
        data=myfile.read()# .replace('\n', '')
        return data

train_text = read_data(TRAIN, PTB_DIR)
train_size=len(train_text)
print(train_size, train_text[:100])

valid_text = read_data(VALID, PTB_DIR)
valid_size=len(valid_text)
print(valid_size, valid_text[:64])

test_text = read_data(TEST, PTB_DIR)
test_size=len(test_text)
print(test_size, test_text[:64])

5101618  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote
399782  consumers may want to move their telephones a little closer to 
449945  no it was n't black monday 
 but while the new york stock excha


### Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = 59 # [A-Z] + [a-z] + ' ' +few 'in between; + punctuation
first_letter = ord(string.ascii_uppercase[0]) # ascii_uppercase before lowercase! 
print("vocabulary size = ", vocabulary_size)
print(first_letter)

def char2id(char):
  """ Converts char to id (int) with one-hot encoding handling of unexpected characters"""
  if char in string.ascii_letters:# or char in string.punctuation or char in string.digits:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    # print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  """ Converts single id (int) to character"""
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

#print(len(string.punctuation))
#for i in string.ascii_letters:
#    print (i, char2id(i))


print(char2id('a'), char2id('A'), char2id('z'), char2id('Z'), char2id(' '), char2id('ï'))
print(id2char(char2id('a')), id2char(char2id('A')))
#print(id2char(65), id2char(33), id2char(90), id2char(58), id2char(0))
#bankno
sample = np.zeros(shape=(1, vocabulary_size), dtype=np.float)
sample[0, char2id(' ')] = 1.0
print(sample)

vocabulary size =  59
65
33 1 58 26 0 0
a A
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.]]


### Helper class for batch generation

In [6]:
class BatchGenerator(object):
  def __init__(self, text, batch_size, seq_length, vocab_size):
    """
    Initializes the batch generator object. Stores the variables and first "letter batch".
    text is text to be processed
    batch_size is size of batch (number of samples)
    seq_length represents the length of sequence
    vocab_size is number of words in vocabulary (assumes one-hot encoding)
    """
    # Store input parameters.
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._seq_length = seq_length
    self._vocab_size = vocab_size
    # Divide text into segments depending on number of batches, each segment determines a cursor position for a batch.
    segment = self._text_size // batch_size
    # Set initial cursor position.
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # Store first "letter batch".
    self._last_letter_batch = self._next_letter_batch()
  
  def _next_letter_batch(self):
    """
    Returns a batch containing of encoded single letters depending on the current batch 
    cursor positions in the data.
    Returned "letter batch" is of size batch_size x vocab_size
    """
    letter_batch = np.zeros(shape=(self._batch_size, self._vocab_size), dtype=np.float)
    # Iterate through "samples"
    for b in range(self._batch_size):
      # Set 1 in position pointed out by one-hot char encoding.
      letter_batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return letter_batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    # First add last letter from previous batch (the "additional one").
    batches = [self._last_letter_batch]
    for step in range(self._seq_length):
      batches.append(self._next_letter_batch())
    # Store last "letter batch" for next batch.
    self._last_letter_batch = batches[-1]
    return batches


In [7]:
# Trick - override first 10 chars
#list1 = list(train_text)
#for i in range(2):
#    list1[i] = 'z'
#train_text = ''.join(list1)
#print("Train set =", train_text[0:100])

# Create objects for training, validation and testing batch generation.
train_batches = BatchGenerator(train_text, BATCH_SIZE, SEQ_LENGTH, vocabulary_size)

# Get first training batch.
batch = train_batches.next()
print(len(batch))
print(batch[0].shape)
#print("Batch = ", batch)
#print(batches2string(batch))
#print("batch len = num of enrollings",len(batch))
#for i in range(num_unrollings):
#    print("i = ", i, "letter=", batches2string(batch)[0][i][0], "bits = ", batch[i][0])


# For validation  - process the whole text as one big batch.
VALID_BATCH_SIZE = int(np.floor(valid_size/SEQ_LENGTH))
valid_batches = BatchGenerator(valid_text, VALID_BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
valid_batch = valid_batches.next()
#print (VALID_BATCH_SIZE)
#print(len(valid_batch))
#print(valid_batch[0].shape)

# For texting  - process the whole text as one big batch.
TEST_BATCH_SIZE = int(np.floor(test_size/SEQ_LENGTH))
test_batches = BatchGenerator(test_text, TEST_BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
# Get single batch! 
test_batch = test_batches.next()


11
(100, 59)


### Helper function defining the GRU cell

In [8]:
  # Definition of the cell computation.
  def gru_cell(input_, prev_output_, name_):
    """Create a GRU cell"""
    with tf.name_scope(name_):        
        # Concatenate input with previous_output.
        x_prev_h = tf.concat([input_, prev_output_], 1)
        
        # Calculate update and reset gates activations.
        update_gate = tf.sigmoid(tf.matmul(x_prev_h, Wz) + bz, name="Update_gate")
        reset_gate = tf.sigmoid(tf.matmul(x_prev_h, Wr) + br, name="Reset_gate")

        # Calculate the update.
        x_gated_prev_h = tf.concat([input_, reset_gate*prev_output_], 1)
        update = tf.tanh(tf.matmul(x_gated_prev_h, Wh) + bh, name="Update")
        # New cell state C.
        output = tf.add(update_gate * prev_output_, (1 - update_gate) * update, name = "Output")
        
    return output


###  Definition of tensor graph

In [9]:
# Reset graph - just in case.
tf.reset_default_graph()

# 0. Shared variables ops.
with tf.name_scope("Shared_Variables"):
  # Define parameters:
  # Update gate params: input, previous output, and bias.
  Wz = tf.Variable(tf.truncated_normal([vocabulary_size+HIDDEN_SIZE, 1], -0.1, 0.1), name="Wz")
  bz = tf.Variable(tf.zeros([1, 1]), name="bz")

  # Forget gate params: input, previous output, and bias.
  Wr = tf.Variable(tf.truncated_normal([vocabulary_size+HIDDEN_SIZE, 1], -0.1, 0.1), name="Wf")
  br = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bf")

  # Staate update params.                             
  Wh = tf.Variable(tf.truncated_normal([vocabulary_size+HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1), name="Wh")
  bh = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="bh")

  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, vocabulary_size], -0.1, 0.1), name="w")
  b = tf.Variable(tf.zeros([vocabulary_size]), name="b")

  # Placeholders for previous (the oldest) state and output.
  prev_output = tf.placeholder(tf.float32, shape=None, name="prev_output")

# 0. Placeholders for inputs.
with tf.name_scope("Input_data"):
  # Define input data buffers.
  input_buffer = list()
  for _ in range(SEQ_LENGTH + 1):
    # Collect placeholders for inputs/labels.
    input_buffer.append(tf.placeholder(tf.float32, shape=None, name="Input_data"))
  print ("input_buffer shape =", input_buffer[0].shape)
  # Collection of training inputs.
  train_inputs = input_buffer[:SEQ_LENGTH]
  # Labels are pointing to the same placeholders!
  # Labels are inputs shifted by one time step.
  train_labels = input_buffer[1:]  
  print ("Seq length  =", len(train_inputs))
  print ("Batch shape =", train_inputs[0].shape)
  # Concatenate targets into 2D tensor.
  targets = tf.concat(train_labels, 0)

 # 2. Training ops.
with tf.name_scope("GRU"):
  # Unrolled GRU loop.
  # Build outpus of size SEQ_LENGTH.
  outputs = list()
  output = prev_output
  for i in train_inputs:
    output = gru_cell(i, output, "cell")
    outputs.append(output)
  print (len(outputs))
  print (outputs[0].shape)
  print (tf.concat(outputs, 0).shape)

# Fully connected layer on top => classification.
# In fact we will create lots of FC layers (one for each output layer), with shared weights.
logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b, name = "Final_FC")

# 2. Loss ops.
with tf.name_scope("Loss"):
    # Loss function(s) - one for every output generated by every lstm cell.
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=targets, logits=logits))
    # Add loss summary.
    loss_summary = tf.summary.scalar("loss", loss)

# 3. Training ops.  
with tf.name_scope("Optimization"):
    # Learning rate decay.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(0.1, global_step, 5000, 0.9, staircase=True)
    # Optimizer.
    optimizer = tf.train.AdamOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    # Gradient clipping.
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

# 4. Predictions ops.  
with tf.name_scope("Evaluation") as scope:
  # Predictions.
  train_prediction = tf.nn.softmax(logits)


input_buffer shape = <unknown>
Seq length  = 10
Batch shape = <unknown>
10
<unknown>
<unknown>


### Subgraph responsible for generation of sample texts, char by char.

In [10]:
with tf.name_scope("Sample_generation") as scope:
  # Create graphs for sampling and validation evaluation: batch 1, "no unrolling".
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size], name="Input_data")
  saved_sample_output = tf.Variable(tf.zeros([1, HIDDEN_SIZE]), name="Output_data")

  # Node responsible for resetting the state and output.
  reset_sample_state = tf.group(
      saved_sample_output.assign(tf.zeros([1, HIDDEN_SIZE])))
  # Single LSTM cell.
  sample_output =gru_cell(sample_input, saved_sample_output, "cell")
  # Output depends on the hidden state.
  with tf.control_dependencies([saved_sample_output.assign(sample_output)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b, name="logits"), name="outputs")

# Merge all summaries.
merged_summaries = tf.summary.merge_all()

# 4. Init global variable.
init = tf.global_variables_initializer() 

### Helper functions for language generation (letter sampling etc). 

In [11]:

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [12]:
def create_feed_dict(dataset):
  """Creates a dictionaries for different sets: maps data onto Tensor placeholders."""
  feed_dict = dict()
  if dataset=="train":
    # Get next batch and create a feed dict.
    next_batch = train_batches.next()
    for i in range(SEQ_LENGTH + 1):
        feed_dict[input_buffer[i]] = next_batch[i]
    # Reset previous state and output
    feed_dict[prev_output] = np.zeros([BATCH_SIZE, HIDDEN_SIZE])
        
  elif dataset=="valid":
    for i in range(SEQ_LENGTH + 1):
        feed_dict[input_buffer[i]] = valid_batch[i]
    # Reset previous state and output
    feed_dict[prev_output] = np.zeros([VALID_BATCH_SIZE, HIDDEN_SIZE])
    
  else: # test
    for i in range(SEQ_LENGTH + 1):
        feed_dict[input_buffer[i]] = test_batch[i]
    # Reset previous state and output
    feed_dict[prev_output] = np.zeros([TEST_BATCH_SIZE, HIDDEN_SIZE])
    
  return feed_dict # {prev_output: train_output_zeros, prev_state: train_state_zeros }

### Session execution

In [13]:
# Eventually clear the log dir.
if tf.gfile.Exists(LOG_DIR):
  tf.gfile.DeleteRecursively(LOG_DIR)
# Create (new) log dir.
tf.gfile.MakeDirs(LOG_DIR)

In [14]:
# How often the test loss on validation batch will be computed. 
summary_frequency = 100

# Create session.
sess = tf.InteractiveSession()
# Create summary writers, point them to LOG_DIR.
train_writer = tf.summary.FileWriter(LOG_DIR + '/train', sess.graph)
valid_writer = tf.summary.FileWriter(LOG_DIR + '/valid')
test_writer = tf.summary.FileWriter(LOG_DIR + '/test')

# Initialize global variables.
tf.global_variables_initializer().run()
print('Initialized')

num_steps =  train_size // (BATCH_SIZE*SEQ_LENGTH) #70001
print("Number of iterations per epoch =", num_steps)
for step in range(num_steps):
    # Run training graph.
    batch = train_batches.next()
    summary, _, t_loss, lr = sess.run([merged_summaries, optimizer, loss, learning_rate], 
                                      feed_dict=create_feed_dict("train"))
    # Add summary.
    train_writer.add_summary(summary, step*BATCH_SIZE*SEQ_LENGTH)
    train_writer.flush()

    # Every (100) steps collect statistics.
    if step % summary_frequency == 0:
      # Print loss from last batch.
      print('Training set BPC at step %d: %0.5f learning rate: %f' % (step, t_loss, lr))
    
      if step % (summary_frequency * 10) == 0:
        # Generate sample text...
        print('=' * 80)
        # consisting of 5 lines...
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          # Reset LSTM hidden state.
          reset_sample_state.run()
          # with 79 characters in each.
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
        
        # Validation set BPC.
        v_summary, v_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("valid"))
        print("Validation set BPC: %.5f" % v_loss)
        valid_writer.add_summary(v_summary, step*BATCH_SIZE*SEQ_LENGTH)
        valid_writer.flush()
    # End of statistics collection

# Test set BPC.
print("Calculating BPC on test dataset")
t_summary, t_loss = sess.run([merged_summaries, loss], feed_dict=create_feed_dict("test"))
print("Final test set BPC: %.5f" % t_loss)
test_writer.add_summary(t_summary, step*BATCH_SIZE*SEQ_LENGTH)
test_writer.flush()
    
# Close writers and session.
train_writer.close()
valid_writer.close()
test_writer.close()
sess.close()

Initialized
Number of iterations per epoch = 5101
Training set BPC at step 0: 4.08848 learning rate: 0.100000
SzAjHXjT^]jv[ivmvCGpimqdo[jv_kAkzmrNBjtSeDk^ddpzkYGsdkBJFEgdgTlgroQF NGZvkDTruVa
hNZo_b\oC_jSVdNjbxrgHscsWhwhy`GRlpPztdiUiroplfswNHJ sBdumBo LADqsHf\orsRJyHu gx^
QrnaIhhhVZofiUnG`ZkSkLEo]jTa Qe_[y]rUKoA^IhSveQnLMUqBoaJnraVAUoBfpjSXRocmqm]iKZe
dmmGdxStiUswiG nouwNMdzVJ\t_LpvSTfHdCBu oMhwTpcpLCmtrEdMnCoIxeLJgCh_JumiyEcNGLfH
UoiFqsrdye`\\YWCElrAuJyeXCCxW  MyXK]JtwwN e]uYESoFGc_zHPTwKbxswagUcsedmJnKyOKG_m
Validation set BPC: 3.70872
Training set BPC at step 100: 2.36819 learning rate: 0.100000
Training set BPC at step 200: 2.17418 learning rate: 0.100000
Training set BPC at step 300: 1.99862 learning rate: 0.100000
Training set BPC at step 400: 2.03115 learning rate: 0.100000
Training set BPC at step 500: 1.95634 learning rate: 0.100000
Training set BPC at step 600: 1.97915 learning rate: 0.100000
Training set BPC at step 700: 1.98389 learning rate: 0.100000
Training set BPC at ste