In [15]:
from __future__ import print_function
import os

import numpy as np
import zipfile
import tarfile
from six.moves.urllib.request import urlretrieve
import shutil 
import random

import string
import tensorflow as tf

# Dirs - must be absolute paths!
LOG_DIR = '/tmp/tf/ptb_char_lstm/'
# Local dir where PTB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'

# Filenames.
TRAIN = "ptb.train.txt"
VALID = "ptb.valid.txt"
TEST = "ptb.test.txt"

# A single recurrent layer of 2000 units
#NUM_UNITS = 100
# Size of the hidden state 64
HIDDEN_SIZE = 64

# A batch size of 100
BATCH_SIZE = 10

# Sequences of length 200 bytes
SEQ_LENGTH = 20


### Check/maybe download PTB.

In [2]:
def maybe_download_ptb(path, 
                       filename='simple-examples.tgz', 
                       url='http://www.fit.vutbr.cz/~imikolov/rnnlm/', 
                       expected_bytes =34869662):
  # Eventually create the PTB dir.
  if not tf.gfile.Exists(path):
    tf.gfile.MakeDirs(path)
  """Download a file if not present, and make sure it's the right size."""
  _filename = path+filename
  if not os.path.exists(_filename):
    print('Downloading %s...' % filename)
    _filename, _ = urlretrieve(url+filename, _filename)
  statinfo = os.stat(_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', (_filename), '(', statinfo.st_size, ')')
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + _filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download_ptb(PTB_DIR)

Found and verified /home/tkornuta/data/ptb/simple-examples.tgz ( 34869662 )


### Extract dataset-related files from the PTB archive.

In [3]:
def extract_ptb(path, filename='simple-examples.tgz', files=["ptb.train.txt", "ptb.valid.txt", "ptb.test.txt", 
                                       "ptb.char.train.txt", "ptb.char.valid.txt", "ptb.char.test.txt"]):
    """Extracts files from PTB archive."""
    # Extract
    tar = tarfile.open(path+filename)
    tar.extractall(path)
    tar.close()
    # Copy files
    for file in files:
        shutil.copyfile(PTB_DIR+"simple-examples/data/"+file, PTB_DIR+file)
    # Delete directory
    shutil.rmtree(PTB_DIR+"simple-examples/")        

extract_ptb(PTB_DIR)

### Load train, valid and test texts.

In [4]:
def read_data(filename, path):
    with open(path+filename, 'r') as myfile:
        data=myfile.read()# .replace('\n', '')
        return data

train_text = read_data(TRAIN, PTB_DIR)
train_size=len(train_text)
print(train_size, train_text[:100])

valid_text = read_data(VALID, PTB_DIR)
valid_size=len(valid_text)
print(valid_size, valid_text[:64])

test_text = read_data(TEST, PTB_DIR)
test_size=len(test_text)
print(test_size, test_text[:64])

5101618  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote
399782  consumers may want to move their telephones a little closer to 
449945  no it was n't black monday 
 but while the new york stock excha


### Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = 59 # [A-Z] + [a-z] + ' ' +few 'in between; + punctuation
first_letter = ord(string.ascii_uppercase[0]) # ascii_uppercase before lowercase! 
print("vocabulary size = ", vocabulary_size)
print(first_letter)

def char2id(char):
  """ Converts char to id (int) with one-hot encoding handling of unexpected characters"""
  if char in string.ascii_letters:# or char in string.punctuation or char in string.digits:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    # print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  """ Converts single id (int) to character"""
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

#print(len(string.punctuation))
#for i in string.ascii_letters:
#    print (i, char2id(i))


print(char2id('a'), char2id('A'), char2id('z'), char2id('Z'), char2id(' '), char2id('ï'))
print(id2char(char2id('a')), id2char(char2id('A')))
#print(id2char(65), id2char(33), id2char(90), id2char(58), id2char(0))
#bankno
sample = np.zeros(shape=(1, vocabulary_size), dtype=np.float)
sample[0, char2id(' ')] = 1.0
print(sample)

vocabulary size =  59
65
33 1 58 26 0 0
a A
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.]]


### Helper class for batch generation

In [6]:
class BatchGenerator(object):
  def __init__(self, text, batch_size, seq_length, vocab_size):
    """
    Initializes the batch generator object. Stores the variables and first "letter batch".
    text is text to be processed
    batch_size is size of batch (number of samples)
    seq_length represents the length of sequence
    vocab_size is number of words in vocabulary (assumes one-hot encoding)
    """
    # Store input parameters.
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._seq_length = seq_length
    self._vocab_size = vocab_size
    # Divide text into segments depending on number of batches, each segment determines a cursor position for a batch.
    segment = self._text_size // batch_size
    # Set initial cursor position.
    self._cursor = [ offset * segment for offset in range(batch_size)]
    # Store first "letter batch".
    self._last_letter_batch = self._next_letter_batch()
  
  def _next_letter_batch(self):
    """
    Returns a batch containing of encoded single letters depending on the current batch 
    cursor positions in the data.
    Returned "letter batch" is of size batch_size x vocab_size
    """
    letter_batch = np.zeros(shape=(self._batch_size, self._vocab_size), dtype=np.float)
    # Iterate through "samples"
    for b in range(self._batch_size):
      # Set 1 in position pointed out by one-hot char encoding.
      letter_batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return letter_batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    # First add last letter from previous batch (the "additional one").
    batches = [self._last_letter_batch]
    for step in range(self._seq_length):
      batches.append(self._next_letter_batch())
    # Store last "letter batch" for next batch.
    self._last_letter_batch = batches[-1]
    return batches


In [7]:
# Trick - override first 10 chars
#list1 = list(train_text)
#for i in range(2):
#    list1[i] = 'z'
#train_text = ''.join(list1)
print("Test set =", train_text[0:100])

# Create two objects for training and validation batch generation.
train_batches = BatchGenerator(train_text, BATCH_SIZE, SEQ_LENGTH, vocabulary_size)
valid_batches = BatchGenerator(valid_text, 1, 1, vocabulary_size)

batch = train_batches.next()
batch = train_batches.next()
#print("Batch = ", batch)
print(batches2string(batch))
#print("batch len = num of enrollings",len(batch))
#for i in range(num_unrollings):
#    print("i = ", i, "letter=", batches2string(batch)[0][i][0], "bits = ", batch[i][0])


Test set =  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memote
['z calloway centrust c', ' league promises the ', 'ion including quotron', 'r proposal for a full', 'n china is very compl', ' december that was su', 'ogilvy group was  unk', 'lis a general electri', 've earlier this year ', 'f philip morris cos  ']


### Helper functions for calculation of loss =  - log2 prob (BPC).

In [8]:
def logprob(predictions, labels):
  """
  Log-probability of the true labels in a predicted batch.
  Assumes that predictions/labels are of shape [batch_size x 1] (i.e. a batch of 1-char sequences)
  """
  predictions[predictions < 1e-10] = 1e-10
  # Divide by the batch size (shape[0]).
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]


In [9]:
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state


###  Definition of tensor graph

In [12]:
# Reset graph - just in case.
tf.reset_default_graph()


graph = tf.Graph()
with graph.as_default():

  # Define input data buffers.
  train_data = list()
  for _ in range(SEQ_LENGTH + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[BATCH_SIZE,vocabulary_size]))
  train_inputs = train_data[:SEQ_LENGTH]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Define parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, HIDDEN_SIZE], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, HIDDEN_SIZE]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, HIDDEN_SIZE], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, HIDDEN_SIZE]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, HIDDEN_SIZE], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, HIDDEN_SIZE]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, HIDDEN_SIZE], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, HIDDEN_SIZE], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, HIDDEN_SIZE]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([BATCH_SIZE, HIDDEN_SIZE]), trainable=False)
  saved_state = tf.Variable(tf.zeros([BATCH_SIZE, HIDDEN_SIZE]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([HIDDEN_SIZE, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
    # Fully connected layer on top => classification.
    # In fact we will create lots of FC layers (one for each output layer), with shared weights.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    # Loss function(s) - one for every output generated by every lstm cell.
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer-related variables.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Greate graphs for sampling and validation evaluation: batch 1, "no unrolling".
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, HIDDEN_SIZE]))
  saved_sample_state = tf.Variable(tf.zeros([1, HIDDEN_SIZE]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, HIDDEN_SIZE])),
    saved_sample_state.assign(tf.zeros([1, HIDDEN_SIZE])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

### Helper functions for language generation (letter sampling etc). 

In [13]:

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

### Session execution

In [None]:
num_steps =  train_size // (BATCH_SIZE*SEQ_LENGTH) #70001
print("Total number of iterations= ", num_steps)
summary_frequency = 100
    
with tf.Session(graph=graph) as session:

  tf.global_variables_initializer().run()
  print('Initialized')
    
  # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
  merged = tf.summary.merge_all()
  train_writer = tf.summary.FileWriter(LOG_DIR + '/train', session.graph)
  valid_writer = tf.summary.FileWriter(LOG_DIR + '/valid')
    
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(SEQ_LENGTH + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run([optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    # Add loss to mean.
    mean_loss += l
    # Every (100) steps collect statistics.
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Train set minibatch BPC: %.2f' % logprob(predictions, labels))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set BPC.
      reset_sample_state.run()
      mean_valid_logprob = 0
      # Sum for a single batch of size 1 - i.e. predict depending only on a single input character.
      for _ in range(1000): #valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        mean_valid_logprob += logprob(predictions, b[1])
      print('Validation set BPC (1-char): %.2f' % float(mean_valid_logprob / (1000))) #valid_size)))
    # End of statistics collection
    

Total number of iterations=  25508
Initialized
Average loss at step 0: 4.069476 learning rate: 10.000000
Train set minibatch BPC: 4.07
GIbKo z  AsejRV[aojdwZhbkPV_a\_AkLhvIwMBjBrrOWQeTJJr_cZntDH fU[ MaBp ejdAodUteqr
DsawKioYa`dLFaa YOAOHzz_yre]ic rxcw`nnoKeUp wge klF_da es\at_ ^[ez WoqOXeZGcnE c
^t\t\ leiQsHVd_UbfnAm rTc ufq iEMnNR e e  s tyWmwVbzde n  k C K`Mtu[UzufwBM`iqTN
wmo vIdW gduQsccR [uv Kors\rR WoXZR  SWJ sspa`usD p\nfIRB_etROs vYv CrfoIL`\gG  
y hrG tUDzNNC[HNw^JzosNeMpMaxLT`^KzSdQQe^c\beA N DtojuLBio   zbHo[WEOAgIYoIV` ZV
Validation set BPC (1-char): 3.44
Average loss at step 100: 2.750074 learning rate: 10.000000
Train set minibatch BPC: 2.66
Validation set BPC (1-char): 2.42
Average loss at step 200: 2.277129 learning rate: 10.000000
Train set minibatch BPC: 2.24
Validation set BPC (1-char): 2.18
Average loss at step 300: 2.155371 learning rate: 10.000000
Train set minibatch BPC: 2.27
Validation set BPC (1-char): 2.07
Average loss at step 400: 2.056683 learning rate: 10.0

Validation set BPC (1-char): 1.63
Average loss at step 4300: 1.528672 learning rate: 10.000000
Train set minibatch BPC: 1.72
Validation set BPC (1-char): 1.56
Average loss at step 4400: 1.499070 learning rate: 10.000000
Train set minibatch BPC: 1.50
Validation set BPC (1-char): 1.62
Average loss at step 4500: 1.526216 learning rate: 10.000000
Train set minibatch BPC: 1.45
Validation set BPC (1-char): 1.74
Average loss at step 4600: 1.526227 learning rate: 10.000000
Train set minibatch BPC: 1.36
Validation set BPC (1-char): 1.79
Average loss at step 4700: 1.516036 learning rate: 10.000000
Train set minibatch BPC: 1.47
Validation set BPC (1-char): 1.91
Average loss at step 4800: 1.506119 learning rate: 10.000000
Train set minibatch BPC: 1.56
Validation set BPC (1-char): 1.68
Average loss at step 4900: 1.518493 learning rate: 10.000000
Train set minibatch BPC: 1.69
Validation set BPC (1-char): 1.84
Average loss at step 5000: 1.471808 learning rate: 1.000000
Train set minibatch BPC: 1.45
`

Validation set BPC (1-char): 1.29
Average loss at step 9100: 1.400687 learning rate: 1.000000
Train set minibatch BPC: 1.67
Validation set BPC (1-char): 1.23
Average loss at step 9200: 1.447746 learning rate: 1.000000
Train set minibatch BPC: 1.48
Validation set BPC (1-char): 1.31
Average loss at step 9300: 1.423694 learning rate: 1.000000
Train set minibatch BPC: 1.61
Validation set BPC (1-char): 1.49
Average loss at step 9400: 1.443195 learning rate: 1.000000
Train set minibatch BPC: 1.40
Validation set BPC (1-char): 1.38
