In [1]:
from __future__ import print_function
import os

import numpy as np
import zipfile
import tarfile
from six.moves.urllib.request import urlretrieve
import shutil 
import random

import string
import tensorflow as tf

# Local dir where PTB files will be stored.
PTB_DIR = '/home/tkornuta/data/ptb/'

# Filenames.
TRAIN = "ptb.train.txt"
VALID = "ptb.valid.txt"
TEST = "ptb.test.txt"


### Check/maybe download PTB.

In [2]:
def maybe_download_ptb(path, 
                       filename='simple-examples.tgz', 
                       url='http://www.fit.vutbr.cz/~imikolov/rnnlm/', 
                       expected_bytes =34869662):
  # Eventually create the PTB dir.
  if not tf.gfile.Exists(path):
    tf.gfile.MakeDirs(path)
  """Download a file if not present, and make sure it's the right size."""
  _filename = path+filename
  if not os.path.exists(_filename):
    print('Downloading %s...' % filename)
    _filename, _ = urlretrieve(url+filename, _filename)
  statinfo = os.stat(_filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', (_filename), '(', statinfo.st_size, ')')
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + _filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download_ptb(PTB_DIR)

Found and verified /home/tkornuta/data/ptb/simple-examples.tgz ( 34869662 )


### Extract dataset-related files from the PTB archive.

In [3]:
def extract_ptb(path, filename='simple-examples.tgz', files=["ptb.train.txt", "ptb.valid.txt", "ptb.test.txt", 
                                       "ptb.char.train.txt", "ptb.char.valid.txt", "ptb.char.test.txt"]):
    """Extracts files from PTB archive."""
    # Extract
    tar = tarfile.open(path+filename)
    tar.extractall(path)
    tar.close()
    # Copy files
    for file in files:
        shutil.copyfile(PTB_DIR+"simple-examples/data/"+file, PTB_DIR+file)
    # Delete directory
    shutil.rmtree(PTB_DIR+"simple-examples/")        

extract_ptb(PTB_DIR)
 

### Load train, valid and test texts.

In [4]:
def read_data(filename, path):
    with open(path+filename, 'r') as myfile:
        data=myfile.read()# .replace('\n', '')
        return data

train_text = read_data(TRAIN, PTB_DIR)
train_size=len(train_text)
print(train_size, train_text[:64])

valid_text = read_data(VALID, PTB_DIR)
valid_size=len(valid_text)
print(valid_size, valid_text[:64])

test_text = read_data(TEST, PTB_DIR)
test_size=len(test_text)
print(test_size, test_text[:64])

5101618  aer banknote berlitz calloway centrust cluett fromstein gitano 
399782  consumers may want to move their telephones a little closer to 
449945  no it was n't black monday 
 but while the new york stock excha


### Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = 59 # [A-Z] + [a-z] + ' ' +few 'in between; + punctuation
first_letter = ord(string.ascii_uppercase[0]) # ascii_uppercase before lowercase! 
print(vocabulary_size)
print(first_letter)

def char2id(char):
  """ Converts char to id (int) with chandling of unexpected characters"""
  if char in string.ascii_letters:# or char in string.punctuation or char in string.digits:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    # print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  """ Converts single id (int) to character"""
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

#print(len(string.punctuation))
#for i in string.ascii_letters:
#    print (i, char2id(i))


print(char2id('a'), char2id('A'), char2id('z'), char2id('Z'), char2id(' '), char2id('ï'))
print(id2char(char2id('a')), id2char(char2id('A')))
#print(id2char(65), id2char(33), id2char(90), id2char(58), id2char(0))

59
65
33 1 58 26 0 0
a A


### Helper class for batch generation

In [6]:
batch_size=64
num_unrollings=100

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

# Create two objects for training and validation batch generation.
train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

batch = train_batches.next()
#print(batch)
print(batches2string(batch))


[' aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro quebec ipo kia memotec', 'as the six members of the association of southeast asian nations thailand malaysia singapore indonesi', 'ack said he is pleased with the economy  s recent performance and does n t see a lot of excesses out ', 'significant recent experience with a similar program in central america indicates that it could take ', 'age does n t cause volatility it  unk  to it   think about what causes the difference in prices betwe', 'rk stock exchange composite trading yesterday intelogic shares rose N cents to close at   N   mr  ede', 's goodman theatre  unk   unk  take the stage in  unk  city leisure   arts the role of  unk  played by', 'rs who are up to N years old and then for another N days if the company institutes a specific trainin', '  unk  taste  unk  says hispanics prefer the new brand has a  unk  content of N N   that compares wit', ' university of kentucky a team led by dean  unk  a ph

In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [None]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [None]:
num_steps = 70001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch BPC: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(1000): #valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set BPC: %.2f' % float(np.exp(
        valid_logprob / 1000)))

Initialized
Average loss at step 0: 4.079040 learning rate: 10.000000
Minibatch BPC: 59.09
gZ[iiVWQJ lh`WAs J`] P B\WzTsrG awuRcI Bn `QqfQvttyYqohL  u J U A yoVZ Ka TSqrk]
_hcRo]`oao DJ nnI]dDQbIt cy Oa[sF yabyaRPcNonLK`   ny^lOepGgn]X eUWDveXryNtuPv E
t_krO_ZExUVjj^b`vFDQ lafItXRMooXRrcoyRetnge aCc_Ja tuatfzordtSVYteoKOs  cL]mY HF
Zl CpYxhJ^[VNiK S LotNMYbYnpP _ CQ  pV fFviA WnrwaqRu\  ii`t TC t lIp` UzehMs[S\
Za   ss azufcbXMJDsF_ KA_ pj cmDElT[Rnehk]nYC_UPIi tNHiDjrC rEN Gh^ RrU_YUajI\t 
Validation set BPC: 31.28
Average loss at step 100: 2.660756 learning rate: 10.000000
Minibatch BPC: 10.37
Validation set BPC: 10.56
Average loss at step 200: 2.131332 learning rate: 10.000000
Minibatch BPC: 7.85
Validation set BPC: 7.42
Average loss at step 300: 1.939570 learning rate: 10.000000
Minibatch BPC: 6.74
Validation set BPC: 6.93
Average loss at step 400: 1.820804 learning rate: 10.000000
Minibatch BPC: 6.18
Validation set BPC: 6.52
Average loss at step 500: 1.745067 learning rate: 10.000

Validation set BPC: 4.43
Average loss at step 5100: 1.308180 learning rate: 1.000000
Minibatch BPC: 3.70
Validation set BPC: 4.65
Average loss at step 5200: 1.307375 learning rate: 1.000000
Minibatch BPC: 3.72
Validation set BPC: 4.59
Average loss at step 5300: 1.304303 learning rate: 1.000000
Minibatch BPC: 3.72
Validation set BPC: 3.89
Average loss at step 5400: 1.301745 learning rate: 1.000000
Minibatch BPC: 3.67
Validation set BPC: 4.21
Average loss at step 5500: 1.306507 learning rate: 1.000000
Minibatch BPC: 3.71
Validation set BPC: 3.69
Average loss at step 5600: 1.301749 learning rate: 1.000000
Minibatch BPC: 3.69
Validation set BPC: 3.88
Average loss at step 5700: 1.305911 learning rate: 1.000000
Minibatch BPC: 3.71
Validation set BPC: 4.66
Average loss at step 5800: 1.300741 learning rate: 1.000000
Minibatch BPC: 3.65
Validation set BPC: 4.65
Average loss at step 5900: 1.297149 learning rate: 1.000000
Minibatch BPC: 3.64
Validation set BPC: 4.12
Average loss at step 6000: 1.3

Average loss at step 10200: 1.288764 learning rate: 0.100000
Minibatch BPC: 3.71
Validation set BPC: 4.34
Average loss at step 10300: 1.296630 learning rate: 0.100000
Minibatch BPC: 3.61
Validation set BPC: 3.50
Average loss at step 10400: 1.291366 learning rate: 0.100000
Minibatch BPC: 3.62
Validation set BPC: 3.61
Average loss at step 10500: 1.297846 learning rate: 0.100000
Minibatch BPC: 3.62
Validation set BPC: 3.95
Average loss at step 10600: 1.292706 learning rate: 0.100000
Minibatch BPC: 3.60
Validation set BPC: 3.88
Average loss at step 10700: 1.280646 learning rate: 0.100000
Minibatch BPC: 3.64
Validation set BPC: 3.57
Average loss at step 10800: 1.291205 learning rate: 0.100000
Minibatch BPC: 3.72
Validation set BPC: 3.96
Average loss at step 10900: 1.290743 learning rate: 0.100000
Minibatch BPC: 3.70
Validation set BPC: 3.90
Average loss at step 11000: 1.287167 learning rate: 0.100000
Minibatch BPC: 3.67
outher this year peace the use  unk  c a as year   homo inclistanwes or

Validation set BPC: 3.70
Average loss at step 15300: 1.297086 learning rate: 0.010000
Minibatch BPC: 3.53
Validation set BPC: 3.58
Average loss at step 15400: 1.289887 learning rate: 0.010000
Minibatch BPC: 3.59
Validation set BPC: 3.29
Average loss at step 15500: 1.281800 learning rate: 0.010000
Minibatch BPC: 3.64
Validation set BPC: 3.70
Average loss at step 15600: 1.290611 learning rate: 0.010000
Minibatch BPC: 3.65
Validation set BPC: 3.95
Average loss at step 15700: 1.289164 learning rate: 0.010000
Minibatch BPC: 3.66
Validation set BPC: 4.03
Average loss at step 15800: 1.285005 learning rate: 0.010000
Minibatch BPC: 3.62
Validation set BPC: 3.65
Average loss at step 15900: 1.295645 learning rate: 0.010000
Minibatch BPC: 3.59
Validation set BPC: 3.64
Average loss at step 16000: 1.288239 learning rate: 0.010000
Minibatch BPC: 3.55
Oed consider up to say index british and  unk  telef  measurming the the bortoon
ing tomest a news share off N said the c   unk  though worrse jubith an

Validation set BPC: 4.11
Average loss at step 20400: 1.288763 learning rate: 0.001000
Minibatch BPC: 3.49
Validation set BPC: 3.54
Average loss at step 20500: 1.290526 learning rate: 0.001000
Minibatch BPC: 3.65
Validation set BPC: 3.75
Average loss at step 20600: 1.288587 learning rate: 0.001000
Minibatch BPC: 3.65
Validation set BPC: 4.05
Average loss at step 20700: 1.294140 learning rate: 0.001000
Minibatch BPC: 3.61
Validation set BPC: 4.12
Average loss at step 20800: 1.288938 learning rate: 0.001000
Minibatch BPC: 3.67
Validation set BPC: 4.09
Average loss at step 20900: 1.294107 learning rate: 0.001000
Minibatch BPC: 3.50
Validation set BPC: 3.88
Average loss at step 21000: 1.287678 learning rate: 0.001000
Minibatch BPC: 3.51
S ralists of high president you woulderry supported around small night have  unk
rowendenf may negony and mr   unk  some weekly at might actimuling awnord outsid
N  unk  the suptement their by third company day we  n intenting this moven the 
Vels crest in m

Validation set BPC: 3.34
Average loss at step 25500: 1.292020 learning rate: 0.000100
Minibatch BPC: 3.43
Validation set BPC: 4.02
Average loss at step 25600: 1.292535 learning rate: 0.000100
Minibatch BPC: 3.60
Validation set BPC: 4.00
Average loss at step 25700: 1.294377 learning rate: 0.000100
Minibatch BPC: 3.64
Validation set BPC: 3.85
Average loss at step 25800: 1.284781 learning rate: 0.000100
Minibatch BPC: 3.58
Validation set BPC: 3.87
Average loss at step 25900: 1.284301 learning rate: 0.000100
Minibatch BPC: 3.57
Validation set BPC: 3.42
Average loss at step 26000: 1.287398 learning rate: 0.000100
Minibatch BPC: 3.57
Yene   it  s  smeed tolesting of shoping demands care sales at all a said   the 
ent a first does to anyts to a pothel is poltuct in differentialds and closed  s
N office at transhime the said at operate for being   it wirm boasdong and servi
N   a lond chail will accunacis stock maved burn to gorbachue buy wempave as N N
de at unturs of many  unk  to power but 