Deep Learning
=============

Long Term Short Term Memory(LSTMs)
------------

Task is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.


We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).



In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

Input Data Creation - training valid split

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)


def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_text = ' '.join(str(e) for e in train_text)
valid_text = ' '.join(str(e) for e in valid_text)
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

Found and verified text8.zip
Data size 17005207
99993848 american individualist anarchism benjamin tucker in one eight tw
1000 anarchism originated as a term of abuse first used against early


Preprocessing As Per Our Requirement

In [3]:
import nltk
from nltk import bigrams
train_bigrams = bigrams(train_text)
valid_bigrams = bigrams(valid_text)
temp=list(train_bigrams)
train_list=list()
for tup in temp:
    s=''.join(tup)
    train_list.append(s)
temp=list(valid_bigrams)
valid_list=list()
for tup in temp:
    s=''.join(tup)
    valid_list.append(s)
print(valid_list)

['an', 'na', 'ar', 'rc', 'ch', 'hi', 'is', 'sm', 'm ', ' o', 'or', 'ri', 'ig', 'gi', 'in', 'na', 'at', 'te', 'ed', 'd ', ' a', 'as', 's ', ' a', 'a ', ' t', 'te', 'er', 'rm', 'm ', ' o', 'of', 'f ', ' a', 'ab', 'bu', 'us', 'se', 'e ', ' f', 'fi', 'ir', 'rs', 'st', 't ', ' u', 'us', 'se', 'ed', 'd ', ' a', 'ag', 'ga', 'ai', 'in', 'ns', 'st', 't ', ' e', 'ea', 'ar', 'rl', 'ly', 'y ', ' w', 'wo', 'or', 'rk', 'ki', 'in', 'ng', 'g ', ' c', 'cl', 'la', 'as', 'ss', 's ', ' r', 'ra', 'ad', 'di', 'ic', 'ca', 'al', 'ls', 's ', ' i', 'in', 'nc', 'cl', 'lu', 'ud', 'di', 'in', 'ng', 'g ', ' t', 'th', 'he', 'e ', ' d', 'di', 'ig', 'gg', 'ge', 'er', 'rs', 's ', ' o', 'of', 'f ', ' t', 'th', 'he', 'e ', ' e', 'en', 'ng', 'gl', 'li', 'is', 'sh', 'h ', ' r', 're', 'ev', 'vo', 'ol', 'lu', 'ut', 'ti', 'io', 'on', 'n ', ' a', 'an', 'nd', 'd ', ' t', 'th', 'he', 'e ', ' s', 'sa', 'an', 'ns', 's ', ' c', 'cu', 'ul', 'lo', 'ot', 'tt', 'te', 'es', 's ', ' o', 'of', 'f ', ' t', 'th', 'he', 'e ', ' f', 'fr', 're

Given a list of words, it will preprocess and make batches to be fed to the skip-gram / CBOW model

There are two functions - build_dataset ----> changes the word to decimal scalars(dictionary key-value pairs)

(in the range specified by vocabulary_size (maybe 100s for letters and 1000s for words)

generate_batch ------> give batch_size,num_skips and skip_window, it will give you back the ---> train batches and the labels

In [4]:
data_index = 0 # very important initialization, gets updated globally
               #each time genarate batch is called.
vocabulary_size = 500 



def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary

def generate_batch(data,batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1) # continue till an unseen 
        #target is found
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

In [5]:
train_data, count, dictionary, reverse_dictionary = build_dataset(train_list)
print('Most common words (+UNK)', count[:5])
print('Sample data', [reverse_dictionary[di] for di in train_data[:5]])
#del words  # Hint to reduce memory.
print('data:', [reverse_dictionary[di] for di in train_data[:8]])



Most common words (+UNK) [['UNK', 103163], ('e ', 3686082), (' t', 2449118), ('s ', 2222166), ('th', 1980414)]
Sample data ['am', 'me', 'er', 'ri', 'ic']
data: ['am', 'me', 'er', 'ri', 'ic', 'ca', 'an', 'n ']


In [6]:
batch_size = 256
embedding_size = 128 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.(windowlen/2)
num_skips = 2 # How many times to reuse a middle input to generate a label.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default():

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed,
                               train_labels, num_sampled, vocabulary_size))

  # Optimizer.
  # Note: The optimizer will optimize the softmax_weights AND the embeddings.
  # This is because the embeddings are defined as a variable quantity and the
  # optimizer's `minimize` method will by default modify all variable quantities 
  # that contribute to the tensor it is passed.
  # See docs on `tf.train.Optimizer.minimize()` for more details.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  print(normalized_embeddings.get_shape)
  #print(valid_embeddings.get_shape)
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

<bound method Tensor.get_shape of <tf.Tensor 'truediv:0' shape=(500, 128) dtype=float32>>


In [7]:
num_steps = 10001
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_batch(train_data,
      batch_size, num_skips, skip_window)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l = session.run([optimizer, loss], feed_dict=feed_dict)
    average_loss += l
    if step % 2000 == 0:
      if step > 0:
        average_loss = average_loss / 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step %d: %f' % (step, average_loss))
      average_loss = 0
    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 10000 == 0:
      sim = similarity.eval()
      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = 'Nearest to %s:' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
        print(log)
  final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step 0: 5.085847
Nearest to ng:  y, su, nk, ld, vu, if, co, yb,
Nearest to ti: sc, ov, rb, e , in, oo, dv, nf,
Nearest to in: vi, nn, rf, is, z , mu, ot, ti,
Nearest to wo: to, vo, xi, mr, gl, cm, vs, by,
Nearest to ma: vu, nh, p , if, hc, dh, xy, oa,
Nearest to g : yw, fs, ht, j , as, zy, eu, fl,
Nearest to  d: ig, k , wr, rl, ar, if, nw, yl,
Nearest to ur: bu, sm, ad, eq, bs, ck, r , pm,
Nearest to fi: yl, ta, ra, ol, ty, gb, to, mn,
Nearest to ou: fr, nk, eh, gu,  c, gr, zy, ml,
Nearest to ce: l , wh, sn, jo, ez, ss, yo,  n,
Nearest to  t: ao, uv, xa, ve, pn, xc, vo,  x,
Nearest to ec: di, hl, tw, mb, ve, bl, ci, hs,
Nearest to ve: rp,  t, ih, ec, nu, dd, an, c ,
Nearest to ne: lb, kw, lh, sf, wl, c , qu, nj,
Nearest to s : ta, ho, ih, ix, je, kh, ht, na,
Average loss at step 2000: 1.870095
Average loss at step 4000: 1.661086
Average loss at step 6000: 1.636439
Average loss at step 8000: 1.630484
Average loss at step 10000: 1.619018
Nearest to ng: n , rg,

Now lets modify the pre-processing step in LSTM to change from one-hot representation to word2Vec

In [36]:
#train=[reverse_dictionary[di] for di in train_data]
batch_size=64
num_unrollings=10
        

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batchtemp = np.zeros(shape=(self._batch_size), dtype=np.float)
    for b in range(self._batch_size):
      batchtemp[b] = self._text[self._cursor[b]]
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    batch=wrdVec(batchtemp)
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def wrdVec(ids):
    graph1=tf.Graph()
    with graph1.as_default():
        v = tf.constant(ids, dtype=tf.int32)
        e=tf.constant(final_embeddings,dtype=tf.float32)
        c=tf.nn.embedding_lookup(e,v)
    with tf.Session(graph=graph1) as sess:
        return sess.run(c)
    
def VecWrdBatch(vectors):
    b=[]
    for vector in vectors:
        #print(vector.shape)
        v=np.array(vector)
        res=np.dot(final_embeddings,v)
            #print(v.shape)
        b.append(reverse_dictionary[np.argmax(res,0)])
    return(b)
def VecWrd(vector):
    v=np.array(vector)
    res=np.dot(final_embeddings,v)
    b=np.argmax(res,0)
    return(reverse_dictionary[b])

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]


def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    
    print(VecWrdBatch(b))
    #l=[reverse_dictionary[x] for x in VecWrdBatch(b)]
    #print(list(zip(s,VecWrdBatch(b))))
    #for x in zip(s,VecWrdBatch(b)):
     #   print(x)
    s = [''.join(x) for x in zip(s,VecWrdBatch(b))]
    #s = [''.join(reverse_dictionary[x]) for x in VecWrdBatch(b)]
    #s = [x for x in VecWrd(bi)]
    #print(s)
  return s

train_batches = BatchGenerator(train_data, batch_size, num_unrollings)
#valid_batches = BatchGenerator(valid_text, 1, 1)

#print(batches2string(train_batches._last_batch))
print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
#print(batches2string(valid_batches.next()))
#print(batches2string(valid_batches.next()))
#train_batches._last_batch
#train_batches._last_batch

['am', 'er', ' r', ' b', 'ri', 'is', 'te', ' w', 'ng', 'o ', ' i', 'ns', 'wo', 'ot', 'r ', 'to', 's ', ' d', 'di', ' c', 'th', 'op', 'ty', ' f', 's ', ' c', 'ed', 's ', 'at', 'te', 'to', 'd ', 'pe', ' g', 'oo', 'ar', 'us', 'ro', 'er', 'ri', 'ty', 'a ', 're', 'l ', 'nd', 'ag', 'ld', 'ia', 'ke', 'a ', 'sa', 'st', ' h', ' d', 'it', 'cs', 'co', 'nd', 'ce', 'te', 'ne', 'te', 're', 'dv']
['me', 'ra', 're', 'be', 'ia', 'so', 'es', 'wa', 'g ', ' b', 'is', 'se', 'o ', 'ti', ' o', 'o ', ' w', 'di', 'it', 'ca', 'h ', 'pr', 'y ', 'fo', ' u', 'cl', 'd ', ' f', 't ', 'er', 'od', ' a', 'er', 'gu', 'op', 're', 'st', 'om', 'rt', 'ic', 'y ', ' s', 'en', ' l', 'd ', 'ga', 'd ', 'a ', 'en', ' t', 'ac', 'ti', 'ha', 'de', 'ti', 's ', 'on', 'd ', 'es', 'en', 'e ', 'e ', 'et', 'va']
['er', 'al', 'em', 'ei', 'a ', 'on', 's ', 'as', ' i', 'be', 's ', 'e ', ' e', 'io', 'of', ' p', 'we', 'id', 'ti', 'au', ' s', 're', ' c', 'or', 'us', 'li', ' i', 'fi', ' s', 'rn', 'da', 'al', 'r ', 'ua', 'ps', 'e ', 't ', 'ma', '

In [40]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:l
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

IndentationError: unexpected indent (<ipython-input-40-dee96784e2de>, line 15)

# TIP :  Actual output y(t)  and lstm cell output h(t) are different

In [None]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.
  print(train_labels)
  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  k=1
  for i in train_inputs:
    if k==1:
        print(i)
    k=k+1
    output, state = lstm_cell(i, output, state)
    outputs.append(output)
    print(len(outputs))
  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    print(tf.concat(0, outputs))
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(0,train_labels), logits=logits))
    print(tf.concat(0, train_labels))
  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.(without
# creating another graph)
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [None]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
 ld     [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))