Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve
import datetime

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        name = f.namelist()[0]
        data = tf.compat.as_str(f.read(name))
    return data
    
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    else:
        print('Unexpected character: %s' % char)
        return 0
    
def id2char(dictid):
    if dictid > 0:
        return chr(dictid + first_letter - 1)
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
    
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            batch[b, char2id(self._text[self._cursor[b]])] = 1.0
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
    
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [7]:
def logprob(predictions, labels):
    """Log-probability of the true labels in a predicted batch."""
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
    """Sample one element from a distribution assumed to be an array of normalized
    probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def sample(prediction):
    """Turn a (column) prediction into 1-hot encoded samples."""
    p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p

def random_distribution():
    """Generate a random column of probabilities."""
    b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
    return b/np.sum(b, 1)[:,None]

Simple LSTM Model.

In [8]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
    
    # Parameters:
    # Input gate: input, previous output, and bias.
    ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    # Forget gate: input, previous output, and bias.
    fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    # Memory cell: input, state and bias.                                                         
    cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    # Output gate: input, previous output, and bias.
    ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]    # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [26]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.297980 learning rate: 10.000000
Minibatch perplexity: 27.06
yanaomthiqhefn crtda ycersfugsjfutneiyiws gltmkrlvtppk xza an slpmvyd xqxeayapt 
oeiolmdivswqsriw  pbhm viqlmaemie rmeotendrtujkowsuyctmwlpve qriwn o x k bw  hs 
 z  pfdatazancmriuy mhb ztsj   ilpcox apcfgf cicnelezaceeobmalyyjt tseiq okhbcmx
g lwisboxtqne  v mhtiksng yesm qa nauako mmctyg lywde athf tr mhercsli feqvcismm
mgllpirqwiej of gv i cme eepqj vsefad hitlmkeazuinjqigu siirthajeyvj fxahnrike a
Validation set perplexity: 19.99
Average loss at step 100: 2.592564 learning rate: 10.000000
Minibatch perplexity: 10.71
Validation set perplexity: 10.10
Average loss at step 200: 2.241147 learning rate: 10.000000
Minibatch perplexity: 8.46
Validation set perplexity: 8.49
Average loss at step 300: 2.099373 learning rate: 10.000000
Minibatch perplexity: 7.54
Validation set perplexity: 8.09
Average loss at step 400: 2.001798 learning rate: 10.000000
Minibatch perplexity: 7.65
Validation set per

Validation set perplexity: 4.42
Average loss at step 4500: 1.616899 learning rate: 10.000000
Minibatch perplexity: 5.35
Validation set perplexity: 4.64
Average loss at step 4600: 1.616210 learning rate: 10.000000
Minibatch perplexity: 4.90
Validation set perplexity: 4.61
Average loss at step 4700: 1.626453 learning rate: 10.000000
Minibatch perplexity: 5.35
Validation set perplexity: 4.53
Average loss at step 4800: 1.632648 learning rate: 10.000000
Minibatch perplexity: 4.47
Validation set perplexity: 4.43
Average loss at step 4900: 1.634618 learning rate: 10.000000
Minibatch perplexity: 5.14
Validation set perplexity: 4.61
Average loss at step 5000: 1.612095 learning rate: 1.000000
Minibatch perplexity: 4.54
le signis in electronise suckeasis indisticated of chie s case unilie monchan ca
le kan outsed to as bassic vat is gresch cossists microin contraurges engud act 
uted a grelit wiqa troush jeeguor s went one six is zero nine othoen one seven n
xingly the maciater and and telp i arm

# Problem 0

Visualize the graph and allow you to debug the program

In [99]:
from IPython.display import clear_output, Image, display, HTML

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

In [9]:
num_nodes = 64

graph_1 = tf.Graph()
with graph_1.as_default():
    
    # Parameters: input, previous output and bias    
    ifcox = tf.Variable(tf.truncated_normal([vocabulary_size, 4 * num_nodes], -0.1, 0.1))
    ifcom = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    ifcob = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        all_gate = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob
        input_gate = tf.sigmoid(all_gate[:, 0: num_nodes])
        forget_gate = tf.sigmoid(all_gate[:, num_nodes: 2* num_nodes])
        update = all_gate[:, 2*num_nodes: 3* num_nodes]
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(all_gate[:, 3*num_nodes:])
        return output_gate * tf.tanh(state), state

    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]    # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_input, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [30]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph_1) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.294123 learning rate: 10.000000
Minibatch perplexity: 26.95
iun viz orhh he js  ycibntveelap gnmtz fzo r ddpvwoqt zjehesdeorhiwcoljtlly  a n
breabug ai scnn r ars ach bchtdj w hrnu fdemxte akvgem  zqobgxtbl btudcsteqmtfta
gu pcaseane nulhknjebohxw  ffwho   ttlqep iro ieeabqkh ccfhudoy  ui hev ri o rwx
yznhiaygnzob dkdnkwi t eqjfc nekl gzeflch nekiudkodrakb omceqldndkeo na oughs p 
fnitfc  firwhrvclogcbsyecb rpnkeliote  y dch thmsaush j   qiry plidxhma  i  umv 
Validation set perplexity: 19.83
Average loss at step 100: 2.582345 learning rate: 10.000000
Minibatch perplexity: 12.59
Validation set perplexity: 10.95
Average loss at step 200: 2.234396 learning rate: 10.000000
Minibatch perplexity: 8.38
Validation set perplexity: 8.87
Average loss at step 300: 2.071583 learning rate: 10.000000
Minibatch perplexity: 6.86
Validation set perplexity: 7.86
Average loss at step 400: 1.985008 learning rate: 10.000000
Minibatch perplexity: 6.74
Validation set per

Validation set perplexity: 4.93
Average loss at step 4500: 1.625344 learning rate: 10.000000
Minibatch perplexity: 5.17
Validation set perplexity: 4.83
Average loss at step 4600: 1.624266 learning rate: 10.000000
Minibatch perplexity: 4.99
Validation set perplexity: 4.74
Average loss at step 4700: 1.601149 learning rate: 10.000000
Minibatch perplexity: 5.44
Validation set perplexity: 4.84
Average loss at step 4800: 1.582035 learning rate: 10.000000
Minibatch perplexity: 5.12
Validation set perplexity: 4.87
Average loss at step 4900: 1.592833 learning rate: 10.000000
Minibatch perplexity: 4.97
Validation set perplexity: 4.80
Average loss at step 5000: 1.617651 learning rate: 1.000000
Minibatch perplexity: 5.37
chile war wroth of had be gram do policu intecre pboture and uba axany jemed in 
vel sp is getlaulds played asiabwab of the use a domi with of steamer as grompte
wnotory term in class i plane numment in dhounty janker wift thebran whowe of cl
s this pardions ebmossian alperpo the 

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

In [31]:
num_unrollings=10
num_alpha = len(string.ascii_lowercase) + 1
vocabulary_size = num_alpha * num_alpha

def bigram2Index(c1, c2):
    return  char2id(c1) * num_alpha + char2id(c2)

def index2Bigram(idx):
    c1 = id2char(idx // num_alpha)
    c2 = id2char(idx % num_alpha)
    return c1+c2

class BigramBatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [ offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
    
    def _next_txt_pos(self, i):
        return (i+1) % self._text_size
    
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            pos = self._cursor[b]
            c1 = self._text[pos]
            pos = self._next_txt_pos(pos)
            c2 = self._text[pos]
            batch[b, bigram2Index(c1, c2)] = 1.0
            self._cursor[b] = self._next_txt_pos(pos)
        return batch
    
    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches

def bigram_characters(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    return [index2Bigram(c) for c in np.argmax(probabilities, 1)]

def bigram_batches2string(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, bigram_characters(b))]
    return s

train_batches_2 = BigramBatchGenerator(train_text, batch_size, num_unrollings)
valid_batches_2 = BigramBatchGenerator(valid_text, 1, 1)

print(bigram_batches2string(train_batches_2.next()))
print(bigram_batches2string(train_batches_2.next()))
print(bigram_batches2string(valid_batches_2.next()))
print(bigram_batches2string(valid_batches_2.next()))


['ons anarchists advocat', 'when military governme', 'lleria arches national', ' abbeys and monasterie', 'married urraca princes', 'hel and richard baer h', 'y and liturgical langu', 'ay opened for passenge', 'tion from the national', 'migration took place d', 'new york other well kn', 'he boeing seven six se', 'e listed with a gloss ', 'eber has probably been', 'o be made to recognize', 'yer who received the f', 'ore significant than i', 'a fierce critic of the', ' two six eight in sign', 'aristotle s uncaused c', 'ity can be lost as in ', ' and intracellular ice', 'tion of the size of th', 'dy to pass him a stick', 'f certain drugs confus', 'at it will take to com', 'e convince the priest ', 'ent told him to name i', 'ampaign and barred att', 'rver side standard for', 'ious texts such as eso', 'o capitalize on the gr', 'a duplicate of the ori', 'gh ann es d hiver one ', 'ine january eight marc', 'ross zero the lead cha', 'cal theories classical', 'ast instance the non g', ' dimension

In [67]:
# bigram-based LSTM
# introduce dropout
num_nodes = 64
embedding_size = 128 # Dimension of the embedding vector.
dropout_keep_rate = 0.9

graph_2 = tf.Graph()
with graph_2.as_default():
    
    # Parameters: input, previous output and bias    
    ifcox = tf.Variable(tf.truncated_normal([embedding_size, 4 * num_nodes], -0.1, 0.1)) #represent a bigram using idx(c1) * 27 + idx(c2)
    ifcom = tf.Variable(tf.truncated_normal([num_nodes, 4 * num_nodes], -0.1, 0.1))
    ifcob = tf.Variable(tf.zeros([1, 4 * num_nodes]))
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -0.1, 0.1))
    # Classifier weights and biases.
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Definition of the cell computation.
    def lstm_cell(i, o, state):
        """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
        Note that in this formulation, we omit the various connections between the
        previous state and the gates."""
        all_gate = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob
        input_gate = tf.sigmoid(all_gate[:, 0: num_nodes])
        forget_gate = tf.sigmoid(all_gate[:, num_nodes: 2* num_nodes])
        update = all_gate[:, 2*num_nodes: 3* num_nodes]
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(all_gate[:, 3*num_nodes:])
        return output_gate * tf.tanh(state), state
    
    # Input data.
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(
            tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]    # labels are inputs shifted by one time step.

    # Unrolled LSTM loop.
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        #embed = tf.nn.embedding_lookup(embeddings, tf.argmax(i, dimension=1))
        embed = tf.matmul(i, embeddings)
        output, state = lstm_cell(tf.nn.dropout(embed, dropout_keep_rate), output, state)
        outputs.append(tf.nn.dropout(output, dropout_keep_rate))

    # State saving across unrollings.
    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        # Classifier.
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=tf.concat(train_labels, 0), logits=logits))

    # Optimizer.
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(
        10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(
        zip(gradients, v), global_step=global_step)

    # Predictions.
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling.
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    #sample_embed = tf.nn.embedding_lookup(embeddings, tf.argmax(sample_input, dimension=1))
    sample_embed = tf.matmul(sample_input, embeddings)
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes])))
    sample_output, sample_state = lstm_cell(
        sample_embed, saved_sample_output, saved_sample_state)
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                                                saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [68]:
batch_size=64
num_unrollings=10
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph_2) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        batches = train_batches_2.next()
        feed_dict = dict()
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
        mean_loss += l
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print(
                'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
            mean_loss = 0
            labels = np.concatenate(list(batches)[1:])
            print('Minibatch perplexity: %.2f' % float(
                np.exp(logprob(predictions, labels))))
            if step % (summary_frequency * 10) == 0:
                # Generate some samples.
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = bigram_characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(39):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += bigram_characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Measure validation set perplexity.
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches_2.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print('Validation set perplexity: %.2f' % float(np.exp(
                valid_logprob / valid_size)))

Initialized
Average loss at step 0: 6.592343 learning rate: 10.000000
Minibatch perplexity: 729.49
yrfndlaxaihlscfqbmqkvgupkjyo uvllphmlsarljmcjaklbhekchyzpvkhncjzrdxqyifadiqvzvtv
wmcwp olsxigjhxbobyemvcbkfhltjavaeeoqndtmmlunequmelj qnigkpzuv orrixez ykidyy yo
sbscaxzadivnzmhbm vffuxzorvt arrbplpssqtptkhehhyincfxlykocfpqcpjkz alcsrlhhswgig
mxnvowijolulnfyasfbrrtg pvhmnsfsddyglqzggchlll a hefxxfqeuvciwtupjudofelsfihqkop
lkkroaxpdkqlqtxdmdrfujsvlue dlrdapcwaurlaqnsuhg rbezbxpwhvjpytkn lkbyzdcnafktjfr
Validation set perplexity: 674.72
Average loss at step 100: 5.319384 learning rate: 10.000000
Minibatch perplexity: 124.71
Validation set perplexity: 140.27
Average loss at step 200: 4.644044 learning rate: 10.000000
Minibatch perplexity: 76.54
Validation set perplexity: 91.46
Average loss at step 300: 4.255024 learning rate: 10.000000
Minibatch perplexity: 59.58
Validation set perplexity: 74.69
Average loss at step 400: 4.006368 learning rate: 10.000000
Minibatch perplexity: 62.09
Validatio

Average loss at step 4400: 3.264284 learning rate: 10.000000
Minibatch perplexity: 29.49
Validation set perplexity: 22.96
Average loss at step 4500: 3.284672 learning rate: 10.000000
Minibatch perplexity: 25.54
Validation set perplexity: 22.96
Average loss at step 4600: 3.276545 learning rate: 10.000000
Minibatch perplexity: 27.23
Validation set perplexity: 22.35
Average loss at step 4700: 3.299826 learning rate: 10.000000
Minibatch perplexity: 22.44
Validation set perplexity: 23.28
Average loss at step 4800: 3.265827 learning rate: 10.000000
Minibatch perplexity: 28.56
Validation set perplexity: 22.72
Average loss at step 4900: 3.249930 learning rate: 10.000000
Minibatch perplexity: 28.77
Validation set perplexity: 24.05
Average loss at step 5000: 3.288751 learning rate: 1.000000
Minibatch perplexity: 32.78
vk boom jown have struw elacement however fact brdper bombers more also the jewi
wd a morfistal frek in the director techorited and ang this of stille to it is h
ture futlynnumause

---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In the following code, I will make use of the 2 classes from the Google English to French translator, they are seq2seq_model.py and data_utils.py


Please download the code from https://github.com/tensorflow/models/tree/master/tutorials/rnn/translate to local directory to run the code

In [9]:
import nltk
import re
import seq2seq_model as seq2seq_model

In [10]:
# Model parameters
learning_rate = 0.5
learning_rate_decay = 0.9
max_gradient_norm = 5
layer_size = 64     #Layer size
num_layers = 3      #Number of layer 
vocabulary_size = len(string.ascii_lowercase) + 4 # a-z, space, GO, PAD, EOS
bucket_size = 25
buckets = [(bucket_size,bucket_size)]  # define the encoder and decoder length
batch_size = 32

class InvertBatchGenerator(object):
    _EOS = 28
    _PAD = 27
    _GO = 29
    
    def __init__(self, sentences, batch_size, bucket_size):
        self._sentences = sentences
        self._sentence_size = len(sentences)
        self._batch_size = batch_size
        self._bucket_size = bucket_size 
        self._cursor = 0
    
    def _create_encoder_decoder_input(self, sentence, max_size):
        sent = self._sentences[self._cursor]
        sentence = ""
        sentence_r = ""
        for word in sent:
            s_word = re.sub("[^a-z ]+", "", word.lower()).strip()
            if(len(s_word) == 0):
                continue
            if(len(sentence) + len(s_word) + 1 > max_size):
                break
            sentence = sentence + ' ' + s_word.lower()
            sentence_r = sentence_r + ' ' + s_word.lower()[::-1]
        
        return sentence.strip(), sentence_r.strip()
        
    def _next_batch(self):
        """Return the following info in one-hot-encoding form
            the quick brown fox, eht kciuq nworb xof
        """
        sent, r_sent = self._create_encoder_decoder_input(self._sentences[self._cursor], self._bucket_size-2)
        self._cursor = (self._cursor + 1) % self._sentence_size
        
        padding_len = self._bucket_size - len(sent)
        pad_sent = []
        pad_r_sent = []
        pad_sent.extend(reversed([char2id(c) for c in sent] + ([self._PAD] *  padding_len)))
        pad_r_sent.extend([self._GO])
        pad_r_sent.extend([char2id(c) for c in r_sent])
        pad_r_sent.extend([self._EOS])
        pad_r_sent.extend([self._PAD] * (padding_len-2))
        
        return pad_sent, pad_r_sent
    
    def next(self):
        """Generate three output
            Encoder input: batch major, size = batch_size * encoder_input_size (bucket_size)
            Decoder input: batch major, size = batch_size * decoder_input_size (bucket_size)
            Target weight: batch major, size = batch_size * decoder_input_size (bucket_size)
        """
        encoder_inputs = []
        decoder_inputs = []
        for step in range(self._batch_size):
            encoder_input, decoder_input = self._next_batch()
            encoder_inputs.append(encoder_input)
            decoder_inputs.append(decoder_input)
            
        batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
        for length_idx in range(self._bucket_size):
            batch_encoder_inputs.append(
                np.array([encoder_inputs[batch_idx][length_idx] 
                          for batch_idx in range(self._batch_size)], dtype=np.int32))
            
        for length_idx in range(self._bucket_size):
            batch_decoder_inputs.append(
                np.array([decoder_inputs[batch_idx][length_idx] 
                          for batch_idx in range(self._batch_size)], dtype=np.int32))
        
            batch_weight = np.ones(self._batch_size, dtype=np.float32)
            for batch_idx in range(self._batch_size):
                if length_idx < self._bucket_size -1:
                    target = decoder_inputs[batch_idx][length_idx + 1]
                if length_idx == self._bucket_size - 1 or target == self._PAD:
                    batch_weight[batch_idx] = 0.0
            batch_weights.append(batch_weight)
            
        return batch_encoder_inputs, batch_decoder_inputs, batch_weights

def id2char_inv(dictid):
    if dictid > 0 and dictid <= 26:
        return chr(dictid + first_letter - 1)
    elif dictid == 0:
        return ' '
    elif dictid == InvertBatchGenerator._EOS:
        return 'E'
    elif dictid == InvertBatchGenerator._GO:
        return 'G'
    elif dictid == InvertBatchGenerator._PAD:
        return 'P'
    else:
        return ' '
    
def characters_inv(probabilities):
    """Turn a 1-hot encoding or a probability distribution over the possible
    characters back into its (most likely) character representation."""
    #return [id2char_inv(c) for c in np.argmax(probabilities)]
    return id2char_inv(np.argmax(probabilities))

def batches2string_inv(batches):
    """Convert a sequence of batches back into their (most likely) string
    representation."""
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, [id2char_inv(c) for c in b])]
        
    return s

#def batches2string(batches):
#    """Convert a sequence of batches back into their (most likely) string
#    representation."""
#    s = [''] * batches[0].shape[0]
#    for b in batches:
#        s = [''.join(x) for x in zip(s, characters(b))]
#    return s

valid_sentences = [["the", "quick", "brown", "fox"], 
                   ["I", "go", "to", "school", "by", "bus"]]
train_sentences = nltk.corpus.gutenberg.sents('shakespeare-caesar.txt')

train_batches_inv = InvertBatchGenerator(train_sentences, batch_size, bucket_size)
valid_batches_inv = InvertBatchGenerator(valid_sentences, batch_size, bucket_size)

print("Training set")
encoder_i, decoder_i, weight = train_batches_inv.next()
print(batches2string_inv(encoder_i))
print(batches2string_inv(decoder_i))

print("Validation set")
encoder_i, decoder_i, weight = valid_batches_inv.next()
print(batches2string_inv(encoder_i)[0:2])
print(batches2string_inv(decoder_i)[0:2])

Training set
['PPPsuiluj fo eidegart eht', 'PPPPPPPPPPPPPsumirp sutca', 'PPPPPPPPPPPPPamirp aneocs', 'PPPsullerum suiualf retne', 'PPPPPPPPPPPPPPPPPPsuiualf', 'PPPPPPeldi uoy emoh ecneh', 'PPPPPPPPton uoy wonk tahw', 'PPPPtra edart tahw ekaeps', 'PPPPPPPPPPPPPPPPPPPPPPrac', 'PPPPPPretneprac a ris yhw', 'PPPPPPPPPPPPPPPPPPPPPPrum', 'PPPPPrehtael yht si erehw', 'PPPPPPhtiw uoht tsod tahw', 'PPPera edart tahw ris uoy', 'PPPPPPPPPPPPPPPPPPPPPlboc', 'PPPPtcepser ni ris yleurt', 'PPPPPPPPPPPPPPPPPPPPPPrum', 'PPPPPPPtra edart tahw tub', 'PPPPPPPyltcerid em rewsna', 'PPPPPPPPPPPPPPPPPPPPPPboc', 'PPPPPPPi taht ris edart a', 'PPPPPPuoht edart tahw alf', 'PPPPPPPeuank ythguan uoht', 'PPPPPPPPPPPPPPPPPPPPPlboc', 'PPPPris uoy hceeseb i yan', 'PPPPPPPPPPPPPPPPPPPPPPrum', 'PPPPPyb uoht ts naem tahw', 'PPPPPPycwas uoht eem dnem', 'PPPPPPPPPPPPPPPPPPPPPPboc', 'PPPPPPPuoy elbboc ris yhw', 'PPPPrelboc a tra uoht alf', 'PPPPPPPPPPPPPPPPPPPPPPboc']
['Geht eidegart fo suilujEP', 'Gsutca sumirpEPPPPPPPPPPP',

In [11]:
def create_model():
     return seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size,
                                   target_vocab_size=vocabulary_size,
                                   buckets=buckets, # only 1 bucket
                                   size=layer_size,
                                   num_layers=num_layers,
                                   max_gradient_norm=max_gradient_norm,
                                   batch_size=batch_size,
                                   learning_rate=learning_rate,
                                   learning_rate_decay_factor=learning_rate_decay,
                                   use_lstm=True,
                                   forward_only=False)
model = create_model()

In [12]:
num_steps = 30001
summary_frequency = 3000

#def reverse_text(nb_steps):
#    with tf.Session() as session:
#        model = create_model()
#        tf.initialize_all_variables().run()
#        for step in xrange(nb_steps):
#            enc_inputs, dec_inputs, weights = get_batch()
#            _, loss, _ = model.step(session, enc_inputs, dec_inputs, weights, 0, False)
#            if step % 1000 == 1:
#                print('* step:', step, 'loss:', loss)
#                validate_model(text, model, session)
#        print('*** evaluation! loss:', loss)
#        validate_model(text, model, session)

with tf.Session() as session:
    print('Initialized')
    mean_loss = 0
    tf.initialize_all_variables().run()
    for step in range(num_steps):
        encoder_input , decoder_input, weights = train_batches_inv.next()
        _, loss, _ = model.step(session, encoder_input, decoder_input, weights, 0, False)
        mean_loss += loss
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
            # The mean loss is an estimate of the loss over the last few batches.
            print('Average loss at step %d: %f' % (step, mean_loss))
            mean_loss = 0
            encoder_inputs, target_decoder_input, _ = train_batches_inv.next()
            decoder_inputs = np.ones((bucket_size, batch_size), dtype=np.int32)
            decoder_inputs[1:, :] = InvertBatchGenerator._PAD
            decoder_inputs[0, : ] = InvertBatchGenerator._GO
            
            weights = np.zeros((bucket_size, batch_size), dtype=np.float32)
            weights[0,:] = 1.0
            _, _, output_logits = model.step(session, encoder_inputs, decoder_inputs, weights, 
                                             bucket_id=0, forward_only=True)
            
            #Display result
            print("=========================\t=========================")
            decoder_out_strs = batches2string_inv(target_decoder_input)
            for output_idx in range(len(output_logits)):
                output_logits_idx = [x[output_idx] for x in output_logits]
                outputs_char = [int(np.argmax(logit)) for logit in output_logits_idx]
                out_sentence = ''.join([id2char_inv(c) for c in outputs_char])
                out_sentence = out_sentence + '\t'
                out_sentence = out_sentence + decoder_out_strs[output_idx]
                print(out_sentence)
            print("=========================\t=========================")

print("Training finished at: ")
print(datetime.datetime.now())

Initialized
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Average loss at step 0: 3.412030
eeeEEEEEEEEEEEEEEEEEEEEEE	GainruhplacEPPPPPPPPPPPPP
eeeEEEEEEEEEEEEEEEEEEEEEE	GksacEPPPPPPPPPPPPPPPPPPP
 EEEEEEEEEEEEEEEEEEEEEEEE	Gecaep oh raseacEPPPPPPPP
eeEEEEEEEEEEEEEEEEEEEEEEE	GseacEPPPPPPPPPPPPPPPPPPP
eeeEEEEEEEEEEEEEEEEEEEEEE	GainruhplacEPPPPPPPPPPPPP
eeEEEEEEEEEEEEEEEEEEEEEEE	GplacEPPPPPPPPPPPPPPPPPPP
 EEEEEEEEEEEEEEEEEEEEEEEE	Gereeh ym drolEPPPPPPPPPP
eeEEEEEEEEEEEEEEEEEEEEEEE	GseacEPPPPPPPPPPPPPPPPPPP
  EEEEEEEEEEEE           	Gdnats uoy yltcerid niEPP
 EEEEEEEEEEEEEEEEEEEEEEEE	GoinotnaEPPPPPPPPPPPPPPPP
eeEEEEEEEEEEEEEEEEEEEEEEE	GtnaEPPPPPPPPPPPPPPPPPPPP
  EEEEEEEEEEEEEEEEEEEEEEE	Grasc ym drolEPPPPPPPPPPP
eeEEEEEEEEEEEEEEEEEEEEEEE	GseacEPPPPPPPPPPPPPPPPPPP
EEEEEEEEEEEEEEEEEEEEEEEEE	Gtegrof ton ni ruoyEPPPPP
eeEEEEEEEEEEEEEEEEEEEEEEE	GtnaEPPPPPPPPPPPPPPPPPPPP
     EEEEE               	Gi llahs rebmemer nehwEPP
eeEEEEEEEEEEEEEEEEEEEEEEE	GseacEPPPPPPPPPPPPPPPPP

Average loss at step 18000: 0.000183
kssaallllllEEEEEEEEEEEEEE	GksacEPPPPPPPPPPPPPPPPPPP
eeelllllssss i     EEEEEE	Gekaeps sdnah rof emEPPPP
yeeetbbtccaaaaaaaaaaaaaaa	Gyeht bats raseacEPPPPPPP
ssaiis     uuuuuuuuuuuuuu	Gseac te ut eturbEPPPPPPP
neet     laaaaaaaaaaaaaaa	Gneht llaf raseacEPPPPPPP
ssessEEEEEEEEEEEEEEEEEEEE	GseydEPPPPPPPPPPPPPPPPPPP
nncccEEEEEEEaaaaaaaaaaaaa	GnicEPPPPPPPPPPPPPPPPPPPP
yrrrrrruueeeeeeeeeeeeeuuu	Gytrebil emodeerfEPPPPPPP
isssiillEEEEEEEEEEEEEEEEE	GissacEPPPPPPPPPPPPPPPPPP
eeooo           ooooooooo	Gemos ot eht nommocEPPPPP
urbbEEEEEEEEEEEEEEEEEEEEE	GurbEPPPPPPPPPPPPPPPPPPPP
eeeluuu      ccuoooooo EE	Gelpoep dna srotanes ebEP
kssaallllllEEEEEEEEEEEEEE	GksacEPPPPPPPPPPPPPPPPPPP
oo               EEEEEEEE	Gog ot eht tiplupEPPPPPPP
cccd    iiiiiiisiluuuEEEE	Gced dna suissac ootEPPPP
urbbEEEEEEEEEEEEEEEEEEEEE	GurbEPPPPPPPPPPPPPPPPPPPP
eeeeee     uuuuullEEEEEEE	Gerehw s suilbupEPPPPPPPP
nncccEEEEEEEaaaaaaaaaaaaa	GnicEPPPPPPPPPPPPPPPPPPPP
eeeeeeeuuuuuuuddddddddddo	G

In [167]:
import imp

imp.reload(seq2seq_model)

<module 'seq2seq_model' from 'c:\\GitProjects\\PythonPlayground\\DataScience\\Tensorflow\\seq2seq_model.py'>

In [19]:
import datetime



2017-07-03 01:01:27.752803


In [None]:
#using python nltk english text corpora
#write a function to reverse every setenence



In [None]:
import tensorflow.models.rnn.translate.seq2seq_model as seq2seq_model

In [None]:
import string

num_nodes = 64
batch_size = 10

def create_model():
     return seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size,
                                   target_vocab_size=vocabulary_size,
                                   buckets=[(word_size + 1, word_size + 2)], # only 1 bucket
                                   size=num_nodes,
                                   num_layers=3,
                                   max_gradient_norm=5.0,
                                   batch_size=batch_size,
                                   learning_rate=0.5,
                                   learning_rate_decay_factor=0.99,
                                   use_lstm=True,
                                   forward_only=False)