In [1]:
from __future__ import print_function

import numpy as np
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
import time
import os
import urllib2
from ptb import reader

In [2]:
"""
Load and process data, utility functions
"""

file_url = "https://raw.githubusercontent.com/jcjohnson/torch-rnn/master/data/tiny-shakespeare.txt"
file_name = "data/tinyshakespeare.txt"

if not os.path.exists("data/"):
    os.makedirs("data/")

if not os.path.exists(file_name):
    response = urllib2.urlopen(file_url)

    fh = open(file_name, "w")
    fh.write(response.read())
    fh.close()

with open(file_name, "r") as f:
    raw_data = f.read()
    print("Data length:", len(raw_data))

vocab = set(raw_data)
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))

data = [vocab_to_idx[c] for c in raw_data]
del raw_data

def gen_epochs(n, num_steps, batch_size):
    for i in range(n):
        yield reader.ptb_iterator(data, batch_size, num_steps)

def reset_graph():
    if "sess" in globals() and sess:
        sess.close()
    tf.reset_default_graph()

def train_network(g, num_epochs, num_steps=200, batch_size=32, verbose=True, save=False):
    tf.set_random_seed(2345)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        training_losses = []
        for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps, batch_size)):
            training_loss = 0
            steps = 0
            training_state = None
            
            for X, Y in epoch:
                steps += 1
                feed_dict={g["x"]:X, g["y"]:Y}
                if training_state is not None:
                    feed_dict[g["init_state"]] = training_state
                training_loss_, training_state, _ = sess.run([g["total_loss"],
                                                              g["final_state"],
                                                              g["train_step"]],
                                                                feed_dict)
                training_loss += training_loss_
            
            if verbose:
                print("Average training loss for Epoch", idx, ": ", training_loss/steps)
            training_losses.append(training_loss/steps)
        
        if isinstance(save, str):
            g["saver"].save(sess, save)
                    
    return training_losses

Data length: 1115394


In [3]:
def build_basic_rnn_graph_with_list(
    state_size = 100,
    num_classes = vocab_size,
    batch_size = 32,
    num_steps = 200,
    learning_rate = 1e-4):
    
    reset_graph()
    
    x = tf.placeholder(tf.int32, [batch_size, num_steps], name="input_placeholder")
    y = tf.placeholder(tf.int32, [batch_size, num_steps], name="output_placeholder")

    """
    RNN Inputs
    """
    embeddings = tf.get_variable("embedding_matrix", [num_classes, state_size])
    rnn_inputs = [tf.squeeze(i) for i in tf.split(tf.nn.embedding_lookup(embeddings,x), num_steps, 1)]
    
    cell = tf.contrib.rnn.BasicRNNCell(num_units=state_size)
    init_state = cell.zero_state(batch_size, tf.float32)
    rnn_outputs, final_state = tf.contrib.rnn.static_rnn(cell, rnn_inputs, initial_state=init_state)
    
    # logits and predictions
    with tf.variable_scope("softmax"):
        W = tf.get_variable("W", [state_size, num_classes])
        b = tf.get_variable("b", [num_classes], initializer=tf.constant_initializer(0.0))

    logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]

    y_as_list = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(y, num_steps, 1)]
    
    
    # losses and train_step
    loss_weights = [tf.ones([batch_size]) for i in range(num_steps)]
    losses = tf.contrib.legacy_seq2seq.sequence_loss_by_example(logits, y_as_list, loss_weights)
    total_loss = tf.reduce_mean(losses)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
    
    return dict(x = x,
                y = y,
                init_state = init_state,
                final_state = final_state,
                total_loss = total_loss,
                train_step = train_step
               )

In [None]:
t = time.time()
build_basic_rnn_graph_with_list()
print("It took", time.time() - t, "seconds to build the graph.")

In [14]:
def build_multilayer_lstm_graph_with_list(
    state_size = 100,
    num_classes = vocab_size,
    batch_size = 32,
    num_steps = 200,
    num_layers = 3,
    learning_rate = 1e-4):
    
    reset_graph()
    
    x = tf.placeholder(tf.int32, [batch_size, num_steps], name="input_placeholder")
    y = tf.placeholder(tf.int32, [batch_size, num_steps], name="output_placeholder")

    """
    RNN Inputs
    """
    embeddings = tf.get_variable("embedding_matrix", [num_classes, state_size])
    rnn_inputs = [tf.squeeze(i) for i in tf.split(tf.nn.embedding_lookup(embeddings,x), num_steps, 1)]

    cell = tf.contrib.rnn.LSTMCell(num_units=state_size)
    cell = tf.contrib.rnn.MultiRNNCell([cell]*num_layers)
    init_state = cell.zero_state(batch_size, tf.float32)
    rnn_outputs, final_state = tf.contrib.rnn.static_rnn(cell, rnn_inputs, initial_state=init_state)

    # logits and predictions
    with tf.variable_scope("softmax"):
        W = tf.get_variable("W", [state_size, num_classes])
        b = tf.get_variable("b", [num_classes], initializer=tf.constant_initializer(0.0))

    logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]

    y_as_list = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(y, num_steps, 1)]

    # losses and train_step
    loss_weights = [tf.ones([batch_size]) for i in range(num_steps)]
    losses = tf.contrib.legacy_seq2seq.sequence_loss_by_example(logits, y_as_list, loss_weights)
    total_loss = tf.reduce_mean(losses)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
    
    return dict(x = x,
                y = y,
                init_state = init_state,
                final_state = final_state,
                total_loss = total_loss,
                train_step = train_step
               )

In [None]:
t = time.time()
build_multilayer_lstm_graph_with_list()
print("It took", time.time() - t, "seconds to build the graph.")

In [15]:
def build_multilayer_lstm_graph_with_dynamic_rnn(
    state_size = 100,
    num_classes = vocab_size,
    batch_size = 32,
    num_steps = 200,
    num_layers = 3,
    learning_rate = 1e-4):
    
    reset_graph()
    
    x = tf.placeholder(tf.int32, [batch_size, None], name="input_placeholder")
    y = tf.placeholder(tf.int32, [batch_size, None], name="output_placeholder")

    """
    RNN Inputs
    """
    embeddings = tf.get_variable("embedding_matrix", [num_classes, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings,x)

    cell = tf.contrib.rnn.LSTMCell(num_units=state_size)
    cell = tf.contrib.rnn.MultiRNNCell([cell]*num_layers)
    init_state = cell.zero_state(batch_size, tf.float32)
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)

    # logits and predictions
    with tf.variable_scope("softmax"):
        W = tf.get_variable("W", [state_size, num_classes])
        b = tf.get_variable("b", [num_classes], initializer=tf.constant_initializer(0.0))

    rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])
    y_reshaped  = tf.reshape(y, [-1])
    logits = tf.matmul(rnn_outputs, W) + b

    # losses and train_step
    loss_weights = [tf.ones([batch_size]) for i in range(num_steps)]
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y_reshaped, logits = logits)
    total_loss = tf.reduce_mean(losses)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
    
    return dict(x = x,
                y = y,
                init_state = init_state,
                final_state = final_state,
                total_loss = total_loss,
                train_step = train_step
               )

In [8]:
t = time.time()
build_multilayer_lstm_graph_with_dynamic_rnn()
print("It took", time.time() - t, "seconds to build the graph")

It took 1.38076400757 seconds to build the graph


In [9]:
g = build_multilayer_lstm_graph_with_dynamic_rnn()
t = time.time()
train_network(g,3)
print("It took", time.time() - t, "seconds to train for 3 epochs.")

Average training loss for Epoch 0 :  3.5336919008
Average training loss for Epoch 1 :  3.31842687471
Average training loss for Epoch 2 :  3.31412000384
It took 283.329195023 seconds to train for 3 epochs.


In [16]:
def ln(tensor, scope=None, epsilon=1e-5):
    """ Layer normalizes a 2D tensor along its second axis """
    assert(len(tensor.get_shape())==2)
    m,v = tf.nn.moments(tensor, [1], keep_dims=True)
    if not isinstance(scope, str):
        scope = ""
    with tf.variable_scope(scope + "layer_norm"):
        scale = tf.get_variable("scale", 
                                shape = [tensor.get_shape()[1]],
                                initializer = tf.constant_initializer(1))
        shift = tf.get_variable("shift",
                                shape = [tensor.get_shape()[1]],
                                initializer = tf.constant_initializer(0))
    
    LN_initial = (tensor - m) / tf.sqrt(v + epsilon)
    
    return LN_initial * scale + shift

In [18]:
class LayerNormalizedLSTMCell(tf.contrib.rnn.BasicRNNCell):
    """
    Adapted from TF's BasicRNNCell to use Layer Normalization.
    """
    
    def __init__(self, num_units, forget_bias=1.0, activation=tf.nn.tanh):
        self._num_units = num_units
        self._forget_bias = forget_bias
        self._activation = activation
    
    @property
    def state_size(self):
        return tf.contrib.rnn.LSTMStateTuple(self._num_units, self._num_units)
    
    @property
    def output_size(self):
        return self._num_units
    
    def __call__(self, inputs, state, scope=None):
        """Long short-term memory cell (LSTM)"""
        c, h = state
        
        concat = tf.nn.rnn._linear([inputs, h], 4 * self._num_units, False)

        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
        i, j, f, o = tf.split(value=concat, num_or_size_splits=4, axis=1)
        
        # add layer normalization to each gate
        i = ln(i, scope="i/")
        j = ln(j, scope="j/")
        f = ln(f, scope="f/")
        o = ln(o, scope="o/")

        new_c = (c * tf.nn.sigmoid(f + self._forget_bias) + tf.nn.sigmoid(i) *
                   self._activation(j))

        # add layer_normalization in calculation of new hidden state
        new_h = self._activation(ln(new_c, scope="new_h/")) * tf.nn.sigmoid(o)
        new_state = tf.contrib.rnn.LSTMStateTuple(new_c, new_h)
        
        return new_h, new_state


In [21]:
def build_graph(
    cell_type = None,
    state_size = 100,
    num_classes = vocab_size,
    batch_size = 32,
    num_steps = 200,
    num_layers = 3,
    build_with_dropout = False,
    learning_rate = 1e-4):
    
    reset_graph()
    
    x = tf.placeholder(tf.int32, [batch_size, None], name="input_placeholder")
    y = tf.placeholder(tf.int32, [batch_size, None], name="labels_placeholder")
    
    dropout = tf.constant(1.0)

    """
    RNN Inputs
    """
    embeddings = tf.get_variable("embedding_matrix", [num_classes, state_size])
    rnn_inputs = tf.nn.embedding_lookup(embeddings,x)

    if cell_type == "GRU":
        cell = tf.contrib.rnn.GRUCell(num_units=state_size)
    elif cell_type == "LSTM":
        cell = tf.contrib.rnn.LSTMCell(num_units=state_size)
    elif cell_type == "LN_LSTM":
        cell = LayerNormalizedLSTMCell(num_units=state_size)
    else:
        cell = tf.contrib.rnn.BasicRNNCell(num_units=state_size)
        
    if build_with_dropout:
        cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob = dropout)
        
    cell = tf.contrib.rnn.MultiRNNCell([cell]*num_layers)

    if build_with_dropout:
        cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob = dropout)
        
    init_state = cell.zero_state(batch_size, tf.float32)
    rnn_outputs, final_state = tf.nn.dynamic_rnn(cell, rnn_inputs, initial_state=init_state)

    # logits and predictions
    with tf.variable_scope("softmax"):
        W = tf.get_variable("W", [state_size, num_classes])
        b = tf.get_variable("b", [num_classes], initializer=tf.constant_initializer(0.0))

    rnn_outputs = tf.reshape(rnn_outputs, [-1, state_size])
    y_reshaped  = tf.reshape(y, [-1])
    logits = tf.matmul(rnn_outputs, W) + b
    predictions = tf.nn.softmax(logits)

    # losses and train_step
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y_reshaped, logits = logits)
    total_loss = tf.reduce_mean(losses)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
    
    return dict(x = x,
                y = y,
                init_state = init_state,
                final_state = final_state,
                total_loss = total_loss,
                train_step = train_step,
                preds = predictions,
                saver = tf.train.Saver()
               )

In [24]:
if not os.path.exists("saves/"):
    os.makedirs("saves/")

g = build_graph(cell_type="GRU", num_steps=80)
t = time.time()
losses = train_network(g, 20, num_steps=80, save="saves/GRU_20_epochs")
print("It took", time.time() - t, "seconds to train for 20 epochs.")
print("The average loss on the final epoch was:", losses[-1])

Average training loss for Epoch 0 :  3.35949899188
Average training loss for Epoch 1 :  2.75694669769
Average training loss for Epoch 2 :  2.43157069891
Average training loss for Epoch 3 :  2.28761342523
Average training loss for Epoch 4 :  2.19376960151
Average training loss for Epoch 5 :  2.12189557629
Average training loss for Epoch 6 :  2.06530556684
Average training loss for Epoch 7 :  2.01816824817
Average training loss for Epoch 8 :  1.97790918738
Average training loss for Epoch 9 :  1.9427579953
Average training loss for Epoch 10 :  1.91151862325
Average training loss for Epoch 11 :  1.88353007266
Average training loss for Epoch 12 :  1.85823383463
Average training loss for Epoch 13 :  1.83507604227
Average training loss for Epoch 14 :  1.81370611749
Average training loss for Epoch 15 :  1.79390396984
Average training loss for Epoch 16 :  1.77555054043
Average training loss for Epoch 17 :  1.75848735445
Average training loss for Epoch 18 :  1.74255770882
Average training loss f

In [39]:
def generate_characters(g, checkpoint, num_chars, prompt='A', pick_top_chars=None):
    """ Accepts a current character, initial state """
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        g["saver"].restore(sess, checkpoint)
        
        #state = None
        state = sess.run([g["final_state"]])
        current_char = vocab_to_idx[prompt]
        chars = [current_char]
        
        for i in range(num_chars):

            #if state is not None:
            feed_dict = {g["x"]: [[current_char]], g["init_state"]:state}
            #else:
            #    feed_dict = {g["x"]: [[current_char]]}
        
            preds, state = sess.run([g["preds"], g["final_state"]], feed_dict)
            
            if pick_top_chars is not None:
                p = np.squeeze(preds)
                p[np.argsort(p)[:-pick_top_chars]] = 0
                p = p / np.sum(p)
                current_char = np.random.choice(vocab_size,1,p=p)[0]
            else:
                current_char = np.random.choice(vocab_size,1,p=np.squeeze(preds))[0]
            
            chars.append(current_char)
        
    chars = map(lambda x:idx_to_vocab[x], chars)
    print("".join(chars))
    return("".join(chars))
    
        
        

In [38]:
g = build_graph(cell_type="GRU", num_steps=1, batch_size=1)
generate_characters(g, "saves/GRU_20_epochs", 750, prompt="A", pick_top_chars=5)

TypeError: The value of a feed cannot be a tf.Tensor object. Acceptable feed values include Python scalars, strings, lists, or numpy ndarrays.

In [32]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    g["saver"].restore(sess, "saves/GRU_20_epochs")
    print(g["final_state"])

(<tf.Tensor 'rnn/while/Exit_2:0' shape=(1, 100) dtype=float32>, <tf.Tensor 'rnn/while/Exit_3:0' shape=(1, 100) dtype=float32>, <tf.Tensor 'rnn/while/Exit_4:0' shape=(1, 100) dtype=float32>)
