# DeepLearning 03b. Recurrent Neural Nets (RNN)

**NB**: This is a simple demo. Hyperparams are not tuned for optimal performance.

* **Implementation 4d**: RNN with Tensorflow (simple RNN & LSTM with naive padding)
    * *Source*: 
        * Erik Hallstr&ouml;m's blog: https://medium.com/@erikhallstrm/tensorflow-rnn-api-2bb31821b185#.qg4y5kgbq.
        * R2RT's blog: http://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html.
    * *Sections*:
        * Simple RNN
        * LSTM
        * Bi-LSTM
        * Multi-Layer LSTM
    * *Contribution*:
        * Both bloggers are brilliant, but neither did a simple POS-tagger example for non-technical linguists.

* **Implementation 4e**: RNN with Tensorflow (dynamic rnn)
    * *Source*: as above.
    * *Sections*:
        * Dynamic Multi-Layer LSTM
        * Dynamic Bi-LSTM
    * *Contribution*: 
        * Add dynamic fitting to sequences with varied lengths.
        * Add bi-directional architecture.

In [3]:
import sys, nltk
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus import brown
from itertools import chain
from spacy.en import English

## I. Implementation 4d

### A. Simple RNN

* Sample Test

##### LOAD DATA

In [2]:
# LOAD DATA

sents = brown.sents()

# DATA NORMALIZATION (LEMMATIZATION)

# set limit on vocab size (for demo purpose)
vocabulary_size = 10000
# handle unknown token and corresponding label
pad_token = "<PAD>"
pad_label = "PAD"
unknown_token = "<UNK>"
unknown_label = "UNK"
# start/end tokens (cf. Jurafsky & Martin on n-gram language models)
sentence_start_token = "<S>"
sentence_end_token = "</S>"

# load parser
parser = English()
# lemmatize
X, Y = [], []
for sent in sents:
    words = ' '.join(sent) # list of words -> sent as string, for spacy parser.
    parsed_sent = parser(unicode(words))
    lemmas = [token.lemma_ for token in parsed_sent]
    labels = [token.pos_ for token in parsed_sent] # use spacy's tagging as target.
    X.append(lemmas)
    Y.append(labels)

# build word dictionary for lookup
word_freq = nltk.FreqDist(chain(*X))    
vocab = word_freq.most_common(vocabulary_size - 1) # [(w,freq)...]. leave 1 slot for <UNK>.
i2w = [pad_token] + [elem[0] for elem in vocab] + [unknown_token]
w2i = {w:i for i,w in enumerate(i2w)}
# replace words under frequency cut to <UNK>
for i,sent in enumerate(X):
    X[i] = [w if w in w2i else unknown_token for w in sent]
    
# build label dictionary for lookup
label_vocab = list(set(chain(*Y)))
i2l = [pad_label] + [x for x in label_vocab] + [unknown_label]
l2i = dict([(l,i) for i,l in enumerate(i2l)])
label_vocabulary_size = len(l2i)
for i,labels in enumerate(Y):
    Y[i] = [l for l in labels]

# one-hot encoding: word/label -> word/label index
train_test_split = (int)(len(X)*0.95)
X_encoded = np.asarray([[w2i[w] for w in sent] for sent in X])
Y_encoded = np.asarray([[l2i[unknown_label] if X[i][j]==unknown_token else l2i[l]
                       for j,l in enumerate(labels)] for i,labels in enumerate(Y)])
X_train = X_encoded[:train_test_split]
Y_train = Y_encoded[:train_test_split]
X_test = X_encoded[train_test_split:]
Y_test = Y_encoded[train_test_split:]

##### MAKE DATA ITERATORS

In [4]:
# PANDA DATAFRAME (sequence lenth for later: dynamic rnn)

df_train = pd.DataFrame({'X':X_train, 'Y':Y_train})
df_train['seq_len'] = map(lambda x:len(x), df_train['X'])

df_test = pd.DataFrame({'X':X_test, 'Y':Y_test})
df_test['seq_len'] = map(lambda x:len(x), df_test['X'])

df_train.head()

Unnamed: 0,X,Y,seq_len
0,"[1, 4943, 621, 2075, 1658, 45, 1805, 34, 1568,...","[6, 5, 5, 5, 5, 15, 5, 6, 2, 3, 5, 10, 14, 2, ...",26
1,"[1, 1658, 533, 45, 9, 378, 17, 188, 10000, 11,...","[6, 2, 1, 15, 3, 2, 9, 2, 16, 3, 6, 5, 5, 5, 9...",47
2,"[1, 1888, 17, 2056, 378, 1658, 10, 4, 509, 25,...","[6, 5, 9, 5, 2, 2, 15, 15, 15, 3, 5, 5, 5, 5, ...",41
3,"[15, 76, 8, 1683, 5693, 5, 97, 299, 4, 405, 14...","[9, 1, 6, 14, 2, 3, 14, 2, 15, 15, 9, 9, 6, 2,...",37
4,"[1, 1658, 45, 16, 32, 112, 11, 111, 5, 1984, 2...","[6, 2, 15, 4, 15, 15, 3, 14, 3, 5, 10, 2, 13, ...",25


In [5]:
# BASE ITERATOR

class DataIterator():
    
    def __init__(self, df):
        self.df = df
        self.size = len(self.df)
        self.epochs = 0
        self.shuffle()
    
    def shuffle(self):
        # sample 100% with different index, but not add new index column
        self.df = self.df.sample(frac=1).reset_index(drop=True) 
        self.cursor = 0
    
    def next_batch(self, n):
        if self.cursor+n-1 > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.ix[self.cursor:self.cursor+n-1]
        self.cursor += n
        return res['X'], res['Y'], res['seq_len']
    
# EXAMPLE
d = DataIterator(df_train)
t = d.next_batch(3)

print 'X sequences:\n', t[0]
print 'Y sequences:\n', t[1]
print 'sequence lengths:\n', t[2]

X sequences:
0                     [15, 26, 32, 21, 119, 63, 14, 3]
1    [27, 4, 21, 851, 7, 94, 394, 5, 3733, 22, 1, 1...
2    [622, 12, 496, 118, 1, 10000, 11, 1, 10000, 62...
Name: X, dtype: object
Y sequences:
0                           [9, 4, 15, 1, 15, 1, 9, 9]
1    [6, 15, 1, 14, 3, 6, 2, 3, 2, 3, 6, 2, 3, 6, 1...
2    [1, 4, 15, 3, 6, 16, 3, 6, 16, 15, 15, 12, 5, ...
Name: Y, dtype: object
sequence lengths:
0     8
1    17
2    19
Name: seq_len, dtype: int64


In [9]:
# PADDED ITERATOR

max_len = max(max(df_train['seq_len']), max(df_test['seq_len'])) # longest sent in corpus.

class FullPaddedDataIterator(DataIterator):
    
    def next_batch(self, n):
        
        if self.cursor+n > self.size:
            self.epochs += 1
            self.shuffle()
        res = self.df.ix[self.cursor:self.cursor+n-1]
        self.cursor += n
        
        maxlen = max_len 
        x = np.zeros([n, maxlen], dtype=np.int32) # empty container.
        y = np.zeros([n, maxlen], dtype=np.int32)
        for i, (x_i,y_i) in enumerate(zip(x,y)):
            x_i[:res['seq_len'].values[i]] = res['X'].values[i] # only fill where there are sequence values.
            y_i[:res['seq_len'].values[i]] = res['Y'].values[i]
        
        return x, y, res['seq_len']
    
# EXAMPLE
d = FullPaddedDataIterator(df_train)
t = d.next_batch(1)

print 'X sequences:\n', t[0]
print 'Y sequences:\n', t[1]
print 'sequence lengths:\n', t[2]

X sequences:
[[  26   10  246   18 7223  640   77  115    2  185    1 2443   77    1
  4180   99  101    2  322   18    1  297   17  527   17  761 2716   13
    95 1419    5 6961   22 1659  386    3    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]]
Y sequences:
[[ 4 15 15  3  5 12  2  1  9  3  6  2  2  6  5

##### BUILD COMPUTATIONAL GRAPH

In [20]:
# HOUSE CLEANING

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
# Clean existing graph before start.
reset_graph()

# CONFIGS

num_epochs = 2
state_size = 100 # i.e. dimension of hidden layer.
num_classes = label_vocabulary_size
batch_size = 100

# DEFINE GRAPH

# Inputs
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, max_len])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, max_len])
init_state = tf.placeholder(tf.float32, [batch_size, state_size])
# Projection to embedding space
W = tf.Variable(np.random.rand(1+state_size, state_size), dtype=tf.float32) # 1+state_size: one-hot size + hidden size.
b = tf.Variable(np.zeros((1,state_size)), dtype=tf.float32)
# Projection to output space
W2 = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b2 = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)

# Unpack data to slices (shape=(batch_size,1), 1 for one-hot size) across batch members
#   cf. tf.unpack?
inputs_series = tf.unpack(batchX_placeholder, axis=1)
labels_series = tf.unpack(batchY_placeholder, axis=1)
# ALTERNATIVE SOL: format data such that Tensorflow cell can create state-state chain
#   only used when also using Tensorflow's cell
# inputs_series = tf.split(1, max_len, batchX_placeholder)

# Build RNN pipeline: state-state concatenation
current_state = init_state
states_series = []
for current_input in inputs_series:
    current_input = tf.reshape(current_input, [batch_size, 1])
    input_and_state_concatenated = tf.concat(1, [current_input, current_state])
    # projecting state and one-hot input at the same time.
    # NB: broadcasted addition.
    next_state = tf.tanh(tf.matmul(input_and_state_concatenated, W) + b)
    states_series.append(next_state)
    current_state = next_state
# ALTERNATIVE SOL: using Tensorflow's ready-made
#   must be used with formatted input described above
# cell = tf.nn.rnn_cell.BasicRNNCell(state_size)
# states_series, current_state = tf.nn.rnn(cell, inputs_series, init_state)

# Forward pass
logits_series = [tf.matmul(state, W2) + b2 for state in states_series] 
predictions_series = [tf.nn.softmax(logits) for logits in logits_series]
# Evaluation
correct = [tf.equal(tf.argmax(pred,1), tf.cast(true,tf.int64)) 
           for pred,true in zip(predictions_series, labels_series)]
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# Loss function
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels) 
          for logits, labels in zip(logits_series,labels_series)]
total_loss = tf.reduce_mean(losses)
# Set training method
train_step = tf.train.AdagradOptimizer(0.3).minimize(total_loss)

##### TRAINING & EVALUATION

In [81]:
%%time

with tf.Session() as sess:
    # Initialize graph
    sess.run(tf.global_variables_initializer())
    # Create data readers
    tr = FullPaddedDataIterator(df_train)
    te = FullPaddedDataIterator(df_test)
    # Record keepers
    tr_losses, te_losses = [], []
    tr_accuracies, te_accuracies = [], []    
    step = 0
    current_epoch = 0
    # Training
    init_state_ = np.zeros((batch_size, state_size))
    while current_epoch < num_epochs:
        step += 1
        tr_x,tr_y,_ = tr.next_batch(batch_size) # _ for sequence length. not used yet.
        total_loss_, train_step_, init_state_, accuracy_ = sess.run(
            [total_loss, train_step, init_state, accuracy],
            feed_dict = {batchX_placeholder:tr_x,
                         batchY_placeholder:tr_y,
                         init_state:init_state_}
            )
        tr_losses.append(total_loss_)
        tr_accuracies.append(accuracy_)
        if step % 100 == 0:
            print "Avg Training Loss at", step, ':', np.mean(tr_losses)
            print "Avg Training Accuracy at", step, ':', np.mean(tr_accuracies)
            te_x,te_y,_ = te.next_batch(batch_size) # randomly sample 100 to evaluate.
            total_loss_, train_step_, init_state_, accuracy_ = sess.run(
                [total_loss, train_step, init_state, accuracy],
                feed_dict = {batchX_placeholder:te_x,
                             batchY_placeholder:te_y,
                             init_state:init_state_}
                )
            te_losses.append(total_loss_)
            te_accuracies.append(accuracy_)
            print "Avg Test Loss at", step, ':', np.mean(te_losses)
            print "Avg Test Accuracy at", step, ':', np.mean(te_accuracies)
            print
        if tr.epochs > current_epoch: # go to the next epoch.
            current_epoch += 1
            step = 0
    print "Final Avg Training Loss:", np.mean(tr_losses)
    print "Final Avg Training Accuracy:", np.mean(tr_accuracies)   
    print "Final Avg Test Loss:", np.mean(te_losses)
    print "Final Avg Test Accuracy:", np.mean(te_accuracies)

### B. SINGLE-LAYER LSTM 

* Sample Test

In [99]:
# HOUSE CLEANING

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
# Clean existing graph before start.
reset_graph()

# CONFIGS

num_epochs = 2
state_size = 100 # i.e. dimension of hidden layer.
num_classes = label_vocabulary_size
batch_size = 100

# DEFINE GRAPH

# Inputs
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, max_len])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, max_len])

# LSTM states
#   cell state: memory cell.
#   hidden state: just the same hidden layer as in simple RNN.
cell_state = tf.placeholder(tf.float32, [batch_size, state_size])
hidden_state = tf.placeholder(tf.float32, [batch_size, state_size])
init_state = tf.nn.rnn_cell.LSTMStateTuple(cell_state, hidden_state)

# Projection to output space
#   NB: these are same as W2, b2 above. now the to-embedding projection is automatic
W = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)

# Unpack data to slices (shape=(batch_size,1), 1 for one-hot size) across batch members
#   cf. tf.unpack?
inputs_series = tf.split(1, max_len, batchX_placeholder)
labels_series = tf.unpack(batchY_placeholder, axis=1)
# Build RNN pipeline: state-state concatenation
cell = tf.nn.rnn_cell.BasicLSTMCell(state_size)
states_series, current_state = tf.nn.rnn(cell, inputs_series, init_state)

# Forward pass
logits_series = [tf.matmul(state, W) + b for state in states_series] 
predictions_series = [tf.nn.softmax(logits) for logits in logits_series]
# Evaluation
correct = [tf.equal(tf.argmax(pred,1), tf.cast(true,tf.int64)) 
           for pred,true in zip(predictions_series, labels_series)]
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# Loss function
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels) 
          for logits, labels in zip(logits_series,labels_series)]
total_loss = tf.reduce_mean(losses)
# Set training method
train_step = tf.train.AdagradOptimizer(0.3).minimize(total_loss)

In [80]:
%%time

with tf.Session() as sess:
    # Initialize graph
    sess.run(tf.global_variables_initializer())
    # Create data readers
    tr = FullPaddedDataIterator(df_train)
    te = FullPaddedDataIterator(df_test)
    # Record keepers
    tr_losses, te_losses = [], []
    tr_accuracies, te_accuracies = [], []    
    step = 0
    current_epoch = 0
    # Training
    cell_state_ = np.zeros((batch_size, state_size))
    hidden_state_ = np.zeros((batch_size, state_size))
    
    while current_epoch < num_epochs:
        step += 1
        tr_x,tr_y,_ = tr.next_batch(batch_size) # _ for sequence length. not used yet.
        total_loss_, train_step_, init_state_, accuracy_ = sess.run(
            [total_loss, train_step, init_state, accuracy],
            feed_dict = {batchX_placeholder:tr_x,
                         batchY_placeholder:tr_y,
                         cell_state:cell_state_,
                         hidden_state:hidden_state_}
            )
        tr_losses.append(total_loss_)
        tr_accuracies.append(accuracy_)
        if step % 100 == 0:
            print "Avg Training Loss at", step, ':', np.mean(tr_losses)
            print "Avg Training Accuracy at", step, ':', np.mean(tr_accuracies)
            te_x,te_y,_ = te.next_batch(batch_size) # randomly sample 100 to evaluate.
            total_loss_, train_step_, init_state_, accuracy_ = sess.run(
                [total_loss, train_step, init_state, accuracy],
                feed_dict = {batchX_placeholder:te_x,
                             batchY_placeholder:te_y,
                             cell_state:cell_state_,
                             hidden_state:hidden_state_}
                )
            te_losses.append(total_loss_)
            te_accuracies.append(accuracy_)
            print "Avg Test Loss at", step, ':', np.mean(te_losses)
            print "Avg Test Accuracy at", step, ':', np.mean(te_accuracies)
            print
        if tr.epochs > current_epoch: # go to the next epoch.
            current_epoch += 1
            step = 0
    print "Final Avg Training Loss:", np.mean(tr_losses)
    print "Final Avg Training Accuracy:", np.mean(tr_accuracies)   
    print "Final Avg Test Loss:", np.mean(te_losses)
    print "Final Avg Test Accuracy:", np.mean(te_accuracies)

### C. BI-SLTM

* Sample Test

In [208]:
# HOUSE CLEANING

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
# Clean existing graph before start.
reset_graph()

# CONFIGS

num_epochs = 2
state_size = 100 # i.e. dimension of hidden layer.
num_classes = label_vocabulary_size
batch_size = 100

# DEFINE GRAPH

# Inputs
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, max_len])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, max_len])

# LSTM states
#   cell state: memory cell.
#   hidden state: just the same hidden layer as in simple RNN.
#   fwd & bwd need to both take the same init.
cell_state = tf.placeholder(tf.float32, [batch_size, state_size])
hidden_state = tf.placeholder(tf.float32, [batch_size, state_size])
init_state = tf.nn.rnn_cell.LSTMStateTuple(cell_state, hidden_state) 
    # NB: init_state per se is not the tensor to be trained
    #     it is cell_state & hidden_state!

# Projection to output space
#   NB: these are same as W2, b2 above. now the to-embedding projection is automatic by Tensorflow.
W = tf.Variable(np.random.rand(state_size*2, num_classes),dtype=tf.float32) # *2: combined weights for fwd & bwd.
b = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)

# Unpack data to slices (shape=(batch_size,1), 1 for one-hot size) across batch members
#   cf. tf.unpack?
inputs_series = tf.split(1, max_len, batchX_placeholder)
labels_series = tf.unpack(batchY_placeholder, axis=1)
# Build Bi-LSTM pipeline: state-state concatenation
with tf.variable_scope('Bi-LSTM'): # not a must. added for the convenience of tensorboard visualization.
    fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(state_size)
    bwd_cell = tf.nn.rnn_cell.BasicLSTMCell(state_size)
    states_series, fwd_current_state, bwd_current_state = tf.nn.bidirectional_rnn(fwd_cell, bwd_cell, inputs_series, 
                                                                                  initial_state_fw=init_state,
                                                                                  initial_state_bw=init_state)
# Forward pass
logits_series = [tf.matmul(state, W) + b for state in states_series] 
predictions_series = [tf.nn.softmax(logits) for logits in logits_series]
# Evaluation
correct = [tf.equal(tf.argmax(pred,1), tf.cast(true,tf.int64)) 
           for pred,true in zip(predictions_series, labels_series)]
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# Loss function
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels) 
          for logits, labels in zip(logits_series,labels_series)]
total_loss = tf.reduce_mean(losses)
# Set training method
train_step = tf.train.AdagradOptimizer(0.3).minimize(total_loss)

In [220]:
%%time

with tf.Session() as sess:
    # Initialize graph
    sess.run(tf.global_variables_initializer())
    # Create data readers
    tr = FullPaddedDataIterator(df_train)
    te = FullPaddedDataIterator(df_test)
    # Record keepers
    tr_losses, te_losses = [], []
    tr_accuracies, te_accuracies = [], []    
    step = 0
    current_epoch = 0
    # Training
    cell_state_ = np.zeros((batch_size, state_size))
    hidden_state_ = np.zeros((batch_size, state_size))
    
    while current_epoch < num_epochs:
        step += 1
        tr_x,tr_y,_ = tr.next_batch(batch_size) # _ for sequence length. not used yet.
        total_loss_, train_step_, init_state_, accuracy_ = sess.run(
            [total_loss, train_step, init_state, accuracy],
            feed_dict = {batchX_placeholder:tr_x,
                         batchY_placeholder:tr_y,
                         cell_state:cell_state_,
                         hidden_state:hidden_state_}
            ) # NB: in decoding bidirectional networks, ALWAYS feed cell_state & hidden_state rather than init_state!
        tr_losses.append(total_loss_)
        tr_accuracies.append(accuracy_)
        if step % 100 == 0:
            print "Avg Training Loss at", step, ':', np.mean(tr_losses)
            print "Avg Training Accuracy at", step, ':', np.mean(tr_accuracies)
            te_x,te_y,_ = te.next_batch(batch_size) # randomly sample 100 to evaluate.
            total_loss_, train_step_, init_state_, accuracy_ = sess.run(
                [total_loss, train_step, init_state, accuracy],
                feed_dict = {batchX_placeholder:te_x,
                             batchY_placeholder:te_y,
                             cell_state:cell_state_,
                             hidden_state:hidden_state_}
                )
            te_losses.append(total_loss_)
            te_accuracies.append(accuracy_)
            print "Avg Test Loss at", step, ':', np.mean(te_losses)
            print "Avg Test Accuracy at", step, ':', np.mean(te_accuracies)
            print
        if tr.epochs > current_epoch: # go to the next epoch.
            current_epoch += 1
            step = 0
    print "Final Avg Training Loss:", np.mean(tr_losses)
    print "Final Avg Training Accuracy:", np.mean(tr_accuracies)   
    print "Final Avg Test Loss:", np.mean(te_losses)
    print "Final Avg Test Accuracy:", np.mean(te_accuracies)

### D. MULTI-LAYER LSTM

* Full Test

In [223]:
# HOUSE CLEANING

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
# Clean existing graph before start.
reset_graph()

# CONFIGS

num_epochs = 2
state_size = 100 # i.e. dimension of hidden layer.
num_classes = label_vocabulary_size
batch_size = 100

num_layers = 2

# DEFINE GRAPH

# Inputs
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, max_len])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, max_len])

# LSTM states
#   2: for cell and hidden states.
init_state = tf.placeholder(tf.float32, [num_layers, 2, batch_size, state_size])
state_per_layer_list = tf.unpack(init_state, axis=0)
rnn_tuple_state = tuple(
    [tf.nn.rnn_cell.LSTMStateTuple(state_per_layer_list[idx][0], state_per_layer_list[idx][1])
     for idx in range(num_layers)]
) # distribute cell_state, hidden state to each layer.

# Projection to output space
#   NB: these are same as W2, b2 above. now the to-embedding projection is automatic by Tensorflow.
W = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)

# Unpack data to slices (shape=(batch_size,1), 1 for one-hot size) across batch members
#   cf. tf.unpack?
inputs_series = tf.split(1, max_len, batchX_placeholder)
labels_series = tf.unpack(batchY_placeholder, axis=1)
# Build RNN pipeline: state-state concatenation
cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)
cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
states_series, current_state = tf.nn.rnn(cell, inputs_series, initial_state=rnn_tuple_state)

# Forward pass
logits_series = [tf.matmul(state, W) + b for state in states_series] 
predictions_series = [tf.nn.softmax(logits) for logits in logits_series]
# Evaluation
correct = [tf.equal(tf.argmax(pred,1), tf.cast(true,tf.int64)) 
           for pred,true in zip(predictions_series, labels_series)]
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# Loss function
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels) 
          for logits, labels in zip(logits_series,labels_series)]
total_loss = tf.reduce_mean(losses)
# Set training method
train_step = tf.train.AdagradOptimizer(0.3).minimize(total_loss)

In [229]:
%%time

with tf.Session() as sess:
    # Initialize graph
    sess.run(tf.global_variables_initializer())
    # Create data readers
    tr = FullPaddedDataIterator(df_train)
    # Record keepers
    tr_losses = []
    tr_accuracies = []    
    step = 0
    current_epoch = 0
    # Training
    init_state_ = np.zeros((num_layers, 2, batch_size, state_size))
    
    while current_epoch < num_epochs:
        step += 1
        tr_x,tr_y,_ = tr.next_batch(batch_size) 
        total_loss_, train_step_, init_state_, accuracy_ = sess.run(
            [total_loss, train_step, init_state, accuracy],
            feed_dict = {batchX_placeholder:tr_x,
                         batchY_placeholder:tr_y,
                         init_state:init_state_}
            )        
        tr_losses.append(total_loss_)
        tr_accuracies.append(accuracy_)
        if step % 100 == 0:
            print "Avg Training Loss at", step, ':', np.mean(tr_losses)
            print "Avg Training Accuracy at", step, ':', np.mean(tr_accuracies)
            te_losses = []
            te_accuracies = []
            te = FullPaddedDataIterator(df_test)
            te.shuffle()
            while te.epochs==0:
                te_x,te_y,_ = te.next_batch(batch_size) # randomly sample 100 to evaluate.
                total_loss_, init_state_, accuracy_ = sess.run(
                    [total_loss, init_state, accuracy],
                    feed_dict = {batchX_placeholder:te_x,
                                 batchY_placeholder:te_y,
                                 init_state:init_state_}
                    )
                te_losses.append(total_loss_)
                te_accuracies.append(accuracy_)
            print "Avg Test Loss at", step, ':', np.mean(te_losses)
            print "Avg Test Accuracy at", step, ':', np.mean(te_accuracies)
            print            
        if tr.epochs > current_epoch: # go to the next epoch.
            current_epoch += 1
            step = 0
    print "Final Avg Training Loss:", np.mean(tr_losses)
    print "Final Avg Training Accuracy:", np.mean(tr_accuracies)   
    print "Final Avg Test Loss:", np.mean(te_losses)
    print "Final Avg Test Accuracy:", np.mean(te_accuracies)

## II. Implementation 4e

* NB: same data iterator as before.

### A. DYNAMIC MULTI-LAYER LSTM

* Sample Test

In [222]:
# HOUSE CLEANING

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
# Clean existing graph before start.
reset_graph()

# CONFIGS

num_epochs = 2
state_size = 100 # i.e. dimension of hidden layer.
num_classes = label_vocabulary_size
batch_size = 100

num_layers = 2

# DEFINE GRAPH

# Inputs
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, max_len])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, max_len])
batchSeqlen_placeholder = tf.placeholder(tf.int32, [batch_size])

# LSTM states
#   2: for cell and hidden states.
init_state = tf.placeholder(tf.float32, [num_layers, 2, batch_size, state_size])
state_per_layer_list = tf.unpack(init_state, axis=0)
rnn_tuple_state = tuple(
    [tf.nn.rnn_cell.LSTMStateTuple(state_per_layer_list[idx][0], state_per_layer_list[idx][1])
     for idx in range(num_layers)]
) # distribute cell_state, hidden state to each layer.

# Projection to output space
#   NB: these are same as W2, b2 above. now the to-embedding projection is automatic by Tensorflow.
W = tf.Variable(np.random.rand(state_size, num_classes),dtype=tf.float32)
b = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)

# Unpack data to slices (shape=(batch_size,1), 1 for one-hot size) across batch members
#   cf. tf.unpack?
# Build RNN pipeline: state-state concatenation
cell = tf.nn.rnn_cell.LSTMCell(state_size, state_is_tuple=True)
cell = tf.nn.rnn_cell.MultiRNNCell([cell] * num_layers, state_is_tuple=True)
# Dynamic RNN architecture
#   tf.expand_dims(batchX_placeholder, -1): dynamic_rnn takes input with [batch_size, max_len, input_size],
#                                           we thus add 1 extra dimension [input_size].
states_series, current_state = tf.nn.dynamic_rnn(cell, tf.expand_dims(batchX_placeholder, -1),
                                                 sequence_length=batchSeqlen_placeholder,
                                                 initial_state=rnn_tuple_state)
states_series = tf.reshape(states_series, [-1, state_size]) # size squeezed to [batch_size*max_len, state_size].

# Forward pass 
logits = tf.matmul(states_series, W) + b
labels = tf.reshape(batchY_placeholder, [-1]) # make labels into one long list.
logits_series = tf.unpack(tf.reshape(logits, [batch_size, max_len, num_classes]), axis=1)
predictions_series = [tf.nn.softmax(logit) for logit in logits_series]
# Evaluation
preds_list = [tf.argmax(pred,1) for pred in predictions_series]
preds_concat = tf.concat(0, preds_list) # concatenate batch predictions into one list to match the format of labels.
correct = tf.equal(preds_concat, tf.cast(labels,tf.int64))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# Loss function
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels)
total_loss = tf.reduce_mean(losses)
# Set training method
train_step = tf.train.AdagradOptimizer(1e-4).minimize(total_loss)

In [76]:
%%time

with tf.Session() as sess:
    # Initialize graph
    sess.run(tf.global_variables_initializer())
    # Create data readers
    tr = FullPaddedDataIterator(df_train)
    te = FullPaddedDataIterator(df_test)
    # Record keepers
    tr_losses, te_losses = [], []
    tr_accuracies, te_accuracies = [], []    
    step = 0
    current_epoch = 0
    # Training
    init_state_ = np.zeros((num_layers, 2, batch_size, state_size))
    
    while current_epoch < num_epochs:
        step += 1
        tr_x,tr_y,tr_seqlen = tr.next_batch(batch_size)
        total_loss_, train_step_, init_state_, accuracy_ = sess.run(
            [total_loss, train_step, init_state, accuracy],
            feed_dict = {batchX_placeholder:tr_x,
                         batchY_placeholder:tr_y,
                         batchSeqlen_placeholder:tr_seqlen,
                         init_state:init_state_}
            )        
        tr_losses.append(total_loss_)
        tr_accuracies.append(accuracy_)
        if step % 100 == 0:
            print "Avg Training Loss at", step, ':', np.mean(tr_losses)
            print "Avg Training Accuracy at", step, ':', np.mean(tr_accuracies)
            te_x,te_y,te_seqlen = te.next_batch(batch_size) # randomly sample 100 to evaluate.
            total_loss_, train_step_, init_state_, accuracy_ = sess.run(
                [total_loss, train_step, init_state, accuracy],
                feed_dict = {batchX_placeholder:te_x,
                             batchY_placeholder:te_y,
                             batchSeqlen_placeholder:te_seqlen,
                             init_state:init_state_}
                )
            te_losses.append(total_loss_)
            te_accuracies.append(accuracy_)
            print "Avg Test Loss at", step, ':', np.mean(te_losses)
            print "Avg Test Accuracy at", step, ':', np.mean(te_accuracies)
            print
        if tr.epochs > current_epoch: # go to the next epoch.
            current_epoch += 1
            step = 0
    print "Final Avg Training Loss:", np.mean(tr_losses)
    print "Final Avg Training Accuracy:", np.mean(tr_accuracies)   
    print "Final Avg Test Loss:", np.mean(te_losses)
    print "Final Avg Test Accuracy:", np.mean(te_accuracies)

### B. DYNAMIC BI-LSTM

* Sample Test

In [216]:
# HOUSE CLEANING

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
# Clean existing graph before start.
reset_graph()

# CONFIGS

num_epochs = 2
state_size = 100 # i.e. dimension of hidden layer.
num_classes = label_vocabulary_size
batch_size = 100

# DEFINE GRAPH

# Inputs
batchX_placeholder = tf.placeholder(tf.float32, [batch_size, max_len])
batchY_placeholder = tf.placeholder(tf.int32, [batch_size, max_len])
batchSeqlen_placeholder = tf.placeholder(tf.int32, [batch_size])

# LSTM states
#   cell state: memory cell.
#   hidden state: just the same hidden layer as in simple RNN.
#   fwd & bwd need to both take the same init.
cell_state = tf.placeholder(tf.float32, [batch_size, state_size])
hidden_state = tf.placeholder(tf.float32, [batch_size, state_size])
init_state = tf.nn.rnn_cell.LSTMStateTuple(cell_state, hidden_state)

# Projection to output space
#   NB: these are same as W2, b2 above. now the to-embedding projection is automatic by Tensorflow.
W = tf.Variable(np.random.rand(state_size*2, num_classes),dtype=tf.float32)
b = tf.Variable(np.zeros((1,num_classes)), dtype=tf.float32)

# Unpack data to slices (shape=(batch_size,1), 1 for one-hot size) across batch members
#   cf. tf.unpack?
# Build Bi-LSTM pipeline: state-state concatenation
fwd_cell = tf.nn.rnn_cell.BasicLSTMCell(state_size)
bwd_cell = tf.nn.rnn_cell.BasicLSTMCell(state_size)
# Dynamic RNN architecture
#   tf.expand_dims(batchX_placeholder, -1): dynamic_rnn takes input with [batch_size, max_len, input_size],
#                                           we thus add 1 extra dimension [input_size].
states_series_tuple, current_states_tuple = tf.nn.bidirectional_dynamic_rnn(fwd_cell, bwd_cell, 
                                                                            tf.expand_dims(batchX_placeholder, -1),
                                                                            sequence_length=batchSeqlen_placeholder,
                                                                            initial_state_fw=init_state,
                                                                            initial_state_bw=init_state)
fwd_states_series, bwd_states_series = states_series_tuple
fwd_states_series = tf.reshape(fwd_states_series, [-1, state_size]) # shape: [batch_size*max_len, state_size].
bwd_states_series = tf.reshape(bwd_states_series, [-1, state_size])
states_series = tf.concat(1, [fwd_states_series, bwd_states_series]) # concat along the dim of state_size.

# Forward pass 
logits = tf.matmul(states_series, W) + b
labels = tf.reshape(batchY_placeholder, [-1]) # make labels into one long list.
logits_series = tf.unpack(tf.reshape(logits, [batch_size, max_len, num_classes]), axis=1)
predictions_series = [tf.nn.softmax(logit) for logit in logits_series]
# Evaluation
preds_list = [tf.argmax(pred,1) for pred in predictions_series]
preds_concat = tf.concat(0, preds_list) # concatenate batch predictions into one list to match the format of labels.
correct = tf.equal(preds_concat, tf.cast(labels,tf.int64))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
# Loss function
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, labels)
total_loss = tf.reduce_mean(losses)
# Set training method
train_step = tf.train.AdagradOptimizer(1e-4).minimize(total_loss)

In [219]:
%%time

with tf.Session() as sess:
    # Initialize graph
    sess.run(tf.global_variables_initializer())
    # Create data readers
    tr = FullPaddedDataIterator(df_train)
    te = FullPaddedDataIterator(df_test)
    # Record keepers
    tr_losses, te_losses = [], []
    tr_accuracies, te_accuracies = [], []    
    step = 0
    current_epoch = 0
    # Training
    cell_state_ = np.zeros((batch_size, state_size))
    hidden_state_ = np.zeros((batch_size, state_size))
    
    while current_epoch < num_epochs:
        step += 1
        tr_x,tr_y,_ = tr.next_batch(batch_size) # _ for sequence length. not used yet.
        total_loss_, train_step_, init_state_, accuracy_ = sess.run(
            [total_loss, train_step, init_state, accuracy],
            feed_dict = {batchX_placeholder:tr_x,
                         batchY_placeholder:tr_y,
                         batchSeqlen_placeholder:te_seqlen,
                         cell_state:cell_state_,
                         hidden_state:hidden_state_}
            )
        tr_losses.append(total_loss_)
        tr_accuracies.append(accuracy_)
        if step % 100 == 0:
            print "Avg Training Loss at", step, ':', np.mean(tr_losses)
            print "Avg Training Accuracy at", step, ':', np.mean(tr_accuracies)
            te_x,te_y,_ = te.next_batch(batch_size) # randomly sample 100 to evaluate.
            total_loss_, train_step_, init_state_, accuracy_ = sess.run(
                [total_loss, train_step, init_state, accuracy],
                feed_dict = {batchX_placeholder:te_x,
                             batchY_placeholder:te_y,
                             batchSeqlen_placeholder:te_seqlen,
                             cell_state:cell_state_,
                             hidden_state:hidden_state_}
                )
            te_losses.append(total_loss_)
            te_accuracies.append(accuracy_)
            print "Avg Test Loss at", step, ':', np.mean(te_losses)
            print "Avg Test Accuracy at", step, ':', np.mean(te_accuracies)
            print
        if tr.epochs > current_epoch: # go to the next epoch.
            current_epoch += 1
            step = 0
    print "Final Avg Training Loss:", np.mean(tr_losses)
    print "Final Avg Training Accuracy:", np.mean(tr_accuracies)   
    print "Final Avg Test Loss:", np.mean(te_losses)
    print "Final Avg Test Accuracy:", np.mean(te_accuracies)