# DeepLearning 03b. Recurrent Neural Nets (RNN)

**NB**: This is a simple demo. Hyperparams are not tuned for optimal performance (1 epoch => 79% accuracy).

**NB**: The code is developed with Tensorflow 1.0.0.

* **Implementation 4d**: RNN with Tensorflow (Bi-LSTM with dynamic rnn)
    * *Source*: 
        * Erik Hallstr&ouml;m's blog: https://medium.com/@erikhallstrm/tensorflow-rnn-api-2bb31821b185#.qg4y5kgbq.
        * R2RT's blog: http://r2rt.com/recurrent-neural-networks-in-tensorflow-iii-variable-length-sequences.html.
    * *Contribution*:
        * Both bloggers are brilliant, but neither did a simple POS-tagger example for non-technical linguists.

## Implementation 4d

In [1]:
import sys, nltk, random
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus import brown
from nltk.stem.porter import PorterStemmer # a bit of text normalization to reduce vocab size.
stemmer = PorterStemmer()
from itertools import chain
from spacy.en import English
from copy import deepcopy

In [2]:
# LODA DATA

tagged_sents = brown.tagged_sents(tagset='universal')
N = len(tagged_sents)
MAX_LEN = 100 # truncate sents longer than MAX_LEN, pad ones shorter.

In [3]:
# PREPROCESSING

# Size Uniformization

sents, taglists, seqlens = [], [], []

def pad(words, padder):
    num_words = len(words)
    if num_words<MAX_LEN:
        return words + padder*(MAX_LEN-num_words)
    return words[:MAX_LEN]

def text_normalize(words):
    return [stemmer.stem(w.lower()) for w in words]

for tagged_sent in tagged_sents:
    words, tags = zip(*tagged_sent) # get tuples here
    sents.append(pad(text_normalize(list(words)),padder=[' ']))
    taglists.append(pad(list(tags),padder=['SPACE']))
    seqlen = len(words)
    seqlens.append(MAX_LEN if seqlen>MAX_LEN else seqlen)
    
# Train-Test Split    

cutoff = int(N*.8)
sents_train, taglists_train, seqlens_train = sents[:cutoff], taglists[:cutoff], seqlens[:cutoff]
sents_test, taglists_test, seqlens_test = sents[cutoff:], taglists[cutoff:], seqlens[cutoff:]

# Encoding in One-Hot Indices

word_dic = {}
tag_dic = {}

def get_id(word, dic, unk=False):
    if unk: return len(dic)
    if word not in dic:
        dic[word] = len(dic)
    return dic[word]

X_train, X_test, Y_train, Y_test, Seq_train, Seq_test = [], [], [], [], [], []

for words,tags,seqlen in zip(sents_train,taglists_train,seqlens_train):
    X_train.append([get_id(word,word_dic) for word in words])
    Y_train.append([get_id(tag,tag_dic) for tag in tags])
    Seq_train.append(seqlen)

for words,tags,seqlen in zip(sents_test,taglists_test,seqlens_test):
    X_test.append([get_id(word,word_dic) if word in word_dic else get_id(word,word_dic,unk=True) for word in words])
    Y_test.append([get_id(tag,tag_dic) if tag in tag_dic else get_dic(tag,tag_dic,unk=True) for tag in tags])
    Seq_test.append(seqlen) 
    
X_train, X_test = np.array(X_train), np.array(X_test)
Y_train, Y_test =  np.array(Y_train), np.array(Y_test)
Seq_train, Seq_test = np.array(Seq_train), np.array(Seq_test)

# Batch Data Feeder

class DataIterator:
    
    def __init__(self, X, Y, Seq):
        self.X = deepcopy(X)
        self.Y = deepcopy(Y)
        self.Seq = deepcopy(Seq)
        self.size = len(X)
        self.indices = np.arange(self.size)
        self.epoch = 0
        self.cursor = 0
        self.shuffle()
    
    def shuffle(self):
        random.shuffle(self.indices)
        self.X = self.X[self.indices]
        self.Y = self.Y[self.indices]
        self.Seq = self.Seq[self.indices]
        self.cursor = 0
    
    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epoch += 1
            self.shuffle()
        X_batch = self.X[self.cursor:self.cursor+n]
        Y_batch = self.Y[self.cursor:self.cursor+n]
        Seq_batch = self.Seq[self.cursor:self.cursor+n]
        self.cursor += n
        return X_batch, Y_batch, Seq_batch

In [6]:
# BUILD COMPUTATIONAL GRAPH

def reset_graph():
    if 'sess' in globals() and sess:
        sess.close()
    tf.reset_default_graph()
# Clean existing graph before start.
reset_graph()

num_epochs  = 1
hidden_size = 100 # i.e. dimension of hidden layer.
vocab_size  = len(word_dic)+1
num_classes = len(tag_dic)
batch_size  = 32

lr = 1e-4 # learning rate
lmd = 0.01 # L2 regularization

X = tf.placeholder(tf.int32, [None, MAX_LEN])
Y = tf.placeholder(tf.int32, [None, MAX_LEN])
Seq = tf.placeholder(tf.int32, [None])

embedding = tf.get_variable('embedding', [vocab_size,hidden_size], dtype=tf.float32)
X_emb = tf.nn.embedding_lookup(embedding, tf.cast(X, tf.int32))

fwd_cell = tf.contrib.rnn.BasicLSTMCell(hidden_size)
bwd_cell = tf.contrib.rnn.BasicLSTMCell(hidden_size)

outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fwd_cell,cell_bw=bwd_cell,inputs=X_emb,
                                             dtype=tf.float32,sequence_length=Seq)

keep_prob = tf.placeholder(tf.float32)
outputs_concat = tf.nn.dropout(tf.concat(outputs, 2), keep_prob) # (batch_size,MAX_LEN,hidden_size)
dense_input_size = int(outputs_concat.get_shape()[2]) # 100
dense_inputs = tf.reshape(outputs_concat, [-1, dense_input_size]) # (batch_size*MAX_LEN,hidden_size)
W = tf.get_variable('weight', shape=[dense_input_size, num_classes], 
                    initializer=tf.contrib.layers.xavier_initializer())
b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='bias')
l2_loss = tf.constant(0.0)
l2_loss += tf.nn.l2_loss(W)
l2_loss += tf.nn.l2_loss(b)

logits = tf.nn.xw_plus_b(dense_inputs, W, b) # (?, num_classes)
pred_probs = tf.nn.softmax(logits) # (?, num_classes)
Y_pred = tf.cast(tf.argmax(pred_probs, dimension=1), tf.int32)
Y_true = tf.reshape(Y, shape=[-1])
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y_true)) + lmd*l2_loss

correct = tf.cast(tf.equal(Y_pred, Y_true), tf.int32)
mask = tf.cast(tf.not_equal(Y_true, tag_dic['SPACE']), tf.int32)
total_seqlen = tf.cast(tf.reduce_sum(Seq), tf.float32)
correct = tf.multiply(correct, mask)
accuracy = tf.cast(tf.reduce_sum(correct), tf.float32) / total_seqlen

train = tf.train.AdamOptimizer(lr).minimize(loss)

In [7]:
%%time

# TRAIN & EVALUATE

with tf.Session() as sess:
    
    # Initialize graph
    sess.run(tf.global_variables_initializer())
    
    # Create data readers
    tr = DataIterator(X_train,Y_train,Seq_train)
    te = DataIterator(X_test,Y_test,Seq_test)
    
    # Record keepers
    tr_losses, te_losses = [], []
    tr_accuracies, te_accuracies = [], []    
    step = 0
    current_epoch = 0
    
    # Training
    while current_epoch < num_epochs:
        step += 1
        tr_x,tr_y,tr_seq = tr.next_batch(batch_size)
        loss_, train_, accuracy_ = sess.run([loss, train, accuracy],
                                             feed_dict = {X:tr_x,
                                                          Y:tr_y,
                                                          Seq:tr_seq,
                                                          keep_prob:0.75})
        tr_losses.append(loss_)
        tr_accuracies.append(accuracy_)
        if step % 10 == 0:
            print "Step", step, "| loss:", loss_, "| acc:", accuracy_
        if tr.epoch > current_epoch: # go to the next epoch.
            current_epoch += 1
            step = 0
            
    # Test       
    te_x,te_y,te_seq = te.next_batch(len(X_test))
    loss_, accuracy_ = sess.run([loss, accuracy],
                                 feed_dict = {X:te_x,
                                              Y:te_y,
                                              Seq:te_seq,
                                              keep_prob:1.0})
    te_losses.append(loss_)
    te_accuracies.append(accuracy_)
    print
    print "Final Avg Training Loss:", np.mean(tr_losses)
    print "Final Avg Training Accuracy:", np.mean(tr_accuracies)   
    print "Final Avg Test Loss:", np.mean(te_losses)
    print "Final Avg Test Accuracy:", np.mean(te_accuracies)

Step 10 | loss: 2.68788 | acc: 0.234528
Step 20 | loss: 2.68328 | acc: 0.236842
Step 30 | loss: 2.67787 | acc: 0.241573
Step 40 | loss: 2.67206 | acc: 0.246594
Step 50 | loss: 2.66149 | acc: 0.249729
Step 60 | loss: 2.65497 | acc: 0.231771
Step 70 | loss: 2.64448 | acc: 0.240069
Step 80 | loss: 2.62407 | acc: 0.226902
Step 90 | loss: 2.59126 | acc: 0.275676
Step 100 | loss: 2.59809 | acc: 0.234352
Step 110 | loss: 2.58303 | acc: 0.240602
Step 120 | loss: 2.58607 | acc: 0.229199
Step 130 | loss: 2.57879 | acc: 0.266323
Step 140 | loss: 2.57354 | acc: 0.25
Step 150 | loss: 2.55386 | acc: 0.249001
Step 160 | loss: 2.54894 | acc: 0.261429
Step 170 | loss: 2.5739 | acc: 0.244403
Step 180 | loss: 2.5413 | acc: 0.240811
Step 190 | loss: 2.54706 | acc: 0.232947
Step 200 | loss: 2.54842 | acc: 0.239227
Step 210 | loss: 2.53342 | acc: 0.248196
Step 220 | loss: 2.54667 | acc: 0.248696
Step 230 | loss: 2.52893 | acc: 0.255474
Step 240 | loss: 2.54491 | acc: 0.27518
Step 250 | loss: 2.51877 | acc: 