# DeepLearning 03. Recurrent Neural Nets (RNN)

* **Implementation 4a**: RNN with Keras (basics)
    * *Source*: My RNN code at https://github.com/suwangcompling/texasdataday2017/blob/master/NER_ATIS.ipynb
    * *Contribution*: 
        * Hopefully clearer pipeline

* **Implementation 4b**: RNN with Keras (bidirectional setup + tuning options)
    * *Advanced Model*: Bi-LSTM-CRF (https://github.com/glample/tagger)

## I. Implementation 4a

In [75]:
import pickle, os, random
os.environ['KERAS_BACKEND']='tensorflow' 
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Activation, TimeDistributed
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical

In [54]:
# LOAD DATA

path = "/Users/jacobsw/Desktop/WORK/OJO/NER_PRESENTATION/DATA/atis.pkl"
train_triple, valid_triple, test_triple, dicts = pickle.load(open(path, 'rb'))

X_train, Y_train = train_triple[0], train_triple[2]
X_valid, Y_valid = valid_triple[0], valid_triple[2]
X_test, Y_test = test_triple[0], test_triple[2]

l2i = dicts['labels2idx']
w2i = dicts['words2idx']
i2l = {i:l for l,i in l2i.iteritems()}
i2w = {i:w for w,i in w2i.iteritems()}

In [67]:
# SET CONFIGS

vocab_size = len(w2i)
label_size = len(l2i)
emb_size = 100
hidden_size = 100

num_epochs = 20
valid_freq = 1000
valid_size = 100

In [60]:
def dim_transform_x(x):
    """
    Reshape an x data point to [batch_size, length].
    
    Arguments:
    x: Single x data point.
    
    Returns reshaped data point.
    """
    return np.asarray([x])

def dim_transform_y(y):
    """
    Reshape an y data point to [batch_size, length, label_size] (in binarized representation).
    
    Arguments:
    y: Single y data point.
    
    Returns reshaped data point.
    """
    return to_categorical(np.asarray(y)[:,np.newaxis], nb_classes=label_size)[np.newaxis,:,:]

In [61]:
# TRAINSFORM DATA TO FIT INPUT REQUIREMENTS

X_train, X_valid, X_test = map(dim_transform_x, X_train), map(dim_transform_x, X_valid), map(dim_transform_x, X_test)
Y_train, Y_valid, Y_test = map(dim_transform_y, Y_train), map(dim_transform_y, Y_valid), map(dim_transform_y, Y_test)

In [70]:
# BUILD COMPUTATIONAL GRAPH

rnn = Sequential()
rnn.add(Embedding(input_dim=vocab_size, output_dim=emb_size))
rnn.add(LSTM(output_dim=hidden_size, activation='relu', return_sequences=True))
rnn.add(TimeDistributed(Dense(output_dim=label_size)))
rnn.add(Activation('softmax'))
rnn.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

rnn.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_8 (Embedding)          (None, None, 100)     57200       embedding_input_8[0][0]          
____________________________________________________________________________________________________
lstm_7 (LSTM)                    (None, None, 100)     80400       embedding_8[0][0]                
____________________________________________________________________________________________________
timedistributed_6 (TimeDistribut (None, None, 127)     12827       lstm_7[0][0]                     
____________________________________________________________________________________________________
activation_6 (Activation)        (None, None, 127)     0           timedistributed_6[0][0]          
Total params: 150427
______________________________________________________________________

In [66]:
# TRAIN

num_iters = 0

for i in xrange(num_epochs):
    for t in xrange(len(X_train)):
        rnn.train_on_batch(X_train[t], Y_train[t])
        num_iters += 1
        if num_iters%valid_freq==0:
            valid_ids = random.sample(range(len(X_valid)), valid_size)
            valid_costs, valid_accs = [], []
            for v in valid_ids:
                valid_cost, valid_acc = rnn.evaluate(X_valid[v], Y_valid[v], verbose=0)
                valid_costs.append(valid_cost); valid_accs.append(valid_acc)
            print "Validation Cost/Accuracy at Iteration", num_iters, ":", np.mean(valid_costs), np.mean(valid_accs) 

# EVALUATE    

test_costs, test_accs = [], []
for e in xrange(len(X_test)):
    test_cost, test_acc = rnn.evaluate(X_test[e], Y_test[e], verbose=0)
    test_costs.append(test_cost); test_accs.append(test_acc)
print "Test Cost/Accuracy:", np.mean(test_costs), np.mean(test_accs)

Validation Cost/Accuracy at Iteration 1000 : 0.630596980916 0.860808443064
Validation Cost/Accuracy at Iteration 2000 : 0.404342949591 0.911917387688
Validation Cost/Accuracy at Iteration 3000 : 0.319398777756 0.9242008791
Validation Cost/Accuracy at Iteration 4000 : 0.241086330384 0.939273726382
Validation Cost/Accuracy at Iteration 5000 : 0.193205771447 0.957866637992
Validation Cost/Accuracy at Iteration 6000 : 0.120419648209 0.971302836597
Validation Cost/Accuracy at Iteration 7000 : 0.143774769793 0.966408751208
Validation Cost/Accuracy at Iteration 8000 : 0.0945306488611 0.976764610591
Validation Cost/Accuracy at Iteration 9000 : 0.0704847894763 0.980819010238
Validation Cost/Accuracy at Iteration 10000 : 0.0993882891076 0.980816777993
Validation Cost/Accuracy at Iteration 11000 : 0.070867968236 0.980805333555
Validation Cost/Accuracy at Iteration 12000 : 0.0725929829256 0.980705998394
Validation Cost/Accuracy at Iteration 13000 : 0.0701369878041 0.978846239016
Validation Cost/Ac

## II. Implementation 4b

* Tuning:
    * Regularization: Dropout
    * Learning: Learning Decay

**NB**: Load data using code in Impl. 4a first. 

**NB**: Providing syntax. Tuning needed to make this thing work.

In [77]:
vocab_size = len(w2i)
label_size = len(l2i)
emb_size = 100
hidden_size = 100

num_epochs = 20
valid_freq = 1000
valid_size = 100

In [81]:
# BUILD COMPUTATIONAL GRAPH

bilstm = Sequential()
bilstm.add(Embedding(input_dim=vocab_size, output_dim=emb_size))
bilstm.add(Bidirectional(LSTM(output_dim=hidden_size, activation='relu', return_sequences=True)))
bilstm.add(Dropout(p=0.5))
bilstm.add(Bidirectional(LSTM(output_dim=hidden_size, activation='relu', return_sequences=True)))
bilstm.add(Dropout(p=0.5))
bilstm.add(TimeDistributed(Dense(output_dim=label_size)))
bilstm.add(Activation('softmax'))

adam = Adam(decay=0.9)
bilstm.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

bilstm.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_12 (Embedding)         (None, None, 100)     57200       embedding_input_12[0][0]         
____________________________________________________________________________________________________
bidirectional_7 (Bidirectional)  (None, None, 200)     160800      embedding_12[0][0]               
____________________________________________________________________________________________________
dropout_5 (Dropout)              (None, None, 200)     0           bidirectional_7[0][0]            
____________________________________________________________________________________________________
bidirectional_8 (Bidirectional)  (None, None, 200)     240800      dropout_5[0][0]                  
___________________________________________________________________________________________

In [83]:
# TRAIN

num_iters = 0

for i in xrange(num_epochs):
    for t in xrange(len(X_train)):
        bilstm.train_on_batch(X_train[t], Y_train[t])
        num_iters += 1
        if num_iters%valid_freq==0:
            valid_ids = random.sample(range(len(X_valid)), valid_size)
            valid_costs, valid_accs = [], []
            for v in valid_ids:
                valid_cost, valid_acc = bilstm.evaluate(X_valid[v], Y_valid[v], verbose=0)
                valid_costs.append(valid_cost); valid_accs.append(valid_acc)
            print "Validation Cost/Accuracy at Iteration", num_iters, ":", np.mean(valid_costs), np.mean(valid_accs)
    
# EVALUATE    

test_costs, test_accs = [], []
for e in xrange(len(X_test)):
    test_cost, test_acc = bilstm.evaluate(X_test[e], Y_test[e], verbose=0)
    test_costs.append(test_cost); test_accs.append(test_acc)
print "Test Cost/Accuracy:", np.mean(test_costs), np.mean(test_accs)