In [1]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

In [2]:
import math
import random
import numpy as np

import tensorflow as tf
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.contrib.layers import safe_embedding_lookup_sparse as embedding_lookup_unique
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell

import helpers

### Prepare data

In [3]:
VOCAB = ['one','two','three','four','five',
         'six','seven','eight','nine','ten']
FROM_LEN = 3
TO_LEN = 8

indexer = helpers.Indexer()
indexer.get_index('PAD')
indexer.get_index('EOS')
for word in VOCAB:
    indexer.get_index(word)
    
def generate_sent(from_len, to_len):
    length = np.random.randint(from_len, to_len)
    return np.random.choice(VOCAB, length)

def to_code(sent):
    return [indexer.get_index(word) for word in sent]

def to_sent(code):
    return list(map(lambda w_idx:indexer.get_object(w_idx), code))
    
def get_batch(n, from_len=FROM_LEN, to_len=TO_LEN):
    return [to_code(generate_sent(from_len,to_len)) for _ in range(n)]

class DataIterator:
    
    def __init__(self, X, Y, shuf=False):
        self.X = X
        self.Y = Y
        self.size = len(self.X)
        self.indices = list(range(self.size))
        self.cursor = 0
        self.epoch = 0
        self.shuf = shuf
    
    def _shuffle(self):
        random.shuffle(self.indices)
        self.X = self.X[self.indices]
        self.Y = self.Y[self.indices]
    
    def next_batch(self, k):
        if self.cursor+k >= self.size:
            batch_X, batch_Y = self.X[self.cursor:], self.Y[self.cursor:]
            self.cursor = 0
            self.epoch += 1
            self._shuffle()
        else:
            batch_X, batch_Y = self.X[self.cursor:self.cursor+k], self.Y[self.cursor:self.cursor+k]
            self.cursor += k
        return batch_X, batch_Y
    

In [4]:
train_data = np.array(get_batch(10000))
test_data = np.array(get_batch(200))
train_iter = DataIterator(train_data, train_data)

### Build model

In [5]:
class PtrNet:
    
    def __init__(self, cell_size, 
                       embedding_size=50,
                       glove_embeddings=False,
                       learning_rate=1e-4,
                       bidirectional=True, 
                       attention=True):
        
        self.bidirectional = bidirectional
        self.attention = attention
        self.cell_size = cell_size
        self.embedding_size = embedding_size
        self.glove_embeddings = glove_embeddings
        self.learning_rate = learning_rate
        
    def _build_graph(self, enc_in_name, enc_in_len_name, dec_tar_name,
                           embedding_scope_name, embedding_name,
                           encoder_scope_name, decoder_scope_name):
        
        # init LSTM cells
        self._init_cells()
        # init placeholders
        self._init_placeholders(enc_in_name, enc_in_len_name, dec_tar_name)
        # init embeddings
        self._init_embeddings(embedding_scope_name, embedding_name)
        # build encoder
        self._build_encoder(encoder_scope_name)
        # build decoder
        self._build_decoder(decoder_scope_name)
        # build optimizer
        self._build_optimizer()
        self._build_evaluator()
        
        
    def _init_cells(self):
        
        self.encoder_cell = LSTMCell(self.cell_size)
        if self.bidirectional:
            self.decoder_cell = LSTMCell(self.cell_size*2)
        else:
            self.decoder_cell = LSTMCell(self.cell_size)
        
    def _init_placeholders(self, enc_in_name, enc_in_len_name, dec_tar_name):
        
        self.encoder_inputs = tf.placeholder(tf.int32, [None, None], name=enc_in_name)
        self.encoder_inputs_length = tf.placeholder(tf.int32, [None,], name=enc_in_len_name)
        self.decoder_targets = tf.placeholder(tf.int32, [None, None], name=dec_tar_name)
        
    def _init_embeddings(self, scope_name, embedding_name):
        
        with tf.variable_scope(scope_name) as scope:
            if self.glove_embeddings:
                self.embedding_matrix = tf.get_variable(embedding_name, self.glove_embeddings.shape,
                                                        initializer=tf.constant_initializer())
                glove_feed = tf.placeholder(tf.float32, self.glove_embeddings.shape)
                glove_init = self.embedding_matrix.assign(glove_feed)
                self.encoder_inputs_embedded = tf.nn.embedding_lookup(self.embedding_matrix, self.encoder_inputs)
            else:
                self.embedding_matrix = tf.get_variable(embedding_name, [self.vocab_size, self.embedding_size],
                                                        initializer=tf.contrib.layers.xavier_initializer())
                self.encoder_inputs_embedded = tf.nn.embedding_lookup(self.embedding_matrix, self.encoder_inputs)
        
    def _build_encoder(self, scope_name):
        
        with tf.variable_scope(scope_name) as scope:
            if self.bidirectional:
                ((encoder_fw_outputs,encoder_bw_outputs),
                 (encoder_fw_final_state,encoder_bw_final_state)) = (
                    tf.nn.bidirectional_dynamic_rnn(cell_fw=self.encoder_cell,
                                                    cell_bw=self.encoder_cell,
                                                    inputs=self.encoder_inputs_embedded,
                                                    sequence_length=self.encoder_inputs_length,
                                                    dtype=tf.float32, time_major=True)
                )
                self.encoder_outputs = tf.concat((encoder_fw_outputs,encoder_bw_outputs), 2)
                encoder_final_state_c = tf.concat((encoder_fw_final_state.c, encoder_bw_final_state.c), 1)
                encoder_final_state_h = tf.concat((encoder_fw_final_state.h, encoder_bw_final_state.h), 1)
                self.encoder_final_state = LSTMStateTuple(c=encoder_final_state_c, h=encoder_final_state_h)
            else:
                (self.encoder_outputs,self.encoder_final_state) = (
                    tf.nn.dynamic_rnn(cell=self.encoder_cell,
                                      inputs=self.encoder_inputs_embedded,
                                      sequence_length=self.encoder_inputs_length,
                                      dtype=tf.float32, time_major=True)
                )
                
    def _build_decoder(self, scope_name):
        
        with tf.variable_scope(scope_name) as scope:
            encoder_max_time, batch_size = tf.unstack(tf.shape(self.encoder_inputs))
            self.decoder_length = self.encoder_inputs_length
            self.W = tf.get_variable('W', [self.decoder_cell.output_size, self.vocab_size],
                                     initializer=tf.contrib.layers.xavier_initializer())
            self.b = tf.get_variable('b', [self.vocab_size],
                                     initializer=tf.contrib.layers.xavier_initializer())
            self.eos_step_embedded = tf.ones([batch_size, self.decoder_cell.output_size], 
                                             dtype=tf.float32, name='EOS')
            self.pad_step_embedded = tf.zeros([batch_size, self.decoder_cell.output_size], 
                                              dtype=tf.float32, name='PAD')
            self.W1 = tf.get_variable('W1', [self.decoder_cell.output_size, self.decoder_cell.output_size],
                                      initializer=tf.contrib.layers.xavier_initializer())
            self.W2 = tf.get_variable('W2', [self.decoder_cell.output_size, self.decoder_cell.output_size],
                                      initializer=tf.contrib.layers.xavier_initializer()) 
            self.v = tf.get_variable('v', [self.decoder_cell.output_size, 1], dtype=tf.float32,
                                     initializer=tf.contrib.layers.xavier_initializer())            
        
        def loop_fn_initial():
            initial_elements_finished = (0 >= self.decoder_length)
            initial_input = self.eos_step_embedded
            initial_cell_state = self.encoder_final_state
            initial_cell_output = None 
            initial_loop_state = None 
            return (initial_elements_finished,
                    initial_input,
                    initial_cell_state,
                    initial_cell_output,
                    initial_loop_state)  
        
        def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
            def get_next_input():
                mt, bc, hd = tf.unstack(tf.shape(self.encoder_outputs))
                EW1 = tf.reshape(tf.tensordot(self.encoder_outputs, self.W1, axes=[[2],[0]]),
                                 [mt, bc, self.decoder_cell.output_size])
                DW2 = tf.matmul(previous_output, self.W2)
                EW1_plus_DW2 = tf.add(EW1, DW2)
                attention_mat = tf.nn.softmax(tf.reshape(tf.squeeze(tf.tensordot(tf.nn.tanh(EW1_plus_DW2), 
                                                                                 self.v, 
                                                                                 axes=[[2],[0]]),
                                                                    axis=2), [mt,bc]), dim=0)
                selector = tf.one_hot(tf.argmax(attention_mat, axis=0), depth=mt,
                                      on_value=1.0, off_value=0.0, axis=0)
                inputs_embedded_selected = tf.transpose(tf.multiply(tf.transpose(self.encoder_outputs, [2,0,1]),
                                                                    selector), [1,2,0])
                next_input = tf.reduce_sum(tf.reshape(inputs_embedded_selected, 
                                                      [mt,bc,self.decoder_cell.output_size]), axis=0)
                return next_input
            elements_finished = (time >= self.decoder_length)
            finished = tf.reduce_all(elements_finished)
            inpt = tf.cond(finished, lambda: self.pad_step_embedded, get_next_input)
            state = previous_state
            output = previous_output
            loop_state = None
            return (elements_finished,
                    inpt,
                    state,
                    output,
                    loop_state)
        
        def loop_fn(time, previous_output, previous_state, previous_loop_state):
            if previous_state is None:
                assert previous_output is None and previous_state is None
                return loop_fn_initial()
            else:
                return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)
            
        decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(self.decoder_cell, loop_fn)
        decoder_outputs = decoder_outputs_ta.stack()
        decoder_max_step, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
        decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, self.decoder_cell.output_size))
        decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, self.W), self.b)
        self.decoder_logits = tf.reshape(decoder_logits_flat, 
                                         (decoder_max_step, decoder_batch_size, self.vocab_size))
        self.decoder_prediction = tf.cast(tf.argmax(self.decoder_logits, 2), dtype=tf.int32)
        
    def _build_optimizer(self):
        
        stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
            labels=tf.one_hot(self.decoder_targets, depth=self.vocab_size, dtype=tf.float32),
            logits=self.decoder_logits
        )
        self.loss = tf.reduce_mean(stepwise_cross_entropy)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        grads_and_vars = optimizer.compute_gradients(self.loss)
        self.train_op = optimizer.apply_gradients(grads_and_vars, self.global_step)
        
    def _build_evaluator(self):
        
        correct_raw = tf.cast(tf.equal(self.decoder_prediction, self.decoder_targets), tf.int32)
        mask = tf.cast(tf.not_equal(self.decoder_targets, 0), tf.int32)
        total_length = tf.cast(tf.reduce_sum(self.encoder_inputs_length), tf.float32)
        correct = tf.multiply(correct_raw, mask)
        self.accuracy = tf.cast(tf.reduce_sum(correct), tf.float32) / total_length
    
    def fit(self, train_iter, indexer, 
            num_epochs, batch_size, verbose=100,
            enc_in_name='encoder_inputs',
            enc_in_len_name='encoder_inputs_length',
            dec_tar_name='decoder_targets',
            embedding_scope_name='Embeddings',
            embedding_name='embedding_matrix',
            encoder_scope_name='Encoder',
            decoder_scope_name='Decoder'):
        
        self.indexer = indexer
        self.vocab_size = len(indexer)
        tf.reset_default_graph()
        self.sess = tf.Session()
        self._build_graph(enc_in_name, enc_in_len_name, dec_tar_name,
                          embedding_scope_name, embedding_name,
                          encoder_scope_name, decoder_scope_name)  
        self.sess.run(tf.global_variables_initializer())

        def to_sent(code):
            return list(map(lambda w_idx:indexer.get_object(w_idx), code))        
        
        curr_epoch = -1
        try:
            while train_iter.epoch<num_epochs:
                if curr_epoch<train_iter.epoch:
                    print('Epoch {}:\n'.format(train_iter.epoch+1))
                    curr_epoch += 1
                batch_X, batch_Y = train_iter.next_batch(batch_size)
                inputs_, inputs_length_ = helpers.batch(batch_X)
                targets_, _ = helpers.batch(batch_Y) # target length doesn't matter.
                fd = {
                    self.encoder_inputs: inputs_,
                    self.encoder_inputs_length: inputs_length_,
                    self.decoder_targets: targets_
                }
                _, l, a, step = self.sess.run([self.train_op, self.loss, self.accuracy, self.global_step], fd)
                if step==0 or step%verbose==0:
                    print('batch {}'.format(step))
                    print('  minibatch loss = {} | accuracy = {}'.format(l, a))
                    for i,(e_in, dt_pred) in enumerate(zip(
                        inputs_.T, # [max-time,batch_size] -> [batch_size,max-time]
                        self.sess.run(self.decoder_prediction, fd).T
                    )):                    
                        print('  sample {}:'.format(i+1))
                        print('    enc input           > {}'.format([w for w in to_sent(e_in) if w!='PAD']))
                        print('    dec train predicted > {}'.format([w for w in to_sent(dt_pred) if w!='PAD']))
                        if i>=2:
                            break
                    print('\n')  
        except KeyboardInterrupt:
            print('training interrupted')
    
    def evaluate(self, test):
        inputs_, inputs_length_ = helpers.batch(test)
        targets_, _ = helpers.batch(test)
        fd = {
            self.encoder_inputs: inputs_,
            self.encoder_inputs_length: inputs_length_,
            self.decoder_targets: targets_
        }    
        l, a = self.sess.run([self.loss, self.accuracy], fd)
        print('Evaluating on test: loss = {} | accuracy = {}'.format(l,a))
    
    def end_session(self):
        try:
            self.sess.close()
            print('Session ended.')
        except:
            print('Attempting to close a closed session or session does not exist.')
        
                

In [6]:
pt = PtrNet(cell_size=20)

In [7]:
pt.fit(train_iter=train_iter, indexer=indexer, num_epochs=10, batch_size=10, verbose=1000)
    # Train it long enough you've get perfect results on this.

Epoch 1:

batch 1000
  minibatch loss = 1.928040862083435 | accuracy = 0.40816327929496765
  sample 1:
    enc input           > ['four', 'seven', 'ten', 'five']
    dec train predicted > ['four', 'seven', 'seven', 'seven', 'five', 'five', 'five']
  sample 2:
    enc input           > ['six', 'ten', 'ten', 'nine', 'four']
    dec train predicted > ['ten', 'ten', 'ten', 'ten', 'ten', 'five', 'five']
  sample 3:
    enc input           > ['eight', 'six', 'two', 'ten', 'ten', 'four']
    dec train predicted > ['ten', 'ten', 'ten', 'ten', 'ten', 'ten', 'five']


Epoch 2:

batch 2000
  minibatch loss = 1.7469537258148193 | accuracy = 0.4000000059604645
  sample 1:
    enc input           > ['five', 'seven', 'seven', 'one', 'four', 'four']
    dec train predicted > ['seven', 'four', 'four', 'four', 'four', 'seven']
  sample 2:
    enc input           > ['nine', 'five', 'nine', 'five', 'eight', 'eight', 'three']
    dec train predicted > ['nine', 'five', 'eight', 'eight', 'eight', 'eight', 'e

In [8]:
pt.evaluate(test_data)

Evaluating on test: loss = 1.028043270111084 | accuracy = 0.5777559280395508


In [9]:
pt.end_session()

Session ended.
