### Task: translating number words into numbers (e.g. one two three -> 1 2 3)

* Seq2seq as in Bahdanau et al. (2015)

In [2]:
# Add custom import path

import sys
sys.path.insert(0, '/home/jacobsuwang/Documents/UTA2018/NEURAL-NETS/ATTENTION/CODE/01-import-folder')

### MAKING DATA

In [3]:
# Data generation

import utils

vocab = set(['PAD','EOS','1','2','3','4','5','one','two','three','four','five'])
word2idx = {'PAD':0,'EOS':1,'1':2,'2':3,'3':4,'4':5,'5':6,
            'one':7,'two':8,'three':9,'four':10,'five':11}
idx2word = {idx:word for word,idx in word2idx.iteritems()}
word2digit_translate = {'one':'1','two':'2','three':'3',
                        'four':'4','five':'5'}
word2digit_translate_byidx = {7:2,8:3,9:4,10:5,11:6}

def code_sequence(s):
    '''
    Take a sentence, convert it to a list of words (in vocab), 
    then return idx encoding.
    '''
    seq = s.split()
    return [word2idx[word] for word in seq]

def decode_sequence(l):
    '''
    Take a list of indices, return words.
    '''
    return ' '.join([idx2word[idx] for idx in l])

def encode(data):
    '''
    Take sentence data, encode it.
    '''
    return [code_sequence(dat) for dat in data]

def to_readable(batch):
    '''
    Take a time-major batch of data, 
    return a list of translated words.
    '''
    batch_t = batch.transpose() # time-major -> batch-major
    return [decode_sequence(dat) for dat in batch_t]

# To transform data (i.e. list of sentences as wordlists) into
# input data, feed it to utils.batch
# sample results:
# utils.batch([code_sequence('1 2 3')])
# (array([[1],
#         [2],
#         [3]], dtype=int32), [3])
# that is, a tuple (time major with shape [max_time, batch_size])
# so with a batch of two, we get
# (array([[1, 4],
#         [2, 5],
#         [3, 5]], dtype=int32), [3, 3])

# Data generator

def random_length(len_from, len_to):
    if len_from == len_to:
        return len_from
    return np.random.randint(len_from, len_to)

def random_batch(input_length_from, input_length_to,
                 output_length_from, output_length_to,
                 seq_length_from, seq_length_to,
                 batch_size):
    if input_length_from > input_length_to or \
        output_length_from > output_length_to:
        raise ValueError('length_from > length_to')

    input_batch = [np.random.randint(low=input_length_from,
                                     high=input_length_to,
                                     size=random_length(seq_length_from,
                                                        seq_length_to)).tolist()
                   for _ in range(batch_size)]
    output_batch = [[word2digit_translate_byidx[idx] for idx in input_dat]
                     for input_dat in input_batch]
    return input_batch, output_batch
      
# Example:
# digit_from = 2
# digit_to = 6+1
# word_from = 7
# word_to = 11+1
# a,b = random_batch(word_from,word_to,digit_from,digit_to,batch_size=2)
# print a
# [[11, 7, 11, 11, 10, 11, 9, 10, 10, 8, 9], [10, 7, 7, 9, 8, 9, 7, 8, 11]] <- indices of num words
# print b
# [[6, 2, 6, 6, 5, 6, 4, 5, 5, 3, 4], [5, 2, 2, 4, 3, 4, 2, 3, 6]] <- indices of their digit translation
# print [decode_sequence(a_) for a_ in a]
# ['five one five five four five three four four two three', 'four one one three two three one two five']
# print [decode_sequence(b_) for b_ in b]
# ['5 1 5 5 4 5 3 4 4 2 3', '4 1 1 3 2 3 1 2 5']

### MAKING MODEL

In [4]:
import math

import numpy as np
import tensorflow as tf
import tensorflow.contrib.seq2seq as seq2seq
from tensorflow.contrib.layers import safe_embedding_lookup_sparse as embedding_lookup_unique
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple, GRUCell

In [40]:
# Graph

tf.reset_default_graph()
sess = tf.InteractiveSession()

# Set configuration

digit_from = 2
digit_to = 6+1
word_from = 7
word_to = 11+1
seqlen_from = 3
seqlen_to = 8

batch_size = 10
vocab_size = len(vocab)
input_embedding_size = 10

encoder_hidden_units = 20
decoder_hidden_units = encoder_hidden_units * 2 # because encoder is going to be bidirectional.

attention = True     # togglable
bidirectional = True # currently hardcoded

encoder_cell = LSTMCell(encoder_hidden_units)
decoder_cell = LSTMCell(decoder_hidden_units)

# _init_placeholder()

encoder_inputs = tf.placeholder(shape=(None,None), dtype=tf.int32, name='encoder_inputs') # [max_time, batch_size]
encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length') 
decoder_targets = tf.placeholder(shape=(None,None), dtype=tf.int32, name='decoder_targets')
decoder_targets_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='decoder_inputs_length')

# _init_decoder_train_connectors()

with tf.name_scope('DecoderTrainFeeds'):
    sequence_size, batch_size_ = tf.unstack(tf.shape(decoder_targets)) # [max_time, batch_size]
    EOS_SLICE = tf.ones([1, batch_size_], dtype=tf.int32) * word2idx['EOS']
    PAD_SLICE = tf.ones([1, batch_size_], dtype=tf.int32) * word2idx['PAD']
    decoder_train_inputs = tf.concat([EOS_SLICE, decoder_targets], axis=0) # add EOS to the beginning.
    decoder_train_length = decoder_targets_length + 1 # and adjust length accordingly.
    decoder_train_targets = tf.concat([decoder_targets, PAD_SLICE], axis=0) # add PAD to the end.
    decoder_train_targets_seq_len, _ = tf.unstack(tf.shape(decoder_train_targets))
    decoder_train_targets_eos_mask = tf.one_hot(decoder_train_length - 1,
                                                decoder_train_targets_seq_len,
                                                on_value=word2idx['EOS'], off_value=word2idx['PAD'],
                                                dtype=tf.int32)
    decoder_train_targets_eos_mask = tf.transpose(decoder_train_targets_eos_mask, [1, 0]) # to [batch_size, max_time]?
    decoder_train_targets = tf.add(decoder_train_targets,
                                   decoder_train_targets_eos_mask) # add EOS to end of target sequence
    loss_weights = tf.ones([
        batch_size,
        tf.reduce_max(decoder_train_length)
    ], dtype=tf.float32, name='loss_weights')

# _init_embeddings()

with tf.variable_scope('embedding') as scope:
    sqrt3 = math.sqrt(3) # unif(-sqrt(3),sqrt(3)) has var = 1.
    initializer = tf.random_uniform_initializer(-sqrt3, sqrt3)
    embedding_matrix = tf.get_variable(
        name='embedding_matrix',
        shape=[vocab_size, input_embedding_size],
        initializer=initializer,
        dtype=tf.float32
    )
    encoder_inputs_embedded = tf.nn.embedding_lookup(embedding_matrix, encoder_inputs)
    decoder_train_inputs_embedded = tf.nn.embedding_lookup(embedding_matrix, decoder_train_inputs)

# _init_bidirectional_encoder()

with tf.variable_scope('BidirectionalEncoder') as scope:
    encoder_cell = LSTMCell(encoder_hidden_units)
    ((encoder_fw_outputs,encoder_bw_outputs), # both have [max_time, batch_size, emb_size]
     (encoder_fw_state,encoder_bw_state)) = ( # state tuples: (c=[batch_size,emb_size],h=same)
            tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                            cell_bw=encoder_cell,
                                            inputs=encoder_inputs_embedded,
                                            sequence_length=encoder_inputs_length,
                                            dtype=tf.float32, time_major=True)
        )  
    encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2) # concat on emb dim.
    if isinstance(encoder_fw_state, LSTMStateTuple):
        encoder_state_c = tf.concat((encoder_fw_state.c, encoder_bw_state.c), 1, name='bidirectional_concat_c')
        encoder_state_h = tf.concat((encoder_fw_state.h, encoder_bw_state.h), 1, name='bidirectional_concat_h')
        encoder_state = LSTMStateTuple(c=encoder_state_c, h=encoder_state_h)
    elif isinstance(encoder_fw_state, tf.Tensor):
        self.encoder_state = tf.concat((encoder_fw_state, encoder_bw_state), 1, name='bidirectional_concat')
    
# _init_decoder()

with tf.variable_scope('Decoder') as scope:
    def output_fn(outputs):
        return tf.contrib.layers.linear(outputs, vocab_size, scope=scope)
    if not attention:
        decoder_fn_train = seq2seq.simple_decoder_fn_train(encoder_state=encoder_state)
        decoder_fn_inference = seq2seq.simple_decoder_fn_inference(
            output_fn=output_fn,
            encoder_state=encoder_state,
            embeddings=embedding_matrix,
            start_of_sequence_id=word2idx['EOS'],
            end_of_sequence_id=word2idx['EOS'],
            maximum_length=tf.reduce_max(encoder_inputs_length) + 3,
            num_decoder_symbols=vocab_size
        )
    else:
        attention_states = tf.transpose(encoder_outputs, [1, 0, 2]) # -> [batch_size, max_time, num_units]
        (attention_keys,
         attention_values,
         attention_score_fn,
         attention_construct_fn) = seq2seq.prepare_attention(
            attention_states=attention_states,
            attention_option='bahdanau',
            num_units=decoder_hidden_units
        )
        decoder_fn_train = seq2seq.attention_decoder_fn_train(
            encoder_state=encoder_state,
            attention_keys=attention_keys,
            attention_values=attention_values,
            attention_score_fn=attention_score_fn,
            attention_construct_fn=attention_construct_fn,
            name='attention_decoder'
        )
        decoder_fn_inference = seq2seq.attention_decoder_fn_inference(
            output_fn=output_fn,
            encoder_state=encoder_state,
            attention_keys=attention_keys,
            attention_values=attention_values,
            attention_score_fn=attention_score_fn,
            attention_construct_fn=attention_construct_fn,
            embeddings=embedding_matrix,
            start_of_sequence_id=word2idx['EOS'],
            end_of_sequence_id=word2idx['EOS'],
            maximum_length=tf.reduce_max(encoder_inputs_length) + 3,
            num_decoder_symbols=vocab_size
        )
        (decoder_outputs_train,
         decoder_state_train,
         decoder_context_state_train) = (
            seq2seq.dynamic_rnn_decoder(
                cell=decoder_cell,
                decoder_fn=decoder_fn_train,
                inputs=decoder_train_inputs_embedded,
                sequence_length=decoder_train_length,
                time_major=True,
                scope=scope
            )
        )
        decoder_logits_train = output_fn(decoder_outputs_train)
        decoder_prediction_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_prediction_train')
        scope.reuse_variables()
        (decoder_logits_inference,
         decoder_state_inference,
         decoder_context_state_inference) = (
            seq2seq.dynamic_rnn_decoder(
                cell=decoder_cell,
                decoder_fn=decoder_fn_inference,
                time_major=True,
                scope=scope
            )
        )
        decoder_prediction_inference = tf.argmax(decoder_logits_inference, axis=-1, name='decoder_prediction_inference')
    
# _init_optimizer()

logits = tf.transpose(decoder_logits_train, [1, 0, 2])
targets = tf.transpose(decoder_train_targets, [1, 0])
loss = seq2seq.sequence_loss(logits=logits, targets=targets, weights=loss_weights)
train_op = tf.train.AdamOptimizer().minimize(loss)

# run training

init = tf.global_variables_initializer()
sess.run(init)

def make_train_inputs(input_seq, target_seq):
    # batch_enc, batch_dec = random_batch(word_from,word_to,digit_from,digit_to,seqlen_from,seqlen_to,batch_size)
        # this is called in ematvey's code as:
        # batch_data = next(batches)
        # fd = model.make_train_inputs(batch_data, batch_data)
    inputs_, inputs_length_ = utils.batch(input_seq) 
        # equiv encoder_inputs_, encoder_inputs_lengths_ = utils.batch(batch_enc)
    targets_, targets_length_ = utils.batch(target_seq)
        # equiv decoder_targets_, _ = utils.batch([seq + [word2idx['EOS']] + [word2idx['PAD']]*2 for seq in batch_dec])
        # the EOS addition is done in a function above, so no need here.
    return {
        encoder_inputs: inputs_,
        encoder_inputs_length: inputs_length_,
        decoder_targets: targets_,
        decoder_targets_length: targets_length_
    }

loss_track = []
max_batches = 5000
batches_in_epoch=1000
try:
    for batch in range(max_batches+1):
        batch_enc, batch_dec = random_batch(word_from,word_to,digit_from,digit_to,seqlen_from,seqlen_to,batch_size)
        fd = make_train_inputs(batch_enc, batch_dec) 
            # ematvey: ..(batch_data, batch_data)
            # because he does copy task, and i do translation.
        _, l = sess.run([train_op, loss], fd)
        loss_track.append(l)
        
        if batch == 0 or batch % batches_in_epoch == 0:
            print('batch {}'.format(batch))
            print('  minibatch loss: {}'.format(sess.run(loss, fd)))
            for i, (e_in, dt_pred) in enumerate(zip(
                    fd[encoder_inputs].T,
                    sess.run(decoder_prediction_train, fd).T
                )):
                print('  sample {}:'.format(i + 1))
                print('    enc input     > {}'.format(decode_sequence(e_in)))
                print('    dec train predicted > {}'.format(decode_sequence(dt_pred)))
                if i >= 2:
                    break
            print
except KeyboardInterrupt:
    print('training interrupted')        

batch 0
  minibatch loss: 2.42388939857
  sample 1:
    enc input     > four three five four two three three
    dec train predicted > EOS EOS EOS 3 3 3 3 EOS
  sample 2:
    enc input     > one three one five five four four
    dec train predicted > EOS EOS EOS EOS one 3 3 3
  sample 3:
    enc input     > two one one three PAD PAD PAD
    dec train predicted > EOS EOS EOS EOS EOS PAD PAD PAD

batch 1000
  minibatch loss: 0.264123678207
  sample 1:
    enc input     > two one two two one five PAD
    dec train predicted > 2 1 2 2 1 5 EOS PAD
  sample 2:
    enc input     > one five three three PAD PAD PAD
    dec train predicted > 1 5 3 3 EOS PAD PAD PAD
  sample 3:
    enc input     > four three three two three PAD PAD
    dec train predicted > 4 3 3 2 3 EOS PAD PAD

batch 2000
  minibatch loss: 0.266255199909
  sample 1:
    enc input     > four three five PAD PAD PAD PAD
    dec train predicted > 4 3 5 EOS PAD PAD PAD PAD
  sample 2:
    enc input     > three five one four one PAD 