### Task: translating number words into numbers (e.g. one two three -> 1 2 3)

* Seq2seq as in Sutskever et al. (2014)

In [1]:
# Add custom import path

import sys
sys.path.insert(0, '/home/jacobsuwang/Documents/UTA2018/NEURAL-NETS/ATTENTION/CODE/01-import-folder')

### MAKING DATA

In [2]:
# Data generation

import utils

vocab = set(['PAD','EOS','1','2','3','4','5','one','two','three','four','five'])
word2idx = {'PAD':0,'EOS':1,'1':2,'2':3,'3':4,'4':5,'5':6,
            'one':7,'two':8,'three':9,'four':10,'five':11}
idx2word = {idx:word for word,idx in word2idx.iteritems()}
word2digit_translate = {'one':'1','two':'2','three':'3',
                        'four':'4','five':'5'}
word2digit_translate_byidx = {7:2,8:3,9:4,10:5,11:6}

def code_sequence(s):
    '''
    Take a sentence, convert it to a list of words (in vocab), 
    then return idx encoding.
    '''
    seq = s.split()
    return [word2idx[word] for word in seq]

def decode_sequence(l):
    '''
    Take a list of indices, return words.
    '''
    return ' '.join([idx2word[idx] for idx in l])

def encode(data):
    '''
    Take sentence data, encode it.
    '''
    return [code_sequence(dat) for dat in data]

def to_readable(batch):
    '''
    Take a time-major batch of data, 
    return a list of translated words.
    '''
    batch_t = batch.transpose() # time-major -> batch-major
    return [decode_sequence(dat) for dat in batch_t]

# To transform data (i.e. list of sentences as wordlists) into
# input data, feed it to utils.batch
# sample results:
# utils.batch([code_sequence('1 2 3')])
# (array([[1],
#         [2],
#         [3]], dtype=int32), [3])
# that is, a tuple (time major with shape [max_time, batch_size])
# so with a batch of two, we get
# (array([[1, 4],
#         [2, 5],
#         [3, 5]], dtype=int32), [3, 3])

# Data generator

def random_length(len_from, len_to):
    if len_from == len_to:
        return len_from
    return np.random.randint(len_from, len_to)

def random_batch(input_length_from, input_length_to,
                 output_length_from, output_length_to,
                 seq_length_from, seq_length_to,
                 batch_size):
    if input_length_from > input_length_to or \
        output_length_from > output_length_to:
        raise ValueError('length_from > length_to')

    input_batch = [np.random.randint(low=input_length_from,
                                     high=input_length_to,
                                     size=random_length(seq_length_from,
                                                        seq_length_to)).tolist()
                   for _ in range(batch_size)]
    output_batch = [[word2digit_translate_byidx[idx] for idx in input_dat]
                     for input_dat in input_batch]
    return input_batch, output_batch
      
# Example:
# digit_from = 2
# digit_to = 6+1
# word_from = 7
# word_to = 11+1
# a,b = random_batch(word_from,word_to,digit_from,digit_to,batch_size=2)
# print a
# [[11, 7, 11, 11, 10, 11, 9, 10, 10, 8, 9], [10, 7, 7, 9, 8, 9, 7, 8, 11]] <- indices of num words
# print b
# [[6, 2, 6, 6, 5, 6, 4, 5, 5, 3, 4], [5, 2, 2, 4, 3, 4, 2, 3, 6]] <- indices of their digit translation
# print [decode_sequence(a_) for a_ in a]
# ['five one five five four five three four four two three', 'four one one three two three one two five']
# print [decode_sequence(b_) for b_ in b]
# ['5 1 5 5 4 5 3 4 4 2 3', '4 1 1 3 2 3 1 2 5']

### MAKING MODEL

In [42]:
import numpy as np
import tensorflow as tf

from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple

In [69]:
# Graph

tf.reset_default_graph()
sess = tf.InteractiveSession()

vocab_size = len(vocab)
input_embedding_size = 20

encoder_hidden_units = 20
decoder_hidden_units = encoder_hidden_units * 2 # because encoder is going to be bidirectional.

#                    decoder 
#                    target
# 
# [] -> [] -> [#] -> [] -> []
#                     |    ^
# encoder             |____|
# inputs             

encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs') # [max_time, batch_size]
encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length') 
    # this takes a vector (length=batch_size), where each cell is the length of the
    # correponding data entry. this doesn't affect time_major op.
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')

embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1.0, 1.0), dtype=tf.float32)
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs) # [max_time, batch_size, emb_size]

encoder_cell = LSTMCell(encoder_hidden_units)
((encoder_fw_outputs,encoder_bw_outputs), # both have [max_time, batch_size, emb_size]
 (encoder_fw_final_state,encoder_bw_final_state)) = ( # state tuples: (c=[batch_size,emb_size],h=same)
        tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                        cell_bw=encoder_cell,
                                        inputs=encoder_inputs_embedded,
                                        sequence_length=encoder_inputs_length,
                                        dtype=tf.float32, time_major=True)
    )

# concat fw-bw separately, then make a combined final state!
encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2) # concat on emb dim.
encoder_final_state_c = tf.concat((encoder_fw_final_state.c, encoder_bw_final_state.c), 1) # same thing.
encoder_final_state_h = tf.concat((encoder_fw_final_state.h, encoder_bw_final_state.h), 1)
encoder_final_state = LSTMStateTuple(
    c=encoder_final_state_c,
    h=encoder_final_state_h
)

decoder_cell = LSTMCell(decoder_hidden_units)
encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs))
    # getting the shape of a tensor [max_time, batch_size].
    # doc: Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
    # WHY: ?dynamically keeping track of the shape?
decoder_lengths = encoder_inputs_length + 3 # +2 steps, +1 for EOS.

W = tf.Variable(tf.random_uniform([decoder_hidden_units, vocab_size], -1, 1), dtype=tf.float32) # for dec only!
b = tf.Variable(tf.zeros([vocab_size]), dtype=tf.float32)
    # shared weights in the dynamic unrolling of the decoder.
    # W shape: [emb_concat, vocab]
    # it will be matmuled in output * W: [batch_size, emb_concat] * [emb_concat, vocab]
    # get: [batch_size, vocab], where we have allthe predictions (as multinomials)

# prepare tokens for each time step
eos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOS') # [batch_size]
pad_time_slice = tf.zeros([batch_size], dtype=tf.int32, name='PAD')
eos_step_embedded = tf.nn.embedding_lookup(embeddings, eos_time_slice) # [max_time, batch_size]
pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)

# Loop feed (doc: tf.nn.raw_rnn?)
# (time, prev_cell_output, prev_cell_state, prev_loop_state) ->
# (elem_finished, input, cell_state, output, loop_state)

# handles first state (i.e. corresponds to the last state of the encoder)
#
#     state feed only (enc final state)
#         |
#         v
#      # --> #
#   last     first 
#   of enc   of dec
#            ^
#            |
#           EOS
#
def loop_fn_initial():
    initial_elements_finished = (0 >= decoder_lengths) # all false (i.e. not done) at the init step.
    initial_input = eos_step_embedded                  # it's a [batch_size] length boolean vector.
        # "input": it's the input for the next state.
        # in this case, the first cell of the decoder.
    initial_cell_state = encoder_final_state
    initial_cell_output = None # these two None help us
    initial_loop_state = None  # checking whether we are at the init step.
    return (initial_elements_finished,
            initial_input,
            initial_cell_state,
            initial_cell_output,
            initial_loop_state)

# handles the transitions in decoder after the first state
#             ___
#  output ->  |  |
#             # -|-> #
#              / |   ^
#         state  |___| <- next_input (inpt)
#
def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
    def get_next_input():
        # at the first cell of the decoder, we take the feed from 
        # the final state of the encoder (handled by loop_fn_init),
        # feed = EOS embedding
        # and compute the first prediction. 
        output_logits = tf.add(tf.matmul(previous_output, W), b)
            # output * W: [batch_size, emb_concat] * [emb_concat, vocab]
            # get: [batch_size, vocab], where we have all the predictions (as multinomials)
        prediction = tf.argmax(output_logits, axis=1)
        next_input = tf.nn.embedding_lookup(embeddings, prediction)
        return next_input
    elements_finished = (time >= decoder_lengths) # again a [batch_size] boolean vector.
        # this returns a boolean tensor, e.g. [1, 1, 1, 0]
        # this means the first three steps are done, but not the last.
        # when all the steps are done, i.e. time (the real time) is larger than
        # the specified max decoding steps, the vector is all 1.
        # then the next line will return 1.
    finished = tf.reduce_all(elements_finished) # maps to boolean scalar.
    inpt = tf.cond(finished, lambda: pad_step_embedded, get_next_input)
        # if finished, return a pad for next input (i.e. the feed to next step)
        # otherwise, return get_next_input as usual.
    state = previous_state
    output = previous_output
    loop_state = None
    # outputs:
    # elements_finished: a [batch_size] boolean vector.
    # inpt: [batch_size, emb_size] tensor for the next cell.
    # state: (c,h) tuole, raw_rnn takes care of it.
    # output: stored [batch_size, emb_size] tensor.
    # loop_state: rnn_raw takes care of it.
    return (elements_finished,
            inpt, 
            state,
            output,
            loop_state)

# combine the two fns above for a single looping function.
def loop_fn(time, previous_output, previous_state, previous_loop_state):
    # time: an int32 scalar raw_rnn uses to keep track of time-steps internally.
    # previous_output: [max_time, batch_size, emb_size] tensor.
    # previous_state: (c,h) tuple.
    # previous_loop_state: raw_rnn uses to keep track of where it is in the loop (automatic).
    if previous_state is None: # time = 0
        assert previous_output is None and previous_state is None
        return loop_fn_initial()
    else:
        return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)

decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn) # we have an LSTM cell.
    # *_ta: the RNN output (TensorArray <- for dynamic use)
    # *_final_state: 2-tuple of [batch_size, emb_size] (i.e. c and h). of no use for seq2seq.
    # _: final_loop_state, which no one gives a fuck (used internally by *.raw_rnn backend).
decoder_outputs = decoder_outputs_ta.stack() # [max_time, batch_size, emb_concat]

decoder_max_step, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))
    # for matmul, we do
    # [max_time, batch_size, emb_concat], [max_time*batch_size, emb_concat]
decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, W), b)
decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_step, decoder_batch_size, vocab_size))
    # put it back into the original shaping scheme.
decoder_prediction = tf.argmax(decoder_logits, 2) # [max_time, batch_size]

# Optimization
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
    logits=decoder_logits
)
loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer().minimize(loss)

init = tf.global_variables_initializer()
sess.run(init)

In [44]:
digit_from = 2
digit_to = 6+1
word_from = 7
word_to = 11+1
seqlen_from = 3
seqlen_to = 8

batch_size = 10

def next_feed(batch_size):
    batch_enc, batch_dec = random_batch(word_from,word_to,digit_from,digit_to,seqlen_from,seqlen_to,batch_size)
    encoder_inputs_, encoder_inputs_lengths_ = utils.batch(batch_enc)
    decoder_targets_, _ = utils.batch([seq + [word2idx['EOS']] + [word2idx['PAD']]*2 for seq in batch_dec])
    return {
        encoder_inputs: encoder_inputs_,
        encoder_inputs_length: encoder_inputs_lengths_,
        decoder_targets: decoder_targets_
    }

loss_track = []

max_batches = 3001
batches_in_epoch = 1000

try:
    for batch in range(max_batches):
        fd = next_feed(batch_size)
        _, l = sess.run([train_op, loss], fd)
        loss_track.append(l)

        if batch == 0 or batch % batches_in_epoch == 0:
            print('batch {}'.format(batch))
            print('  minibatch loss: {}'.format(sess.run(loss, fd)))
            predict_ = sess.run(decoder_prediction, fd)
            for i, (inp, pred) in enumerate(zip(fd[encoder_inputs].T, predict_.T)):
                print('  sample {}:'.format(i + 1))
                print('    input     > {}'.format(decode_sequence(inp)))
                print('    predicted > {}'.format(decode_sequence(pred)))
                if i >= 2:
                    break
            print
except KeyboardInterrupt:
    print('training interrupted')

batch 0
  minibatch loss: 2.41256952286
  sample 1:
    input     > two two five four PAD PAD PAD
    predicted > EOS EOS EOS EOS five five five PAD PAD PAD
  sample 2:
    input     > one three five one five five PAD
    predicted > 5 5 4 4 4 4 4 4 4 PAD
  sample 3:
    input     > five three two one one one two
    predicted > EOS EOS EOS EOS five five five five five five

batch 1000
  minibatch loss: 0.519072413445
  sample 1:
    input     > one three five PAD PAD PAD PAD
    predicted > 1 3 5 EOS PAD PAD PAD PAD PAD PAD
  sample 2:
    input     > one five two four two five three
    predicted > 1 5 2 2 2 5 3 EOS PAD PAD
  sample 3:
    input     > four four two one five one four
    predicted > 4 4 2 1 5 4 4 EOS PAD PAD

batch 2000
  minibatch loss: 0.198304265738
  sample 1:
    input     > five three five five PAD PAD PAD
    predicted > 5 3 5 5 EOS PAD PAD PAD PAD PAD
  sample 2:
    input     > two two two four one three two
    predicted > 2 2 4 2 1 3 2 EOS PAD PAD
  sample 

In [70]:
# # Experiment block

# tf.reset_default_graph()
# sess = tf.InteractiveSession()

# a = tf.placeholder(tf.int32, shape=(None,None))
# # b = tf.one_hot(a, depth=7)
# # b = tf.unstack(tf.shape(a), axis=0)
# # c = tf.unstack(tf.shape(a), axis=1)
# # c = tf.shape(a)
# b = tf.stack(a)
# c = tf.unstack(b)

# a_ = np.array([[1,2,3],[4,5,6]])
# sess.run(tf.global_variables_initializer())
# # t1 = sess.run(b, feed_dict={a:a_})
# # print t1, type(t1)
# # t2,t3 = sess.run(c, feed_dict={a:a_})
# # print t2,t3
# # print t2, type(t2)
# print sess.run(b, feed_dict={a:a_})
# print sess.run(c, feed_dict={a:a_})