### Task: translating number words into numbers (e.g. one two three -> 1 2 3)

* Base Seq2seq with training-wheels on, i.e. decoder gets manually fed inputs (e.g. 1 2 3) to produce results (e.g. 1 2 3). For architecture testing.

In [None]:
### Pang2 Bai2

# First-step enc-dec model
#
#                     decoder 
#                     target
#
#  [] -> [] -> [#] -> [] -> []
#
#  encoder            decoder
#  inputs             inputs
#
# Next step: decoder_input = f(encoder_last_state) or f(prev_decoder_output)

In [2]:
# Add custom import path

import sys
sys.path.insert(0, '/home/jacobsuwang/Documents/UTA2018/NEURAL-NETS/ATTENTION/CODE/01-import-folder')

### MAKING DATA

In [3]:
import utils

In [111]:
vocab = set(['PAD','EOS','1','2','3','4','5','one','two','three','four','five'])
word2idx = {'PAD':0,'EOS':1,'1':2,'2':3,'3':4,'4':5,'5':6,
            'one':7,'two':8,'three':9,'four':10,'five':11}
idx2word = {idx:word for word,idx in word2idx.iteritems()}
word2digit_translate = {'one':'1','two':'2','three':'3',
                        'four':'4','five':'5'}
word2digit_translate_byidx = {7:2,8:3,9:4,10:5,11:6}

def code_sequence(s):
    '''
    Take a sentence, convert it to a list of words (in vocab), 
    then return idx encoding.
    '''
    seq = s.split()
    return [word2idx[word] for word in seq]

def decode_sequence(l):
    '''
    Take a list of indices, return words.
    '''
    return ' '.join([idx2word[idx] for idx in l])

def encode(data):
    '''
    Take sentence data, encode it.
    '''
    return [code_sequence(dat) for dat in data]

def to_readable(batch):
    '''
    Take a time-major batch of data, 
    return a list of translated words.
    '''
    batch_t = batch.transpose() # time-major -> batch-major
    return [decode_sequence(dat) for dat in batch_t]

# To transform data (i.e. list of sentences as wordlists) into
# input data, feed it to utils.batch
# sample results:
# utils.batch([code_sequence('1 2 3')])
# (array([[1],
#         [2],
#         [3]], dtype=int32), [3])
# that is, a tuple (time major with shape [max_time, batch_size])
# so with a batch of two, we get
# (array([[1, 4],
#         [2, 5],
#         [3, 5]], dtype=int32), [3, 3])

In [136]:
# Data generator

def random_length(len_from, len_to):
    if len_from == len_to:
        return len_from
    return np.random.randint(len_from, len_to)

def random_batch(input_length_from, input_length_to,
                 output_length_from, output_length_to,
                 seq_length_from, seq_length_to,
                 batch_size):
    if input_length_from > input_length_to or \
        output_length_from > output_length_to:
        raise ValueError('length_from > length_to')

    input_batch = [np.random.randint(low=input_length_from,
                                     high=input_length_to,
                                     size=random_length(seq_length_from,
                                                        seq_length_to)).tolist()
                   for _ in range(batch_size)]
    output_batch = [[word2digit_translate_byidx[idx] for idx in input_dat]
                     for input_dat in input_batch]
    return input_batch, output_batch
      
# Example:
# digit_from = 2
# digit_to = 6+1
# word_from = 7
# word_to = 11+1
# a,b = random_batch(word_from,word_to,digit_from,digit_to,batch_size=2)
# print a
# [[11, 7, 11, 11, 10, 11, 9, 10, 10, 8, 9], [10, 7, 7, 9, 8, 9, 7, 8, 11]]
# print b
# [[6, 2, 6, 6, 5, 6, 4, 5, 5, 3, 4], [5, 2, 2, 4, 3, 4, 2, 3, 6]]
# print [decode_sequence(a_) for a_ in a]
# ['five one five five four five three four four two three', 'four one one three two three one two five']
# print [decode_sequence(b_) for b_ in b]
# ['5 1 5 5 4 5 3 4 4 2 3', '4 1 1 3 2 3 1 2 5']

### MAKING MODEL

In [13]:
import numpy as np
import tensorflow as tf

In [69]:
# Graph

tf.reset_default_graph()
sess = tf.InteractiveSession()

vocab_size = len(vocab)
input_embedding_size = 20

encoder_hidden_units = decoder_hidden_units = 20

#                    decoder 
#                    target
# 
# [] -> [] -> [#] -> [] -> []
# 
# encoder            decoder
# inputs             inputs

encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs') # [max_time, batch_size]
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs')

embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1.0, 1.0), dtype=tf.float32)
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs) # [max_time, batch_size, emb_size]
decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)

encoder_cell = tf.contrib.rnn.LSTMCell(encoder_hidden_units)
encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_inputs_embedded,
    dtype=tf.float32, time_major=True
) # encoder_outputs will not be used,
  # cuz we only care about the outputs of decoder cells
  # encoder_final_state is the input to the decoder, i.e. h_enc^F.
  # below, it is fed in decoder as the init.
decoder_cell = tf.contrib.rnn.LSTMCell(decoder_hidden_units)
decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
    decoder_cell, decoder_inputs_embedded,
    initial_state=encoder_final_state,
    dtype=tf.float32, time_major=True, scope='plain_decoder'
) # decoder_outputs: [max_time, batch_size, decoder_hidden_size]
decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)
    # this creates a linear layer for softmax.
    # the conversion: [*,*,decoder_hidden_size] -> [*,*,vocab_size] 
decoder_prediction = tf.argmax(decoder_logits, 2)

stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
    logits=decoder_logits
) # tf.one_hot(indices, depth), where indices is the input (e.g. [2,3] matrix),
  # depth is the length of the one-hot vector (usually set as vocab size).
  # cross-ent shape: [max_time, batch_size], tells you the entropy at each
  # input cell (or, word in this case).
loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer().minimize(loss)

init = tf.global_variables_initializer()
sess.run(init)

In [79]:
# Make some test input

input_batch, _ = utils.batch(encode(['one two three', 'three five four', 'one three one']))
output_batch, _ = utils.batch(encode(['1 2 3', '3 5 4', '1 3 1']))
# _ is seqlen, which we don't use yet

# Example (time-major conversion):
# input_data = ['one two three', 'three five four', 'one three one']
# output_data = ['1 2 3', '3 5 4', '1 3 1']
# inputs = encode(input_data)
# outputs = encode(output_data)
# print inputs
# print outputs
# [[7, 8, 9], [9, 11, 10], [7, 9, 7]]
# [[2, 3, 4], [4, 6, 5], [2, 4, 2]]
# utils.batch(inputs=inputs)
# (array([[ 7,  9,  7],
#         [ 8, 11,  9],
#         [ 9, 10,  7]], dtype=int32), [3, 3, 3])

print 'batch_encoded:\n', str(input_batch)
print
print 'readable: ', to_readable(input_batch)
print
print 'decoder_inputs:\n', str(output_batch)
print
print 'readable: ', to_readable(output_batch)
print 

pred_ = sess.run(decoder_prediction, feed_dict={encoder_inputs: input_batch,
                                                decoder_inputs: output_batch})
print 'decoder_predictions:\n', str(pred_)
print 
print 'readable: ', to_readable(pred_)

batch_encoded:
[[ 7  9  7]
 [ 8 11  9]
 [ 9 10  7]]

readable:  ['one two three', 'three five four', 'one three one']

decoder_inputs:
[[2 4 2]
 [3 6 4]
 [4 5 2]]

readable:  ['1 2 3', '3 5 4', '1 3 1']

decoder_predictions:
[[ 5  5  6]
 [10 11  5]
 [ 5  8  5]]

readable:  ['4 four 4', '4 five two', '5 4 4']


In [152]:
digit_from = 2
digit_to = 6+1
word_from = 7
word_to = 11+1
seqlen_from = 3
seqlen_to = 8

batch_size = 10

def next_feed(batch_size):
    batch_enc, batch_dec = random_batch(word_from,word_to,digit_from,digit_to,seqlen_from,seqlen_to,batch_size)
    encoder_inputs_, _ = utils.batch(batch_enc)
    decoder_targets_, _ = utils.batch([seq + [word2idx['EOS']] for seq in batch_dec])
    decoder_inputs_, _ = utils.batch([[word2idx['EOS']] + seq for seq in batch_dec])
    return {
        encoder_inputs: encoder_inputs_,
        decoder_inputs: decoder_inputs_,
        decoder_targets: decoder_targets_
    }

loss_track = []

max_batches = 3001
batches_in_epoch = 1000

try:
    for batch in range(max_batches):
        fd = next_feed(batch_size)
        _, l = sess.run([train_op, loss], fd)
        loss_track.append(l)

        if batch == 0 or batch % batches_in_epoch == 0:
            print('batch {}'.format(batch))
            print('  minibatch loss: {}'.format(sess.run(loss, fd)))
            predict_ = sess.run(decoder_prediction, fd)
            for i, (inp, pred) in enumerate(zip(fd[encoder_inputs].T, predict_.T)):
                print('  sample {}:'.format(i + 1))
                print('    input     > {}'.format(decode_sequence(inp)))
                print('    predicted > {}'.format(decode_sequence(pred)))
                if i >= 2:
                    break
            print()
except KeyboardInterrupt:
    print('training interrupted')

batch 0
  minibatch loss: 0.112800441682
  sample 1:
    input     > two four four one PAD PAD PAD
    predicted > 4 4 4 1 EOS PAD PAD PAD
  sample 2:
    input     > one two two five four one PAD
    predicted > 1 2 2 5 4 1 EOS PAD
  sample 3:
    input     > one three five four PAD PAD PAD
    predicted > 1 3 5 4 EOS PAD PAD PAD
()
batch 1000
  minibatch loss: 0.105462908745
  sample 1:
    input     > four five four five PAD PAD PAD
    predicted > 4 5 4 5 EOS PAD PAD PAD
  sample 2:
    input     > two five four four three one PAD
    predicted > 2 5 4 4 3 1 EOS PAD
  sample 3:
    input     > one four five PAD PAD PAD PAD
    predicted > 1 4 5 EOS PAD PAD PAD PAD
()
batch 2000
  minibatch loss: 0.0850664749742
  sample 1:
    input     > four five five four three four PAD
    predicted > 4 5 5 4 3 4 EOS PAD
  sample 2:
    input     > one two four two PAD PAD PAD
    predicted > 1 2 4 2 EOS PAD PAD PAD
  sample 3:
    input     > two four four one three two PAD
    predicted > 4 4

In [53]:
# # Experiment block

# tf.reset_default_graph()
# sess = tf.InteractiveSession()

# a = tf.placeholder(tf.int32, shape=(None,None))
# b = tf.one_hot(a, depth=7)

# a_ = np.array([[1,2,3],[4,5,6]])
# sess.run(tf.global_variables_initializer())
# print sess.run(b, feed_dict={a:a_})

[[[ 0.  1.  0.  0.  0.  0.  0.]
  [ 0.  0.  1.  0.  0.  0.  0.]
  [ 0.  0.  0.  1.  0.  0.  0.]]

 [[ 0.  0.  0.  0.  1.  0.  0.]
  [ 0.  0.  0.  0.  0.  1.  0.]
  [ 0.  0.  0.  0.  0.  0.  1.]]]
