In [1]:
# Add custom import path

import sys
sys.path.insert(0, '/home/jacobsuwang/Documents/UTA2018/NEURAL-NETS/ATTENTION/CODE/01-import-folder')

### LOAD DATA

In [2]:
import utils
import dill
import random
import numpy as np

bl08_path = '/home/jacobsuwang/Documents/UTA2018/NEURAL-NETS/ATTENTION/DATA/COHERENCE/data1-train-encoded-data.p'

data_dict = dill.load(open(bl08_path, 'rb'))

print data_dict['readme']


README

@ 'tar-order':targets_orders

List of sentence order lists; each list = integers indexing permuted sentences in the doc.

@ 'inp-encode':inputs_encoded

List of documents; each doc = a list of sentences; each sent = a list of word indices.

@ 'inp-slen':inputs_sent_lengths

List of length info of documents; each info = a list of sentence lengths.

@ 'w-indexer':word_indexer

Indexer() class. word <-> word index.

@ 'idx-emb':idx2emb

dict() class. index <-> glove embeddings.

@ 'glove-init':glove_init

Initializer of embedding matrix. 



In [3]:
# Padding inputs

MAX_SENT = 61
MAX_DOC = 26
MAX_SENT, MAX_DOC = 0, 0
for doc in data_dict['inp-encode']:
    if len(doc)>MAX_DOC:
        MAX_DOC = len(doc)
    for sent in doc:
        if len(sent)>MAX_SENT:
            MAX_SENT = len(sent)
print 'MAX_DOC: {} | MAX_SENT: {}'.format(MAX_DOC, MAX_SENT)

def pad_doc(doc):
    padded_doc = []
    doc_length = len(doc)
    for sent in doc:
        padded_sent = sent + [0]*(MAX_SENT-len(sent))
        padded_doc.append(padded_sent)
    padded_doc += [[0]*MAX_SENT]*(MAX_DOC-doc_length)
    return padded_doc, doc_length

def pad_sents_length(slen):
    return slen + [0]*(MAX_DOC-len(slen))

def pad_tars_order(tord):
    return tord + [0]*3 # +1 EOS, +2 PADs

MAX_DOC: 26 | MAX_SENT: 61


### MAKE MODEL

In [5]:
import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple

In [13]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

## CONFIGS ##

sent_emb_size = 20
doc_enc_emb_size = sent_emb_size*2
doc_dec_emb_size = doc_enc_emb_size*2
    # the model is sent-enc (bilstm) -> doc-enc (bilstm) -> doc-dec (unilstm)
    # so the #dim doubles twice through the two bilstms.

sent_vocab_size = MAX_DOC + 2
    # #sents in longest doc + PAD + EOS.
    # no known better solution, because the final weight matrix 
    # W = [dec-emb, pred-dim] must be fixed.
    
lr = 1e-5
    
## INPUT PORT ##
    
doc_inputs = tf.placeholder(tf.int32, shape=[MAX_DOC, MAX_SENT], name='doc-inputs')
    # 1 document padded to the longest of docs and the longest of sents.
    # NB: here it is treated as [batch_size=MAX_DOC, max_time=MAX_SENT].
doc_sents_length = tf.placeholder(tf.int32, shape=[MAX_DOC], name='doc-sents-length')
    # lengths of sentences in the doc.

embeddings = tf.get_variable('word-embeddings', shape=data_dict['glove-init'].shape, # (1809,300)
                             initializer=tf.constant_initializer())
glove_feed = tf.placeholder(tf.float32, shape=data_dict['glove-init'].shape)
glove_init = embeddings.assign(glove_feed)
    # above initializes the embedding matrix [vocab_size, emb_size]
    # with pretrained 300D GloVe vectors.
doc_inputs_embedded = tf.transpose(tf.nn.embedding_lookup(embeddings, doc_inputs),[1,0,2]) 
    # transpose to [max_time, batch_size, emb_size] for time-major.

## SENTENCE ENCODER ##
    
with tf.variable_scope('sent-enc'):
    sent_enc_cell = LSTMCell(sent_emb_size)
    ((sent_enc_fw_outputs,sent_enc_bw_outputs), 
     (sent_enc_fw_final_state,sent_enc_bw_final_state)) = ( 
            tf.nn.bidirectional_dynamic_rnn(cell_fw=sent_enc_cell,
                                            cell_bw=sent_enc_cell,
                                            inputs=doc_inputs_embedded,
                                            sequence_length=doc_sents_length,
                                            dtype=tf.float32, time_major=True)
        )
    sent_enc_outputs = tf.concat((sent_enc_fw_outputs,sent_enc_bw_outputs), 2)
        # [max_time, batch_size, emb_size].
    sent_enc_final_state_c = tf.concat((sent_enc_fw_final_state.c,sent_enc_bw_final_state.c), 1)
    sent_enc_final_state_h = tf.concat((sent_enc_fw_final_state.c,sent_enc_bw_final_state.h), 1)
    sent_enc_final_state = LSTMStateTuple(
        c=sent_enc_final_state_c, # both c & h are [batch_size, emb_size].
        h=sent_enc_final_state_h  # basically a list of sentence embeddings of the doc.
    )    

doc_final_embedded = tf.expand_dims(sent_enc_final_state.h, 1)
    # now the original "batch_size" is reinterpreted as the max_time/length 
    # of the document (which maxes at MAX_DOC).
    # thus, we add a batch dimension at dim-1 to maintain the
    # [max_time, batch_size, emb_size] time-major form for convenience of processing.
doc_length = tf.placeholder(tf.int32, shape=[1], name='doc-length')
    # 1 scalar that will be fed in as [*], document length.
doc_targets = tf.placeholder(tf.int32, shape=[None, 1], name='doc-targets') 
    # the correct order indices for the jumbled document.
    # [max_time=#sents in doc, batch_size=1]

## DOCUMENT ENCODER ##

with tf.variable_scope('doc-enc'):
    doc_enc_cell = LSTMCell(doc_enc_emb_size)
    ((doc_enc_fw_outputs,doc_enc_bw_outputs), 
     (doc_enc_fw_final_state,doc_enc_bw_final_state)) = ( 
            tf.nn.bidirectional_dynamic_rnn(cell_fw=doc_enc_cell,
                                            cell_bw=doc_enc_cell,
                                            inputs=doc_final_embedded,
                                            sequence_length=doc_length,
                                            dtype=tf.float32, time_major=True)
        )    
    doc_enc_outputs = tf.concat((doc_enc_fw_outputs,doc_enc_bw_outputs), 2) 
        # [max_time, batch_size, emb_size].
    doc_enc_final_state_c = tf.concat((doc_enc_fw_final_state.c,doc_enc_bw_final_state.c), 1)
    doc_enc_final_state_h = tf.concat((doc_enc_fw_final_state.c,doc_enc_bw_final_state.h), 1)
    doc_enc_final_state = LSTMStateTuple(
        c=doc_enc_final_state_c, # [batch_size, emb_size], where batch_size=1 always
        h=doc_enc_final_state_h  # as only 1 document gets entered per time.
    )
    
## DOCUMENT DECODER ##
    
doc_dec_cell = LSTMCell(doc_dec_emb_size)
doc_max_time, doc_batch_size, _ = tf.unstack(tf.shape(doc_final_embedded))
    # dynamically passing these dimensions from enc to dec.
doc_dec_length = doc_length + 3 
    # MAX_DOC +2 steps, +1 for EOS. 

W = tf.get_variable('W', [doc_dec_emb_size, sent_vocab_size], dtype=tf.float32,
                    initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('b', [sent_vocab_size], dtype=tf.float32,
                    initializer=tf.contrib.layers.xavier_initializer())
    # the final softmax layer for the decoder,
    # softmanx(W*out + b)

eos_step_embedded = tf.ones([doc_batch_size, doc_enc_emb_size], dtype=tf.float32, name='EOS')
pad_step_embedded = tf.zeros([doc_batch_size, doc_enc_emb_size], dtype=tf.float32, name='PAD')
    # doc_batch_size=1, doc_enc_emb_size=(unilstm's dim in doc encoder).
    # NB: even batch_size=1, must use this dynamic control for TF to interpret.

def loop_fn_initial():
    initial_elements_finished = (0 >= doc_dec_length) 
        # false along batch_size, i.e. none is done at this (init) step.
    initial_input = eos_step_embedded                   
    initial_cell_state = doc_enc_final_state
    initial_cell_output = None 
    initial_loop_state = None 
    return (initial_elements_finished,
            initial_input,
            initial_cell_state,
            initial_cell_output,
            initial_loop_state)
 
W1 = tf.get_variable('W1', [doc_enc_emb_size, doc_enc_emb_size], dtype=tf.float32,
                     initializer=tf.contrib.layers.xavier_initializer())
W2 = tf.get_variable('W2', [doc_dec_emb_size, doc_enc_emb_size], dtype=tf.float32,
                     initializer=tf.contrib.layers.xavier_initializer())
v = tf.get_variable('v', [doc_enc_emb_size, 1], dtype=tf.float32,
                    initializer=tf.contrib.layers.xavier_initializer())
    # raw weights for computing "the attention of decoding step i on the
    # encoding step j".
    # u_j^i = v^T tanh(W1*e_j + W2*d_i) Vinyals et al. (2016, Section 2.3).

def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
    
    ## ATTENTION ##
    
    def get_next_input(): 
        mt, bc, _ = tf.unstack(tf.shape(doc_final_embedded))
            # dynamically manage max_time and batch_size.
        EW1 = tf.reshape(tf.tensordot(doc_final_embedded, W1, axes=[[2],[0]]),
                         [mt, bc, doc_enc_emb_size]) 
            # [mt,bc,enc_emb] * [enc_emb,enc_emb] = [mt,bc,enc_emb]
            # batch computing W1*e_j above.
        DW2 = tf.matmul(previous_state.h, W2)
            # [bc,dec_emb] * [dec_emb,enc_emb] = [bc,enc_emb]
            # batch computing W2*d_i.
        EW1_add_DW2 = tf.add(EW1, DW2)
            # [mt,bc,enc_emb] + [bc,enc_emb] = [mt,bc,enc_emb]
            # the identity in the last two dimensions invokes broadcasting.
        attention_mat = tf.reshape(tf.nn.tanh(tf.squeeze(tf.tensordot(EW1_add_DW2, v, axes=[[2],[0]]), 
                                                         axis=2)), [mt,bc])
            # op1. EW1_add_DW2 * v
            #   [mt,bc,enc_emb] * [enc_emb, 1] = [mt,bc,1]
            #   the vacuous 1 dim of v must be there to enable tensordot.
            # op2. squeeze(res-op1)
            #   remove dim-2, which = 1.
            #   squeeze([mt,bc,1], 2) = [mt,bc]
            # op3. reshape(res-op2)
            #   explicitly specify the [mt,bc] shape for easy interpretation.
            #   NB: mt,bc must be dynamically extracted dims (using unstack(shape))!
        attention_norm_mat = tf.nn.softmax(attention_mat, dim=0) 
            # softmax along the max_time axis (#sentences in doc), i.e.
            # the attention weights of all sentences sum up to 1.
        selector = tf.one_hot(tf.argmax(attention_norm_mat, axis=0), depth=doc_max_time,
                              on_value=1.0, off_value=0.0, axis=0) 
            # an [mt,bc] shaped array of one-hot vectors (i.e. a matrix).
            # it picks out the argmax sentence embeddings for feeding to
            # the next time step.
        inputs_embedded_selected = tf.transpose(
            tf.multiply(
                tf.transpose(doc_final_embedded, [2,0,1]), 
                selector), 
            [1,2,0]
        ) 
            # op1. transpose([mt,bc,enc_emb], [2,0,1])
            #   [mt,bc,enc_emb] -> [enc_emb,mt,bc]
            #   to allow for broadcast in below.
            # op2. multiply([enc_emb,mt,bc], [mt,bc])
            #   broadcast to 0-ify the non-max sentence embeddings.
            # op3. transpose([enc_emb,mt,bc], [1,2,0])
            #   [enc_emb,mt,bc] -> [mt,bc,enc_emb]
            #   return to time-major.
        inputs_embedded_selected = tf.reduce_sum(
            tf.reshape(inputs_embedded_selected, [mt, bc, doc_enc_emb_size]), 
            axis=0 
        ) 
            # op1. reshape
            #   explicitly specify the [mt,bc,enc_emb] shape for easy interpretation.
            # op2. reduce_sum(res-op1)
            #   collapse the tensor along max_time to leave behind only
            #   the max sentence embedding (the other embs are 0-fied now,
            #   so nothing changes except for shape).
        next_input = inputs_embedded_selected
            # feed the resulting max sentence embedding to the next time step.
        return next_input
    
    elements_finished = (time >= doc_dec_length)
        # if time >= doc_dec_length, it means we are done processing
        # all the sentences in the doc.
    finished = tf.reduce_all(elements_finished) 
        # when finished, this should be true.
        # e.g. if x=[T,F,T], reduce_all(x)=F,
        #      if x=[T,T,T], reduce_all(x)=T.
        # the reduction is effectively along the batch dim 
        # (but configurable to be restricted to a selected dimension, or all).
    inpt = tf.cond(finished, lambda: pad_step_embedded, get_next_input)
        # cond(bool, pos-res, neg-res)
        # if finished=True, return pos-res, i.e. ending state (PAD).
        # otherwise return neg-res, i.e. the next input.
    state = previous_state
    output = previous_output
    loop_state = None
    return (elements_finished,
            inpt, 
            state,
            output,
            loop_state)
        # returns:
        # elements_finished: a [batch_size] boolean vector.
        # inpt: [batch_size, emb_size] tensor for the next time step.
        # state: (c,h) tuple, raw_rnn takes care of it.
        # output: stored [batch_size, emb_size] tensor.
        # loop_state: rnn_raw takes care of it.

def loop_fn(time, previous_output, previous_state, previous_loop_state):
        # time: an int32 scalar raw_rnn uses to keep track of time-steps internally.
        # previous_output: [max_time, batch_size, emb_size] tensor.
        # previous_state: (c,h) tuple.
        # previous_loop_state: raw_rnn uses to keep track of where it is in the loop (automatic).
    if previous_state is None:
        assert previous_output is None and previous_state is None
        return loop_fn_initial()
    else:
        return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)
    
## PREDICTION ##

doc_dec_outputs_ta, doc_dec_final_state, _ = tf.nn.raw_rnn(doc_dec_cell, loop_fn)
doc_dec_outputs = doc_dec_outputs_ta.stack()
    # doc_dec_outputs = [max_time=#sents in doc, batch_size=1, dec_emb_size]
    # gather outputs (dynamic TensorArray to Tensor).
doc_dec_max_step, doc_dec_batch_size, doc_dec_dim = tf.unstack(tf.shape(doc_dec_outputs))
    # dynamically extract the mt,bc,emb of the particular data entry.
doc_dec_outputs_flat = tf.reshape(doc_dec_outputs, (-1, doc_dec_dim))
    # reshape to [mt*bc, emb] for the last, softmax layer.
    # because there we do softmax(W*vector + b).
doc_dec_logits_flat = tf.add(tf.matmul(doc_dec_outputs_flat, W), b)
    # [mt*bc, dec_emb] * [dec_emb,output_dim=sent_vocab=MAX_DOC+EOS+PAD] + [output_dim]
    #   = [mt*bc, output_dim]
doc_dec_logits = tf.reshape(doc_dec_logits_flat, (doc_dec_max_step, doc_dec_batch_size, sent_vocab_size))
    # reshape to [mt,bc,output_dim] to get prediction.
doc_dec_prediction = tf.cast(tf.argmax(doc_dec_logits, 2), dtype=tf.int32)
    # argmax long dim=output_dim to get the max-dim in output_dim=MAX_DOC+EOS+PAD.

## EVALUATION (ACCURACY) ##

correct_raw = tf.cast(tf.equal(doc_dec_prediction, doc_targets), tf.int32)
    # equal(pred=[mt=#sents in doc,bc=1], true=[mt=#sents in doc,bc=1])
    #   = [mt,bc]
mask = tf.cast(tf.not_equal(doc_targets, 0), tf.int32)
    # mark non-zero entries in doc_targets with 1.
    # [mt,bc] shaped.
total_seqlen = tf.cast(doc_length, tf.float32)
    # #sents in doc. float to get float accuracy value.
correct = tf.multiply(correct_raw, mask)
    # [mt,bc] * [mt,bc], where if correct, get correct=1*mask=1 = 1
    # otherwise get correct=1*mask=0 = 0.
    # get [mt,bc] matrix here.
accuracy = tf.cast(tf.reduce_sum(correct)-1, tf.float32) / total_seqlen
    # reduce_sum(correct=[mt,bc]) / total_seqlen = scalar / [seqlen]
    #   = [accuracy]. doesn't matter because doesn't affect np.mean(accuracies).
    
## OPTIMIZATION ##

stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(doc_targets, depth=sent_vocab_size, dtype=tf.float32),
    logits=doc_dec_logits
)
    # labels=one_hot(doc_targets=[mt,bc],output_dim=sent_vocab)=[mt,bc,output_dim]
    # logits=[mt,bc,output_dim]
loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer(lr).minimize(loss)
init = tf.global_variables_initializer()
sess.run(init)

### (ROUGH) EVALUATION

In [15]:
loss_track = []
accuracy_track = []

num_epochs = 10
verbose = 100

sess.run(glove_feed, feed_dict={glove_feed:data_dict['glove-init']})

for e in range(num_epochs):
    print 'Epoch {}:'.format(e+1)
    print
    for i in range(len(data_dict['inp-encode'])):
        doc_inputs_, _ = pad_doc(data_dict['inp-encode'][i])
        doc_sents_length_ = pad_sents_length(data_dict['inp-slen'][i])
        doc_length_ = [len(data_dict['inp-encode'][i])] # NB: must be [2]!!!
        doc_targets_ = np.array(pad_tars_order(data_dict['tar-order'][i]))[:,np.newaxis]
        fd = {doc_inputs:doc_inputs_, 
              doc_sents_length:doc_sents_length_,
              doc_length:doc_length_, 
              doc_targets:doc_targets_}
        _, l, a = sess.run([train_op, loss, accuracy], feed_dict=fd)
        loss_track.append(l)
        accuracy_track.append(a)
        if i % verbose == 0:
            print 'Current mean loss = {} | mean accuracy = {}'.format(np.mean(loss_track),np.mean(accuracy_track))
    print

# Results with 0 tuning
#
# Epoch 100:

# Current mean loss = 0.301753968 | mean accuracy = 0.758796215057
# Current mean loss = 0.301661282778 | mean accuracy = 0.758872568607
# Current mean loss = 0.301440030336 | mean accuracy = 0.758985757828
# Current mean loss = 0.300857931376 | mean accuracy = 0.759163379669
# Current mean loss = 0.300510913134 | mean accuracy = 0.759310424328