In [2]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

In [3]:
import os
import time
import random
import shutil
import dill
import numpy as np

import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, MultiRNNCell, DropoutWrapper

from helpers import Indexer, batch
from itertools import chain, product
from collections import defaultdict

### Prepare data

In [12]:
# Link to NYT data folder

nyt_code_dir = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_event_sample_code/"
FILE_NAMES = os.listdir(nyt_code_dir)

# Link to dictionary information

info_path = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k.p"
indexer100k, word2emb100k = dill.load(open(info_path, 'rb'))
glove_embs = []
for i in range(len(indexer100k)):
    glove_embs.append(word2emb100k[indexer100k.get_object(i)])
glove_embs = np.array(glove_embs)
print(glove_embs.shape)

(100001, 300)


In [37]:
BATCH_SIZE = 100
NUM_EVENTS = 5
NUM_WORDS = 5

def get_batch(file_idx):
    filename = FILE_NAMES[file_idx]
    edoc_a, edoc_b, _ = dill.load(open(nyt_code_dir+FILE_NAMES[file_idx],'rb'))
    size_a, size_b = len(edoc_a), len(edoc_b)
    batch_x1, batch_x2, batch_y = [], [], []
    ys = [1,0,0,1]
    sizes = [[size_a,size_a],[size_a,size_b],[size_b,size_a],[size_b,size_b]]
    for _ in range(BATCH_SIZE//4):
        for i,(eda,edb) in enumerate(product([edoc_a,edoc_b],
                                             [edoc_a,edoc_b])):
            batch_x1.append(eda[np.random.randint(0,sizes[i][0])])
            batch_x2.append(edb[np.random.randint(0,sizes[i][1])])
            batch_y.append(ys[i])
    return np.array(batch_x1), np.array(batch_x2), np.array(batch_y) 
        # batch_x*: <bc,ne,nw>, batch_y: <bc,>

### FFNN-BiLSTM-bilinear

In [61]:
tf.reset_default_graph()

sess = tf.InteractiveSession()

VOCAB_SIZE, EMB_SIZE = glove_embs.shape
HID_SIZE = 100 # let event embs be of the same hid-size as role-factored arg vectors.

NUM_LAYERS = 2
KEEP_PROB = 0.7
LEARNING_RATE = 1e-4

input_x1 = tf.placeholder(tf.int32, [BATCH_SIZE, NUM_EVENTS, NUM_WORDS], name='input_x1')
input_x2 = tf.placeholder(tf.int32, [BATCH_SIZE, NUM_EVENTS, NUM_WORDS], name='input_x2')
input_y = tf.placeholder(tf.int32, [BATCH_SIZE], name='input_y')

keep_prob = tf.placeholder(tf.float32, name="keep_prob")

with tf.variable_scope('Embedding'):
    embeddings = tf.get_variable('embedding', [VOCAB_SIZE, EMB_SIZE],
                                 initializer=tf.contrib.layers.xavier_initializer())
    glove_init = embeddings.assign(glove_embs)
    input_x1_embedded = tf.reshape(tf.nn.embedding_lookup(embeddings, input_x1), [BATCH_SIZE, NUM_EVENTS, -1]) 
        # op1. embed words: <bc,ne,nw,emb>
        # op2. concat words in event: <bc,ne,nw*emb>
    input_x2_embedded = tf.reshape(tf.nn.embedding_lookup(embeddings, input_x2), [BATCH_SIZE, NUM_EVENTS, -1])

def run_ffnn(inputs): # inputs=<ne,nw*emb>
    W_ffnn = tf.get_variable('W_ffnn', [NUM_WORDS*EMB_SIZE, EMB_SIZE], 
                             initializer=tf.contrib.layers.xavier_initializer())
    return tf.matmul(inputs, W_ffnn) # <ne,emb>, event now has the same emb length as word.

with tf.variable_scope('FFNN') as scope:
    input_x1_ffnn = tf.transpose(tf.map_fn(run_ffnn, input_x1_embedded), [1,0,2]) 
        # op1. map_fn out: <bc,ne,emb>
        # op2. for lstm input: <max-time=ne,bc,emb>
    scope.reuse_variables()
    input_x2_ffnn = tf.transpose(tf.map_fn(run_ffnn, input_x2_embedded), [1,0,2])

cell = MultiRNNCell([DropoutWrapper(LSTMCell(HID_SIZE),output_keep_prob=keep_prob)]*NUM_LAYERS) 
    
def run_lstm(inputs):
    ((fw_outputs,bw_outputs), # <max-time=ne,bc,hid>, attention later if needed.
     (fw_final_state,bw_final_state)) = ( # <bc,hid>
        tf.nn.bidirectional_dynamic_rnn(cell_fw=cell,
                                        cell_bw=cell,
                                        inputs=inputs,
                                        sequence_length=[NUM_EVENTS]*BATCH_SIZE,
                                        dtype=tf.float32, time_major=True)
    )    
    return tf.concat([tf.concat([fw_state_tuple.h,bw_state_tuple.h], 1) # lstm-out size *= NUM_LAYERS by stacking.
                      for fw_state_tuple,bw_state_tuple in zip(fw_final_state,bw_final_state)], 1)

with tf.variable_scope('BiLSTM') as scope:
    final_vec_x1 = run_lstm(input_x1_ffnn) # <bc,hid*2*num-layers>
    scope.reuse_variables()
    final_vec_x2 = run_lstm(input_x2_ffnn)

final_vec_size = HID_SIZE*2*NUM_LAYERS    

def run_scores(fv_x1, fv_x2):
    W_bi = tf.get_variable('W_bi', [final_vec_size, final_vec_size], 
                           initializer=tf.contrib.layers.xavier_initializer())
    return tf.nn.sigmoid(tf.diag_part(tf.matmul(tf.matmul(final_vec_x1,W_bi),tf.transpose(final_vec_x2))),
                         name='scores')

scores = run_scores(final_vec_x1, final_vec_x2)
predictions = tf.cast(tf.round(scores), tf.int32, name='predictions')     
    
with tf.name_scope('Loss'):
    losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(input_y, tf.float32), logits=scores)
    loss = tf.reduce_mean(losses, name='loss')

with tf.name_scope('Accuracy'):
    correct_predictions = tf.equal(predictions, input_y)
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
    
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name='train_op')

sess.run(tf.global_variables_initializer())

# saver = tf.train.Saver()

In [63]:
NUM_EPOCHS = 1
TRAIN_SIZE = 10
VERBOSE = 1
# TRAIN_SIZE = len(FILE_NAMES)
# VERBOSE = 1000

loss_track, accuracy_track = [], []
try:
    for e in range(NUM_EPOCHS):
        print('Epoch ', e+1)
        print('\n')
        file_indices = np.random.choice(list(range(len(FILE_NAMES))), size=TRAIN_SIZE, replace=False)
        random.shuffle(file_indices)
        curr_loss_track, curr_accuracy_track = [], []
        for file_idx in file_indices:
            batch_x1, batch_x2, batch_y = get_batch(file_idx)
            fd = {input_x1:batch_x1, input_x2:batch_x2,
                  input_y:batch_y,
                  keep_prob:KEEP_PROB}
            _, step, loss_, accuracy_ = sess.run([train_op, global_step, loss, accuracy], feed_dict=fd)
            curr_loss_track.append(loss_)
            curr_accuracy_track.append(accuracy_)
            if step%VERBOSE==0:
                print(' average batch loss & accuracy at step {}: <{}, {}>'.format(step,
                                                                                   np.mean(curr_loss_track), 
                                                                                   np.mean(curr_accuracy_track)))
    print('\n')
    print('  epoch mean loss & accuracy: <{}, {}>'.format(np.mean(curr_loss_track),np.mean(curr_accuracy_track)))
    print('\n') 
    loss_track += curr_loss_track
    accuracy_track += curr_accuracy_track
except KeyboardInterrupt:
    print('Stopped!')            