In [1]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

In [2]:
import os
import time
import random
import shutil
import dill
import numpy as np

import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, MultiRNNCell, DropoutWrapper

from helpers import Indexer, batch
from itertools import chain, product
from collections import defaultdict

### Prepare data

In [7]:
# Link to NYT data folder

nyt_code_dir = "/work/04233/sw33286/AIDA-DATA/nyt_end_salads_event_code_ffnn_embeddings/"
FILE_NAMES = os.listdir(nyt_code_dir)

# Link to dictionary information

# info_path = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k_event.p"
# indexer100k, word2emb100k = dill.load(open(info_path, 'rb'))
# glove_embs = []
# for i in range(len(indexer100k)):
#     glove_embs.append(word2emb100k[indexer100k.get_object(i)])
# glove_embs = np.array(glove_embs)
# print(glove_embs.shape)

In [8]:
BATCH_SIZE = 32
NUM_EVENTS = 5
EMB_SIZE = 100
CTX_LEN = 20
CTX_DUMMY = np.zeros([NUM_EVENTS, EMB_SIZE])

def sample_two(lbs):
    idx1, idx2 = np.random.choice(np.arange(len(lbs)), size=2, replace=False)
    return idx1, idx2, 1 if lbs[idx1]==lbs[idx2] else 0

def get_batch(file_idx):
    filename = FILE_NAMES[file_idx]
    batch_mix, batch_lbs = dill.load(open(nyt_code_dir+FILE_NAMES[file_idx],'rb'))
    batch_x1, batch_x2, batch_y = [], [], []
    for _ in range(BATCH_SIZE):
        idx1, idx2, lb = sample_two(batch_lbs)
        batch_x1.append(batch_mix[idx1])
        batch_x2.append(batch_mix[idx2])
        batch_y.append(lb)
    size_ctx = len(batch_mix)
    batch_ctx = batch_mix[:CTX_LEN] if size_ctx>=CTX_LEN else np.vstack((batch_mix,[CTX_DUMMY]*(CTX_LEN-size_ctx)))
    return np.array(batch_x1), np.array(batch_x2), np.array(batch_ctx), np.array(batch_y) 

In [9]:
a1,a2,b,c = get_batch(0)
print(a1.shape, a2.shape, b.shape, c.shape)
print(CTX_DUMMY.shape)

(32, 5, 100) (32, 5, 100) (20, 5, 100) (32,)
(5, 100)


### BiLSTM-bilinear

In [18]:
tf.reset_default_graph()

sess = tf.InteractiveSession()

# EMB_SIZE = 100 # the hard coded event embedding size. # SPECIFIED ABOVE (PRETRAINED SIZE)
HID_SIZE = 100 # let event embs be of the same hid-size as role-factored arg vectors.

NUM_LAYERS = 2
KEEP_PROB = 0.7
LEARNING_RATE = 1e-5

# hyperparams for cnn context reader.
FILTER_SIZES = [3,4,5]
NUM_FILTERS = 50
NUM_CHANNELS = 1

input_x1 = tf.placeholder(tf.float32, [BATCH_SIZE, NUM_EVENTS, EMB_SIZE], name='input_x1')
    # NB: this is embedded input by role-factored net.
input_x2 = tf.placeholder(tf.float32, [BATCH_SIZE, NUM_EVENTS, EMB_SIZE], name='input_x2')
input_ctx = tf.placeholder(tf.float32, [CTX_LEN, NUM_EVENTS, EMB_SIZE], name='input_ctx')
input_y = tf.placeholder(tf.int32, [BATCH_SIZE], name='input_y')

keep_prob = tf.placeholder(tf.float32, name="keep_prob")

input_x1_ffnn = tf.transpose(input_x1, [1,0,2]) # <mt=ne,bc,emb>
input_x2_ffnn = tf.transpose(input_x2, [1,0,2])
input_ctx_ffnn = tf.expand_dims(tf.expand_dims(tf.reshape(input_ctx, [CTX_LEN, NUM_EVENTS*EMB_SIZE]),0),-1)
    # op1. reshape: <ctx,ne*emb>
    # op2,3. expand dim for bc=1, chn=1: <bc=1,height=ctx,width=ne*emb,chn=1>

cell = MultiRNNCell([DropoutWrapper(LSTMCell(HID_SIZE),output_keep_prob=keep_prob)]*NUM_LAYERS)

def run_lstm(inputs): # inputs: <mt,bc,emb>
    ((fw_outputs,bw_outputs), # <max-time=ne,bc,hid>, attention later if needed.
     (fw_final_state,bw_final_state)) = ( # <bc,hid>
        tf.nn.bidirectional_dynamic_rnn(cell_fw=cell,
                                        cell_bw=cell,
                                        inputs=inputs,
                                        sequence_length=[NUM_EVENTS]*BATCH_SIZE,
                                        dtype=tf.float32, time_major=True)
    )    
    return tf.concat([tf.concat([fw_state_tuple.h,bw_state_tuple.h], 1) # lstm-out size *= NUM_LAYERS by stacking.
                      for fw_state_tuple,bw_state_tuple in zip(fw_final_state,bw_final_state)], 1), \
           tf.transpose(tf.concat([fw_outputs,bw_outputs], 2), [1,0,2])
        # out1: <bc,hid*2*num-layers>
        # out2: concat -> <ne,bc,hid*2> -> <bc,ne,hid*2>
        
with tf.variable_scope('BiLSTM') as scope: 
    final_state_x1, outputs_x1 = run_lstm(input_x1_ffnn)
        # fs_x1: <bc,hid*2*num-layers>
        # out_x1: <bc,ne,hid*2>
    scope.reuse_variables()
    final_state_x2, outputs_x2 = run_lstm(input_x2_ffnn)
    
def run_attention(outputs, state):
    W_d = tf.get_variable('W_d', [HID_SIZE*2, HID_SIZE*2], initializer=tf.contrib.layers.xavier_initializer())
    W_s = tf.get_variable('W_s', [HID_SIZE*2*NUM_LAYERS, HID_SIZE*2], 
                          initializer=tf.contrib.layers.xavier_initializer())
    d_W = tf.tensordot(outputs, W_d, axes=[[2],[0]])
        # <bc,ne,hid*2> * <hid*2,hid*2> = <bc,ne,hid*2>
    s_W = tf.expand_dims(tf.matmul(state, W_s), axis=1)
        # op1. <bc,hid*2*num-layers> * <hid*2*num-layers,hid*2> -> <bc,hid*2>
        # op2. <bc,hid*2> -> <bc,1,hid*2>
    a_tsr = tf.nn.tanh(tf.add(d_W, s_W))
        # op1. <bc,ne,hid*2> + <bc,1,hid*2> -> <bc,ne,hid*2>
        # op2. elem-wise nonlinearity.
    W_a = tf.get_variable('W_a', [HID_SIZE*2, 1], initializer=tf.contrib.layers.xavier_initializer())
    a_W = tf.nn.softmax(tf.tensordot(a_tsr, W_a, axes=[[2],[0]]), dim=1)
        # op1. <bc,ne,hid*2> * <hid*2,1> -> <bc,ne,1>
        # op2. softmax over max-time=ne.
    d_a = tf.reduce_sum(tf.multiply(outputs, a_W), axis=1)
        # op1. <bc,ne,hid*2> elem* <bc,ne,1> -> <bc,ne,hid*2>
        # op2. sum over max-time=ne (weighted sum) -> <bc,hid*2>
    return d_a  

with tf.variable_scope('Mutual-Attention') as scope:
    x1_to_x2_att = run_attention(outputs_x2, final_state_x1) # x1 attending to x2, <bc,hid*2>
    scope.reuse_variables()
    x2_to_x1_att = run_attention(outputs_x1, final_state_x2) # x2 attending to x1
    
def run_cnn(inputs): # in: <bc=1,height=ctx,width=ne*emb,chn=1>
    pool_outputs = []
    for i,filter_size in enumerate(FILTER_SIZES):
        with tf.variable_scope('CNN-ctx-%s' % filter_size):
            filter_shape = [filter_size, NUM_EVENTS*EMB_SIZE, NUM_CHANNELS, NUM_FILTERS]
            W = tf.get_variable('W', filter_shape, initializer=tf.contrib.layers.xavier_initializer())
            b = tf.get_variable('b', [NUM_FILTERS], initializer=tf.contrib.layers.xavier_initializer())
            conv = tf.nn.conv2d(inputs, W, strides=[1,1,1,1], padding='VALID', name='conv')
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
            pool = tf.nn.max_pool(h, ksize=[1,CTX_LEN-filter_size+1,1,1], strides=[1,1,1,1], padding='VALID', name='pool')
            pool_outputs.append(pool)
    num_filters_total = NUM_FILTERS * len(FILTER_SIZES)
    h_pool_flat = tf.nn.dropout(tf.reshape(tf.concat(pool_outputs, 3), [-1, num_filters_total]), keep_prob)
    return h_pool_flat # <bc=1,num-filters*len(filter-sizes)>

with tf.variable_scope('Context-reader') as scope:
    ctx = tf.tile(run_cnn(input_ctx_ffnn), [BATCH_SIZE,1])
        # op1. run-cnn out: <bc=1,num-filters*len(filter-sizes)>
        # op2. create bc copies of it: <bc,num-filters*len(filter-sizes)>
        
def run_scores(fs_x1, fs_x2, att_12, att_21, c):
    fv_size = HID_SIZE*2*NUM_LAYERS+HID_SIZE*2+NUM_FILTERS*len(FILTER_SIZES) 
        # sent encoding size + mutual attention size + context size.
        # e.g. hid-size=100, num-filters=50
        #      400 + 200 + 150 = 750
    W_bi = tf.get_variable('W_bi', [fv_size, fv_size], 
                           initializer=tf.contrib.layers.xavier_initializer())
    fv_x1 = tf.concat([fs_x1,att_12,c],axis=1) 
        # concat: [<bc,hid*2*num-layers>, <bc,hid*2>, <bc,num-filters*len(filter-sizes)>]
        #   -> <bc,fv = hid*2*num-layers + hid*2 + num-filters*len(filter-sizes)>
    fv_x2 = tf.concat([fs_x2,att_21,c],axis=1)
    return tf.nn.sigmoid(tf.diag_part(tf.matmul(tf.matmul(fv_x1,W_bi),tf.transpose(fv_x2))),name='scores')
        # op1. bilinear mult: <bc,fv> * <fv,fv> * <fv,bc> -> <bc,bc>
        # op2: match bc: <bc,>
        # op3: sigmoid to compute scores: <bc,>
        
scores = run_scores(final_state_x1, final_state_x2, x1_to_x2_att, x2_to_x1_att, ctx)
predictions = tf.cast(tf.round(scores), tf.int32, name='predictions')     
    
with tf.name_scope('Loss'):
    losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(input_y, tf.float32), logits=scores)
    loss = tf.reduce_mean(losses, name='loss')

with tf.name_scope('Accuracy'):
    correct_predictions = tf.equal(predictions, input_y)
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
    
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name='train_op')

sess.run(tf.global_variables_initializer())

saver = tf.train.Saver()
save_dir = "/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE-EVENT/our-model-ffnn-init-bilstm/"
save_path = save_dir + "our-model-ffnn-init-bilstm-00"

def remove_all_files(target_dir):
    for filename in os.listdir(target_dir):
        os.remove(os.path.abspath(os.path.join(target_dir, filename)))
        
def checkpoint_model(s_dir, s_path, svr, ss):
    # save_dir, save_path, saver, sess
    remove_all_files(s_dir)
    svr.save(ss, s_path)

In [26]:
NUM_EPOCHS = 1
TRAIN_SIZE = 10
VERBOSE = 1
SAVE_PER = 50000
# TRAIN_SIZE = len(FILE_NAMES)
# VERBOSE = 1000

loss_track, accuracy_track = [], []
try:
    for e in range(NUM_EPOCHS):
        print('Epoch ', e+1)
        print('\n')
        file_indices = np.random.choice(list(range(len(FILE_NAMES))), size=TRAIN_SIZE, replace=False)
        random.shuffle(file_indices)
        curr_loss_track, curr_accuracy_track = [], []
        for file_idx in file_indices:
            batch_x1, batch_x2, batch_ctx, batch_y = get_batch(file_idx)
            fd = {input_x1:batch_x1, input_x2:batch_x2,
                  input_ctx:batch_ctx,
                  input_y:batch_y,
                  keep_prob:KEEP_PROB}
            _, step, loss_, accuracy_ = sess.run([train_op, global_step, loss, accuracy], feed_dict=fd)
            curr_loss_track.append(loss_)
            curr_accuracy_track.append(accuracy_)
            if step%VERBOSE==0:
                print(' average batch loss & accuracy at step {}: <{}, {}>'.format(step,
                                                                                   np.mean(curr_loss_track), 
                                                                                   np.mean(curr_accuracy_track)))
            if step%SAVE_PER==0:
                checkpoint_model(save_dir, save_path, saver, sess)
                print(' [SAVE] model checkpointed at step {}'.format(step))
    print('\n')
    print('  epoch mean loss & accuracy: <{}, {}>'.format(np.mean(curr_loss_track),np.mean(curr_accuracy_track)))
    print('\n') 
    loss_track += curr_loss_track
    accuracy_track += curr_accuracy_track
except KeyboardInterrupt:
    print('Stopped!')            

### Model saving

In [20]:
checkpoint_model(save_dir, save_path, saver, sess)

### Model continued training

In [21]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

import os
import time
import random
import shutil
import dill
import numpy as np

import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, MultiRNNCell, DropoutWrapper

from helpers import Indexer, batch
from itertools import chain, product
from collections import defaultdict

# Link to NYT data folder

nyt_code_dir = "/work/04233/sw33286/AIDA-DATA/nyt_end_salads_event_code_ffnn_embeddings/"
FILE_NAMES = os.listdir(nyt_code_dir)

# # Link to dictionary information

# info_path = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k_event.p"
# indexer100k, word2emb100k = dill.load(open(info_path, 'rb'))
# glove_embs = []
# for i in range(len(indexer100k)):
#     glove_embs.append(word2emb100k[indexer100k.get_object(i)])
# glove_embs = np.array(glove_embs)
# print(glove_embs.shape)

BATCH_SIZE = 32
NUM_EVENTS = 5
EMB_SIZE = 100
CTX_LEN = 20
CTX_DUMMY = np.zeros([NUM_EVENTS, EMB_SIZE])

def sample_two(lbs):
    idx1, idx2 = np.random.choice(np.arange(len(lbs)), size=2, replace=False)
    return idx1, idx2, 1 if lbs[idx1]==lbs[idx2] else 0

def get_batch(file_idx):
    filename = FILE_NAMES[file_idx]
    batch_mix, batch_lbs = dill.load(open(nyt_code_dir+FILE_NAMES[file_idx],'rb'))
    batch_x1, batch_x2, batch_y = [], [], []
    for _ in range(BATCH_SIZE):
        idx1, idx2, lb = sample_two(batch_lbs)
        batch_x1.append(batch_mix[idx1])
        batch_x2.append(batch_mix[idx2])
        batch_y.append(lb)
    size_ctx = len(batch_mix)
    batch_ctx = batch_mix[:CTX_LEN] if size_ctx>=CTX_LEN else np.vstack((batch_mix,[CTX_DUMMY]*(CTX_LEN-size_ctx)))
    return np.array(batch_x1), np.array(batch_x2), np.array(batch_ctx), np.array(batch_y) 
        # batch_x*: <bc,ne,emb>, batch_ctx: <ctx,ne,emb>, batch_y: <bc,>

In [22]:
restore_dir = "/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE-EVENT/our-model-ffnn-init-bilstm/"
restore_filename = "our-model-ffnn-init-bilstm-00.meta"

sess = tf.Session()
saver = tf.train.import_meta_graph(restore_dir + restore_filename)
saver.restore(sess, tf.train.latest_checkpoint(restore_dir))
graph = tf.get_default_graph()

input_x1 = graph.get_tensor_by_name('input_x1:0')
input_x2 = graph.get_tensor_by_name('input_x2:0')
input_ctx = graph.get_tensor_by_name('input_ctx:0')
input_y = graph.get_tensor_by_name('input_y:0')
keep_prob = graph.get_tensor_by_name('keep_prob:0')

scores = graph.get_tensor_by_name('scores:0')
predictions = graph.get_tensor_by_name('predictions:0')
loss = graph.get_tensor_by_name('Loss/loss:0')
accuracy = graph.get_tensor_by_name('Accuracy/accuracy:0')
global_step = graph.get_tensor_by_name('global_step:0')
train_op = graph.get_tensor_by_name('train_op:0')

save_dir = "/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE-EVENT/our-model-ffnn-init-bilstm/"
save_path = save_dir + "our-model-ffnn-init-bilstm-00"

def remove_all_files(target_dir):
    for filename in os.listdir(target_dir):
        os.remove(os.path.abspath(os.path.join(target_dir, filename)))
        
def checkpoint_model(s_dir, s_path, svr, ss):
    # save_dir, save_path, saver, sess
    remove_all_files(s_dir)
    svr.save(ss, s_path)

In [25]:
NUM_EPOCHS = 1
TRAIN_SIZE = 10
VERBOSE = 1
SAVE_PER = 50000
# TRAIN_SIZE = len(FILE_NAMES)
# VERBOSE = 1000

loss_track, accuracy_track = [], []
try:
    for e in range(NUM_EPOCHS):
        print('Epoch ', e+1)
        print('\n')
        file_indices = np.random.choice(list(range(len(FILE_NAMES))), size=TRAIN_SIZE, replace=False)
        random.shuffle(file_indices)
        curr_loss_track, curr_accuracy_track = [], []
        for file_idx in file_indices:
            batch_x1, batch_x2, batch_ctx, batch_y = get_batch(file_idx)
            fd = {input_x1:batch_x1, input_x2:batch_x2,
                  input_ctx:batch_ctx,
                  input_y:batch_y,
                  keep_prob:KEEP_PROB}
            _, step, loss_, accuracy_ = sess.run([train_op, global_step, loss, accuracy], feed_dict=fd)
            curr_loss_track.append(loss_)
            curr_accuracy_track.append(accuracy_)
            if step%VERBOSE==0:
                print(' average batch loss & accuracy at step {}: <{}, {}>'.format(step,
                                                                                   np.mean(curr_loss_track), 
                                                                                   np.mean(curr_accuracy_track)))
            if step%SAVE_PER==0:
                checkpoint_model(save_dir, save_path, saver, sess)
                print(' [SAVE] model checkpointed at step {}'.format(step))
    print('\n')
    print('  epoch mean loss & accuracy: <{}, {}>'.format(np.mean(curr_loss_track),np.mean(curr_accuracy_track)))
    print('\n') 
    loss_track += curr_loss_track
    accuracy_track += curr_accuracy_track
except KeyboardInterrupt:
    print('Stopped!')            