### FFNN event-pair pretraining

* Baseline naive-FFNN for Weber et al. (2018) Event Representations with Tensor-based Compositions. AAAI18

* INPUT: a batch of events (5-tuples <v,s,o,p,po>), corresponding positive and negative instances (word2vec style training).
* OUTPUT: encoded event representation (as vectors).
* PROC:
    * Look up word embeddings and concat embeddings within events. An event is thus a vector $e\in R^{5d_e}$
    * Put event embeddings through a feedforward layer for encoding
    $$v_e = e\cdot W$$
    where $W\in R^{5d_e\times h}$.
    * Maximizing the distance between the input event and its positive examples while minimizing the distance between it and its negative examples through a max-margin loss
    $$\ell = \frac{1}{N}\sum_{i=1}^N \texttt{max}(0, m + \texttt{sim}(e, e_{neg}) - \texttt{sim}(e, e_{pos}))$$
* COMMENTS
    * For the simple demo I use dot product for distance metric rather than cosine as in the paper.
    * The network is not L2-regularized as in the paper, but the loss term can be easily added.

In [1]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

In [2]:
import os
import time
import random
import shutil
import dill
import numpy as np

import tensorflow as tf

from helpers import Indexer
from itertools import chain

### Prepare data

In [3]:
# Link to NYT data folder

nyt_code_dir = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_event_sample_code/"
FILE_NAMES = os.listdir(nyt_code_dir)

# Link to dictionary information

info_path = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k.p"
indexer100k, word2emb100k = dill.load(open(info_path, 'rb'))
glove_embs = []
for i in range(len(indexer100k)):
    glove_embs.append(word2emb100k[indexer100k.get_object(i)])
glove_embs = np.array(glove_embs)
print(glove_embs.shape)

(100001, 300)


In [8]:
BATCH_SIZE = 32
CONTRA_BC = 10
NUM_WORDS = 5

def get_batch(edoc_a, edoc_b):
    edoc_a = list(chain.from_iterable(edoc_a)) # to a list of events
    edoc_b = list(chain.from_iterable(edoc_b))
    size_a, size_b = len(edoc_a), len(edoc_b)
    batch_x, batch_pos, batch_neg = [], [], []
    for _ in range(BATCH_SIZE//2):
        x_a = edoc_a[np.random.randint(0, size_a)]
        x_b = edoc_b[np.random.randint(0, size_b)]
        pos_a = [edoc_a[np.random.randint(0, size_a)] for _ in range(CONTRA_BC)]
        neg_a = [edoc_b[np.random.randint(0, size_b)] for _ in range(CONTRA_BC)]
        pos_b = [edoc_b[np.random.randint(0, size_b)] for _ in range(CONTRA_BC)]
        neg_b = [edoc_a[np.random.randint(0, size_a)] for _ in range(CONTRA_BC)]        
        batch_x += [x_a, x_b]
        batch_pos += [pos_a, pos_b]
        batch_neg += [neg_a, neg_b]
    return np.array(batch_x), np.array(batch_pos), np.array(batch_neg)

In [5]:
# Example: batch shapes

edoc_a, edoc_b, _ = dill.load(open(nyt_code_dir+FILE_NAMES[0],'rb'))
a,b1,b2 = get_batch(edoc_a, edoc_b)
a.shape, b1.shape, b2.shape

((32, 5), (32, 10, 5), (32, 10, 5))

### FFNN event-pair sem-space learning

In [20]:
tf.reset_default_graph()

sess = tf.InteractiveSession()

VOCAB_SIZE, EMB_SIZE = glove_embs.shape
HID_SIZE = 100 # let event embs be of the same hid-size as role-factored arg vectors.

LEARNING_RATE = 1e-4

# for debugging, explicit batch-size:
inputs = tf.placeholder(tf.int32, [BATCH_SIZE, 5], name='inputs') # <bc,nw-in-event=5>
inputs_pos = tf.placeholder(tf.int32, [BATCH_SIZE, CONTRA_BC, 5], name='inputs_pos') # <bc,ctr-bc,nw-in-event=5>
inputs_neg = tf.placeholder(tf.int32, [BATCH_SIZE, CONTRA_BC, 5], name='inputs_neg')

with tf.variable_scope('Embedding'):
    embeddings = tf.get_variable('embedding', [VOCAB_SIZE, EMB_SIZE],
                                 initializer=tf.contrib.layers.xavier_initializer())
    glove_init = embeddings.assign(glove_embs)
    
with tf.variable_scope('Concat-encode'):
    W = tf.get_variable('W', [NUM_WORDS*EMB_SIZE, HID_SIZE], 
                        initializer=tf.contrib.layers.xavier_initializer())
    
def encode_events(inputs_):
    bc,_ = tf.unstack(tf.shape(inputs_))
    embedded = tf.reshape(tf.nn.embedding_lookup(embeddings, inputs_), [bc,-1])
        # op1. lookup: <bc,nw,emb>
        # op2. concat word embs in event: <bc,nw*emb>
    return tf.matmul(embedded, W) # <bc,nw*emb> * <nw*emb,hid> -> <bc,hid>

inputs_encoded = encode_events(inputs) # <bc,hid>
inputs_pos_encoded = tf.map_fn(encode_events, inputs_pos, dtype=tf.float32) # <bc,ctr,hid>
inputs_neg_encoded = tf.map_fn(encode_events, inputs_neg, dtype=tf.float32)

with tf.variable_scope('Encode'):
    predictions = tf.identity(inputs_encoded, name='predictions') # <bc,hid> event embeddings.
    
with tf.variable_scope('Loss'):
    sim_pos = tf.matrix_diag_part(tf.transpose(tf.tensordot(inputs_pos_encoded, 
                                                            tf.transpose(inputs_encoded, [1,0]), 
                                               axes=[[2],[0]]), [1,0,2]))
        # op1. inputs_encoded -> <hid,bc>
        # op2. dot: <bc,ctr,hid> * <hid,bc> -> <bc,ctr,bc>
        # op3. transpose: <bc,ctr,bc> -> <ctr,bc,bc>
        # op4. bc match: <ctr,bc>
    sim_neg = tf.matrix_diag_part(tf.transpose(tf.tensordot(inputs_neg_encoded, 
                                                            tf.transpose(inputs_encoded, [1,0]), 
                                               axes=[[2],[0]]), [1,0,2])) 
    loss = tf.reduce_mean(tf.reduce_mean(tf.maximum(0., 1. + sim_neg - sim_pos), axis=0))
        # op1. max(0, m + sim_neg - sim_pos), <ctr,bc>
        # op2. average loss over contra instances: <bc,>
        # op3. average loss over batch  
        
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name='train_op')

sess.run(tf.global_variables_initializer())

saver = tf.train.Saver()

In [31]:
NUM_EPOCHS = 1
TRAIN_SIZE = 10
VERBOSE = 1
# TRAIN_SIZE = len(FILE_NAMES)
# VERBOSE = 10

try:
    loss_track = []
    for e in range(NUM_EPOCHS):
        print('Epoch ', e+1)
        print('\n')
        curr_loss_track = []
        file_indices = np.random.choice(list(range(len(FILE_NAMES))), size=TRAIN_SIZE, replace=False)
        random.shuffle(file_indices)
        curr_loss_track, curr_accuracy_track = [], []
        for file_idx in file_indices:
            edoc_a, edoc_b, _ = dill.load(open(nyt_code_dir+FILE_NAMES[file_idx],'rb')) # context not added
            batch_x, batch_pos, batch_neg = get_batch(edoc_a, edoc_b)
            fd = {inputs:batch_x, inputs_pos:batch_pos, inputs_neg:batch_neg}
            _, step, loss_ = sess.run([train_op, global_step, loss], feed_dict=fd)
            curr_loss_track.append(loss_)
            if step%VERBOSE==0:
                print(' average batch loss at step {}: <{}>'.format(step, np.mean(curr_loss_track)))
        print('\n')
        print('  epoch mean loss: <{}>'.format(np.mean(curr_loss_track)))
        print('\n') 
        loss_track += curr_loss_track  
except KeyboardInterrupt:
    print('Stopped!')                      

### Model saving

In [24]:
def remove_all_files(target_dir):
    for filename in os.listdir(target_dir):
        os.remove(os.path.abspath(os.path.join(target_dir, filename)))

In [25]:
save_dir = "/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/our-model-ffnn-pretrain-sample/"
save_path = save_dir + "our-model-ffnn-pretrain-sample-00"
remove_all_files(save_dir)
saver.save(sess, save_path)

'/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/our-model-ffnn-pretrain-sample/our-model-ffnn-pretrain-sample-00'

### Model reading

In [27]:
# import tensorflow as tf

class BasicFeedForward:
    # basic:
    #   - not l2-regularized or dropout-regularized.
    #   - dot à la place de cosine.
    
    def __init__(self, rf_dir, rf_filename):
        self.sess = tf.Session()
        saver = tf.train.import_meta_graph(rf_dir + rf_filename)
        saver.restore(self.sess, tf.train.latest_checkpoint(rf_dir))
        self.graph = tf.get_default_graph()
        self.inputs = self.graph.get_tensor_by_name('inputs:0') # <bc,nw-in-event=5>
        self.predictions = self.graph.get_tensor_by_name('Encode/predictions:0') # <bc,hid>
            # hid=100 hard coded for now.
    
    def embed_batch(self, batch_events):
        return self.sess.run(self.predictions, feed_dict={self.inputs:batch_events})    

In [28]:
restore_dir = "/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/our-model-ffnn-pretrain-sample/"
restore_filename = "our-model-ffnn-pretrain-sample-00.meta"
ffnn = BasicFeedForward(restore_dir, restore_filename)

In [29]:
# Example: batch shapes

edoc_a, edoc_b, _ = dill.load(open(nyt_code_dir+FILE_NAMES[0],'rb'))
a,_,_ = get_batch(edoc_a, edoc_b) # only want <bc,nw-in-event=5> output
print("Input event batch shape: {} x {}".format(*a.shape))
b = ffnn.embed_batch(a)
print("Output event batch shape: {} x {}".format(*b.shape))

Input event batch shape: 32 x 5
Output event batch shape: 32 x 100
