### Self-attentive sentence-pair classifier

* Adapted from Lin et al. (2017) A Structured Self-attentive Sentence Embedding. ICLR17

In [1]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")

In [2]:
import os
import random
import shutil
import numpy as np

import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, MultiRNNCell, DropoutWrapper

from helpers import Indexer, batch
from itertools import chain, product

In [3]:
SHARED_SIZE = 2 # size of noise (or, common vocab for all types).

TYPES = ['ANIMAL','VEHICLE','NATURE','FURNITURE','FRUIT']
SHARED_VOCAB = ['share'+str(i+1) for i in range(SHARED_SIZE)]
TYPE2VOCAB = {'ANIMAL': ['cat','dog','pig','horse','deer']            + SHARED_VOCAB,
              'VEHICLE': ['car','bike','motorcycle','train','bus']    + SHARED_VOCAB,
              'NATURE': ['hill','mountain','lake','river','valley']   + SHARED_VOCAB,
              'FURNITURE': ['stool','table','closet','cabinet','bed'] + SHARED_VOCAB,
              'FRUIT': ['apple','pear','strawberry','grape','tomato'] + SHARED_VOCAB}
VOCAB = list(chain.from_iterable(TYPE2VOCAB.values()))

indexer = Indexer()
indexer.get_index('PAD')
for word in VOCAB:
    indexer.get_index(word)

In [4]:
NUM_WORDS = 5
NUM_EVENTS = 3
DOC_LEN = 4

def to_sent(code):
    return [[indexer.get_object(idx) for idx in event] for event in code]

def get_rand_event_code(sem_type):
    return [indexer.get_index(np.random.choice(TYPE2VOCAB[sem_type])) for _ in range(NUM_WORDS)]

def get_mixture(type1, type2):
    doc_a = [[get_rand_event_code(type1) for _ in range(NUM_EVENTS)] for _ in range(DOC_LEN)]
    doc_b = [[get_rand_event_code(type2) for _ in range(NUM_EVENTS)] for _ in range(DOC_LEN)]   
    doc_mix = np.array(doc_a[:] + doc_b[:])
    doc_lbs = np.array([0]*DOC_LEN + [1]*DOC_LEN)
    indices = list(range(DOC_LEN*2))
    random.shuffle(indices)
    doc_mix = doc_mix[indices]
    doc_lbs = doc_lbs[indices]
    return doc_a, doc_b, doc_mix, doc_lbs    

def batch_mixture(doc_a, doc_b, k):
    batch_x1, batch_x2, batch_y = [], [], []
    ys = [1,0,0,1]
    for _ in range(k):
        for i,(da,db) in enumerate(product([doc_a,doc_b],[doc_a,doc_b])):
            batch_x1.append(random.choice(da))
            batch_x2.append(random.choice(db))
            batch_y.append(ys[i])
    return np.array(batch_x1), np.array(batch_x2), np.array(batch_y)

### Self-attention with bi-linear final layer

In [5]:
tf.reset_default_graph()

sess = tf.InteractiveSession()

VOCAB_SIZE = len(indexer)
EMB_SIZE = 20
HID_SIZE = 10
CHN_SIZE = 2 # channel size. #attention channels.
KEEP_PROB = 0.7
LEARNING_RATE = 1e-4

with tf.device('/cpu:0'):
    
    input_x1 = tf.placeholder(tf.int32, [5, 3], name='input_x1') # <max-time=nw,batch-size=ne>
    input_x2 = tf.placeholder(tf.int32, [5, 3], name='input_x2')
    input_x1_length = tf.placeholder(tf.int32, [3], name='input_x1_length')
    input_x2_length = tf.placeholder(tf.int32, [3], name='input_x2_length')
    input_y  = tf.placeholder(tf.int32, [1], name='input_y')

    keep_prob = tf.placeholder(tf.float32, name="keep_prob")

    with tf.variable_scope('Embeddings'):
        embeddings = tf.get_variable('embeddings', [VOCAB_SIZE, EMB_SIZE], 
                                     initializer=tf.contrib.layers.xavier_initializer())
        input_x1_embedded = tf.nn.embedding_lookup(embeddings, input_x1) # <max-time, batch-size, emb-size>
        input_x2_embedded = tf.nn.embedding_lookup(embeddings, input_x2)

    cell = DropoutWrapper(LSTMCell(HID_SIZE),output_keep_prob=keep_prob)

    def run_lstm(cell, inputs, inputs_length): # lstm-out size *= 2 by bidirectionality.
        ((fw_outputs,bw_outputs), # <max-time, batch-size, hid-size>, attention later if needed.
         (fw_final_state,bw_final_state)) = ( # <batch-size, hid-size>
            tf.nn.bidirectional_dynamic_rnn(cell_fw=cell,
                                            cell_bw=cell,
                                            inputs=inputs,
                                            sequence_length=inputs_length,
                                            dtype=tf.float32, time_major=True)
        )
        return tf.concat([fw_outputs,bw_outputs], axis=2) # <max-time, batch-size, hid-size*2>

    with tf.variable_scope('Bi-LSTM') as scope:
        input_x1_hidden = tf.transpose(run_lstm(cell, input_x1_embedded, input_x1_length),[1,0,2]) 
            # op1. run_lstm output: <max-time, batch-size, hid-size*2>
            # op2. transpose: <batch-size=ne, max-time=nw, hid-size*2=2u>, for self attending.
        scope.reuse_variables()
        input_x2_hidden = tf.transpose(run_lstm(cell, input_x2_embedded, input_x2_length),[1,0,2])

    def run_self_attention(inputs): # inputs = <batch-size=ne, max-time=nw, hid-size*2=2u>
        W_s1 = tf.get_variable('W_s1', [2*HID_SIZE, HID_SIZE], initializer=tf.contrib.layers.xavier_initializer())
            # shape = <hid-size*2=2u, hid-size>
            # axis 1 doesn't have to be HID_SIZE, could be any d.
        W_s2 = tf.get_variable('W_s2', [HID_SIZE, CHN_SIZE], initializer=tf.contrib.layers.xavier_initializer())
        inputs_s1 = tf.tensordot(inputs, W_s1, axes=[[2],[0]]) # <batch-size, max-time=nw, hid-size=d>
        inputs_s2 = tf.tensordot(tf.nn.tanh(inputs_s1), W_s2, axes=[[2],[0]]) # <batch-size, max-time=nw, chn-size=r>
        return tf.nn.softmax(tf.transpose(inputs_s2, [0,2,1]), dim=-1)
            # op1. <batch-size, max-time=nw, chn-size=r> -> <batch-size, chn-size=r, max-time=nw>, for later attending.
            # op2. softmax along the n dimension (attention over component words).

    with tf.variable_scope('Self-attention') as scope:
        att_x1 = run_self_attention(input_x1_hidden) # <batch-size, chn-size, max-time=nw>
        scope.reuse_variables()
        att_x2 = run_self_attention(input_x2_hidden)
        # apply attention
        input_x1_att = tf.matrix_diag_part(tf.transpose(tf.tensordot(tf.transpose(input_x1_hidden, [0,2,1]), att_x1, 
                                                                     axes=[[2],[2]]),
                                                        [3,1,0,2]))
            # op1. <batch-size=ne, max-time=nw, hid-size*2=2u> -> <batch-size=ne, hid-size*2=2u, max-time=nw>
            # op2. result: <batch-size, hid-size*2=2u, batch-size, chn-size=r>
            # op3. transpose to <chn-size=r, hid-size*2=2u, batch-size, batch-size>
            # op4. match batch dim: take the diag part, get <chn-size=r, hid-size*2=2u, batch-size>
        input_x2_att = tf.matrix_diag_part(tf.transpose(tf.tensordot(tf.transpose(input_x2_hidden, [0,2,1]), att_x2, 
                                                                     axes=[[2],[2]]),
                                                        [3,1,0,2]))
            # Lin et al. (2017) also applies diagonal penalty, but this doesn't seem to be terribly effective.
            #   omitted here for now.
        final_vec_x1 = tf.reshape(tf.transpose(input_x1_att, [2,0,1]), [1, -1])
            # op1. transpose for computing loss: 
            #   <chn-size=r, hid-size*2=2u, batch-size> -> <batch-size, chn-size=r, hid-size*2=2u>
            # op2. concat attended information from all events in all channels (batch-size x chn-size x hid-size*2)
            #   <1, batch-sizechn-size*hid-size*2>
        final_vec_x2 = tf.reshape(tf.transpose(input_x2_att, [2,0,1]), [1, -1])

    with tf.variable_scope('Scores'):
        final_vec_size = NUM_EVENTS*HID_SIZE*2*CHN_SIZE 
            # multi-events = *num-events, bidirectional=*2, multi-channel attetion = *num-channels.
        W_bi = tf.get_variable('W_bi', [final_vec_size, final_vec_size], 
                               initializer=tf.contrib.layers.xavier_initializer())
        scores = tf.nn.sigmoid(tf.diag_part(tf.matmul(tf.matmul(final_vec_x1,W_bi),tf.transpose(final_vec_x2))),name='scores')
        predictions = tf.cast(tf.round(scores), tf.int32, name='predictions') 

    with tf.name_scope('Loss'):
        losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(input_y, tf.float32), logits=scores)
        loss = tf.reduce_mean(losses, name='loss')

    with tf.name_scope('Accuracy'):
        correct_predictions = tf.equal(predictions, input_y)
        accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')

    global_step = tf.Variable(0, name='global_step', trainable=False)
    optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name='train_op')

sess.run(tf.global_variables_initializer())

# saver = tf.train.Saver()

In [6]:
NUM_EPOCHS = 1
NUM_BATCHES = 1000
VERBOSE = 1000

loss_track, accuracy_track = [], []
for e in range(NUM_EPOCHS):
    print('Epoch ', e+1)
    print('\n')
    curr_loss_track, curr_accuracy_track = [], []
    for _ in range(NUM_BATCHES):
        type1, type2 = np.random.choice(TYPES, 2, replace=False)
        doc_a, doc_b, _, _ = get_mixture(type1, type2)
        batch_x1, batch_x2, batch_y = batch_mixture(doc_a, doc_b, k=4)
        for x1,x2,y in zip(batch_x1,batch_x2,batch_y):
            x1,x1_len = batch(x1)
            x2,x2_len = batch(x2)
            fd = {input_x1:x1, input_x1_length:x1_len,
                  input_x2:x2, input_x2_length:x2_len,
                  input_y:[y],
                  keep_prob:KEEP_PROB}
            _, step, loss_, accuracy_ = sess.run([train_op, global_step, loss, accuracy], feed_dict=fd)
            curr_loss_track.append(loss_)
            curr_accuracy_track.append(accuracy_)
            if step%VERBOSE==0:
                print(' average batch loss & accuracy at step {}: <{}, {}>'.format(step, np.mean(curr_loss_track), 
                                                                                         np.mean(curr_accuracy_track)))
    print('\n')
    print('  epoch mean loss & accuracy: <{}, {}>'.format(np.mean(curr_loss_track),np.mean(curr_accuracy_track)))
    print('\n') 
    loss_track += curr_loss_track
    accuracy_track += curr_accuracy_track

Epoch  1


 average batch loss & accuracy at step 1000: <0.7224329710006714, 0.640999972820282>
 average batch loss & accuracy at step 2000: <0.7026016712188721, 0.7260000109672546>
 average batch loss & accuracy at step 3000: <0.6781991720199585, 0.7570000290870667>
 average batch loss & accuracy at step 4000: <0.6669455170631409, 0.765250027179718>
 average batch loss & accuracy at step 5000: <0.6612539291381836, 0.7648000121116638>
 average batch loss & accuracy at step 6000: <0.6513721346855164, 0.7821666598320007>
 average batch loss & accuracy at step 7000: <0.6416612863540649, 0.7965714335441589>
 average batch loss & accuracy at step 8000: <0.6319300532341003, 0.8106250166893005>
 average batch loss & accuracy at step 9000: <0.6259729862213135, 0.8183333277702332>
 average batch loss & accuracy at step 10000: <0.6202099323272705, 0.8260999917984009>
 average batch loss & accuracy at step 11000: <0.6164026260375977, 0.8309999704360962>
 average batch loss & accuracy at step 1200