In [1]:
import sys
sys.path.insert(0, "/work/04233/sw33286/AIDA-SCRIPTS")
sys.path.insert(0, "/home/04233/sw33286/AIDA-package")

In [2]:
import os
import time
import random
import shutil
import dill
import numpy as np

import tensorflow as tf
from tensorflow.contrib.rnn import LSTMCell, MultiRNNCell, DropoutWrapper

from kmedoids import kMedoids
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import accuracy_score

from helpers import Indexer, batch
from itertools import chain, product
from collections import defaultdict

### Prepare data

In [3]:
# Link to NYT data folder

nyt_code_dir = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_code/"
FILE_NAMES = os.listdir(nyt_code_dir)
CTX_LEN = 500

# Link to dictionary information

info_path = "/work/04233/sw33286/AIDA-DATA/nyt_eng_salads_info/indexer_word2emb_100k.p"
indexer100k, word2emb100k = dill.load(open(info_path, 'rb'))
glove_embs = []
for i in range(len(indexer100k)):
    glove_embs.append(word2emb100k[indexer100k.get_object(i)])
glove_embs = np.array(glove_embs)
print(glove_embs.shape)

(100001, 300)


In [4]:
# Data batching

def batch_mixture(doc_a, doc_b, doc_mix, k=25):
    batch_x1, batch_x2, batch_y = [], [], []
    ys = [1,0,0,1]
    for _ in range(k): # 4 entries added per iteration.
        for i,(da,db) in enumerate(product([doc_a, doc_b], 
                                           [doc_a, doc_b])):
            batch_x1.append(random.choice(da))
            batch_x2.append(random.choice(db))
            batch_y.append(ys[i])
    doc_mix_flat = list(chain.from_iterable(doc_mix))
    doc_mix_len = len(doc_mix_flat)
    doc_mix_padded = np.array(doc_mix_flat[:CTX_LEN]) if doc_mix_len>=CTX_LEN else np.array(doc_mix_flat+[0]*(CTX_LEN-doc_mix_len))
    return batch(batch_x1), batch(batch_x2), np.array([doc_mix_padded]), np.array(batch_y)

def get_batch(file_idx):
    filename = FILE_NAMES[file_idx]
    doc_a, doc_b, doc_mix = dill.load(open(nyt_code_dir+filename, 'rb'))
    (batch_x1,batch_x1_len), (batch_x2,batch_x2_len), batch_ctx, batch_y = batch_mixture(doc_a,doc_b,doc_mix)
    return batch_x1,batch_x1_len,batch_x2,batch_x2_len,batch_ctx,batch_y

### Bi-LSTM classifier

In [5]:
tf.reset_default_graph()

sess = tf.InteractiveSession()

VOCAB_SIZE = len(indexer100k)
EMB_SIZE = glove_embs.shape[1]
HID_SIZE = 100
NUM_LAYERS = 2
KEEP_PROB = 0.7
LEARNING_RATE = 1e-5

# hyperparams for cnn context reader.
FILTER_SIZES = [3,4,5]
NUM_FILTERS = 50 # finally make 150d context info
NUM_CHANNELS = 1

input_x1 = tf.placeholder(tf.int32, [None, None], name='input_x1') # <max-time, batch-size>
input_x2 = tf.placeholder(tf.int32, [None, None], name='input_x2')
input_x1_length = tf.placeholder(tf.int32, [None], name='input_x1_length')
input_x2_length = tf.placeholder(tf.int32, [None], name='input_x2_length')
input_ctx = tf.placeholder(tf.int32, [1, CTX_LEN], name='input_ctx') # <batch-size, height=max-time>
input_y  = tf.placeholder(tf.int32, [None], name='input_y')

keep_prob = tf.placeholder(tf.float32, name="keep_prob")

with tf.variable_scope('Embeddings'):
    embeddings = tf.get_variable('embeddings', glove_embs.shape, 
                                 initializer=tf.contrib.layers.xavier_initializer())
    glove_init = embeddings.assign(glove_embs)
    input_x1_embedded = tf.nn.embedding_lookup(embeddings, input_x1) # <max-time, batch-size, emb-size>
    input_x2_embedded = tf.nn.embedding_lookup(embeddings, input_x2)
    input_ctx_embedded = tf.expand_dims(tf.nn.embedding_lookup(embeddings, input_ctx), -1)
        # <batch-size, height=max-time, width=EMB_SIZE, num_channels=1>

cell = MultiRNNCell([DropoutWrapper(LSTMCell(HID_SIZE),output_keep_prob=keep_prob)]*NUM_LAYERS) 

def run_lstm(cell, inputs, inputs_length): # lstm-out size *= 2 by bidirectionality.
    ((fw_outputs,bw_outputs), # <max-time, batch-size, hid-size>, attention later if needed.
     (fw_final_state,bw_final_state)) = ( # <batch-size, hid-size>
        tf.nn.bidirectional_dynamic_rnn(cell_fw=cell,
                                        cell_bw=cell,
                                        inputs=inputs,
                                        sequence_length=inputs_length,
                                        dtype=tf.float32, time_major=True)
    )
    return tf.concat([tf.concat([fw_state_tuple.h,bw_state_tuple.h], 1) # lstm-out size *= NUM_LAYERS by stacking.
                      for fw_state_tuple,bw_state_tuple in zip(fw_final_state,bw_final_state)], 1)
    
with tf.variable_scope('Bi-LSTM') as scope:
    final_state_x1 = run_lstm(cell, input_x1_embedded, input_x1_length)
    scope.reuse_variables() # both sentence inputs share the same weights.
    final_state_x2 = run_lstm(cell, input_x2_embedded, input_x2_length)

def run_cnn(inputs):
    pool_outputs = []
    for i,filter_size in enumerate(FILTER_SIZES):
        with tf.variable_scope('CNN-ctx-%s' % filter_size):
            filter_shape = [filter_size, EMB_SIZE, NUM_CHANNELS, NUM_FILTERS]
            W = tf.get_variable('W', filter_shape, initializer=tf.contrib.layers.xavier_initializer())
            b = tf.get_variable('b', [NUM_FILTERS], initializer=tf.contrib.layers.xavier_initializer())
            conv = tf.nn.conv2d(inputs, W, strides=[1,1,1,1], padding='VALID', name='conv')
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')
            pool = tf.nn.max_pool(h, ksize=[1,CTX_LEN-filter_size+1,1,1], strides=[1,1,1,1], padding='VALID', name='pool')
            pool_outputs.append(pool)
    num_filters_total = NUM_FILTERS * len(FILTER_SIZES)
    h_pool_flat = tf.nn.dropout(tf.reshape(tf.concat(pool_outputs, 3), [-1, num_filters_total]), keep_prob)
    return h_pool_flat

bc, _ = tf.unstack(tf.shape(final_state_x1))
ctx = tf.tile(run_cnn(input_ctx_embedded), [bc, 1]) 
    # op1: <batch-size=1,total-num-filters>
    # op2: create batch-size copies of the context vec.

final_vec_size = HID_SIZE*2*2 + NUM_FILTERS*len(FILTER_SIZES)
    # op1: bidirection=*2, 2-layer stacked bi-lstm=*2.
    # op2: compute the total number of filters.
final_vec_x1 = tf.concat([final_state_x1, ctx], 1)
final_vec_x2 = tf.concat([final_state_x2, ctx], 1)
W_bi = tf.get_variable('W_bi', [final_vec_size, final_vec_size], initializer=tf.contrib.layers.xavier_initializer())
scores = tf.nn.sigmoid(tf.diag_part(tf.matmul(tf.matmul(final_vec_x1,W_bi),tf.transpose(final_vec_x2))),name='scores')
predictions = tf.cast(tf.round(scores), tf.int32, name='predictions') 

with tf.name_scope('Loss'):
    losses = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(input_y, tf.float32), logits=scores)
    loss = tf.reduce_mean(losses, name='loss')

with tf.name_scope('Accuracy'):
    correct_predictions = tf.equal(predictions, input_y)
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')
    
global_step = tf.Variable(0, name='global_step', trainable=False)
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
grads_and_vars = optimizer.compute_gradients(loss)
train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name='train_op')

sess.run(tf.global_variables_initializer())

saver = tf.train.Saver()

In [1]:
NUM_EPOCHS = 1
TRAIN_SIZE = 10
# TRAIN_SIZE = len(FILE_NAMES)
VERBOSE = 1000
ERROR_LOG = []

loss_track, accuracy_track = [], []
start = time.time()
try:
    for e in range(NUM_EPOCHS):
        print('Epoch ', e+1)
        print('\n')
        file_indices = np.random.choice(list(range(len(FILE_NAMES))), size=TRAIN_SIZE, replace=False)
        random.shuffle(file_indices)  
        curr_loss_track, curr_accuracy_track = [], []
        for file_idx in file_indices:
            try:
                batch_x1,batch_x1_length,batch_x2,batch_x2_length,batch_ctx,batch_y = get_batch(file_idx)
            except:
                ERROR_LOG.append(file_idx)
                continue
            fd = {input_x1:batch_x1, input_x1_length:batch_x1_length,
                  input_x2:batch_x2, input_x2_length:batch_x2_length,
                  input_ctx:batch_ctx,
                  input_y:batch_y,
                  keep_prob:KEEP_PROB}   
            _, step, loss_, accuracy_ = sess.run([train_op, global_step, loss, accuracy], feed_dict=fd)
            curr_loss_track.append(loss_)
            curr_accuracy_track.append(accuracy_)
            if step%VERBOSE==0:
                print('  batch loss & accuracy at step {}: <{}, {}> (time elapsed = {})'.format(step, loss_, accuracy_,
                                                                                                time.time()-start))
                start = time.time()
        print('\n')
        print('  epoch mean loss & accuracy: <{}, {}>'.format(np.mean(curr_loss_track),np.mean(curr_accuracy_track)))
        print('\n')    
        loss_track += curr_loss_track
        accuracy_track += curr_accuracy_track 
except KeyboardInterrupt:
    print('Stopped!')

### Model Saving

In [7]:
def remove_all_files(target_dir):
    for filename in os.listdir(target_dir):
        os.remove(os.path.abspath(os.path.join(target_dir, filename)))

In [8]:
save_dir = "/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/our-model-with-context-kmedoids/"
save_path = save_dir + "our-model-with-context-00"
remove_all_files(save_dir)
saver.save(sess, save_path)

'/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/our-model-with-context-kmedoids/our-model-with-context-00'

### Bi-LSTM + HAC

In [11]:
# Evaluation functions

def get_rand_mixture():
    filename = random.choice(FILE_NAMES)
    da,db, doc_mix = dill.load(open(nyt_code_dir+filename, 'rb'))
    doc_lbs = []
    for sentcode in doc_mix:
        if sentcode in da:
            doc_lbs.append(0)
        else:
            doc_lbs.append(1)
    doc_mix_flat = list(chain.from_iterable(doc_mix))
    doc_mix_len = len(doc_mix_flat)
    ctx = np.array([doc_mix_flat[:CTX_LEN]]) if doc_mix_len>=CTX_LEN else np.array([doc_mix_flat+[0]*(CTX_LEN-doc_mix_len)])
    return doc_mix, doc_lbs, ctx

def to_labels(C, doc_len): # C: {cls:[datum_id, ...], ...}
    lbs = [0]*doc_len
    for idx in C[1]:
        lbs[idx] = 1
    return lbs

def flip_clust(clust):
    return np.array([0 if i==1 else 1 for i in clust])

def clust_accuracy(true, pred):
    return max(accuracy_score(true, pred),
               accuracy_score(true, flip_clust(pred)))

def to_sent(code):
    return [indexer100k.get_object(idx) for idx in code]

# Bi-LSTM + HAC class

class ClfKM:
    
    def __init__(self, clf_dir, clf_filename):
        self.sess = tf.Session()
        saver = tf.train.import_meta_graph(clf_dir + clf_filename)
        saver.restore(self.sess, tf.train.latest_checkpoint(clf_dir))
        self.graph = tf.get_default_graph()
        self.input_x1 = self.graph.get_tensor_by_name('input_x1:0')
        self.input_x2 = self.graph.get_tensor_by_name('input_x2:0')
        self.input_x1_length = self.graph.get_tensor_by_name('input_x1_length:0')
        self.input_x2_length = self.graph.get_tensor_by_name('input_x2_length:0')
        self.input_ctx = self.graph.get_tensor_by_name('input_ctx:0')
        self.input_y = self.graph.get_tensor_by_name('input_y:0')
        self.keep_prob = self.graph.get_tensor_by_name('keep_prob:0')

        self.scores = self.graph.get_tensor_by_name('scores:0')
        self.predictions = self.graph.get_tensor_by_name('predictions:0')
        self.loss = self.graph.get_tensor_by_name('Loss/loss:0')
        self.accuracy = self.graph.get_tensor_by_name('Accuracy/accuracy:0')
        self.global_step = self.graph.get_tensor_by_name('global_step:0')
        self.train_op = self.graph.get_tensor_by_name('train_op:0')
        
    def dist(self, x1, x2):
        x1, x1_len = batch([x1])
        x2, x2_len = batch([x2])
        fd = {self.input_x1:x1, self.input_x1_length:x1_len,
              self.input_x2:x2, self.input_x2_length:x2_len,
              self.input_ctx:self.ctx,
              self.keep_prob:1.0}
        conf = self.sess.run(self.scores, feed_dict=fd)
        return 1-conf[0]
    
    def evaluate(self, doc_mix, doc_lbs, ctx, method='average', plot=True):
        self.ctx = ctx
        doc_mix_sq, _ = batch(doc_mix)
        doc_mix_sq = doc_mix_sq.T
        _, doc_mix_clust = kMedoids(squareform(pdist(doc_mix_sq,metric=self.dist)), 2)
        doc_prd = to_labels(doc_mix_clust, len(doc_mix))
        acc = clust_accuracy(doc_lbs, doc_prd)
        return acc
        

In [12]:
restore_dir = "/work/04233/sw33286/AIDA-INDIV-MODEL-SAVE/our-model-with-context-kmedoids/"
restore_filename = "our-model-with-context-00.meta"
clf_km = ClfKM(restore_dir, restore_filename)

In [2]:
clf_km.evaluate(*get_rand_mixture())

### Evaluation on subset

In [None]:
def rand_evaluation(k=100):
    accuracies = []
    for _ in range(k):
        _, acc = clf_km.evaluate(*get_rand_mixture(), plot=False)
        accuracies.append(acc)
    print('Average clustering accuracy over {} samples = {}'.format(k, np.mean(accuracies)))

In [None]:
%%time

rand_evaluation()