## Preprocessing

In [17]:
from __future__ import division

import random
import bz2
import cPickle, dill
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from copy import deepcopy

In [2]:
# Helpers

class Indexer(object):
    def __init__(self):
        self.objs_to_ints = {}
        self.ints_to_objs = {}
    def __repr__(self):
        return str([str(self.get_object(i)) for i in xrange(0, len(self))])
    def __len__(self):
        return len(self.objs_to_ints)
    def get_object(self, index):
        if (index not in self.ints_to_objs):
            return None
        else:
            return self.ints_to_objs[index]
    def contains(self, object):
        return self.index_of(object) != -1
    def index_of(self, object):
        if (object not in self.objs_to_ints):
            return -1
        else:
            return self.objs_to_ints[object]
    def get_index(self, object, add=True):
        if not add:
            return self.index_of(object)
        if (object not in self.objs_to_ints):
            new_idx = len(self.objs_to_ints)
            self.objs_to_ints[object] = new_idx
            self.ints_to_objs[new_idx] = object
        return self.objs_to_ints[object]

In [138]:
# Load triples

data_dir = '/home/jacobsuwang/Documents/UTA2017/PROPERTIES/AMT/TRIPLES/'

n_indexer = Indexer()
v_indexer = Indexer()

manual_correct = {'monkeu':'monkey','shir':'shirt',
                  'moneky':'monkey','egges':'egg',
                  'grap':'grape','woool':'wool',
                  'chelf':'chef','gril':'girl',
                  'dophin':'dolphin','iar':'air',
                  'bowling-bell':'bowling','milke':'milk',
                  'knie':'knife','skyscaper':'skyscraper',
                  'borse':'horse','blook':'blood',
                  'dew':'sew','spetula':'spatula','skycraper':'skyscraper',
                  'bowling-ball':'bowling','airoplane':'aeroplane'}
def correct_word(word):
    return word if word not in manual_correct else manual_correct[word]

svo2plau = dict()
def read_from_triples(path, val, svo2plau):
    with open(path, 'r') as f:
        for line in f:
            line = line.strip().split()
            if len(line)!=3: continue
            s,v,o = line
            s,v,o = correct_word(s),correct_word(v),correct_word(o)
            v_indexer.get_index(v)
            n_indexer.get_index(s)
            n_indexer.get_index(o)
            svo2plau[(s,v,o)] = val
read_from_triples(data_dir+'pos-all.txt', 1, svo2plau)
read_from_triples(data_dir+'neg-all.txt', 0, svo2plau)

print '#triples = ' + repr(len(svo2plau))
print '#nouns = ' + repr(len(n_indexer))
print '#verbs = ' + repr(len(v_indexer))

#triples = 3062
#nouns = 435
#verbs = 156


In [139]:
# Load features

feat_path = '/home/jacobsuwang/Documents/UTA2017/PROPERTIES/AMT/FEATURES/nouns.xlsx'

FEAT_LIST = ['SIZE','WEIGHT','RIGIDITY','SENTIENCE','MASS-COUNT','PHASE']

featbindiff_indexer = Indexer()
feat3lvdiff_indexer = Indexer()

noun2feat2val = defaultdict(lambda : dict())    
def read_feat(path, feat_type, noun2feat2val):
    df = pd.read_excel(path, sheetname=feat_type)
    for i in xrange(len(df)):
        entry = df.iloc[i]
        noun = str(entry['WORDS'])
        feat_val = np.argmax(list(entry)[1:])
        noun2feat2val[correct_word(noun)][feat_type] = feat_val
        
for feat_type in FEAT_LIST:
    read_feat(feat_path, feat_type, noun2feat2val)
    df = pd.read_excel(feat_path, sheetname=feat_type)
    num_feat = len(df.columns[1:])
    for diff in np.arange(-(num_feat-1), num_feat):
        featbindiff_indexer.get_index(feat_type+':'+str(diff))
    for lv in np.arange(-1, 2):
        feat3lvdiff_indexer.get_index(feat_type+':'+str(lv))


In [5]:
# Load glove

glove_path = '/home/jacobsuwang/Documents/UTA2017/PROPERTIES/DATA4/glove.6B.300d.txt'

word2emb = {}

with open(glove_path, 'r') as f:
    for line in f:
        line = line.split()
        word2emb[line[0]] = np.asarray(map(np.float, line[1:]))

print 'vocab size = ' + repr(len(word2emb))

vocab size = 400000


In [69]:
featbindiff_indexer

['SIZE:-6', 'SIZE:-5', 'SIZE:-4', 'SIZE:-3', 'SIZE:-2', 'SIZE:-1', 'SIZE:0', 'SIZE:1', 'SIZE:2', 'SIZE:3', 'SIZE:4', 'SIZE:5', 'SIZE:6', 'WEIGHT:-6', 'WEIGHT:-5', 'WEIGHT:-4', 'WEIGHT:-3', 'WEIGHT:-2', 'WEIGHT:-1', 'WEIGHT:0', 'WEIGHT:1', 'WEIGHT:2', 'WEIGHT:3', 'WEIGHT:4', 'WEIGHT:5', 'WEIGHT:6', 'RIGIDITY:-4', 'RIGIDITY:-3', 'RIGIDITY:-2', 'RIGIDITY:-1', 'RIGIDITY:0', 'RIGIDITY:1', 'RIGIDITY:2', 'RIGIDITY:3', 'RIGIDITY:4', 'SENTIENCE:-5', 'SENTIENCE:-4', 'SENTIENCE:-3', 'SENTIENCE:-2', 'SENTIENCE:-1', 'SENTIENCE:0', 'SENTIENCE:1', 'SENTIENCE:2', 'SENTIENCE:3', 'SENTIENCE:4', 'SENTIENCE:5', 'MASS-COUNT:-3', 'MASS-COUNT:-2', 'MASS-COUNT:-1', 'MASS-COUNT:0', 'MASS-COUNT:1', 'MASS-COUNT:2', 'MASS-COUNT:3', 'PHASE:-2', 'PHASE:-1', 'PHASE:0', 'PHASE:1', 'PHASE:2']

In [82]:
# Helpers for getting features and embs

def get_glove(word):
    return word2emb[word] if word in word2emb else np.zeros(300)

def to_3lv_feat(diff):
    if diff>0:
        return 1
    elif diff==0:
        return 0
    else:
        return -1

def get_features(s, o, scheme='bin'):
    assert scheme in ['bin','3lv']
    so_fts = []
    for feat_type in FEAT_LIST:
        s_ft = noun2feat2val[s][feat_type]
        o_ft = noun2feat2val[o][feat_type]
        diff = s_ft - o_ft
        feat_name = feat_type+':'+str(diff) if scheme=='bin' \
                    else feat_type+':'+str(to_3lv_feat(diff))
        so_fts.append(featbindiff_indexer.get_index(feat_name,add=0)
                        if scheme=='bin' else
                        feat3lvdiff_indexer.get_index(feat_name,add=0))
    return so_fts
        

In [179]:
# Format input

scheme = 'bin'

X_so, X_v, Y = [], [], []
X_gl = []
for (s,v,o),plau in svo2plau.iteritems():
    X_so.append(get_features(s,o,scheme))
    X_v.append(v_indexer.get_index(v, add=0))
    X_gl.append(np.concatenate((get_glove(s),get_glove(v),get_glove(o))))
    Y.append(plau)
    
X_so = np.asarray(X_so)
X_v = np.reshape(np.asarray(X_v),[-1,1])
X_gl = np.asarray(X_gl)
Y = np.asarray(Y)

class DataIteratorEmb:
    
    def __init__(self, X_gl, Y):
        self.X_gl = deepcopy(X_gl)
        self.Y = deepcopy(Y)
        self.size = len(X_gl)
        self.indices = np.arange(self.size)
        self.epoch = 0
        self.cursor = 0
        self.shuffle()
    
    def shuffle(self):
        random.shuffle(self.indices)
        self.X_gl = self.X_gl[self.indices]
        self.Y = self.Y[self.indices]
        self.cursor = 0
    
    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epoch += 1
            self.shuffle()    
        X_gl_batch = self.X_gl[self.cursor:self.cursor+n]
        Y_batch = self.Y[self.cursor:self.cursor+n]
        self.cursor += n
        return X_gl_batch, Y_batch
    

class DataIteratorEmbWK:
    
    def __init__(self, X_so, X_v, X_gl, Y):
        self.X_so = deepcopy(X_so)
        self.X_v = deepcopy(X_v)
        self.X_gl = deepcopy(X_gl)
        self.Y = deepcopy(Y)
        self.size = len(X_so)
        self.indices = np.arange(self.size)
        self.epoch = 0
        self.cursor = 0
        self.shuffle()
    
    def shuffle(self):
        random.shuffle(self.indices)
        self.X_so = self.X_so[self.indices]
        self.X_v = self.X_v[self.indices]
        self.X_gl = self.X_gl[self.indices]
        self.Y = self.Y[self.indices]
        self.cursor = 0
    
    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epoch += 1
            self.shuffle()    
        X_so_batch = self.X_so[self.cursor:self.cursor+n]
        X_v_batch = self.X_v[self.cursor:self.cursor+n]
        X_gl_batch = self.X_gl[self.cursor:self.cursor+n]
        Y_batch = self.Y[self.cursor:self.cursor+n]
        self.cursor += n
        return X_so_batch, X_v_batch, X_gl_batch, Y_batch
    

### Embedding only

In [178]:
train_ratio = 0.9
num_epochs = 20
batch_size = 20

emb_size = 10
hidden_size = 10
glove_size = 900
num_classes = 2

init_lr = 1e-4

def get_feat_size(scheme):
    return len(featbindiff_indexer) if scheme=='bin' else len(feat3lvdiff_indexer)

tf.reset_default_graph()

x_gl = tf.placeholder(tf.float32, [None, glove_size])

W1 = tf.get_variable('W1', [glove_size, hidden_size],
                     initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.get_variable('b1', [hidden_size],
                     initializer=tf.contrib.layers.xavier_initializer())
z = tf.nn.relu(tf.add(tf.matmul(x_gl, W1), b1))

W2 = tf.get_variable('W2', [hidden_size, num_classes],
                     initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.get_variable('b2', [num_classes],
                     initializer=tf.contrib.layers.xavier_initializer())

probs = tf.nn.softmax(tf.add(tf.matmul(z, W2), b2))
preds = tf.cast(tf.argmax(probs, 1), dtype=tf.int32)

y = tf.placeholder(tf.int32, [None])
y_onehot = tf.one_hot(y, 2)

loss = tf.reduce_mean(tf.negative(tf.log(tf.reduce_sum(tf.multiply(probs, y_onehot), 1))))

# eq = tf.equal(preds, y)
correct = tf.reduce_sum(tf.cast(tf.equal(preds, y), dtype=tf.int32))

decay_steps = 100
learning_rate_decay_factor = 0.95
global_step = tf.contrib.framework.get_or_create_global_step()
initial_learning_rate = init_lr
lr = tf.train.exponential_decay(initial_learning_rate,
                                global_step,
                                decay_steps,
                                learning_rate_decay_factor,
                                staircase=True)
# optimizer setup
opt = tf.train.AdamOptimizer(lr)
grads = opt.compute_gradients(loss)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
with tf.control_dependencies([apply_gradient_op]):
    train_op = tf.no_op(name='train')
# run
init = tf.global_variables_initializer()
cutoff = int(len(svo2plau)*train_ratio)
init_indices = np.arange(len(svo2plau))
random.shuffle(init_indices)
X_gl = X_gl[init_indices]
Y = Y[init_indices]
X_gl_train, Y_train = X_gl[:cutoff], Y[:cutoff]
X_gl_test, Y_test = X_gl[cutoff:], Y[cutoff:]

with tf.Session() as sess:
    sess.run(init)
    train_iter = DataIteratorEmb(X_gl_train, Y_train)
    cur_epoch = train_iter.epoch
    cur_loss = 0.0
    while train_iter.epoch < num_epochs:
        batch_x_gl, batch_y = train_iter.next_batch(batch_size)
        feed = {x_gl:batch_x_gl, y:batch_y}
        [_, batch_loss] = sess.run([train_op, loss],
                                   feed_dict=feed)
        cur_loss += batch_loss
        if cur_epoch<train_iter.epoch:
            print 'Epoch %d loss = %.2f' % (cur_epoch, cur_loss)
            cur_epoch = train_iter.epoch
            cur_loss = 0
    print
    
    # Eval on train
    num_correct = sess.run(correct, feed_dict={x_gl:X_gl_train, y:Y_train})
    print "Accuracy on train: " + repr(num_correct/float(len(X_so_train)))
    
    # Eval on test
    num_correct = sess.run(correct, feed_dict={x_gl:X_gl_test, y:Y_test})
    print "Accuracy on train: " + repr(num_correct/float(len(X_so_test)))
            

Epoch 0 loss = 98.66
Epoch 1 loss = 92.50
Epoch 2 loss = 88.92
Epoch 3 loss = 86.03
Epoch 4 loss = 82.91
Epoch 5 loss = 80.87
Epoch 6 loss = 78.47
Epoch 7 loss = 77.13
Epoch 8 loss = 75.42
Epoch 9 loss = 74.76
Epoch 10 loss = 73.55
Epoch 11 loss = 72.79
Epoch 12 loss = 71.69
Epoch 13 loss = 71.35
Epoch 14 loss = 70.44
Epoch 15 loss = 69.97
Epoch 16 loss = 69.65
Epoch 17 loss = 69.08
Epoch 18 loss = 68.49
Epoch 19 loss = 68.36

Accuracy on train: 0.77241379310344827
Accuracy on train: 0.68078175895765469


### Embedding + WK

In [200]:
train_ratio = 0.9
num_epochs = 20
batch_size = 20

emb_size = 10
hidden_size = 10
glove_size = 900
num_classes = 2

init_lr = 1e-4
keep = 1.0

def get_feat_size(scheme):
    return len(featbindiff_indexer) if scheme=='bin' else len(feat3lvdiff_indexer)

tf.reset_default_graph()

keep_prob = tf.placeholder(tf.float32)

x_so = tf.placeholder(tf.int32, [None, 6])
E_so = tf.get_variable('E-so', [get_feat_size(scheme), emb_size],
                       initializer=tf.contrib.layers.xavier_initializer())
e_so = tf.nn.embedding_lookup(E_so, x_so) # (batch_size, 6, 10)

x_v = tf.placeholder(tf.int32, [None, 1])
E_v = tf.get_variable('E-v', [len(v_indexer), emb_size],
                      initializer=tf.contrib.layers.xavier_initializer())
e_v = tf.nn.embedding_lookup(E_v, x_v) # (batch_size, 1, 10)

e_svo = tf.contrib.layers.flatten(tf.concat((e_so, e_v), axis=1)) 
    # concat: (batch_size, 7, 10)
    # flatten: 7 10D embeddings gets concatenated [10,10,10,...] (does not touch None)

W1 = tf.get_variable('W1', [emb_size*7, hidden_size],
                     initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.get_variable('b1', [hidden_size],
                     initializer=tf.contrib.layers.xavier_initializer())
z1 = tf.nn.relu(tf.add(tf.matmul(e_svo, W1), b1))

x_gl = tf.placeholder(tf.float32, [None, glove_size])

W2 = tf.get_variable('W2', [glove_size, hidden_size],
                     initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.get_variable('b2', [hidden_size],
                     initializer=tf.contrib.layers.xavier_initializer())
z2 = tf.nn.relu(tf.add(tf.matmul(x_gl, W2), b2))

z = tf.nn.dropout(tf.concat((z1, z2), axis=1), keep_prob)

W = tf.get_variable('W', [hidden_size*2, num_classes],
                    initializer=tf.contrib.layers.xavier_initializer())
b = tf.get_variable('b', [num_classes],
                    initializer=tf.contrib.layers.xavier_initializer())

probs = tf.nn.softmax(tf.add(tf.matmul(z, W), b))
preds = tf.cast(tf.argmax(probs, 1), dtype=tf.int32)

y = tf.placeholder(tf.int32, [None])
y_onehot = tf.one_hot(y, 2)

loss = tf.reduce_mean(tf.negative(tf.log(tf.reduce_sum(tf.multiply(probs, y_onehot), 1))))

# eq = tf.equal(preds, y)
correct = tf.reduce_sum(tf.cast(tf.equal(preds, y), dtype=tf.int32))

decay_steps = 100
learning_rate_decay_factor = 0.95
global_step = tf.contrib.framework.get_or_create_global_step()
initial_learning_rate = init_lr
lr = tf.train.exponential_decay(initial_learning_rate,
                                global_step,
                                decay_steps,
                                learning_rate_decay_factor,
                                staircase=True)
# optimizer setup
opt = tf.train.AdamOptimizer(lr)
grads = opt.compute_gradients(loss)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
with tf.control_dependencies([apply_gradient_op]):
    train_op = tf.no_op(name='train')
# run
init = tf.global_variables_initializer()
cutoff = int(len(svo2plau)*train_ratio)
init_indices = np.arange(len(svo2plau))
random.shuffle(init_indices)
X_so = X_so[init_indices]
X_v = X_v[init_indices]
X_gl = X_gl[init_indices]
Y = Y[init_indices]
X_so_train, X_v_train, X_gl_train, Y_train = X_so[:cutoff], X_v[:cutoff], X_gl[:cutoff], Y[:cutoff]
X_so_test, X_v_test, X_gl_test, Y_test = X_so[cutoff:], X_v[cutoff:], X_gl[cutoff:], Y[cutoff:]

with tf.Session() as sess:
    sess.run(init)
    train_iter = DataIteratorEmbWK(X_so_train, X_v_train, X_gl_train, Y_train)
    cur_epoch = train_iter.epoch
    cur_loss = 0.0
    while train_iter.epoch < num_epochs:
        batch_x_so, batch_x_v, batch_x_gl, batch_y = train_iter.next_batch(batch_size)
        feed = {x_so:batch_x_so, x_v:batch_x_v, x_gl:batch_x_gl, 
                y:batch_y, keep_prob:keep}
        [_, batch_loss] = sess.run([train_op, loss],
                                   feed_dict=feed)
        cur_loss += batch_loss
        if cur_epoch<train_iter.epoch:
            print 'Epoch %d loss = %.2f' % (cur_epoch, cur_loss)
            cur_epoch = train_iter.epoch
            cur_loss = 0
    print
    
    # Eval on train
    num_correct = sess.run(correct, feed_dict={x_so:X_so_train,
                                               x_v:X_v_train,
                                               x_gl:X_gl_train,
                                               y:Y_train,
                                               keep_prob:1.0})
    print "Accuracy on train: " + repr(num_correct/float(len(X_so_train)))
    
    # Eval on test
    num_correct = sess.run(correct, feed_dict={x_so:X_so_test,
                                               x_v:X_v_test,
                                               x_gl:X_gl_test,
                                               y:Y_test,
                                               keep_prob:1.0})
    print "Accuracy on train: " + repr(num_correct/float(len(X_so_test)))
        

Epoch 0 loss = 98.34
Epoch 1 loss = 90.94
Epoch 2 loss = 87.12
Epoch 3 loss = 84.43
Epoch 4 loss = 81.90
Epoch 5 loss = 80.41
Epoch 6 loss = 79.10
Epoch 7 loss = 77.50
Epoch 8 loss = 76.24
Epoch 9 loss = 75.31
Epoch 10 loss = 74.39
Epoch 11 loss = 73.49
Epoch 12 loss = 72.91
Epoch 13 loss = 72.25
Epoch 14 loss = 71.55
Epoch 15 loss = 70.95
Epoch 16 loss = 70.31
Epoch 17 loss = 70.17
Epoch 18 loss = 69.39
Epoch 19 loss = 69.05

Accuracy on train: 0.76261343012704175
Accuracy on train: 0.76872964169381108


In [117]:
# TF interactive testing

# tf.reset_default_graph()

# # x = tf.placeholder(tf.int32, [None, 6])
# # emb_mat = tf.get_variable('emb', [20, 10])
# # emb = tf.nn.embedding_lookup(emb_mat, x) # (batch_size, n_col, emb_size)

# x1 = tf.placeholder(tf.int32, [3,2])
# x1_float = tf.placeholder(tf.float32, [3,2])
# x2 = tf.placeholder(tf.int32, [1,2])
# x = tf.concat((x1,x2), axis=0)
# x_reshape = tf.reshape(x, [-1])

# sm = tf.nn.softmax(x1_float)

# a1 = np.array([[1,1],
#               [2,2],
#               [3,3]])
# a1_float = np.array([[1,1],
#                       [2,2],
#                       [3,3]],dtype=np.float32)
# a2 = np.array([[4,4]])

# sess = tf.InteractiveSession()
# sess.run(tf.global_variables_initializer())
# # d = sess.run(x, feed_dict={x1:a1,x2:a2})
# # e = sess.run(x_reshape, feed_dict={x1:a1,x2:a2})
# print sess.run(sm, feed_dict={x1_float:a1_float})

[[ 0.5  0.5]
 [ 0.5  0.5]
 [ 0.5  0.5]]
