In [1]:
from gensim.models import word2vec
import numpy as np
from scipy import spatial
import tensorflow as tf
from sklearn.model_selection import ShuffleSplit

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import & Init matplotlib
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
plt.style.use('ggplot')

# Import util
import time
import re
import sys
import gc

Using TensorFlow backend.
Building prefix dict from /home/sunset/word_contest/datas/dict/dict.txt.big ...
Loading model from cache /tmp/jieba.u849ecfdca27003d306f39ca004b82b5b.cache
Loading model cost 1.157 seconds.
Prefix dict has been built succesfully.


### Load datasets

#### sample test data

In [None]:
sample = pd.read_csv('datas/sample_test_data.txt')
sample

In [None]:
# Extract sample test datas
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]

# Tokenize
x1 = [list(jieba.cut(' '.join(_))) for _ in x1]
x2 = [[list(jieba.cut(s)) for s in _] for _ in x2]
y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

#### PPT Gossiping QA

In [None]:
# Extract PPT Gossiping QA datas
with open('datas/raw/Gossiping-QA-Dataset.txt', 'r') as fi:
    gossip_lines = [line.strip().split('\t') for line in fi]
gossip_x1 = [list(jieba.cut(line[0])) for line in gossip_lines if len(line) == 2]
gossip_x2 = [list(jieba.cut(line[1])) for line in gossip_lines if len(line) == 2]

In [None]:
del(sample)
del(gossip_lines)
time.sleep(1)
gc.collect()

### sentence embedding

#### Extract first principle component

In [None]:
word2vec_model = word2vec.Word2Vec.load('models/word2vec_250.model.bin')
total_word_cnt = np.sum([_.count for _ in word2vec_model.wv.vocab.values()])

In [None]:
def weighted_centroid(sentence, a=0.0001):
    _ = [a / (a + word2vec_model.wv.vocab[word].count / total_word_cnt) * word2vec_model.wv.word_vec(word)
            for word in sentence if word in word2vec_model.wv.vocab]
    return np.mean(_, axis=0) if len(_) > 0 else np.zeros(word2vec_model.vector_size)

In [None]:
vs = np.concatenate([
    np.array([weighted_centroid(s) for s in x1]),
    np.array([weighted_centroid(s) for opts in x2 for s in opts]),
    np.array([weighted_centroid(s) for s in gossip_x1]),
    np.array([weighted_centroid(s) for s in gossip_x2])
])
assert(len(vs) == len(x1) + len(x2) * len(x2[0]) + len(gossip_x1) + len(gossip_x2))
vs.shape

In [None]:
# Release unused memory comsumed model
del(word2vec_model)
time.sleep(1)
gc.collect()

In [None]:
# Compute first principle component
eig_vals, eig_vecs = np.linalg.eig(np.corrcoef(vs.T))
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
eig_pairs.sort(reverse=True)
u = eig_pairs[0][1].copy()

# Release unused memory
del(eig_vals)
del(eig_vecs)
del(eig_pairs)
time.sleep(1)
gc.collect()

#### Common component removal

In [None]:
for i in range(len(vs)):
    vs[i] = vs[i] - u * (vs[i] @ u)

In [None]:
_ = np.cumsum([len(x1), len(x2) * len(x2[0]), len(gossip_x1)])
sample_vs1, sample_vs2, gossip_vs1, gossip_vs2 = np.split(vs, _)
del(vs)
time.sleep(1)
gc.collect()

In [None]:
# np.save('datas/sentence_embedding/sample_vs1', sample_vs1)
# np.save('datas/sentence_embedding/sample_vs2', sample_vs2)
# np.save('datas/sentence_embedding/gossip_vs1', gossip_vs1)
# np.save('datas/sentence_embedding/gossip_vs2', gossip_vs2)

In [2]:
sample_vs1 = np.load('datas/sentence_embedding/sample_vs1.npy')
sample_vs2 = np.load('datas/sentence_embedding/sample_vs2.npy')
gossip_vs1 = np.load('datas/sentence_embedding/gossip_vs1.npy')
gossip_vs2 = np.load('datas/sentence_embedding/gossip_vs2.npy')

## Sigmoid Scheme
### Train using tensorflow

In [None]:
# Define mini-batch data loader
class mini_batcher():
    def __init__(self, x1, x2, y, batch_size):
        self._x1 = np.array(x1)
        self._x2 = np.array(x2)
        self._y = np.array(y)
        self._batch_size = batch_size
        self._datas_num = len(x1)
        self._pointer = 0
        assert(self._batch_size <= self._datas_num)
        self._idx = np.arange(len(x1))
        np.random.shuffle(self._idx)
    
    def _next_batch(self):
        f = self._pointer
        t = self._pointer + self._batch_size
        if t > self._datas_num:
            f = 0
            t = self._batch_size
            np.random.shuffle(self._idx)
        self._pointer = t
        idx = self._idx[f:t]
        return self._x1[idx], self._x2[idx], self._y[idx]
    
    def next_batch_4_sigmoid(self, shuffle_batch=True):
        b_x1, b_x2, b_y = self._next_batch()

        # Expand to (question, one_option, 0/1)
        dt = np.array([
            (b_x1[i], b_x2[i][j], float(b_y[i] == j))
            for i in range(len(b_x2)) for j in range(len(b_x2[i]))
        ])
        b_idx = np.arange(len(dt))
        if shuffle_batch:
            np.random.shuffle(b_idx)
        b_x1 = np.array([_[0] for _ in dt[b_idx]])
        b_x2 = np.array([_[1] for _ in dt[b_idx]])
        b_y = np.array([_[2] for _ in dt[b_idx]])
        
        return b_x1, b_x2, b_y
    
    def all_4_sigmoid_evaluation(self):
        dt = np.array([
            (self._x1[i], self._x2[i][j], float(self._y[i] == j))
            for i in range(len(self._x2)) for j in range(len(self._x2[i]))
        ])
        all_x1 = np.array([_[0] for _ in dt])
        all_x2 = np.array([_[1] for _ in dt])
        all_y = np.array([_[2] for _ in dt])
        return all_x1, all_x2, all_y

    def next_batch_4_cross_entropy(self):
        raise NotImplementedError('Not yet OwOb')
        
    def all_4_cross_entropy_evaluation(self):
        raise NotImplementedError('Not yet OwOb')

In [None]:
# tf Model
vec_sz = f1.shape[-1]
tf_x1 = tf.placeholder(tf.float32, [None, vec_sz])
tf_x2 = tf.placeholder(tf.float32, [None, vec_sz])
tf_y = tf.placeholder(tf.float32, [None])
tf_W = tf.Variable(tf.truncated_normal([vec_sz, vec_sz]))
tf_y_ = tf.reduce_sum(tf_x1 * (tf_x2 @ tf_W), axis=1)

# tf Objective
cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y, logits=tf_y_) + 1e-3 * tf.nn.l2_loss(tf_W)
optimizer = tf.train.AdamOptimizer(1e-4).minimize(cost)

# tf Session & Saver
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# train loop hyperparameters
batch_size = 10
batch_num = len(y) // batch_size
epoch_num = 500
data_loader = mini_batcher(f1, f2, y, batch_size)

In [None]:
start_time = time.time()
train_cost_lst = []
train_acc_lst = []
for ith_epoch in range(epoch_num):
    epoch_cost = 0
    for ith_batch in range(batch_num):
        b_x1, b_x2, b_y = data_loader.next_batch_4_sigmoid()
        _, b_cost = sess.run([optimizer, cost], {tf_x1: b_x1, tf_x2: b_x2, tf_y: b_y})
        b_cost = np.mean(b_cost)
        epoch_cost += b_cost / batch_num
    train_cost_lst.append(epoch_cost)
    
    # Evalutae accuracy after epoch
    all_x1, all_x2, all_y = data_loader.all_4_sigmoid_evaluation()
    all_y_ = sess.run(tf_y_, {tf_x1: all_x1, tf_x2: all_x2})
    correct = np.sum(np.argmax(all_y.reshape(-1, 6), axis=1) == np.argmax(all_y_.reshape(-1, 6), axis=1))
    train_acc_lst.append(correct / all_y.reshape(-1, 6).shape[0])
    
    print('epoch cost: %10f / elapsed time %10.2f' % (epoch_cost, time.time() - start_time))

In [None]:
# Plot train cost in each epoch
fig, ax = plt.subplots(2, 1, figsize=(15, 6))
ax[0].set_title('loss')
ax[0].set_xlim(0, len(train_cost_lst))
ax[0].scatter(x=list(range(len(train_cost_lst))), y=train_cost_lst, marker='.', color='blue')
ax[1].set_title('accuracy')
ax[1].set_xlim(0, len(train_cost_lst))
ax[1].scatter(x=list(range(len(train_acc_lst))), y=train_acc_lst, marker='.', color='blue')

### Cross Validation

In [None]:
def gogo_fold(train_f1, train_f2, train_y, test_f1, test_f2, test_y, verbose=1):
    train_loader = mini_batcher(train_f1, train_f2, train_y, batch_size)
    test_loader = mini_batcher(test_f1, test_f2, test_y, batch_size)
    sess.run(tf.global_variables_initializer())
    
    def evaluate(data_loader):
        # Evalutae train accuracy
        all_x1, all_x2, all_y = data_loader.all_4_sigmoid_evaluation()
        all_y_ = sess.run(tf_y_, {tf_x1: all_x1, tf_x2: all_x2})
        correct = np.sum(np.argmax(all_y.reshape(-1, 6), axis=1) == np.argmax(all_y_.reshape(-1, 6), axis=1))
        return correct / all_y.reshape(-1, 6).shape[0]
    
    train_acc_lst = []
    test_acc_lst = []
    for ith_epoch in range(epoch_num):
        epoch_cost = 0
        for ith_batch in range(batch_num):
            b_x1, b_x2, b_y = train_loader.next_batch_4_sigmoid()
            sess.run(optimizer, {tf_x1: b_x1, tf_x2: b_x2, tf_y: b_y})
        
        train_acc = evaluate(train_loader)
        test_acc = evaluate(test_loader)
        train_acc_lst.append(train_acc)
        test_acc_lst.append(test_acc)
        if verbose:
            print('epoch %3d: train acc %10f / test acc %10f' % (ith_epoch, train_acc, test_acc))
    return train_acc_lst, test_acc_lst

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.4, random_state=0)

In [None]:
for train_idx, test_idx in cv.split(np.arange(len(y))):
    train_lst, test_lst = gogo_fold(
        f1[train_idx], f2[train_idx], y[train_idx], f1[test_idx], f2[test_idx], y[test_idx], verbose=False
    )
    print('train acc %10f / test acc %10f' % (train_lst[-1], test_lst[-1]))