In [None]:
from gensim.models import word2vec
import numpy as np
from scipy import spatial
import tensorflow as tf
from sklearn.model_selection import ShuffleSplit

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import & Init matplotlib
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
plt.style.use('ggplot')

# Import util
import time
import re
import sys
import gc

# Self define module
from mini_batch_helper import MiniBatch

### Load datasets

#### sample test data

In [None]:
sample = pd.read_csv('datas/sample_test_data.txt')
sample

In [None]:
# Extract sample test datas
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]

# Tokenize
x1 = [list(jieba.cut(' '.join(_))) for _ in x1]
x2 = [[list(jieba.cut(s)) for s in _] for _ in x2]
y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

In [None]:
test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_datas

In [None]:
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)

#### PPT Gossiping QA

In [None]:
# # Extract PPT Gossiping QA datas
# with open('datas/raw/Gossiping-QA-Dataset.txt', 'r') as fi:
#     gossip_lines = [line.strip().split('\t') for line in fi]
# gossip_x1 = [list(jieba.cut(line[0])) for line in gossip_lines if len(line) == 2]
# gossip_x2 = [list(jieba.cut(line[1])) for line in gossip_lines if len(line) == 2]

In [None]:
# del(sample)
# del(gossip_lines)
# time.sleep(1)
# gc.collect()

### sentence embedding

#### Extract first principle component

In [None]:
# word2vec_model = word2vec.Word2Vec.load('models/word2vec_250.model.bin')
# total_word_cnt = np.sum([_.count for _ in word2vec_model.wv.vocab.values()])

In [None]:
# def weighted_centroid(sentence, a=0.0001):
#     _ = [a / (a + word2vec_model.wv.vocab[word].count / total_word_cnt) * word2vec_model.wv.word_vec(word)
#             for word in sentence if word in word2vec_model.wv.vocab]
#     return np.mean(_, axis=0) if len(_) > 0 else np.zeros(word2vec_model.vector_size)

In [None]:
# vs = np.concatenate([
#     np.array([weighted_centroid(s) for s in x1]),
#     np.array([weighted_centroid(s) for opts in x2 for s in opts]),
#     np.array([weighted_centroid(s) for s in test_x1]),
#     np.array([weighted_centroid(s) for opts in test_x2 for s in opts]),
#     np.array([weighted_centroid(s) for s in gossip_x1]),
#     np.array([weighted_centroid(s) for s in gossip_x2]),
# ])
# assert(len(vs) == len(x1) + len(x2) * len(x2[0]) + len(test_x1) + len(test_x2) * len(test_x2[0]) + len(gossip_x1) + len(gossip_x2))
# vs.shape

In [None]:
# # Release unused memory comsumed model
# del(word2vec_model)
# time.sleep(1)
# gc.collect()

In [None]:
# # Compute first principle component
# eig_vals, eig_vecs = np.linalg.eig(np.corrcoef(vs.T))
# eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
# eig_pairs.sort(reverse=True)
# u = eig_pairs[0][1].copy()

# # Release unused memory
# del(eig_vals)
# del(eig_vecs)
# del(eig_pairs)
# time.sleep(1)
# gc.collect()

#### Common component removal

In [None]:
# for i in range(len(vs)):
#     vs[i] = vs[i] - u * (vs[i] @ u)

In [None]:
# _ = np.cumsum([len(x1), len(x2) * len(x2[0]), len(test_x1), len(test_x2) * len(test_x2[0]), len(gossip_x1)])

In [None]:
# _ = np.cumsum([len(x1), len(x2) * len(x2[0]), len(test_x1), len(test_x2) * len(test_x2[0]), len(gossip_x1)])
# sample_vs1, sample_vs2, test_vs1, test_vs2, gossip_vs1, gossip_vs2 = np.split(vs, _)
# del(vs)
# time.sleep(1)
# gc.collect()

In [None]:
# np.save('datas/sentence_embedding/first_principle_component', u)
# np.save('datas/sentence_embedding/sample_vs1_250', sample_vs1)
# np.save('datas/sentence_embedding/sample_vs2_250', sample_vs2)
# np.save('datas/sentence_embedding/test_vs1_250', test_vs1)
# np.save('datas/sentence_embedding/test_vs2_250', test_vs2)
# np.save('datas/sentence_embedding/gossip_vs1_250', gossip_vs1)
# np.save('datas/sentence_embedding/gossip_vs2_250', gossip_vs2)

## Sigmoid Scheme

### Import datas

In [None]:
def get_wrong_idx(n):
    idx = np.arange(n)
    np.random.shuffle(idx)
    for i in np.where(idx == np.arange(n))[0]:
        if idx[i] != i:
            continue
        t = np.random.randint(n)
        while t==i or t==idx[i]:
            t = np.random.randint(n)
        idx[i], idx[t] = idx[t], idx[i]
    return idx

In [None]:
# Read embedding feature
test_x1 = np.load('datas/sentence_embedding/sample_vs1.npy')
test_x2 = np.load('datas/sentence_embedding/sample_vs2.npy')
train_x1 = np.load('datas/sentence_embedding/gossip_vs1.npy')
train_x2 = np.load('datas/sentence_embedding/gossip_vs2.npy')

# Processed
test_x1 = test_x1[np.repeat(np.arange(len(test_x1)), 6)]
train_x2 = np.stack([
    train_x2,
    train_x2[get_wrong_idx(train_x2.shape[0])],
    train_x2[get_wrong_idx(train_x2.shape[0])],
    train_x2[get_wrong_idx(train_x2.shape[0])],
], axis=1)

test_x1.shape, test_x2.shape, train_x1.shape, train_x2.shape

### Train using tensorflow

In [None]:
# tf Model
vec_sz = train_x1.shape[-1]
tf_x1 = tf.placeholder(tf.float32, [None, vec_sz])
tf_x2 = tf.placeholder(tf.float32, [None, vec_sz])
tf_y = tf.placeholder(tf.float32, [None])
tf_W = tf.Variable(tf.truncated_normal([vec_sz, vec_sz]))
tf_y_ = tf.reduce_sum(tf_x1 * (tf_x2 @ tf_W), axis=1)

# tf Objective
cost = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y, logits=tf_y_) + 5e-3 * tf.nn.l2_loss(tf_W)
optimizer = tf.train.AdamOptimizer().minimize(cost)

# tf Session & Saver
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# train loop hyperparameters
batch_size = 2777
batch_num = np.prod(train_x2.shape[:-1]) // batch_size
epoch_num = 30
data_loader = MiniBatch(train_x1, train_x2, np.zeros(train_x2.shape[0]))

In [None]:
start_time = time.time()
train_acc_lst = []
test_acc_lst = []
for ith_epoch in range(epoch_num):
    train_correct = 0
    for ith_batch in range(batch_num):
        b_x1, b_x2, b_y = data_loader.next_batch_4_sigmoid(batch_size)
        _, b_y_ = sess.run([optimizer, tf_y_], {tf_x1: b_x1, tf_x2: b_x2, tf_y: b_y})
        train_correct += np.sum((b_y == 0) * (b_y_ < 0) + (b_y == 1) * (b_y_ > 0))
    train_acc_lst.append(train_correct / (batch_num * batch_size))
    
    # Evalutae accuracy after epoch
    test_y_ = sess.run(tf_y_, {tf_x1: test_x1, tf_x2: test_x2})
    test_correct = np.sum(np.argmax(test_y_.reshape(-1, 6), axis=1) == y)
    test_acc_lst.append(test_correct / (len(test_y_) / 6))
    
    print('epoch %3d: train acc %.4f / test acc %.4f / elapsed time %10.f' % (
        ith_epoch, train_acc_lst[-1], test_acc_lst[-1], time.time() - start_time
    ))

In [None]:
# Plot train cost in each epoch
fig, ax = plt.subplots(1, figsize=(15, 6))
ax.set_title('loss')
ax.set_xlim(0, len(train_acc_lst))
ax.scatter(x=list(range(len(train_acc_lst))), y=train_acc_lst, marker='.', color='blue')
ax.set_title('accuracy')
ax.set_xlim(0, len(test_acc_lst))
ax.scatter(x=list(range(len(test_acc_lst))), y=test_acc_lst, marker='.', color='red')