In [1]:
import numpy as np
import tensorflow as tf
from scipy import spatial
from scipy import stats
from gensim.models import word2vec

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re
import sys
import gc

# Self define module
from mini_batch_helper import extractor
from mini_batch_helper import MiniBatchCorpus

Using TensorFlow backend.


### Loading corpus and forming dict

In [3]:
word2vec_fname = 'models/word2vec_no_tc_offitial_200.model.bin'
corpus_fnames = [
    'datas/training_data/no_TC_下課花路米.txt',
    'datas/training_data/no_TC_人生劇展.txt',
    'datas/training_data/no_TC_公視藝文大道.txt',
    'datas/training_data/no_TC_成語賽恩思.txt',
    'datas/training_data/no_TC_我的這一班.txt',
    'datas/training_data/no_TC_流言追追追.txt',
    'datas/training_data/no_TC_聽聽看.txt',
    'datas/training_data/no_TC_誰來晚餐.txt',
]
sample_rate_on_training_datas = 0.3
extra_words = ['<pad>']
unknown_word = '<pad>'

word2id, id2word, word_p, embedding_matrix, corpus, corpus_id = extractor(word2vec_fname, corpus_fnames, sample_rate_on_training_datas, extra_words, unknown_word)

In [6]:
corpus[0]

[['在', '台北', '近郊', '的', '天溪', '園'],
 ['一支', '生態', '營隊'],
 ['會', '有', '什麼樣', '的', '新', '發現', '呢'],
 ['什麼樣', '的', '青蛙'],
 ['居然', '還會噴', '毒液', '耶'],
 ['同學們', '最想', '看到', '的'],
 ['又是', '哪樣', '的', '動物', '呢'],
 ['蛇', '看到', '蛇'],
 ['青竹絲'],
 ['同學', '的', '願望', '真的', '能夠', '成真', '嗎'],
 ['會', '去', '咬', '人', '對', '不對'],
 ['今天', '下課', '花路', '米'],
 ['邀', '你', '跟著', '生態', '營隊'],
 ['走入', '天溪', '園'],
 ['一起', '夜訪', '森林', '大冒險'],
 ['風雨無阻'],
 ['今天', '我', '跟', '湘涵'],
 ['參加', '了', '一個', '由', '台北市', '西湖國小'],
 ['所', '舉辦', '的', '生態', '資訊', '營隊'],
 ['沒錯', '透過', '這個', '接觸', '大自然'],
 ['同時', '也', '可以', '更進一步'],
 ['觀察', '各種', '不同', '的', '物種'],
 ['跟', '生態', '的', '情況', '喔'],
 ['所以', '我們', '來到', '的是'],
 ['陽明山國家公園', '裡面', '的'],
 ['這個', '天溪', '園', '生態', '教育', '中心'],
 ['沒錯'],
 ['不過', '聽說', '他們', '今天'],
 ['分成', '五個', '小隊', '對', '不對'],
 ['那', '我們', '是不是', '應該', '分組', '進行'],
 ['分頭', '觀察'],
 ['當然', '啊'],
 ['現在', '我們', '就', '分', '開來', '走', '好不好'],
 ['好'],
 ['那', '今天', '我', '是', '代表', '隊'],
 ['我', '代表', '隊'],
 ['我們', '一', '起來'

In [7]:
# Statistic sentence length
Series([len(s) for cp in corpus for s in cp]).describe()

count    703473.000000
mean          4.662566
std           2.190775
min           1.000000
25%           3.000000
50%           5.000000
75%           6.000000
max          77.000000
dtype: float64

In [4]:
voc_size = embedding_matrix.shape[0]
emb_size = embedding_matrix.shape[1]
unknown_word_id = word2id['<pad>']
pad_word_id = word2id['<pad>']
max_seq_len = np.max([len(s) for cp in corpus_id for s in cp])

print('%20s: %d' % ('unknown_word_id', unknown_word_id))
print('%20s: %d' % ('pad_word_id', pad_word_id))
print('%20s: %d' % ('max_seq_len', max_seq_len))

     unknown_word_id: 58633
         pad_word_id: 58633
         max_seq_len: 77


In [5]:
# Data split
rnd_idx = np.arange(len(corpus_id))
np.random.shuffle(rnd_idx)
corpus_id = corpus_id[rnd_idx[:len(corpus_id)//2]]
valid_corpus_num = 10

train_data_loader = MiniBatchCorpus(corpus_id[valid_corpus_num:])
valid_data_loader = MiniBatchCorpus(corpus_id[:valid_corpus_num])
print('train datas num:', train_data_loader.data_num)
print('valid datas num:', valid_data_loader.data_num)

train datas num: 2354782
valid datas num: 16626


### Define model

In [6]:
# Word embedding model
tf_word_p = tf.constant(word_p, dtype=tf.float64)
embeddings_W = tf.Variable(embedding_matrix)
del(embedding_matrix)
gc.collect()

0

In [7]:
# Input
wa = tf.placeholder(tf.float64, [1])
x1 = tf.placeholder(tf.int32, [None, max_seq_len])
x2 = tf.placeholder(tf.int32, [None, max_seq_len])
y = tf.placeholder(tf.float64, [None])

In [8]:
def sentence_embedding(xs):
    xs_mask = 1 - tf.to_double(tf.equal(xs, pad_word_id))
    xs_len = tf.reduce_sum(xs_mask, axis=1)
    xs_embedded = tf.gather(embeddings_W, xs) * tf.reshape(xs_mask, [-1, max_seq_len, 1])
    xs_word_p = tf.gather(tf_word_p, xs)
    xs_weighted = tf.reshape(wa / (wa + xs_word_p), [-1, max_seq_len, 1]) * xs_embedded
    xs_center = tf.reduce_sum(xs_weighted, axis=1) / tf.reshape(tf.to_double(xs_len)+1e-6, [-1, 1])
    return xs_center

In [9]:
x1_center = sentence_embedding(x1)
x2_center = sentence_embedding(x2)
W = tf.Variable(tf.truncated_normal([emb_size, emb_size], stddev=1e-6, dtype=tf.float64))
tf_score = tf.reduce_sum((x2_center * (x1_center @ W)), axis=1)

----------------------------------
## Training

In [None]:
reg = tf.nn.l2_loss(W) * 6 / (emb_size * emb_size)
cost = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=tf_score)) + reg
optimizer = tf.train.AdamOptimizer(1e-3)
gvs = optimizer.compute_gradients(cost)
capped_gvs = [(tf.clip_by_norm(grad, 0.2), var) for grad, var in gvs]
train_step = optimizer.apply_gradients(capped_gvs)

In [None]:
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
def eval_valid_loss():
    valid_loss = 0
    valid_batch = 1024
    batch_num = valid_data_loader.data_num // valid_batch
    for i in range(batch_num):
        b_x1, b_x2, b_y = valid_data_loader.next_batch(valid_batch, max_seq_len, pad_word_id)
        now_loss = sess.run(cost, {wa: [1e-4], x1: b_x1, x2: b_x2, y: b_y})
        valid_loss += now_loss / (batch_num * valid_batch)
    return valid_loss

In [None]:
batch_size = 256
epoch_num = 2
log_interval = 200
save_interval = 1000

train_batch_loss = 0
start_time = time.time()
for i_batch in range(epoch_num * train_data_loader.data_num // batch_size):
    b_x1, b_x2, b_y = train_data_loader.next_batch(batch_size, max_seq_len, pad_word_id)
    _, now_loss = sess.run([train_step, cost], {wa: [1e-4], x1: b_x1, x2: b_x2, y: b_y})
    train_batch_loss += now_loss / (log_interval * batch_size)
    if (i_batch+1) % log_interval == 0:
        valid_loss = eval_valid_loss()
        print('train batch loss %10f / valid loss %10f / elapsed time %.f' % (
            train_batch_loss, valid_loss, time.time()-start_time), flush=True)
        train_batch_loss = 0
    if save_interval is not None and (i_batch+1) % save_interval == 0:
        saver.save(sess, 'models/Attack-sentence-embedding/s_emb', global_step=i_batch+1)
        print('model saved (latest)', flush=True)

saver.save(sess, 'models/Attack-sentence-embedding/s_emb_final')

------------------------------
## Evaluating

In [10]:
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

Building prefix dict from /home/sunset/word_contest/datas/dict/dict.txt.big ...
Loading model from cache /tmp/jieba.u849ecfdca27003d306f39ca004b82b5b.cache
Loading model cost 1.172 seconds.
Prefix dict has been built succesfully.


In [11]:
sample = pd.read_csv('datas/sample_test_data.txt')

sample_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
sample_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
sample_y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in sample_x2]) == 0)

sample_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in sample_x1]
sample_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in sample_x2]

In [12]:
test_datas = pd.read_csv('datas/AIFirstProblem.txt')

test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)

test_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in test_x1]
test_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in test_x2]

In [13]:
def word_lst_2_id_lst(lst):
    return [word2id[lst[i]] if i<len(lst) and lst[i] in word2id else pad_word_id for i in range(max_seq_len)]

In [14]:
sample_id1 = np.array([word_lst_2_id_lst(s) for s in sample_x1])
sample_id2 = np.array([[word_lst_2_id_lst(r) for r in rs] for rs in sample_x2])
test_id1 = np.array([word_lst_2_id_lst(s) for s in test_x1])
test_id2 = np.array([[word_lst_2_id_lst(r) for r in rs] for rs in test_x2])

In [15]:
def eval_ans(lst_id1, lst_id2):
    score = sess.run(tf_score, {
        wa: [1e-4],
        x1: np.repeat(lst_id1, 6, axis=0),
        x2: lst_id2.reshape(-1, max_seq_len)})
    score = score.reshape(-1, 6)
    return np.argmax(score, axis=1)

In [16]:
def eval_on_sample():
    my_ans = eval_ans(sample_id1, sample_id2)
    return np.sum(my_ans == sample_y)

In [24]:
saver = tf.train.Saver()
sess = tf.Session()
saver.restore(sess, 'models/Attack-sentence-embedding/s_emb-16000')

INFO:tensorflow:Restoring parameters from models/Attack-sentence-embedding/s_emb-16000


In [25]:
sample_correct = eval_on_sample()
my_sample_ans = eval_ans(sample_id1, sample_id2)
my_test_ans = eval_ans(test_id1, test_id2)
print('sample correct %4d' % (sample_correct))

sample correct   30


In [None]:
# with open('answer/attack-sentence-embedding.txt', 'w') as f:
#     f.write('id,ans\n')
#     f.write('\n'.join(['%d,%d' % (i+1, a) for i, a in enumerate(selected_test_ans)]))
#     f.write('\n')