In [1]:
import pandas as pd
import numpy as np
import re
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')
import os
import time
import gc
import json
from mini_batch_helper import extractor, MiniBatchCorpus

Building prefix dict from /home/sunset/word_contest/datas/dict/dict.txt.big ...
Loading model from cache /tmp/jieba.u849ecfdca27003d306f39ca004b82b5b.cache
Loading model cost 1.147 seconds.
Prefix dict has been built succesfully.
Using TensorFlow backend.


In [2]:
# Read in  training data
word2vec_fname = 'models/word2vec_no_tc_offitial_200.model.bin'
corpus_fnames = [
    'datas/training_data/no_TC_下課花路米.txt',
    'datas/training_data/no_TC_人生劇展.txt',
    'datas/training_data/no_TC_公視藝文大道.txt',
    'datas/training_data/no_TC_成語賽恩思.txt',
    'datas/training_data/no_TC_我的這一班.txt',
    'datas/training_data/no_TC_流言追追追.txt',
    'datas/training_data/no_TC_聽聽看.txt',
    'datas/training_data/no_TC_誰來晚餐.txt',
]
sample_rate_on_training_datas = 0.3  # 1.0
extra_words = ['<pad>']
unknown_word = '<pad>'

word2id, id2word, word_p, embedding_matrix, corpus, corpus_id = extractor(word2vec_fname, corpus_fnames, sample_rate_on_training_datas, extra_words, unknown_word)
# Data split
rnd_idx = np.arange(len(corpus_id))
np.random.shuffle(rnd_idx)
corpus_id = corpus_id[rnd_idx[:len(corpus_id)//2]]
valid_corpus_num = 10

train_data_loader = MiniBatchCorpus(corpus_id[valid_corpus_num:])
valid_data_loader = MiniBatchCorpus(corpus_id[:valid_corpus_num])
print('train datas num:', train_data_loader.data_num, flush=True)
print('valid datas num:', valid_data_loader.data_num, flush=True)


max_seq_len = max([len(sentence) for episode in corpus_id for sentence in episode])
max_seq_len

del(corpus)
gc.collect()

train datas num: 686438
valid datas num: 18602


0

In [3]:
# reference: https://github.com/dennybritz/chatbot-retrieval/blob/8b1be4c2e63631b1180b97ef927dc2c1f7fe9bea/udc_hparams.py
exp_name = 'dual_lstm_2'
# Model Parameters
params = {}
save_params_dir = 'models/%s/' %exp_name
params['word2vec_model_name'] = word2vec_fname
params['word2vec_vocab_size'] = embedding_matrix.shape[0]
params['word2vec_dim'] = embedding_matrix.shape[1]
params['rnn_dim'] = 256  # 256, 384, 512
params['n_layers'] = 1

# Training Parameters
params['learning_rate'] = 1e-4
params['keep_prob_train'] = 0.8
params['keep_prob_valid'] = 1.0
params['l1_loss'] = 1e-4 # regularize M
params['clip'] = 0.25
params['batch_size'] = 256
params['eval_batch_size'] = 16
params['n_iterations'] = int(40 * train_data_loader.data_num / params['batch_size'])

record = {}
save_record_dir = 'models/%s/' %exp_name
record['newest_model_dir'] = 'models/' + exp_name +'/newest/'
record['best_model_dir'] = 'models/' + exp_name +'/best/'
record['loss_train'] = []
record['loss_valid'] = []
record['accuracy_valid'] = []
record['best_iter'] = 0
record['sample_correct'] = 0

In [4]:
# Define model
import tensorflow as tf

# Input
#context = tf.placeholder(dtype=tf.int64, shape=(None, max_seq_len), name='context')
#response = tf.placeholder(dtype=tf.int64, shape=(None, max_seq_len), name='response')
context = tf.placeholder(dtype=tf.int64, shape=(None, None), name='context')
response = tf.placeholder(dtype=tf.int64, shape=(None, None), name='response')
target = tf.placeholder(dtype=tf.int64, shape=(None, ), name='target')
keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')


#with tf.device('/gpu:0'):
# Embedding
init_embedding_W = tf.constant_initializer(embedding_matrix)
embeddings_W = tf.get_variable('embeddings_W', shape=[embedding_matrix.shape[0], embedding_matrix.shape[1]], initializer=init_embedding_W)
context_embedded = tf.nn.embedding_lookup(embeddings_W, context, name="embed_context")
response_embedded = tf.nn.embedding_lookup(embeddings_W, response, name="embed_response")

if params['n_layers'] == 1:
# shared LSTM encoder
    cell = tf.nn.rnn_cell.LSTMCell(num_units=params['rnn_dim'], forget_bias=2.0, 
                use_peepholes=True, state_is_tuple=True, reuse=tf.get_variable_scope().reuse)
    cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
    c_outputs, c_states = tf.nn.dynamic_rnn(cell, context_embedded, dtype=tf.float32)
    encoding_context = c_states.h
    r_outputs, r_states = tf.nn.dynamic_rnn(cell, response_embedded, dtype=tf.float32)
    encoding_response = r_states.h
else:
    cells = [tf.nn.rnn_cell.LSTMCell(num_units=params['rnn_dim'], forget_bias=2.0, use_peepholes=True, state_is_tuple=False, reuse=tf.get_variable_scope().reuse) 
                for _ in range(params['n_layers'])]
    dropcells = [tf.contrib.rnn.DropoutWrapper(cell,input_keep_prob=keep_prob) for cell in cells]
    multicell = tf.contrib.rnn.MultiRNNCell(dropcells, state_is_tuple=False)
    multicell = tf.contrib.rnn.DropoutWrapper(multicell, output_keep_prob=keep_prob)
    c_outputs, c_states = tf.nn.dynamic_rnn(multicell, context_embedded, dtype=tf.float32)
    encoding_context = c_states.h
    r_outputs, r_states = tf.nn.dynamic_rnn(multicell, response_embedded, dtype=tf.float32)
    encoding_response = r_states.h

# σ(cMr)
M = tf.get_variable('M', shape=[params['rnn_dim'], params['rnn_dim']], initializer=tf.truncated_normal_initializer())

# "Predict" a  response: c * M
generated_response = tf.matmul(encoding_context, M)
generated_response = tf.expand_dims(generated_response, 2)
encoding_response = tf.expand_dims(encoding_response, 2)

# Dot product between generated response and actual response
logits = tf.matmul(generated_response, encoding_response, True)
logits = tf.reshape(logits, [-1])

# Apply sigmoid to convert logits to probabilities (for prediction, not for loss)
probs = tf.sigmoid(logits)
correct_prediction = tf.logical_or( tf.logical_and(tf.equal(target,1), tf.greater_equal(probs,0.5)), tf.logical_and(tf.equal(target,0), tf.less(probs,0.5)))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Calculate the binary cross-entropy loss
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.to_float(target)))
loss = loss + params['l1_loss'] * tf.reduce_sum(tf.abs(M))

#train_step = tf.train.AdamOptimizer(params['learning_rate']).minimize(loss)
optimizer = tf.train.AdamOptimizer(params['learning_rate'])
gvs = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_norm(grad, params['clip']), var) for grad, var in gvs]
train_step = optimizer.apply_gradients(capped_gvs)

In [17]:
# Load in sample and test
sample = pd.read_csv('datas/sample_test_data.txt')
sample_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
sample_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
sample_y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in sample_x2]) == 0)
sample_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in sample_x1]
sample_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in sample_x2]


test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)
test_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in test_x1]
test_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in test_x2]

In [18]:
def word_lst_2_id_lst(lst):
    pad_word_id = word2id['<pad>']
    pad_len = max(len(lst), 0)
    return [word2id[lst[i]] if i<len(lst) and lst[i] in word2id else pad_word_id for i in range(pad_len)]

In [19]:
sample_id1 = np.array([word_lst_2_id_lst(s) for s in sample_x1])
sample_id2 = np.array([[word_lst_2_id_lst(r) for r in rs] for rs in sample_x2])
test_id1 = np.array([word_lst_2_id_lst(s) for s in test_x1])
test_id2 = np.array([[word_lst_2_id_lst(r) for r in rs] for rs in test_x2])

In [20]:
record = {}
save_record_dir = 'models/%s/' %exp_name
record['newest_model_dir'] = 'models/' + exp_name +'/newest/'
record['best_model_dir'] = 'models/' + exp_name +'/best/'
record['loss_train'] = []
record['loss_valid'] = []
record['accuracy_valid'] = []
record['best_iter'] = 0
record['sample_correct'] = 0

In [None]:
emb

In [21]:
# Predict on sample dataset
saver = tf.train.Saver()
with tf.Session() as sess:
    # Restore model
    saver.restore(sess, record['best_model_dir']+'model.ckpt')
    score = []
    for q, rs in zip(sample_id1, sample_id2):
        for r in rs:
            now_score = sess.run(probs, {
                context: [q],
                response: [r],
                keep_prob: params['keep_prob_valid']})[0]
            score.append(now_score)
            print(now_score, [id2word[idx] for idx in q], [id2word[idx] for idx in r])
    score = np.array(score).reshape(-1, 6)
    my_ans = np.argmax(score, axis=1)
    sample_correct = np.sum(my_ans == sample_y)
    print('sample correct %4d' % (sample_correct), flush=True)
    record['sample_correct'] = sample_correct.tolist()

INFO:tensorflow:Restoring parameters from models/dual_lstm_2/best/model.ckpt
0.271947 ['你', '這麼', '快', '就', '知道', '了'] ['全家', '就是', '你家']
0.998684 ['你', '這麼', '快', '就', '知道', '了'] ['付出', '不是', '浪費時間']
0.991211 ['你', '這麼', '快', '就', '知道', '了'] ['願意', '為', '社會', '付出', '的', '人太少', '了']
0.000193259 ['你', '這麼', '快', '就', '知道', '了'] ['我', '都', '是', '一個', '人', '把', '他關', '在', '家裡']
0.876545 ['你', '這麼', '快', '就', '知道', '了'] ['廢話', '你', '沒有', '聽過', '壞話', '傳', '千里']
0.988411 ['你', '這麼', '快', '就', '知道', '了'] ['你', '不要', '以為', '你', '拳頭', '大', '大家', '都', '<pad>', '你', '我', '才', '不怕', '你']
9.6795e-05 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度', '不同'] ['紋路', '不同']
0.976237 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度', '不同'] ['也是', '你', '唯一', '發洩', '情緒', '的', '辦法', '吧']
0.0007766 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度', '不同'] ['還記', '不', '記得', '那次', '你', '在', '我家', '巷口']
0.00423483 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度', '不同'] ['像', '拼命三郎', '一樣', '的', '跳舞']
1.0 ['每', '一支', '冰塊', '都', '

0.0606551 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['這個', '時期', '的', '小孩子']
0.540459 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['你', '看', '晶瑩剔透', '的']
1.05481e-06 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['最', '新鮮', '的', '保存', '方式', '就是', '把', '牠', '急速', '冷凍', '起來']
0.0048841 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['手語', '都', '有', '很多', '不一樣', '的', '比法']
0.00849051 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['<pad>', '是', '在', '哪裏', '學', '的']
0.00126968 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['不然的話', '你', '冰', '就', '放下', '嘛']
3.87738e-06 ['可是', '為什麼', '會', '選擇', '到',

0.99992 ['怎麼', '沒有', '通知', '我', '呢', '怎麼', '通知', '找', '得到', '嗎'] ['我', '整整', '一年', '都', '在', '找', '你', '好累']
1.0 ['怎麼', '沒有', '通知', '我', '呢', '怎麼', '通知', '找', '得到', '嗎'] ['原來', '鍾', '沅', '失蹤', '的', '那', '一年', '都', '跟', '<pad>', '姊', '在', '一起']
0.000371025 ['怎麼', '沒有', '通知', '我', '呢', '怎麼', '通知', '找', '得到', '嗎'] ['頭', '一次', '見到', '<pad>', '便', '被', '她', '那股', '優雅', '動人', '的', '神采', '所', '吸引']
0.998705 ['怎麼', '沒有', '通知', '我', '呢', '怎麼', '通知', '找', '得到', '嗎'] ['你', '愛不愛', '<pad>']
0.0521249 ['你', '還有', '閒情逸致', '聽', '收音機', '啊', '我', '在', '聽', '今天', '星座', '運勢', '分析', '已經', '快', '講到', '我', '了'] ['物超所值', '耶']
0.146876 ['你', '還有', '閒情逸致', '聽', '收音機', '啊', '我', '在', '聽', '今天', '星座', '運勢', '分析', '已經', '快', '講到', '我', '了'] ['只要', '她', '一', '高興', '你', '要', '借', '她', '的', '皇冠', '<pad>', '絕對', '沒有', '問題']
0.943067 ['你', '還有', '閒情逸致', '聽', '收音機', '啊', '我', '在', '聽', '今天', '星座', '運勢', '分析', '已經', '快', '講到', '我', '了'] ['這種', '不', '科學', '的', '東西', '你還', '相信', '啊']
0.0834218 ['你', '還有', '閒情逸致', '聽', '收音機

0.908609 ['不然', '我們', '來', '比賽', '啊', '好', '啊', '比', '什麼'] ['我先', '出發', '去', '探索']
3.83264e-05 ['借', '我', '看', '一下', '嘛', '不要', '啦', '我', '房間', '很', '亂'] ['我們', '有', '問題', '要問', '你']
1.47754e-09 ['借', '我', '看', '一下', '嘛', '不要', '啦', '我', '房間', '很', '亂'] ['可是', '老師', '現在', '有', '事情', '耶']
0.866093 ['借', '我', '看', '一下', '嘛', '不要', '啦', '我', '房間', '很', '亂'] ['我們', '下次', '再', '問好', '了']
0.913097 ['借', '我', '看', '一下', '嘛', '不要', '啦', '我', '房間', '很', '亂'] ['老師', '老師']
0.998473 ['借', '我', '看', '一下', '嘛', '不要', '啦', '我', '房間', '很', '亂'] ['啊', '借看一下', '會', '怎樣', '啊']
2.47122e-11 ['借', '我', '看', '一下', '嘛', '不要', '啦', '我', '房間', '很', '亂'] ['你', '在', '裡頭', '幹嘛']
0.99992 ['這是', '班遊', '地點', '的', '幾個', '提案', '請', '大家', '投票表決', '這些', '地點', '都', '好', '無聊', '誰', '想', '去', '啊'] ['你們', '的', '想法', '都', '不錯', '啦']
0.963157 ['這是', '班遊', '地點', '的', '幾個', '提案', '請', '大家', '投票表決', '這些', '地點', '都', '好', '無聊', '誰', '想', '去', '啊'] ['你', '還是', '找', '別人', '啦']
0.999757 ['這是', '班遊', '地點', '的', '幾個', '提案', '請', '大家', 

1.0 ['神算', '公主', '你', '這次', '啊', '麻煩', '大', '了', '關我', '什麼', '事', '啊', '你', '不知道', '嗎', '她', '為', '了', '減肥', '塑身', '啊', '已經', '連續', '好', '幾天', '都', '一天', '只', '吃', '一餐', '呢'] ['她', '一定', '會', '後悔', '沒有', '好好', '栽培', '我']
0.995835 ['神算', '公主', '你', '這次', '啊', '麻煩', '大', '了', '關我', '什麼', '事', '啊', '你', '不知道', '嗎', '她', '為', '了', '減肥', '塑身', '啊', '已經', '連續', '好', '幾天', '都', '一天', '只', '吃', '一餐', '呢'] ['那麼', '<pad>', '都', '<pad>', '所以', '我', '想', '請', '她', '幫', '我', '<pad>', '命盤']
0.00463355 ['你', '發', '什麼', '瘋', '啊', '你', '你', '才', '發瘋', '呢', '我', '看', '你', '有', '<pad>'] ['是', '球員', '的', '生命']
0.999917 ['你', '發', '什麼', '瘋', '啊', '你', '你', '才', '發瘋', '呢', '我', '看', '你', '有', '<pad>'] ['要', '想', '打', '好', '籃球']
0.983426 ['你', '發', '什麼', '瘋', '啊', '你', '你', '才', '發瘋', '呢', '我', '看', '你', '有', '<pad>'] ['就得', '喜歡', '籃球']
0.998799 ['你', '發', '什麼', '瘋', '啊', '你', '你', '才', '發瘋', '呢', '我', '看', '你', '有', '<pad>'] ['好好', '照顧', '它']
1.0 ['你', '發', '什麼', '瘋', '啊', '你', '你', '才', '發瘋', '呢', '我', '看'