In [1]:
import pandas as pd
import numpy as np
import re
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')
import os
import time
import gc
import json
from mini_batch_helper import extractor, MiniBatchCorpus
from gensim.models import word2vec

Building prefix dict from /home/sunset/word_contest/datas/dict/dict.txt.big ...
Loading model from cache /tmp/jieba.u849ecfdca27003d306f39ca004b82b5b.cache
Loading model cost 1.141 seconds.
Prefix dict has been built succesfully.
Using TensorFlow backend.


In [6]:
# Load word2id vocab
word2vec_fname = 'models/word2vec_no_tc_offitial_200.model.bin'
word2vec_model = word2vec.Word2Vec.load(word2vec_fname)
extra_words = ['<pad>']
unknown_word = '<pad>'


# Extract word2vec
word2id = {}
id2word = [None] * (len(word2vec_model.wv.vocab) + len(extra_words)) 

for i, word in enumerate(extra_words):
    word2id[word] = i + len(word2vec_model.wv.vocab)
    id2word[i + len(word2vec_model.wv.vocab)] = word

for k, v in word2vec_model.wv.vocab.items():
    word2id[k] = v.index
    id2word[v.index] = k

del(word2vec_model)
gc.collect()

6

In [10]:
exp_name = 'dual_lstm_0'  # model_to_test

# Parameters
# Model Parameters
params = {}
save_params_dir = 'models/%s/' %exp_name
params['word2vec_model_name'] = word2vec_fname
params['word2vec_vocab_size'] = len(word2id)
params['word2vec_dim'] = 200
params['rnn_dim'] = 512  # 256
params['n_layers'] = 1

# Training Parameters
params['learning_rate'] = 1e-4
params['keep_prob_train'] = 0.8
params['keep_prob_valid'] = 1.0
params['clip'] = 0.25
params['batch_size'] = 512  # 256  #128
params['eval_batch_size'] = 16


if not os.path.exists(save_params_dir):
    os.makedirs(save_params_dir)
with open(save_params_dir+'model_parameters.json', 'w') as f:
    json.dump(params, f, indent=1)


# Define model
import tensorflow as tf

# Input
context = tf.placeholder(dtype=tf.int64, shape=(None, None), name='context')
response = tf.placeholder(dtype=tf.int64, shape=(None, None), name='response')
target = tf.placeholder(dtype=tf.int64, shape=(None, ), name='target')
keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')

#with tf.device('/gpu:0'):
# Embedding
embeddings_W = tf.get_variable('embeddings_W', shape=[params['word2vec_vocab_size'], params['word2vec_dim']])
context_embedded = tf.nn.embedding_lookup(embeddings_W, context, name="embed_context")
response_embedded = tf.nn.embedding_lookup(embeddings_W, response, name="embed_response")

if params['n_layers'] == 1:
# shared LSTM encoder
    cell = tf.nn.rnn_cell.LSTMCell(num_units=params['rnn_dim'], forget_bias=2.0, 
                use_peepholes=True, state_is_tuple=True, reuse=tf.get_variable_scope().reuse)
    cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
    c_outputs, c_states = tf.nn.dynamic_rnn(cell, context_embedded, dtype=tf.float32)
    encoding_context = c_states.h
    r_outputs, r_states = tf.nn.dynamic_rnn(cell, response_embedded, dtype=tf.float32)
    encoding_response = r_states.h
else:
    cells = [tf.nn.rnn_cell.LSTMCell(num_units=params['rnn_dim'], forget_bias=2.0, use_peepholes=True, state_is_tuple=False, reuse=tf.get_variable_scope().reuse) 
                for _ in range(params['n_layers'])]
    dropcells = [tf.contrib.rnn.DropoutWrapper(cell,input_keep_prob=keep_prob) for cell in cells]
    multicell = tf.contrib.rnn.MultiRNNCell(dropcells, state_is_tuple=False)
    multicell = tf.contrib.rnn.DropoutWrapper(multicell, output_keep_prob=keep_prob)
    c_outputs, c_states = tf.nn.dynamic_rnn(multicell, context_embedded, dtype=tf.float32)
    encoding_context = c_states.h
    r_outputs, r_states = tf.nn.dynamic_rnn(multicell, response_embedded, dtype=tf.float32)
    encoding_response = r_states.h

# σ(cMr)
M = tf.get_variable('M', shape=[params['rnn_dim'], params['rnn_dim']], initializer=tf.truncated_normal_initializer())

# "Predict" a  response: c * M
generated_response = tf.matmul(encoding_context, M)
generated_response = tf.expand_dims(generated_response, 2)
encoding_response = tf.expand_dims(encoding_response, 2)

# Dot product between generated response and actual response
logits = tf.matmul(generated_response, encoding_response, True)
logits = tf.reshape(logits, [-1])

# Apply sigmoid to convert logits to probabilities (for prediction, not for loss)
probs = tf.sigmoid(logits)
correct_prediction = tf.logical_or( tf.logical_and(tf.equal(target,1), tf.greater_equal(probs,0.5)), tf.logical_and(tf.equal(target,0), tf.less(probs,0.5)))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Calculate the binary cross-entropy loss
loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.to_float(target)), name='mean_loss_of_batch')

#train_step = tf.train.AdamOptimizer(params['learning_rate']).minimize(loss)
optimizer = tf.train.AdamOptimizer(params['learning_rate'])
gvs = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_norm(grad, params['clip']), var) for grad, var in gvs]
train_step = optimizer.apply_gradients(capped_gvs)


In [None]:

# Load word2id vocab
word2vec_fname = 'models/word2vec_no_tc_offitial_200.model.bin'
extra_words = ['<pad>']
unknown_word = '<pad>'


# Extract word2vec
word2id = {}
id2word = [None] * (len(word2vec_model.wv.vocab) + len(extra_words)) 

for i, word in enumerate(extra_words):
    word2id[word] = i + len(word2vec_model.wv.vocab)
    id2word[i + len(word2vec_model.wv.vocab)] = word

for k, v in word2vec_model.wv.vocab.items():
    word2id[k] = v.index
    id2word[v.index] = k

del(word2vec_model)
gc.collect()

In [72]:

# Load in sample and test
sample = pd.read_csv('datas/sample_test_data.txt')
sample_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
sample_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
sample_y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in sample_x2]) == 0)
sample_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in sample_x1]
sample_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in sample_x2]


test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)
test_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in test_x1]
test_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in test_x2]


In [73]:
def word_lst_2_id_lst(lst):
    pad_word_id = word2id['<pad>']
    pad_len = max(len(lst), 0)
    return [word2id[lst[i]] if i<len(lst) and lst[i] in word2id else pad_word_id for i in range(pad_len)]


In [74]:

sample_id1 = np.array([word_lst_2_id_lst(s) for s in sample_x1])
sample_id2 = np.array([[word_lst_2_id_lst(r) for r in rs] for rs in sample_x2])
test_id1 = np.array([word_lst_2_id_lst(s) for s in test_x1])
test_id2 = np.array([[word_lst_2_id_lst(r) for r in rs] for rs in test_x2])


In [75]:
record = {}
save_record_dir = 'models/%s/' %exp_name
record['newest_model_dir'] = 'models/' + exp_name +'/newest/'
record['best_model_dir'] = 'models/' + exp_name +'/best/'
record['loss_train'] = []
record['loss_valid'] = []
record['accuracy_valid'] = []
record['best_iter'] = 0
record['sample_correct'] = 0

In [83]:
# Predict on sample dataset
saver = tf.train.Saver()
with tf.Session() as sess:
    # Restore model
    saver.restore(sess, record['best_model_dir']+'model.ckpt')
    score = []
    for q, rs in zip(sample_id1, sample_id2):
        for r in rs:
            now_score = sess.run(probs, {
                context: [q],
                response: [r],
                keep_prob: params['keep_prob_valid']})[0]
            score.append(now_score)
            print(now_score, [id2word[idx] for idx in q], [id2word[idx] for idx in r])
    score = np.array(score).reshape(-1, 6)
    my_ans = np.argmax(score, axis=1)
    sample_correct = np.sum(my_ans == sample_y)
    print('sample correct %4d' % (sample_correct), flush=True)
    record['sample_correct'] = sample_correct.tolist()

INFO:tensorflow:Restoring parameters from models/dual_lstm_0/best/model.ckpt
0.999832 ['你', '這麼', '快', '就', '知道', '了'] ['全家', '就是', '你家']
0.999999 ['你', '這麼', '快', '就', '知道', '了'] ['付出', '不是', '浪費時間']
0.99746 ['你', '這麼', '快', '就', '知道', '了'] ['願意', '為', '社會', '付出', '的', '人太少', '了']
0.893584 ['你', '這麼', '快', '就', '知道', '了'] ['我', '都', '是', '一個', '人', '把', '他關', '在', '家裡']
1.0 ['你', '這麼', '快', '就', '知道', '了'] ['廢話', '你', '沒有', '聽過', '壞話', '傳', '千里']
1.0 ['你', '這麼', '快', '就', '知道', '了'] ['你', '不要', '以為', '你', '拳頭', '大', '大家', '都', '<pad>', '你', '我', '才', '不怕', '你']
0.999984 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度', '不同'] ['紋路', '不同']
1.0 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度', '不同'] ['也是', '你', '唯一', '發洩', '情緒', '的', '辦法', '吧']
3.10729e-13 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度', '不同'] ['還記', '不', '記得', '那次', '你', '在', '我家', '巷口']
0.139485 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度', '不同'] ['像', '拼命三郎', '一樣', '的', '跳舞']
1.0 ['每', '一支', '冰塊', '都', '不同', '水質', '不同', '硬度'

0.999736 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['最', '新鮮', '的', '保存', '方式', '就是', '把', '牠', '急速', '冷凍', '起來']
1.0 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['手語', '都', '有', '很多', '不一樣', '的', '比法']
0.999996 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['<pad>', '是', '在', '哪裏', '學', '的']
0.000382212 ['網路上', '那些', '流', '言說', '如果說', '你', '把', '<pad>', '一些', '尿液', '會', '讓', '蝦', '活得', '比較', '久', '根本', '騙人', '的'] ['不然的話', '你', '冰', '就', '放下', '嘛']
2.76488e-08 ['可是', '為什麼', '會', '選擇', '到', '歐洲', '的', '<pad>', '那個', '城市', '去', '學習', '戲劇', '呢'] ['最早', '的', '人類', '先民', '們', '最', '早期', '的', '武士', '帶著', '青銅', '武器', '和', '<pad>']
0.0119575 ['可是', '為什麼', '會', '選擇', '到', '歐洲', '的', '<pad>', '那個', '城市', '去', '學習', '戲劇', '呢'] ['客家', '米食', '以前', '是', '過年', '過節', '才', '吃', '的']
0.999964 ['可是', '為什

1.0 ['你', '還有', '閒情逸致', '聽', '收音機', '啊', '我', '在', '聽', '今天', '星座', '運勢', '分析', '已經', '快', '講到', '我', '了'] ['只要', '她', '一', '高興', '你', '要', '借', '她', '的', '皇冠', '<pad>', '絕對', '沒有', '問題']
1.0 ['你', '還有', '閒情逸致', '聽', '收音機', '啊', '我', '在', '聽', '今天', '星座', '運勢', '分析', '已經', '快', '講到', '我', '了'] ['這種', '不', '科學', '的', '東西', '你還', '相信', '啊']
1.0 ['你', '還有', '閒情逸致', '聽', '收音機', '啊', '我', '在', '聽', '今天', '星座', '運勢', '分析', '已經', '快', '講到', '我', '了'] ['等', '我', '事業有成', '第一個', '回饋', '你', '好不好']
1.0 ['你', '還有', '閒情逸致', '聽', '收音機', '啊', '我', '在', '聽', '今天', '星座', '運勢', '分析', '已經', '快', '講到', '我', '了'] ['你', '是', '說', '跟', '同學', '收錢', '啊']
0.500838 ['你', '還有', '閒情逸致', '聽', '收音機', '啊', '我', '在', '聽', '今天', '星座', '運勢', '分析', '已經', '快', '講到', '我', '了'] ['<pad>']
3.5294e-08 ['小', '<pad>', '你', '跟', '我', '一起', '回家', '去', '好不好', '妳在', '開玩笑', '的', '吧'] ['我', '是從', '台灣', '來的']
1.0 ['小', '<pad>', '你', '跟', '我', '一起', '回家', '去', '好不好', '妳在', '開玩笑', '的', '吧'] ['麻煩', '幫', '我', '還給', '老爹']
1.0 ['小', '<pad>', 

0.987523 ['這是', '班遊', '地點', '的', '幾個', '提案', '請', '大家', '投票表決', '這些', '地點', '都', '好', '無聊', '誰', '想', '去', '啊'] ['那', '我', '要', '怎麼', '<pad>', '可以', '接得', '好']
0.90729 ['我', '的', '乖', '孫女', '呢', '她', '在', '房間', '裡', '讀書', '大概', '沒', '聽到', '您', '進來', '的', '聲音'] ['膝蓋', '有沒有', '感覺', '舒服', '一點']
0.999969 ['我', '的', '乖', '孫女', '呢', '她', '在', '房間', '裡', '讀書', '大概', '沒', '聽到', '您', '進來', '的', '聲音'] ['去', '叫', '她', '一下']
0.000223345 ['我', '的', '乖', '孫女', '呢', '她', '在', '房間', '裡', '讀書', '大概', '沒', '聽到', '您', '進來', '的', '聲音'] ['你', '笑', '得', '甜蜜蜜']
0.634842 ['我', '的', '乖', '孫女', '呢', '她', '在', '房間', '裡', '讀書', '大概', '沒', '聽到', '您', '進來', '的', '聲音'] ['可是', '我', '就', '覺得', '奶奶', '很', '可愛']
0.999998 ['我', '的', '乖', '孫女', '呢', '她', '在', '房間', '裡', '讀書', '大概', '沒', '聽到', '您', '進來', '的', '聲音'] ['放', '輕鬆', '一點']
0.986571 ['我', '的', '乖', '孫女', '呢', '她', '在', '房間', '裡', '讀書', '大概', '沒', '聽到', '您', '進來', '的', '聲音'] ['至少', '我', '努力', '過了', '嘛']
0.992054 ['那', '你', '知道', '為什麼', '會', '有', '攀岩', '<pad>', '為什麼

0.993162 ['豆油', '我長', '得', '這個', '樣子', '啊', '沒有', '啦', '我', '是畫', '你', '的', '神情', '你', '的', '神情', '這樣', '很', '像', '啊', '我', '就', '說', '你', '畫得', '這麼', '醜', '就', '去', '那個', '什麼', '社區', '新開', '的', '畫畫', '<pad>', '嘛'] ['每年', '四到', '九月', '是', '芒果', '成熟', '的', '季節']
1.0 ['豆油', '我長', '得', '這個', '樣子', '啊', '沒有', '啦', '我', '是畫', '你', '的', '神情', '你', '的', '神情', '這樣', '很', '像', '啊', '我', '就', '說', '你', '畫得', '這麼', '醜', '就', '去', '那個', '什麼', '社區', '新開', '的', '畫畫', '<pad>', '嘛'] ['不必', '啦', '我', '有', '天才', '的', '人', '不需要', '去學', '什麼']
0.988341 ['豆油', '我長', '得', '這個', '樣子', '啊', '沒有', '啦', '我', '是畫', '你', '的', '神情', '你', '的', '神情', '這樣', '很', '像', '啊', '我', '就', '說', '你', '畫得', '這麼', '醜', '就', '去', '那個', '什麼', '社區', '新開', '的', '畫畫', '<pad>', '嘛'] ['台南', '玉井', '<pad>', '一條', '長長的', '大道']
1.0 ['豆油', '我長', '得', '這個', '樣子', '啊', '沒有', '啦', '我', '是畫', '你', '的', '神情', '你', '的', '神情', '這樣', '很', '像', '啊', '我', '就', '說', '你', '畫得', '這麼', '醜', '就', '去', '那個', '什麼', '社區', '新開', '的', '畫畫', '<pad>', '嘛'] ['放些'

In [79]:
score

array([[  9.99832273e-01,   9.99999404e-01,   9.97460365e-01,
          8.93584251e-01,   1.00000000e+00,   1.00000000e+00],
       [  9.99984026e-01,   9.99999881e-01,   3.10728738e-13,
          1.39484659e-01,   1.00000000e+00,   7.70871520e-01],
       [  1.00000000e+00,   9.99999881e-01,   1.00000000e+00,
          4.01133224e-02,   7.74087461e-11,   5.69536947e-17],
       [  9.99999762e-01,   3.44671607e-02,   1.00000000e+00,
          8.32516789e-01,   5.27819910e-04,   7.80801356e-01],
       [  1.00000000e+00,   9.99995470e-01,   6.55183315e-01,
          1.00000000e+00,   9.99985337e-01,   9.94669497e-01],
       [  9.99991298e-01,   1.44149780e-07,   3.15514594e-01,
          3.89238835e-07,   2.81275392e-01,   6.46067038e-02],
       [  1.00000000e+00,   1.86311896e-04,   1.00000000e+00,
          1.77117442e-06,   1.00000000e+00,   1.00000000e+00],
       [  9.99992371e-01,   9.67398703e-01,   9.99997020e-01,
          1.00000000e+00,   1.17893033e-01,   1.00000000e+00],
