In [1]:
import pandas as pd
import numpy as np
import re
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')
import os
import time
import gc
import json
from mini_batch_helper import extractor, MiniBatchCorpus

Building prefix dict from /Users/sunset/Talk2AI_Contest/datas/dict/dict.txt.big ...
Loading model from cache /var/folders/43/l4vp_w_x4wb11mmy_bb1jrkc0000gn/T/jieba.u857f67a870683287981bc6f5b9493ffc.cache
Loading model cost 1.879 seconds.
Prefix dict has been built succesfully.


In [2]:
# reference: https://github.com/dennybritz/chatbot-retrieval/blob/8b1be4c2e63631b1180b97ef927dc2c1f7fe9bea/udc_hparams.py
exp_name = 'dual_lstm_8'
with open('models/%s/model_parameters.json' %exp_name) as json_data:
    params = json.load(json_data)
    json_data.close()

In [3]:
# Read in  training data
word2vec_fname = params['word2vec_model_name']
extra_words = ['<pad>']
unknown_word = None

word2id, id2word, word_p, embedding_matrix, corpus, corpus_id = extractor(word2vec_fname, [], 0, extra_words, unknown_word)

In [4]:
record = {}
save_record_dir = 'models/%s/' %exp_name
record['newest_model_dir'] = 'models/' + exp_name +'/newest/'
record['best_model_dir'] = 'models/' + exp_name +'/best/'
record['loss_train'] = []
record['loss_valid'] = []
record['accuracy_valid'] = []
record['best_iter'] = 0
record['sample_correct'] = 0

In [5]:
# Define model
import tensorflow as tf

# Input
context = tf.placeholder(dtype=tf.int32, shape=(None, None), name='context')
context_len = tf.placeholder(dtype=tf.int32, shape=(None,), name='context_len')
response = tf.placeholder(dtype=tf.int32, shape=(None, None), name='response')
response_len = tf.placeholder(dtype=tf.int32, shape=(None,), name='response_len')
target = tf.placeholder(dtype=tf.int32, shape=(None, ), name='target')
keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')


#with tf.device('/gpu:0'):
# Embedding
init_embedding_W = tf.constant_initializer(embedding_matrix)
embeddings_W = tf.get_variable('embeddings_W', shape=[embedding_matrix.shape[0], embedding_matrix.shape[1]], initializer=init_embedding_W)
context_embedded = tf.nn.embedding_lookup(embeddings_W, context, name="embed_context")
response_embedded = tf.nn.embedding_lookup(embeddings_W, response, name="embed_response")

if params['n_layers'] == 1:
# shared LSTM encoder
    cell = tf.nn.rnn_cell.LSTMCell(num_units=params['rnn_dim'], forget_bias=2.0, 
                use_peepholes=True, state_is_tuple=True, reuse=tf.get_variable_scope().reuse)
    cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
    c_outputs, c_states = tf.nn.dynamic_rnn(cell, context_embedded, dtype=tf.float32)
    mask = tf.expand_dims(tf.one_hot(context_len, depth=tf.shape(context)[1]), 1)
    encoding_context = tf.squeeze(tf.matmul(mask, c_outputs), 1)   # c_states.h
    r_outputs, r_states = tf.nn.dynamic_rnn(cell, response_embedded, dtype=tf.float32)
    mask = tf.expand_dims(tf.one_hot(response_len, depth=tf.shape(response)[1]), 1)
    encoding_response =  tf.squeeze(tf.matmul(mask, r_outputs), 1)  # r_states.h
else:
    cells = [tf.nn.rnn_cell.LSTMCell(num_units=params['rnn_dim'], forget_bias=2.0, use_peepholes=True, state_is_tuple=False, reuse=tf.get_variable_scope().reuse) 
                for _ in range(params['n_layers'])]
    dropcells = [tf.contrib.rnn.DropoutWrapper(cell,input_keep_prob=keep_prob) for cell in cells]
    multicell = tf.contrib.rnn.MultiRNNCell(dropcells, state_is_tuple=False)
    multicell = tf.contrib.rnn.DropoutWrapper(multicell, output_keep_prob=keep_prob)
    c_outputs, c_states = tf.nn.dynamic_rnn(multicell, context_embedded, dtype=tf.float32)
    mask = tf.expand_dims(tf.one_hot(context_len, depth=tf.shape(context)[1]), 1)
    encoding_context = tf.squeeze(tf.matmul(mask, c_outputs), 1)   # c_states.h
    r_outputs, r_states = tf.nn.dynamic_rnn(multicell, response_embedded, dtype=tf.float32)
    mask = tf.expand_dims(tf.one_hot(response_len, depth=tf.shape(response)[1]), 1)
    encoding_response =  tf.squeeze(tf.matmul(mask, r_outputs), 1)  # r_states.h

# σ(cMr)
M = tf.get_variable('M', shape=[params['rnn_dim'], params['rnn_dim']], initializer=tf.truncated_normal_initializer(stddev=0.01))

# "Predict" a  response: c * M
generated_response = tf.matmul(encoding_context, M)
generated_response = tf.expand_dims(generated_response, 2)
encoding_response = tf.expand_dims(encoding_response, 2)

# Dot product between generated response and actual response
logits = tf.matmul(generated_response, encoding_response, True)
logits = tf.reshape(logits, [-1])

# Apply sigmoid to convert logits to probabilities (for prediction, not for loss)
probs = tf.sigmoid(logits)
correct_prediction = tf.logical_or( tf.logical_and(tf.equal(target,1), tf.greater_equal(probs,0.5)), tf.logical_and(tf.equal(target,0), tf.less(probs,0.5)))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Calculate the binary cross-entropy loss
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.to_float(target)))
loss = loss + params['l1_loss'] * tf.reduce_sum(tf.abs(M))

#train_step = tf.train.AdamOptimizer(params['learning_rate']).minimize(loss)
optimizer = tf.train.AdamOptimizer(params['learning_rate'])
gvs = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_norm(grad, params['clip']), var) for grad, var in gvs]
train_step = optimizer.apply_gradients(capped_gvs)



In [6]:
# Load in sample and test
sample = pd.read_csv('datas/sample_test_data.txt')
sample_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
sample_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
sample_y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in sample_x2]) == 0)
sample_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in sample_x1]
sample_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in sample_x2]


test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)
test_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in test_x1]
test_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in test_x2]

In [7]:
def word_lst_2_id_lst(lst, pad_to_len=-1):
    pad_word_id = word2id['<pad>']
    pad_len = max(len(lst), 0)
    id_list = [word2id[lst[i]] if i<len(lst) and lst[i] in word2id else pad_word_id for i in range(pad_len)]
    pad_len = pad_to_len - len(id_list)
    if pad_len > 0:
        id_list.extend([pad_word_id] * pad_len)
    return id_list

In [8]:
pad_to_length = -1

In [9]:
sample_id1 = np.array([word_lst_2_id_lst(s, pad_to_length) for s in sample_x1])
sample_id2 = np.array([[word_lst_2_id_lst(r, pad_to_length) for r in rs] for rs in sample_x2])
test_id1 = np.array([word_lst_2_id_lst(s, pad_to_length) for s in test_x1])
test_id2 = np.array([[word_lst_2_id_lst(r, pad_to_length) for r in rs] for rs in test_x2])

In [10]:
saver = tf.train.Saver()
sess = tf.Session()
saver.restore(sess, record['best_model_dir']+'model.ckpt')

INFO:tensorflow:Restoring parameters from models/dual_lstm_8/best/model.ckpt


In [11]:
score = []
for q, rs in zip(sample_id1, sample_id2):
    for r in rs:
        now_score = sess.run(probs, {
            context: [q],
            response: [r],
            keep_prob: params['keep_prob_valid'],
            context_len:[len(q)-1],
            response_len:[len(r)-1]})[0]
        score.append(now_score)
#             print(now_score, [id2word[idx] for idx in q], [id2word[idx] for idx in r])
score = np.array(score).reshape(-1, 6)
my_ans = np.argmax(score, axis=1)
sample_correct = np.sum(my_ans == sample_y)
print('sample correct %4d' % (sample_correct), flush=True)
record['sample_correct'] = sample_correct.tolist()

sample correct    8


In [None]:
# # Export embedding layer for naive method to use
# now_emb_w = sess.run(embeddings_W)
# with open('./models/word2vec/word2vec-dual-lstm-8.txt', 'w') as f:
#     assert(len(id2word) == now_emb_w.shape[0])
#     f.write('%d %d\n' % now_emb_w.shape)
#     for word, vec in zip(id2word, now_emb_w):
#         f.write('%s %s\n' % (word, ' '.join([str(f) for f in vec])))

In [31]:
qq = []
rr = []
for q, rs in zip(sample_id1, sample_id2):
    q_state = sess.run(c_states, {
        context: [q],
        keep_prob: params['keep_prob_valid'],
        context_len: [len(q)-1]
    })[0]
    qq.append(q_state)
    for r in rs:
        r_state = sess.run(c_states, {
            context: [r],
            keep_prob: params['keep_prob_valid'],
            context_len: [len(r)-1]
        })[0]
        rr.append(r_state)
qq = np.array(qq)
rr = np.array(rr).reshape(-1, 6, qq.shape[-1])
qq.shape, rr.shape

((50, 1024), (50, 6, 1024))

In [33]:
state_dotsim = []
for q, rs in zip(qq, rr):
    for r in rs:
        state_dotsim.append(np.dot(q, r))
state_dotsim = np.array(state_dotsim).reshape(-1, 6)
my_ans = np.argmax(state_dotsim, axis=1)
sample_correct = np.sum(my_ans == sample_y)
print('sample correct %4d' % (sample_correct), flush=True)

sample correct   27


In [35]:
state_cossim = []
for q, rs in zip(qq, rr):
    for r in rs:
        state_cossim.append(np.dot(q, r) / np.linalg.norm(q) / np.linalg.norm(r))
state_cossim = np.array(state_cossim).reshape(-1, 6)
my_ans = np.argmax(state_cossim, axis=1)
sample_correct = np.sum(my_ans == sample_y)
print('sample correct %4d' % (sample_correct), flush=True)

sample correct   30


In [40]:
qq = []
rr = []
for q, rs in zip(sample_id1, sample_id2):
    q_state = sess.run(encoding_context, {
        context: [q],
        keep_prob: params['keep_prob_valid'],
        context_len: [len(q)-1]
    })[0]
    qq.append(q_state)
    for r in rs:
        r_state = sess.run(encoding_context, {
            context: [r],
            keep_prob: params['keep_prob_valid'],
            context_len: [len(r)-1]
        })[0]
        rr.append(r_state)
qq = np.array(qq)
rr = np.array(rr).reshape(-1, 6, qq.shape[-1])
qq.shape, rr.shape

((50, 256), (50, 6, 256))

In [41]:
state_dotsim = []
for q, rs in zip(qq, rr):
    for r in rs:
        state_dotsim.append(np.dot(q, r))
state_dotsim = np.array(state_dotsim).reshape(-1, 6)
my_ans = np.argmax(state_dotsim, axis=1)
sample_correct = np.sum(my_ans == sample_y)
print('sample correct %4d' % (sample_correct), flush=True)

sample correct   29


In [42]:
state_cossim = []
for q, rs in zip(qq, rr):
    for r in rs:
        state_cossim.append(np.dot(q, r) / np.linalg.norm(q) / np.linalg.norm(r))
state_cossim = np.array(state_cossim).reshape(-1, 6)
my_ans = np.argmax(state_cossim, axis=1)
sample_correct = np.sum(my_ans == sample_y)
print('sample correct %4d' % (sample_correct), flush=True)

sample correct   26


In [47]:
qq = []
rr = []
for q, rs in zip(test_id1, test_id2):
    q_state = sess.run(c_states, {
        context: [q],
        keep_prob: params['keep_prob_valid'],
        context_len: [len(q)-1]
    })[0]
    qq.append(q_state)
    for r in rs:
        r_state = sess.run(c_states, {
            context: [r],
            keep_prob: params['keep_prob_valid'],
            context_len: [len(r)-1]
        })[0]
        rr.append(r_state)
qq = np.array(qq)
rr = np.array(rr).reshape(-1, 6, qq.shape[-1])
qq.shape, rr.shape

((500, 1024), (500, 6, 1024))

In [48]:
state_cossim = []
for q, rs in zip(qq, rr):
    for r in rs:
        state_cossim.append(np.dot(q, r) / np.linalg.norm(q) / np.linalg.norm(r))
state_cossim = np.array(state_cossim).reshape(-1, 6)

In [50]:
my_test_ans = np.argmax(state_cossim, axis=1)

In [53]:
with open('answer/attack-dual-lstm-8-rnn-state-cos-sim.txt', 'w') as fo:
    fo.write('id,ans\n')
    fo.write('\n'.join(['%d,%s' % (i+1, ans) for i, ans in enumerate(my_test_ans)]))
    fo.write('\n')