In [1]:
from gensim.models import word2vec
import numpy as np
from scipy import spatial

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re

from mini_batch_helper import extractor, MiniBatchCorpus

Using TensorFlow backend.
Building prefix dict from /home/sunset/Talk2AI_Contest/datas/dict/dict.txt.big ...
Loading model from cache /tmp/jieba.ufb7b5ede4bbc311ed39003ae859d1289.cache
Loading model cost 1.161 seconds.
Prefix dict has been built succesfully.


In [2]:
# Read in  training data
word2vec_fname = 'models/word2vec/fine-tuned-2.txt'
corpus_fnames = [
    'datas/training_data/下課花路米.txt',
    'datas/training_data/人生劇展.txt',
    'datas/training_data/公視藝文大道.txt',
    'datas/training_data/成語賽恩思.txt',
    'datas/training_data/我的這一班.txt',
    'datas/training_data/流言追追追.txt',
    'datas/training_data/聽聽看.txt',
    'datas/training_data/誰來晚餐.txt',
]
sample_rate_on_training_datas = 1.0  # 1.0
extra_words = ['<pad>']
unknown_word = None

word2id, id2word, word_p, embedding_matrix, corpus, corpus_id = extractor(word2vec_fname, corpus_fnames, sample_rate_on_training_datas, extra_words, unknown_word)

In [3]:
#parameters

batchSize=128
wordLen=64

filterSize=10
strideSize=1

fm1_num=25
fm2_num=50

use_gru=True #if false, use lstm
use_dropout=False

fm1Size=int(wordLen/(2*strideSize))
fm2Size=int(wordLen/(2*strideSize)/(2*strideSize))

In [4]:
# Data split
#rnd_idx = np.arange(len(corpus_id))
#np.random.shuffle(rnd_idx)
#corpus_id = corpus_id[rnd_idx[:len(corpus_id)]]
valid_corpus_num = 10

train_data_loader = MiniBatchCorpus(corpus_id[valid_corpus_num:], context_len=3, max_len=64)
valid_data_loader = MiniBatchCorpus(corpus_id[:valid_corpus_num], context_len=3, max_len=64)
print('train datas num:', train_data_loader.data_num, flush=True)
print('valid datas num:', valid_data_loader.data_num, flush=True)

train datas num: 5767705
valid datas num: 14102


In [5]:
# Load in sample
sample = pd.read_csv('datas/sample_test_data.txt')
sample_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
sample_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
sample_y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in sample_x2]) == 0)
sample_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in sample_x1]
sample_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in sample_x2]

test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)
test_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in test_x1]
test_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in test_x2]
with open('datas/AIFirst_test_answer.txt', 'r') as f:
    f.readline()
    test_y = np.array([int(line.strip().split(',')[-1]) for line in f])

def word_lst_2_id_lst(lst, pad_to_len=-1):
    pad_word_id = word2id['<pad>']
    pad_len = max(len(lst), 0)
    id_list = [word2id[lst[i]] if i<len(lst) and lst[i] in word2id else pad_word_id for i in range(pad_len)]
    pad_len = pad_to_len - len(id_list)
    if pad_len > 0:
        id_list.extend([pad_word_id] * pad_len)
    return id_list

pad_to_length = wordLen

sample_id1 = np.array([word_lst_2_id_lst(s, pad_to_length) for s in sample_x1])
sample_id2 = np.array([[word_lst_2_id_lst(r, pad_to_length) for r in rs] for rs in sample_x2])
test_id1 = np.array([word_lst_2_id_lst(s, pad_to_length) for s in test_x1])
test_id2 = np.array([[word_lst_2_id_lst(r, pad_to_length) for r in rs] for rs in test_x2])

In [6]:
# Define model
import tensorflow as tf

def compute_accuracy(next_x1, next_x2, _y, _keep_prob):
    global prediction
    y_pre = sess.run(prediction, feed_dict={context: next_x1, response: next_x2, keep_prob:_keep_prob})
    correct_prediction = tf.equal(tf.argmax(y_pre,1), tf.argmax(_y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    result = sess.run(accuracy, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob:_keep_prob})
    return result
 
def weight_variable(shape):
    initial = tf.random_uniform(shape,-1.0,1.0)
    return tf.Variable(initial)
 
def bias_variable(shape):
    initial = tf.random_uniform(shape,-1.0,1.0)
    return tf.Variable(initial)
 
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, strideSize, strideSize, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

# Input
context = tf.placeholder(dtype=tf.int32, shape=(None, None), name='context')
context_len = tf.placeholder(dtype=tf.int32, shape=(None,), name='context_len')
response = tf.placeholder(dtype=tf.int32, shape=(None, None), name='response')
response_len = tf.placeholder(dtype=tf.int32, shape=(None,), name='response_len')
target = tf.placeholder(dtype=tf.float32, shape=(None, None), name='target')
keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')

In [7]:
#embedding

init_embedding_W = tf.constant_initializer(embedding_matrix)
embeddings_W = tf.get_variable('embeddings_W', shape=[embedding_matrix.shape[0], embedding_matrix.shape[1]], initializer=init_embedding_W)
context_embedded = tf.nn.embedding_lookup(embeddings_W, context, name="embed_context")
response_embedded = tf.nn.embedding_lookup(embeddings_W, response, name="embed_response")
# here should pass a gru

In [8]:
# rnn layer

# gru cell
if use_gru:
    cell = tf.contrib.rnn.GRUCell(num_units=200, reuse=tf.get_variable_scope().reuse)

# lstm cell
else:
    cell = tf.nn.rnn_cell.LSTMCell(num_units=200, forget_bias=1.0, use_peepholes=True, state_is_tuple=True, reuse=tf.get_variable_scope().reuse)

if use_dropout:
    cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
c_outputs, c_states = tf.nn.dynamic_rnn(cell, context_embedded, dtype=tf.float32)
context_gru = c_outputs
r_outputs, r_states = tf.nn.dynamic_rnn(cell, response_embedded, dtype=tf.float32)
response_gru = r_outputs

In [9]:
# M1 matrix and M2 matrix

# M1 word dot matrix
word_dot_matrix=tf.matmul(context_embedded, response_embedded, False, True)
m1_image=tf.reshape(word_dot_matrix, [-1, wordLen, wordLen, 1])
m1_image=tf.divide(m1_image, tf.reduce_max(m1_image))

# M2 segment dot matrix
segment_dot_matrix=tf.matmul(context_gru, response_gru, False, True)
m2_image=tf.reshape(segment_dot_matrix, [-1, wordLen, wordLen, 1])
m2_image=tf.divide(m2_image, tf.reduce_max(m2_image))

y_label=tf.cast(target, tf.float32)

# M1 convolution
W_conv1_m1 = weight_variable([filterSize,filterSize, 1, fm1_num])
b_conv1_m1 = bias_variable([fm1_num])
h_conv1_m1 = tf.nn.sigmoid(conv2d(m1_image, W_conv1_m1) + b_conv1_m1)
h_pool1_m1 = max_pool_2x2(h_conv1_m1)

W_conv2_m1 = weight_variable([filterSize,filterSize, fm1_num, fm2_num])
b_conv2_m1 = bias_variable([fm2_num])
h_conv2_m1 = tf.nn.sigmoid(conv2d(h_pool1_m1, W_conv2_m1) + b_conv2_m1)
h_pool2_m1 = max_pool_2x2(h_conv2_m1)

h_pool2_m1_flat = tf.reshape(h_pool2_m1, [-1, fm2Size*fm2Size*fm2_num])

# M2 convolution
W_conv1_m2 = weight_variable([filterSize,filterSize, 1, fm1_num])
b_conv1_m2 = bias_variable([fm1_num])
h_conv1_m2 = tf.nn.sigmoid(conv2d(m2_image, W_conv1_m2) + b_conv1_m2)
h_pool1_m2 = max_pool_2x2(h_conv1_m2)

W_conv2_m2 = weight_variable([filterSize,filterSize, fm1_num, fm2_num])
b_conv2_m2 = bias_variable([fm2_num])
h_conv2_m2 = tf.nn.sigmoid(conv2d(h_pool1_m2, W_conv2_m2) + b_conv2_m2)
h_pool2_m2 = max_pool_2x2(h_conv2_m2)

h_pool2_m2_flat = tf.reshape(h_pool2_m2, [-1, fm2Size*fm2Size*fm2_num])

# Accumulate M1 and M2
matching_accumulation = tf.add(h_pool2_m1_flat, h_pool2_m2_flat)

W_fc1 = weight_variable([fm2Size*fm2Size*fm2_num, wordLen*wordLen])
b_fc1 = bias_variable([wordLen*wordLen])
h_fc1 = tf.nn.sigmoid(tf.matmul(matching_accumulation, W_fc1) + b_fc1)

W_fc2 = weight_variable([wordLen*wordLen, 2])
b_fc2 = bias_variable([2])
prediction = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)

In [None]:
cross_entropy = -tf.reduce_mean(y_label * tf.log(prediction))
train_step=tf.train.AdagradOptimizer(learning_rate=learning_rate,initial_accumulator_value=0.001).minimize(cross_entropy)

#mean_square_error=tf.reduce_mean(tf.multiply(tf.subtract(y_label, prediction), tf.subtract(y_label, prediction)))
#train_step=tf.train.AdagradOptimizer(learning_rate=0.01,initial_accumulator_value=0.1).minimize(mean_square_error)

In [None]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    lr=0.01
    for it in range(10000):
        if it%500==0 :
            lr/=10;
        next_x1, next_x2, next_y, x1_len, x2_len = train_data_loader.next_batch(batch_size=batchSize, pad_to_length=wordLen, pad_word=word2id['<pad>'], return_len=True)
        _y=np.zeros((batchSize,2))
        _y[np.arange(batchSize), next_y]=1
        #_context_lstm=sess.run(context_lstm, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: 0.8, context_len: x1_len, response_len:x2_len, learning_rate:lr})
        #print(_context_lstm)
        #_response_emb=sess.run(response_embedded, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: 0.8, context_len: x1_len, response_len:x2_len, learning_rate:lr})
        #print(_response_emb)
        sess.run(train_step, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: 0.8, context_len: x1_len, response_len:x2_len, learning_rate:lr})
        ce=sess.run(cross_entropy, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: 0.8, context_len: x1_len, response_len:x2_len})
        if it%10==0:
            #mse=sess.run(mean_square_error, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: 0.8, context_len: x1_len, response_len:x2_len})
            ce=sess.run(cross_entropy, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: 0.8, context_len: x1_len, response_len:x2_len})
            acc=compute_accuracy(next_x1, next_x2, _y, 0.8)
            #print(mse, acc)
            sample_acc=0
            for i in range(50):
                _context=[]
                _response=[]
                _ans=np.zeros((6,2))
                for j in range(6):
                    _context.append(list(sample_id1[i]))
                    _response.append(list(sample_id2[i][j]))
                    if j==sample_y[i]:
                        _ans[j][1]=1.0
                    else:
                        _ans[j][0]=1.0
                _context=np.array(_context)
                _response=np.array(_response)
                pred=sess.run(prediction, feed_dict={context: _context, response: _response, target: _ans, keep_prob: 0.8, context_len: x1_len, response_len:x2_len})
                #print(sample_y[i])
                #print(pred)
                guess=np.argmax(pred, axis=0)
                #print(guess)
                if guess[1]==sample_y[i]:
                    sample_acc=sample_acc+1
            test_acc=0
            for i in range(500):
                _context=[]
                _response=[]
                _ans=np.zeros((6,2))
                for j in range(6):
                    _context.append(list(test_id1[i]))
                    _response.append(list(test_id2[i][j]))
                    if j==test_y[i]:
                        _ans[j][1]=1.0
                    else:
                        _ans[j][0]=1.0
                _context=np.array(_context)
                _response=np.array(_response)
                pred=sess.run(prediction, feed_dict={context: _context, response: _response, target: _ans, keep_prob: 0.8, context_len: x1_len, response_len:x2_len})
                #print(sample_y[i])
                #print(pred)
                guess=np.argmax(pred, axis=0)
                #print(guess)
                if guess[1]==test_y[i]:
                    test_acc=test_acc+1
            print(it, ce, acc, sample_acc, '/50 ', test_acc, '/500')

0 9.46654 0.523438 24 /50  196 /500
10 0.30681 0.679688 31 /50  246 /500
20 0.300697 0.664062 26 /50  245 /500
30 0.271612 0.703125 27 /50  240 /500
40 0.326902 0.625 27 /50  249 /500
50 0.456517 0.515625 28 /50  259 /500
60 0.264976 0.710938 27 /50  256 /500
70 0.330166 0.617188 27 /50  254 /500
80 0.296531 0.632812 27 /50  252 /500
90 0.267195 0.703125 27 /50  252 /500
100 0.295663 0.710938 28 /50  255 /500
110 0.296535 0.664062 30 /50  262 /500
120 0.288191 0.695312 29 /50  266 /500
130 0.304712 0.640625 31 /50  263 /500
140 0.27289 0.703125 30 /50  264 /500
150 0.286914 0.6875 27 /50  260 /500
160 0.29494 0.6875 30 /50  257 /500
170 0.275794 0.71875 31 /50  262 /500
180 0.275541 0.695312 30 /50  267 /500
190 0.263863 0.757812 31 /50  266 /500
200 0.249903 0.789062 31 /50  272 /500
210 0.256865 0.75 30 /50  263 /500
220 0.261549 0.742188 31 /50  267 /500
230 0.22177 0.757812 31 /50  265 /500
240 0.288955 0.695312 30 /50  269 /500
250 0.271509 0.710938 30 /50  269 /500
260 0.249353 0

2120 0.255637 0.765625 30 /50  265 /500
2130 0.259361 0.734375 30 /50  265 /500
2140 0.218776 0.78125 30 /50  265 /500
2150 0.263603 0.710938 30 /50  265 /500
2160 0.236603 0.78125 30 /50  265 /500
2170 0.293804 0.65625 30 /50  265 /500
2180 0.234742 0.78125 30 /50  265 /500
2190 0.248321 0.734375 30 /50  265 /500
2200 0.217372 0.804688 30 /50  265 /500
2210 0.26577 0.71875 30 /50  265 /500
2220 0.255356 0.765625 30 /50  265 /500
2230 0.234578 0.75 30 /50  265 /500
2240 0.222525 0.789062 30 /50  265 /500
2250 0.272716 0.75 30 /50  265 /500
2260 0.248925 0.757812 30 /50  265 /500
2270 0.241539 0.773438 30 /50  265 /500
2280 0.272049 0.773438 30 /50  265 /500
2290 0.283143 0.71875 30 /50  265 /500
2300 0.248947 0.742188 30 /50  265 /500
2310 0.240116 0.757812 30 /50  265 /500
2320 0.270183 0.75 30 /50  265 /500
2330 0.231859 0.765625 30 /50  265 /500
2340 0.23327 0.757812 30 /50  265 /500
2350 0.248678 0.75 30 /50  265 /500
2360 0.25135 0.703125 30 /50  265 /500
2370 0.26375 0.710938 30 