In [None]:
from gensim.models import word2vec
import numpy as np
from scipy import spatial

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re

from mini_batch_helper import extractor, MiniBatchCorpus

In [None]:
# Read in  training data
word2vec_fname = 'models/word2vec/fine-tuned-2.txt'
corpus_fnames = [
    'datas/training_data/下課花路米.txt',
    'datas/training_data/人生劇展.txt',
    'datas/training_data/公視藝文大道.txt',
    'datas/training_data/成語賽恩思.txt',
    'datas/training_data/我的這一班.txt',
    'datas/training_data/流言追追追.txt',
    'datas/training_data/聽聽看.txt',
    'datas/training_data/誰來晚餐.txt',
]
sample_rate_on_training_datas = 1.0  # 1.0
extra_words = ['<pad>']
unknown_word = None

word2id, id2word, word_p, embedding_matrix, corpus, corpus_id = extractor(word2vec_fname, corpus_fnames, sample_rate_on_training_datas, extra_words, unknown_word)

In [None]:
valid_corpus_num = 10

train_data_loader = MiniBatchCorpus(corpus_id[valid_corpus_num:], context_len=3, max_len=64)
valid_data_loader = MiniBatchCorpus(corpus_id[:valid_corpus_num], context_len=3, max_len=64)
print('train datas num:', train_data_loader.data_num, flush=True)
print('valid datas num:', valid_data_loader.data_num, flush=True)

In [None]:
exp_name = 'dual_lstm_?'
# HyperParameters
# Model Parameters
hp = {}

hp['word2vec_model_name'] = word2vec_fname
hp['word2vec_vocab_size'] = embedding_matrix.shape[0]
hp['word2vec_dim'] = embedding_matrix.shape[1]
hp['rnn_dim'] = 256  # 200
hp['forget_bias'] = 1.0 # 0.0

hp['word_len'] = 64
hp['filter_size'] = 10
hp['stride_size'] = 1
hp['fm1_num'] = 25  
hp['fm2_num'] = 50
hp['cell_type'] = 'gru'  # 'gru' or 'lstm'
hp['keep_prob'] = 0.8  # 0.8 , 0.5 !
hp['fm1_size'] = int(hp['word_len']/(2*hp['stride_size']))  # unused ?? 
hp['fm2_size'] = int(hp['word_len']/(2*hp['stride_size'])/(2*hp['stride_size']))

# Training Parameters
hp['learning_rate'] = 1e-3
hp['decay_learning_rate'] = 0.8
hp['decay_times_no_improve'] = 5
hp['clip'] = 15
hp['batch_size'] = 256
hp['n_iterations'] = int(20 * train_data_loader.data_num / hp['batch_size'])

In [None]:
# Export the hyperparameters as json
save_hp_dir = 'models/%s/' %exp_name
if not os.path.exists(save_hp_dir):
    os.makedirs(save_hp_dir)
with open(save_hp_dir+'model_parameters.json', 'w') as f:
    json.dump(hp, f, indent=1)

In [None]:
''' Parameters from Original SMN file
batchSize=128
wordLen=64

filterSize=10
strideSize=1

fm1_num=25
fm2_num=50

use_gru=True #if false, use lstm
use_dropout=False

fm1Size=int(wordLen/(2*strideSize))
fm2Size=int(wordLen/(2*strideSize)/(2*strideSize))
'''

In [None]:
# Load in sample
sample = pd.read_csv('datas/sample_test_data.txt')
sample_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
sample_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
sample_y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in sample_x2]) == 0)
sample_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in sample_x1]
sample_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in sample_x2]

test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)
test_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in test_x1]
test_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in test_x2]
with open('datas/AIFirst_test_answer.txt', 'r') as f:
    f.readline()
    test_y = np.array([int(line.strip().split(',')[-1]) for line in f])

def word_lst_2_id_lst(lst, pad_to_len=-1):
    pad_word_id = word2id['<pad>']
    pad_len = max(len(lst), 0)
    id_list = [word2id[lst[i]] if i<len(lst) and lst[i] in word2id else pad_word_id for i in range(pad_len)]
    pad_len = pad_to_len - len(id_list)
    if pad_len > 0:
        id_list.extend([pad_word_id] * pad_len)
    return id_list

pad_to_length = hp['word_len']

sample_id1 = np.array([word_lst_2_id_lst(s, pad_to_length) for s in sample_x1])
sample_id2 = np.array([[word_lst_2_id_lst(r, pad_to_length) for r in rs] for rs in sample_x2])
test_id1 = np.array([word_lst_2_id_lst(s, pad_to_length) for s in test_x1])
test_id2 = np.array([[word_lst_2_id_lst(r, pad_to_length) for r in rs] for rs in test_x2])

In [None]:
# Define model
import tensorflow as tf

def compute_accuracy(next_x1, next_x2, _y, _keep_prob):
    global prediction
    y_pre = sess.run(prediction, feed_dict={context: next_x1, response: next_x2, keep_prob:_keep_prob})
    correct_prediction = tf.equal(tf.argmax(y_pre,1), tf.argmax(_y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    result = sess.run(accuracy, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob:_keep_prob})
    return result
 
def weight_variable(shape):
    initial = tf.random_uniform(shape,-1.0,1.0)
    return tf.Variable(initial)
 
def bias_variable(shape):
    initial = tf.random_uniform(shape,-1.0,1.0)
    return tf.Variable(initial)
 
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, hp['stride_size'], hp['stride_size'], 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

# Input
context = tf.placeholder(dtype=tf.int32, shape=(None, None), name='context')
context_len = tf.placeholder(dtype=tf.int32, shape=(None,), name='context_len')
response = tf.placeholder(dtype=tf.int32, shape=(None, None), name='response')
response_len = tf.placeholder(dtype=tf.int32, shape=(None,), name='response_len')
target = tf.placeholder(dtype=tf.float32, shape=(None, None), name='target')
keep_prob = tf.placeholder(dtype=tf.float32, name='keep_prob')
learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate')

In [None]:
#embedding

init_embedding_W = tf.constant_initializer(embedding_matrix)
embeddings_W = tf.get_variable('embeddings_W', shape=[embedding_matrix.shape[0], embedding_matrix.shape[1]], initializer=init_embedding_W)
context_embedded = tf.nn.embedding_lookup(embeddings_W, context, name="embed_context")
response_embedded = tf.nn.embedding_lookup(embeddings_W, response, name="embed_response")
# here should pass a gru

In [None]:
# rnn layer
assert(hp['cell_type'] == 'gru' or hp['cell_type'] == 'lstm')
if hp['cell_type'] == 'gru':
    cell = tf.contrib.rnn.GRUCell(num_units=hp['rnn_dim'], reuse=tf.get_variable_scope().reuse)
elif hp['cell_type'] == 'lstm':
    cell = tf.nn.rnn_cell.LSTMCell(num_units=hp['rnn_dim'], forget_bias=hp['forget_bias'], 
                                   use_peepholes=True, state_is_tuple=True, 
                                   reuse=tf.get_variable_scope().reuse)
cell = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob, output_keep_prob=keep_prob)
c_outputs, c_states = tf.nn.dynamic_rnn(cell, context_embedded, dtype=tf.float32)
context_rnn = c_outputs
r_outputs, r_states = tf.nn.dynamic_rnn(cell, response_embedded, dtype=tf.float32)
response_rnn = r_outputs

In [None]:
# M1 matrix and M2 matrix

# M1 word dot matrix
word_dot_matrix=tf.matmul(context_embedded, response_embedded, False, True)
m1_image=tf.reshape(word_dot_matrix, [-1, hp['word_len'], hp['word_len'], 1])
m1_image=tf.divide(m1_image, tf.reduce_max(m1_image))

# M2 segment dot matrix
segment_dot_matrix=tf.matmul(context_rnn, response_rnn, False, True)
m2_image=tf.reshape(segment_dot_matrix, [-1, hp['word_len'], hp['word_len'], 1])
m2_image=tf.divide(m2_image, tf.reduce_max(m2_image))

y_label=tf.cast(target, tf.float32)
hp['fm2_size']
# M1 convolution
W_conv1_m1 = weight_variable([hp['filter_size'],hp['filter_size'], 1, hp['fm1_num']])
b_conv1_m1 = bias_variable([hp['fm1_num']])
h_conv1_m1 = tf.nn.sigmoid(conv2d(m1_image, W_conv1_m1) + b_conv1_m1)
h_pool1_m1 = max_pool_2x2(h_conv1_m1)

W_conv2_m1 = weight_variable([hp['filter_size'],hp['filter_size'], hp['fm1_num'], hp['fm2_num']])
b_conv2_m1 = bias_variable([hp['fm2_num']])
h_conv2_m1 = tf.nn.sigmoid(conv2d(h_pool1_m1, W_conv2_m1) + b_conv2_m1)
h_pool2_m1 = max_pool_2x2(h_conv2_m1)

h_pool2_m1_flat = tf.reshape(h_pool2_m1, [-1, hp['fm2_size']*hp['fm2_size']*hp['fm2_num']])  # ??

# M2 convolution
W_conv1_m2 = weight_variable([hp['filter_size'],hp['filter_size'], 1, hp['fm1_num']])
b_conv1_m2 = bias_variable([hp['fm1_num']])
h_conv1_m2 = tf.nn.sigmoid(conv2d(m2_image, W_conv1_m2) + b_conv1_m2)
h_pool1_m2 = max_pool_2x2(h_conv1_m2)

W_conv2_m2 = weight_variable([hp['filter_size'],hp['filter_size'], hp['fm1_num'], hp['fm2_num']])
b_conv2_m2 = bias_variable([hp['fm2_num']])
h_conv2_m2 = tf.nn.sigmoid(conv2d(h_pool1_m2, W_conv2_m2) + b_conv2_m2)
h_pool2_m2 = max_pool_2x2(h_conv2_m2)

h_pool2_m2_flat = tf.reshape(h_pool2_m2, [-1, hp['fm2_size']*hp['fm2_size']*hp['fm2_num']])

# Accumulate M1 and M2
matching_accumulation = tf.add(h_pool2_m1_flat, h_pool2_m2_flat)

W_fc1 = weight_variable([hp['fm2_size']*hp['fm2_size']*hp['fm2_num'], hp['word_len']*hp['word_len']])
b_fc1 = bias_variable([hp['word_len']*hp['word_len']])
h_fc1 = tf.nn.sigmoid(tf.matmul(matching_accumulation, W_fc1) + b_fc1)

W_fc2 = weight_variable([hp['word_len']*hphp['fm2_size']['word_len'], 2])
b_fc2 = bias_variable([2])
prediction = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2)  # ???

In [None]:
## 這段 code 是 Dual LSTM 的
## 33 可以看一下
## 要獨立出 'probs' 的原因是為惹之後 voting
## loss 用 tf.nn.sigmoid_cross_entropy_with_logits 有優化, 用 'logits' 獨立出來，不要用 'probs'
'''
# Dot product between generated response and actual response
logits = tf.matmul(generated_response, encoding_response, True)
logits = tf.reshape(logits, [-1])

# Apply sigmoid to convert logits to probabilities (for prediction, not for loss)
probs = tf.sigmoid(logits)
correct_prediction = tf.logical_or( tf.logical_and(tf.equal(target,1), tf.greater_equal(probs,0.5)), tf.logical_and(tf.equal(target,0), tf.less(probs,0.5)))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Calculate the binary cross-entropy loss
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.to_float(target)))
'''

In [None]:
cross_entropy = -tf.reduce_mean(y_label * tf.log(prediction))  #??
train_step=tf.train.AdagradOptimizer(learning_rate=learning_rate,initial_accumulator_value=0.001).minimize(cross_entropy)
# ??

#mean_square_error=tf.reduce_mean(tf.multiply(tf.subtract(y_label, prediction), tf.subtract(y_label, prediction)))
#train_step=tf.train.AdagradOptimizer(learning_rate=0.01,initial_accumulator_value=0.1).minimize(mean_square_error)

In [None]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    lr=0.01
    for it in range(10000):
        if it%500==0 :
            lr/=10;
        next_x1, next_x2, next_y, x1_len, x2_len = train_data_loader.next_batch(batch_size=hp['batch_size'], pad_to_length=hp['word_len'], pad_word=word2id['<pad>'], return_len=True)
        _y=np.zeros((hp['batch_size'],2))
        _y[np.arange(hp['batch_size']), next_y]=1
        #_context_lstm=sess.run(context_lstm, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: hp['keep_prob'], context_len: x1_len, response_len:x2_len, learning_rate:lr})
        #print(_context_lstm)
        #_response_emb=sess.run(response_embedded, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: hp['keep_prob'], context_len: x1_len, response_len:x2_len, learning_rate:lr})
        #print(_response_emb)
        sess.run(train_step, feed_dict={context: next_x1, hp['batch_size']response: next_x2, target: _y, keep_prob: hp['keep_prob'], context_len: x1_len, response_len:x2_len, learning_rate:lr})
        ce=sess.run(cross_entropy, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: hp['keep_prob'], context_len: x1_len, response_len:x2_len})
        if it%10==0:
            #mse=sess.run(mean_square_error, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: hp['keep_prob'], context_len: x1_len, response_len:x2_len})
            ce=sess.run(cross_entropy, feed_dict={context: next_x1, response: next_x2, target: _y, keep_prob: hp['keep_prob'], context_len: x1_len, response_len:x2_len})
            acc=compute_accuracy(next_x1, next_x2, _y, hp['keep_prob'])
            #print(mse, acc)
            sample_acc=0
            for i in range(50):
                _context=[]
                _response=[]
                _ans=np.zeros((6,2))
                for j in range(6):
                    _context.append(list(sample_id1[i]))
                    _response.append(list(sample_id2[i][j]))
                    if j==sample_y[i]:
                        _ans[j][1]=1.0
                    else:
                        _ans[j][0]=1.0
                _context=np.array(_context)
                _response=np.array(_response)
                pred=sess.run(prediction, feed_dict={context: _context, response: _response, target: _ans, keep_prob: hp['keep_prob'], context_len: x1_len, response_len:x2_len})
                #print(sample_y[i])
                #print(pred)
                guess=np.argmax(pred, axis=0)
                #print(guess)
                if guess[1]==sample_y[i]:
                    sample_acc=sample_acc+1
            test_acc=0
            for i in range(500):
                _context=[]
                _response=[]
                _ans=np.zeros((6,2))
                for j in range(6):
                    _context.append(list(test_id1[i]))
                    _response.append(list(test_id2[i][j]))
                    if j==test_y[i]:
                        _ans[j][1]=1.0
                    else:
                        _ans[j][0]=1.0
                _context=np.array(_context)
                _response=np.array(_response)
                pred=sess.run(prediction, feed_dict={context: _context, response: _response, target: _ans, keep_prob: hp['keep_prob'], context_len: x1_len, response_len:x2_len})
                #print(sample_y[i])
                #print(pred)
                guess=np.argmax(pred, axis=0)
                #print(guess)
                if guess[1]==test_y[i]:
                    test_acc=test_acc+1
            print(it, ce, acc, sample_acc, '/50 ', test_acc, '/500')