# Dual LSTM with official sample

In [None]:
import pandas as pd
import numpy as np
import re
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

In [None]:
import time
import gc

def release_variable(data_to_release):
    del(data_to_release)
    time.sleep(1)
    gc.collect()

## Word Embedding
* [keras 使用 word2vec 當 embedding 的教學](http://ben.bolte.cc/blog/2016/gensim.html)

In [None]:
load_word2vec_path = 'models/word2vec_all_offitial_200.model.bin'

from gensim.models import word2vec
word2vec_model = word2vec.Word2Vec.load(load_word2vec_path)
word2vec_dim = word2vec_model.vector_size 
word2vec_vocab_size = len(word2vec_model.wv.vocab.items())

### 將 word2vec_model 的 vocab 存成 dictionary 'word2id'

In [None]:
# Save the vocab
# Sequences have various lengths, so let index '0' serve as padding  -> all index+1
word2id = dict([(k, v.index+1) for k, v in word2vec_model.wv.vocab.items()])
print('given word string, find index in word2vec_model vocab... 然後 -> ', word2id['然後'])
id2word = dict([(v, k) for k, v in word2id.items()])
print('given index, find word string in word2vec_model vocab... 95 -> ', id2word[95])

In [None]:
# Release unused memory comsumed model
import time
import gc
del(word2vec_model)
time.sleep(1)
gc.collect()

## Read in official sample data ( for validation )

In [None]:
# Sample data for validation
sample = pd.read_csv('datas/sample_test_data.txt')

# Extract sample test datas
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
y = sample.answer.values

# Tokenize
x1 = np.array([list(jieba.cut(' '.join(_))) for _ in x1])
x2 = np.array([[list(jieba.cut(s)) for s in _] for _ in x2])
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

## Read in test data ( for output answer )

In [None]:
# Offitial testing data
test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)

### Convert string list x1, x2 to np array of index

In [None]:
'''
# Find the length of longest sequence, we shall pad all sentences to this length
max_seq_len = 0
for x in x1:
    max_seq_len = max(max_seq_len, len(x))
    
for xs in x2:
    for x in xs:   
        max_seq_len = max(max_seq_len, len(x))
        
print('The longest sequnce in training data has %d words' %max_seq_len)
'''

In [None]:
'''
new_x1 = []
for sentence in x1:
    tmp_sentence = []
    # Converd word to index
    for word in sentence:
        if word in word2id:
            tmp_sentence.append(word2id[word])
        # else:
            # print('Cannot find %s in vocab: ' %word)
    
    # Padding all sequences to same length
    len_to_pad = max_seq_len - len(tmp_sentence)
    tmp_sentence.extend([0] * len_to_pad)
    new_x1.append(tmp_sentence)
    
x1 = np.array(new_x1)
print(x1.shape)
'''

In [None]:
'''
new_x2 = []
for options in x2:
    for sentence in options:
        tmp_sentence = []
        for word in sentence:
            if word in word2id:
                tmp_sentence.append(word2id[word])

        # Padding all sequences to same length
        len_to_pad = max_seq_len - len(tmp_sentence)
        tmp_sentence.extend([0] * len_to_pad)
        new_x2.append(tmp_sentence)
    
x2 = np.array(new_x2)
print(x2.shape)
assert(x2.shape[-1] == max_seq_len)
'''

### Convert to format: ( context, reponse, 0/1 ) 

In [None]:
'''
# Repeate x1 
# -> (x1[0], x1[0], x1[0], x1[0], x1[0], x1[0],  x1[1], ...)
num_responses = 6
x1 = np.repeat(x1, num_responses, axis=0)
'''

In [None]:
'''
# Original 'y' means which response is correct
# y = sample.answer.values
# Now convert y to indicate wherther one (context, respoonse) is corrct, 0/1
new_y = []
for answer in y:
    new_y.extend([0]*answer)
    new_y.append(1)
    new_y.extend([0]*(num_responses-answer-1))
y = np.reshape(np.array(new_y), (-1, 1))
print(y.shape)
'''

## Read in  training data

In [1]:
# Prepare training data corpus
from mini_batch_helper import MiniBatchCorpus
corpus_fname = [
    'datas/training_data/下課花路米.txt',
##     'datas/training_data/人生劇展.txt',
#     'datas/training_data/公視藝文大道.txt',
##     'datas/training_data/成語賽恩思.txt',
##    'datas/training_data/我的這一班.txt',
#     'datas/training_data/流言追追追.txt',
#     'datas/training_data/聽聽看.txt',
#    'datas/training_data/誰來晚餐.txt',
]
corpus = []
for fname in corpus_fname:
    with open(fname, 'r') as f:
        corpus.extend([[s.split() for s in line.split('\t')] for line in f])

In [2]:
# Instantiate a MiniBatchCorpus to get training data as batch
from mini_batch_helper import MiniBatchCorpus
data_loader = MiniBatchCorpus(corpus)


In [3]:
max_seq_len = 0
mean_seq_len = 0
num_sentences = 0
for episode in corpus:
    for sentence in episode:
        max_seq_len = max(max_seq_len, len(sentence))
        mean_seq_len += len(sentence)
        num_sentences += 1
mean_seq_len /= num_sentences
print('max_seq_len', max_seq_len)
print('mean_seq_len', mean_seq_len)

max_seq_len 66
mean_seq_len 4.628876693328018


In [5]:
# Demo
x1, x2, y = data_loader.next_batch(batch_size=2, pad_to_length=max_seq_len, pad_wor)
print(x1)
print(x1.shape)
print(x1[0].shape)
print(x1[1].shape)

['這裡', '有個', '洞', '這邊', '有個', '洞', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['西天', '說', '三次', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[ list(['這裡', '有個', '洞', '這邊', '有個', '洞', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
 list(['西天', '說', '三次', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
(2,)


AttributeError: 'list' object has no attribute 'shape'

In [None]:
print(corpus[0][14])

In [None]:
release_variable(corpus)

## Model ( tf )

In [None]:
# Define hyperparameters
# reference: https://github.com/dennybritz/chatbot-retrieval/blob/8b1be4c2e63631b1180b97ef927dc2c1f7fe9bea/udc_hparams.py
# Model Parameters
params = {}
params['word2vec_path'] = load_word2vec_path + '.wv.syn0.npy'
params['word2vec_vocab_size'] = word2vec_vocab_size
params['word2vec_dim'] = word2vec_dim
params['rnn_dim'] = 256

# Training Parameters
params['learning_rate'] = 0.001
params['batch_size'] = 128
params['eval_batch_size'] = 16
params['n_iterations'] = 40

## TODO: Embedding 後可以考慮加一層 Dropout

In [None]:
# Define model
import tensorflow as tf

# Input
#context = tf.placeholder(dtype=tf.int64, shape=(None, max_seq_len), name='context')
#response = tf.placeholder(dtype=tf.int64, shape=(None, max_seq_len), name='response')
context = tf.placeholder(dtype=tf.int64, shape=(None, None), name='context')
response = tf.placeholder(dtype=tf.int64, shape=(None, None), name='response')
target = tf.placeholder(dtype=tf.int64, shape=(None, 1), name='target')


with tf.device('/gpu:0'):
    # Embedding
    init_embedding_W = np.load(open(params['word2vec_path'], 'rb'))
    embedding_input_dim = init_embedding_W.shape[0]  # vocab size
    embedding_output_dim = init_embedding_W.shape[1]  # embedding output dim
    init_embedding_W = tf.constant_initializer(init_embedding_W)
    embeddings_W = tf.get_variable('embeddings_W', shape=[embedding_input_dim, embedding_output_dim], initializer=init_embedding_W)
    context_embedded = tf.nn.embedding_lookup(embeddings_W, context, name="embed_context")
    response_embedded = tf.nn.embedding_lookup(embeddings_W, response, name="embed_response")

    # shared LSTM encoder
    cell = tf.nn.rnn_cell.LSTMCell(num_units=params['rnn_dim'], forget_bias=2.0, 
                use_peepholes=True, state_is_tuple=True, reuse=tf.get_variable_scope().reuse)

    c_outputs, c_states = tf.nn.dynamic_rnn(cell, context_embedded, dtype=tf.float32)
    encoding_context = c_states.h
    r_outputs, r_states = tf.nn.dynamic_rnn(cell, response_embedded, dtype=tf.float32)
    encoding_response = r_states.h

    # σ(cMr)
    M = tf.get_variable('M', shape=[params['rnn_dim'], params['rnn_dim']], initializer=tf.truncated_normal_initializer())
    
    # "Predict" a  response: c * M
    generated_response = tf.matmul(encoding_context, M)
    generated_response = tf.expand_dims(generated_response, 2)
    encoding_response = tf.expand_dims(encoding_response, 2)

    # Dot product between generated response and actual response
    logits = tf.matmul(generated_response, encoding_response, True)
    logits = tf.squeeze(logits, [2])

    # Apply sigmoid to convert logits to probabilities (for prediction, not for loss)
    probs = tf.sigmoid(logits)

    # Calculate the binary cross-entropy loss
    loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=tf.to_float(target)), name='mean_loss_of_batch')

    train_step = tf.train.AdamOptimizer(params['learning_rate']).minimize(loss)


In [None]:
# Train
import os

exp_name = 'dual_lstm_0'
saver = tf.train.Saver()
with tf.Session() as sess:
    # sess.run(embeddings_W.initializer)
    # sess.run(M.initializer)
    init = tf.global_variables_initializer()
    sess.run(init)
    
    # Restore model
    # saver.restore(sess, 'models/%s/%s.ckpt' % (exp_name, params['n_iterations'])
    # print("Model restored.")

    for it in range(params['n_iterations']):
        print('Iterations %4d:\t' %(it+1) , end="")
        # Train next batch
        next_x1, next_x2, next_y = data_loader.next_batch(batch_size=params['batch_size'], pad_to_length=max_seq_len)
        sess.run(train_step, feed_dict={context: next_x1, response: next_x2, target: next_y})

    # Save the model
    if not os.path.exists('models/'+exp_name):
        os.makedirs('models/'+exp_name)
    save_path = saver.save(sess, 'models/%s/%s.ckpt' % (exp_name, params['n_iterations']))
