In [1]:
import numpy as np
import tensorflow as tf

# Import util
import time
import re
import sys
import gc

# Self define module
from mini_batch_helper import extractor
from mini_batch_helper import rnn_minibatch_sequencer

Using TensorFlow backend.


## Loading corpus and forming dict

In [2]:
word2vec_fname = 'models/word2vec_all_offitial_200.model.bin'
corpus_fnames = [
    'datas/training_data/下課花路米.txt',
    'datas/training_data/人生劇展.txt',
    'datas/training_data/公視藝文大道.txt',
    'datas/training_data/成語賽恩思.txt',
    'datas/training_data/我的這一班.txt',
    'datas/training_data/流言追追追.txt',
    'datas/training_data/聽聽看.txt',
    'datas/training_data/誰來晚餐.txt',
]
sample_rate_on_training_datas = 0.3
extra_words = ['<unk>', '<bos>', '<eos>']
unknown_word = '<unk>'

word2id, id2word, word_p, embedding_matrix, corpus, corpus_id = extractor(word2vec_fname, corpus_fnames, sample_rate_on_training_datas, extra_words, unknown_word)
del(word_p)
del(embedding_matrix)
del(corpus)

# Get only fixed number of corpus
rnd_idx = np.arange(len(corpus_id))
np.random.shuffle(rnd_idx)
corpus_id = corpus_id[rnd_idx[:100]]

train_corpus_id = corpus_id[:len(corpus_id)-1]
valid_corpus_id = corpus_id[len(corpus_id)-1:]
traintext = [w for cp in train_corpus_id for s in cp for w in [word2id['<bos>']] + s + [word2id['<eos>']]]
validtext = [w for cp in valid_corpus_id for s in cp for w in [word2id['<bos>']] + s + [word2id['<eos>']]]
del(corpus_id)
del(train_corpus_id)
del(valid_corpus_id)

In [3]:
SEQLEN = 10
BATCHSIZE = 32
EPOCHNUM = 10
ALPHASIZE = len(word2id)
INTERNALSIZE = 200
NLAYERS = 2
LEARNING_RATE = 0.001
DROPOUT_PKEEP = 0.8
LOGINTERVAL = 10
SAVEINTERVAL= 100
CLIP = 0.2

print('%20s: %s' % ('SEQLEN', SEQLEN))
print('%20s: %s' % ('BATCHSIZE', BATCHSIZE))
print('%20s: %s' % ('EPOCHNUM', EPOCHNUM))
print('%20s: %s' % ('ALPHASIZE', ALPHASIZE))
print('%20s: %s' % ('INTERNALSIZE', INTERNALSIZE))
print('%20s: %s' % ('NLAYERS', NLAYERS))
print('%20s: %s' % ('LEARNING_RATE', LEARNING_RATE))
print('%20s: %s' % ('DROPOUT_PKEEP', DROPOUT_PKEEP))
print('%20s: %s' % ('LOGINTERVAL', LOGINTERVAL))
print('%20s: %s' % ('SAVEINTERVAL', SAVEINTERVAL))
print('%20s: %s' % ('CLIP', CLIP))

              SEQLEN: 10
           BATCHSIZE: 32
            EPOCHNUM: 10
           ALPHASIZE: 65866
        INTERNALSIZE: 200
             NLAYERS: 2
       LEARNING_RATE: 0.001
       DROPOUT_PKEEP: 0.8
         LOGINTERVAL: 10
        SAVEINTERVAL: 100
                CLIP: 0.2


-----------------------------
## Define model
Modified (mostly copy OuOb) from [here](https://github.com/martin-gorner/tensorflow-rnn-shakespeare/blob/master/rnn_train.py)

In [4]:
# inputs
X = tf.placeholder(tf.int32, [None, None])    # [ BATCHSIZE, SEQLEN ]
Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0)       # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
Y_ = tf.placeholder(tf.int32, [None, None])   # [ BATCHSIZE, SEQLEN ]
Yo_ = tf.one_hot(Y_, ALPHASIZE)               # [ BATCHSIZE, SEQLEN, ALPHASIZE ]

# inputs info
batchsize = tf.placeholder(tf.int32)

# input state
Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE*NLAYERS])  # [ BATCHSIZE, INTERNALSIZE * NLAYERS]

cells = [tf.contrib.rnn.GRUCell(INTERNALSIZE) for _ in range(NLAYERS)]
dropcells = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=DROPOUT_PKEEP) for cell in cells]
multicell = tf.contrib.rnn.MultiRNNCell(dropcells, state_is_tuple=False)
multicell = tf.contrib.rnn.DropoutWrapper(multicell, output_keep_prob=DROPOUT_PKEEP)

# Yr: [ BATCHSIZE, SEQLEN, INTERNALSIZE ]
# H:  [ BATCHSIZE, INTERNALSIZE*NLAYERS ] (last state in the sequence)
Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin)

Yflat = tf.reshape(Yr, [-1, INTERNALSIZE])               # [ BATCHSIZE x SEQLEN, INTERNALSIZE ]
Ylogits = tf.contrib.layers.linear(Yflat, ALPHASIZE)     # [ BATCHSIZE x SEQLEN, ALPHASIZE ]
Yflat_ = tf.reshape(Yo_, [-1, ALPHASIZE])                # [ BATCHSIZE x SEQLEN, ALPHASIZE ]
loss = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Yflat_)  # [ BATCHSIZE x SEQLEN ]
loss = tf.reshape(loss, [batchsize, -1])      # [ BATCHSIZE, SEQLEN ]
Yo = tf.nn.softmax(Ylogits)                   # [ BATCHSIZE x SEQLEN, ALPHASIZE ]

# Gradient clipping
optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
gvs = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_norm(grad, CLIP), var) for grad, var in gvs]
train_step = optimizer.apply_gradients(capped_gvs)

----------------------------------
## Training

In [5]:
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [6]:
step= 0
start_time = time.time()
istate = np.zeros([BATCHSIZE, INTERNALSIZE*NLAYERS])  # initial zero input state
batch_loss = 0

for x, y_, epoch in rnn_minibatch_sequencer(traintext, BATCHSIZE, SEQLEN, EPOCHNUM):
    step += 1
    _, now_loss, istate = sess.run([train_step, loss, H], {
        X: x,
        Y_: y_,
        Hin: istate,
        batchsize: BATCHSIZE,
    })
    batch_loss += np.mean(now_loss) / LOGINTERVAL
    if step % LOGINTERVAL == 0:
        print('epoch %2d: batch loss %10f / elapsed time %.f' % (epoch, batch_loss, time.time() - start_time), flush=True)
        batch_loss = 0
    if step % SAVEINTERVAL == 0:
        saver.save(sess, 'models/Attack-language-model/lm', global_step=step)
        print('Saved model', flush=True)

epoch  0: batch loss  10.926170 / elapsed time 14
epoch  0: batch loss   8.397359 / elapsed time 27
epoch  0: batch loss   6.608917 / elapsed time 40
epoch  0: batch loss   6.305751 / elapsed time 54
epoch  0: batch loss   6.343321 / elapsed time 67
epoch  0: batch loss   6.207823 / elapsed time 80
epoch  0: batch loss   6.161940 / elapsed time 93
epoch  0: batch loss   6.246227 / elapsed time 107
epoch  0: batch loss   6.221875 / elapsed time 120


KeyboardInterrupt: 

In [None]:
saver.save(sess, 'models/Attack-language-model/lm-final')

------------------------------
## Evaluating

In [7]:
import pandas as pd
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

Building prefix dict from /home/sunset/word_contest/datas/dict/dict.txt.big ...
Loading model from cache /tmp/jieba.u849ecfdca27003d306f39ca004b82b5b.cache
Loading model cost 1.173 seconds.
Prefix dict has been built succesfully.


In [8]:
sample = pd.read_csv('datas/sample_test_data.txt')

sample_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
sample_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
sample_y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in sample_x2]) == 0)

sample_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in sample_x1]
sample_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in sample_x2]

In [9]:
test_datas = pd.read_csv('datas/AIFirstProblem.txt')

test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)

test_x1 = [[word for word in jieba.cut(' '.join(s)) if word != ' '] for s in test_x1]
test_x2 = [[[word for word in jieba.cut(r) if word != ' '] for r in rs] for rs in test_x2]

In [10]:
def lst_word2id(lst):
    return [word2id['<bos>']] + [word2id[w] if w in word2id else word2id[unknown_word] for w in lst] + [word2id['<eos>']]

In [11]:
def eval_loss(lst):
    return np.mean(sess.run(loss, {
        X: [lst[:-1]],
        Y_: [lst[1:]],
        Hin: np.zeros([1, INTERNALSIZE*NLAYERS]),
        batchsize: 1,
    }))

In [12]:
def eval_ans(lst_id1, lst_id2):
    score = []
    for i in range(len(lst_id1)):
        for j in range(6):
            score.append(eval_loss(lst_word2id(lst_id1[i]) + lst_word2id(lst_id2[i][j])))
    return np.argmin(np.array(score).reshape(-1, 6), axis=1)

In [None]:
saver = tf.train.Saver()
sess = tf.Session()
saver.restore(sess, 'models/Attack-language-model/lm-10800')

In [None]:
my_ans = eval_ans(sample_x1, sample_x2)

In [None]:
my_ans, sample_y

In [None]:
np.sum(my_ans == sample_y)