In [None]:
import numpy as np
import tensorflow as tf

# Import util
import time
import re
import sys
import gc

# Self define module
from mini_batch_helper import rnn_minibatch_sequencer

## Loading corpus and forming dict

In [None]:
corpus_fnames = [
    'datas/training_data/no_TC_下課花路米.txt',
    'datas/training_data/no_TC_誰來晚餐.txt',
    'datas/training_data/no_TC_公視藝文大道.txt',
    'datas/training_data/no_TC_成語賽恩思.txt',
    'datas/training_data/no_TC_我的這一班.txt',
    'datas/training_data/no_TC_流言追追追.txt',
    'datas/training_data/no_TC_人生劇展.txt',
    'datas/training_data/no_TC_聽聽看.txt',
]
sample_rate_on_training_datas = 0.3

def word_tok_lst_2_ch_lst(s):
    return ['<bos>'] + [ch.strip() for word in s for ch in word if ch.strip() != ''] + ['<eos>']

corpus = []
for fname in corpus_fnames:
    with open(fname, 'r') as f:
        now_corpus = np.array([line for line in f])
        sample_num = int(max(len(now_corpus)*sample_rate_on_training_datas, 5))
        rnd_idx = np.arange(len(now_corpus))
        np.random.shuffle(rnd_idx)
        now_corpus = now_corpus[rnd_idx[:sample_num]]
        corpus.extend([ch for line in now_corpus for s in line.strip().split('\t') for ch in word_tok_lst_2_ch_lst(s)])

id2ch = list(set(corpus))
ch2id = dict([(ch, i) for i, ch in enumerate(id2ch)])

corpus_id = np.array([ch2id[ch] for ch in corpus])
del(corpus)

In [None]:
len(corpus_id)

In [None]:
SEQLEN = 10
BATCHSIZE = 32
EPOCHNUM = 10
ALPHASIZE = len(id2ch)
INTERNALSIZE = 200         # InternalSize == EmbeddingSize
NLAYERS = 2
LEARNING_RATE = 0.001
DROPOUT_PKEEP = 0.8
LOGINTERVAL = 10
SAVEINTERVAL= 100
# CLIP = 0.2

print('%20s: %s' % ('SEQLEN', SEQLEN))
print('%20s: %s' % ('BATCHSIZE', BATCHSIZE))
print('%20s: %s' % ('EPOCHNUM', EPOCHNUM))
print('%20s: %s' % ('ALPHASIZE', ALPHASIZE))
print('%20s: %s' % ('INTERNALSIZE', INTERNALSIZE))
print('%20s: %s' % ('NLAYERS', NLAYERS))
print('%20s: %s' % ('LEARNING_RATE', LEARNING_RATE))
print('%20s: %s' % ('DROPOUT_PKEEP', DROPOUT_PKEEP))
print('%20s: %s' % ('LOGINTERVAL', LOGINTERVAL))
print('%20s: %s' % ('SAVEINTERVAL', SAVEINTERVAL))
print('%20s: %s' % ('CLIP', CLIP))

-----------------------------
## Define model
Modified (mostly copy OuOb) from [here](https://github.com/martin-gorner/tensorflow-rnn-shakespeare/blob/master/rnn_train.py)

In [None]:
# inputs
X = tf.placeholder(tf.int32, [None, None])    # [ BATCHSIZE, SEQLEN ]
Xo = tf.one_hot(X, ALPHASIZE, 1.0, 0.0)       # [ BATCHSIZE, SEQLEN, ALPHASIZE ]
Y_ = tf.placeholder(tf.int32, [None, None])   # [ BATCHSIZE, SEQLEN ]
Yo_ = tf.one_hot(Y_, ALPHASIZE)               # [ BATCHSIZE, SEQLEN, ALPHASIZE ]

# inputs info
batchsize = tf.placeholder(tf.int32)

# input state
Hin = tf.placeholder(tf.float32, [None, INTERNALSIZE*NLAYERS])  # [ BATCHSIZE, INTERNALSIZE * NLAYERS]

cells = [tf.contrib.rnn.GRUCell(INTERNALSIZE) for _ in range(NLAYERS)]
dropcells = [tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=DROPOUT_PKEEP) for cell in cells]
multicell = tf.contrib.rnn.MultiRNNCell(dropcells, state_is_tuple=False)
multicell = tf.contrib.rnn.DropoutWrapper(multicell, output_keep_prob=DROPOUT_PKEEP)

# Yr: [ BATCHSIZE, SEQLEN, INTERNALSIZE ]
# H:  [ BATCHSIZE, INTERNALSIZE*NLAYERS ] (last state in the sequence)
Yr, H = tf.nn.dynamic_rnn(multicell, Xo, dtype=tf.float32, initial_state=Hin)

Yflat = tf.reshape(Yr, [-1, INTERNALSIZE])               # [ BATCHSIZE x SEQLEN, INTERNALSIZE ]
Ylogits = tf.contrib.layers.linear(Yflat, ALPHASIZE)     # [ BATCHSIZE x SEQLEN, ALPHASIZE ]
Yflat_ = tf.reshape(Yo_, [-1, ALPHASIZE])                # [ BATCHSIZE x SEQLEN, ALPHASIZE ]
loss = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Yflat_)  # [ BATCHSIZE x SEQLEN ]
loss = tf.reshape(loss, [batchsize, -1])      # [ BATCHSIZE, SEQLEN ]
Yo = tf.nn.softmax(Ylogits)                   # [ BATCHSIZE x SEQLEN, ALPHASIZE ]

# Gradient clipping
# optimizer = tf.train.AdamOptimizer(LEARNING_RATE)
# gvs = optimizer.compute_gradients(loss)
# capped_gvs = [(tf.clip_by_norm(grad, CLIP), var) for grad, var in gvs]
# train_step = optimizer.apply_gradients(capped_gvs)
train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

----------------------------------
## Training

In [None]:
saver = tf.train.Saver()
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [None]:
step= 0
start_time = time.time()
istate = np.zeros([BATCHSIZE, INTERNALSIZE*NLAYERS])  # initial zero input state
batch_loss = 0

for x, y_, epoch in rnn_minibatch_sequencer(traintext, BATCHSIZE, SEQLEN, EPOCHNUM):
    step += 1
    _, now_loss, istate = sess.run([train_step, loss, H], {
        X: x,
        Y_: y_,
        Hin: istate,
        batchsize: BATCHSIZE,
    })
    batch_loss += np.mean(now_loss) / LOGINTERVAL
    if step % LOGINTERVAL == 0:
        print('epoch %2d: batch loss %10f / elapsed time %.f' % (epoch, batch_loss, time.time() - start_time), flush=True)
        batch_loss = 0
    if step % SAVEINTERVAL == 0:
        saver.save(sess, 'models/Attack-language-model/lm', global_step=step)
        print('Saved model', flush=True)

In [None]:
saver.save(sess, 'models/Attack-language-model/lm-final')

------------------------------
## Evaluating