In [1]:
# -*- coding: utf-8 -*-

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils import shuffle
from keras.preprocessing.sequence import pad_sequences
import os
from tqdm import tqdm
import pickle

Using TensorFlow backend.


In [3]:
def load_vocab(path):
    with open(path, 'r', encoding='utf-8') as fr:
        vocab = fr.readlines()
        vocab = [w.strip('\n') for w in vocab]
    return vocab  

vocab_ch = load_vocab('data/vocab.ch')
vocab_en = load_vocab('data/vocab.en')
print(len(vocab_ch), vocab_ch[:20])
print(len(vocab_en), vocab_en[:20])

word2id_ch = {w: i for i, w in enumerate(vocab_ch)}
id2word_ch = {i: w for i, w in enumerate(vocab_ch)}
word2id_en = {w: i for i, w in enumerate(vocab_en)}
id2word_en = {i: w for i, w in enumerate(vocab_en)}

20003 ['<unk>', '<s>', '</s>', ',', '的', '.', '"', '和', '在', '了', '中国', '是', '对', '发展', '与', '美国', '要', '中', '一', '问题']
20003 ['<unk>', '<s>', '</s>', 'the', ',', 'and', 'of', '.', 'to', 'in', '"', 'a', '-', 'is', 'that', "'s", 'for', 'on', 'china', 'with']


In [7]:
def load_data(path, word2id):
    with open(path, 'r', encoding='utf-8') as fr:
        lines = fr.readlines()
        sentences = [line.strip('\n').split(' ') for line in lines]
        sentences = [[word2id['<s>']] + [word2id[w] for w in sentence] + [word2id['</s>']]
                     for sentence in sentences]
        lens = [len(sentence) for sentence in sentences]  # 统计每行的长度
        maxlen = np.max(lens)  # 单行最大长度
        return sentences, lens, maxlen

# train: training, no beam search, calculate loss
# eval: no training, no beam search, calculate loss
# infer: no training, beam search, calculate bleu

mode = 'train'

train_ch, len_train_ch, maxlen_train_ch = load_data('data/train.ch', word2id_ch)
train_en, len_train_en, maxlen_train_en = load_data('data/train.en', word2id_en)
dev_ch, len_dev_ch, maxlen_dev_ch = load_data('data/dev.ch', word2id_ch)
dev_en, len_dev_en, maxlen_dev_en = load_data('data/dev.en', word2id_en)
test_ch, len_test_ch, maxlen_test_ch = load_data('data/test.ch', word2id_ch)
test_en, len_test_en, maxlen_test_en = load_data('data/test.en', word2id_en)

maxlen_ch = np.max([maxlen_train_ch, maxlen_dev_ch, maxlen_test_ch])  # 训练集、验证集、测试集的单行最大长度
maxlen_en = np.max([maxlen_train_en, maxlen_dev_en, maxlen_test_en])  # 训练集、验证集、测试集的单行最大长度

print('maxlen_ch:', maxlen_ch)
print('maxlen_en:', maxlen_en)

for x in train_ch[0:5]:
    print(x)
for x in train_en[0:5]:
    print(x)

if mode == 'train':
    train_ch = pad_sequences(train_ch, maxlen=maxlen_ch, padding='post', value=word2id_ch['</s>'])  # 在句子结尾填充，填充值为 </s> 对应的 id
    train_en = pad_sequences(train_en, maxlen=maxlen_en, padding='post', value=word2id_en['</s>'])
    print(train_ch.shape, train_en.shape)
    for x in train_ch[0:5]:
        print(x)
    for x in train_en[0:5]:
        print(x)
elif mode == 'eval':
    dev_ch = pad_sequences(dev_ch, maxlen=maxlen_ch, padding='post', value=word2id_ch['</s>'])
    dev_en = pad_sequences(dev_en, maxlen=maxlen_en, padding='post', value=word2id_en['</s>'])
    print(dev_ch.shape, dev_en.shape)
    for x in dev_ch[0:5]:
        print(x)
    for x in dev_en[0:5]:
        print(x)
elif mode == 'infer':
    test_ch = pad_sequences(test_ch, maxlen=maxlen_ch, padding='post', value=word2id_ch['</s>'])
    test_en = pad_sequences(test_en, maxlen=maxlen_en, padding='post', value=word2id_en['</s>'])
    print(test_ch.shape, test_en.shape)
    for x in test_ch[0:5]:
        print(x)
    for x in test_en[0:5]:
        print(x)

maxlen_ch: 62
maxlen_en: 62
[1, 1613, 3, 593, 121, 435, 3, 0, 3, 53, 86, 1139, 1133, 277, 5389, 43, 6148, 966, 4, 694, 3, 429, 1766, 3, 3200, 0, 3, 14170, 15367, 3, 82, 1551, 244, 47, 6422, 6148, 5, 2]
[1, 8, 842, 5575, 3, 244, 8, 11059, 3, 18806, 3, 6045, 0, 87, 0, 3, 345, 0, 3, 9035, 1473, 514, 1373, 607, 48, 3, 188, 514, 2760, 2519, 48, 5, 2]
[1, 0, 0, 0, 18, 846, 5477, 2829, 2571, 510, 6148, 966, 563, 4, 2132, 3, 96, 88, 6284, 238, 24, 4, 4109, 569, 490, 490, 3, 24, 22, 41, 6, 1585, 2251, 86, 57, 100, 465, 6148, 42, 9364, 5, 2]
[1, 0, 1518, 31, 9740, 3872, 11060, 9741, 1779, 332, 0, 4058, 22, 41, 6, 32, 27, 7, 32, 1185, 1230, 3, 37, 42, 843, 29, 4, 7255, 3, 7255, 11061, 1603, 12, 29, 47, 117, 3, 2761, 42, 92, 1473, 14171, 5, 6, 2]
[1, 277, 2868, 93, 3, 115, 750, 3200, 2035, 3, 1641, 750, 508, 31, 4578, 16879, 48, 1014, 4451, 5, 2]
[1, 9, 1041, 4, 3, 14251, 944, 3532, 189, 89, 5, 3, 14251, 3532, 189, 89, 92, 1501, 1341, 5, 1064, 17, 11, 1570, 362, 448, 1130, 3814, 947, 107, 3, 1206,

In [4]:
X = tf.placeholder(tf.int32, [None, maxlen_ch])  # 每一批次有若干条数据，每个的长度为 maxlen_ch
X_len = tf.placeholder(tf.int32, [None])
Y = tf.placeholder(tf.int32, [None, maxlen_en])
Y_len = tf.placeholder(tf.int32, [None])
Y_in = Y[:, :-1]  # 不包含最后一个 </s>
Y_out = Y[:, 1:]  # 不包含开头的 <s>

# 参数的两种初始化方式
k_initializer = tf.contrib.layers.xavier_initializer()
e_initializer = tf.random_uniform_initializer(-1.0, 1.0)

embedding_size = 512  # 嵌入层维度（词向量维度）
hidden_size = 512  # 隐藏层单元数

# 如果是训练模式，batch size 为 128，其它模式 batch size 为 16
if mode == 'train':
    batch_size = 128
else:
    batch_size = 16

with tf.variable_scope('embedding_X'):
    embeddings_X = tf.get_variable('weights_X', shape=[len(word2id_ch), embedding_size], initializer=e_initializer)
    embedded_X = tf.nn.embedding_lookup(embeddings_X, X) # batch_size, seq_len, embedding_size
    
with tf.variable_scope('embedding_Y'):
    embeddings_Y = tf.get_variable('weights_Y', shape=[len(word2id_en), embedding_size], initializer=e_initializer)
    embedded_Y = tf.nn.embedding_lookup(embeddings_Y, Y_in) # batch_size, seq_len, embedding_size

In [5]:
def single_cell(mode=mode):
    """
    定义 LSTM 单元
    @param mode: 模式（训练模式或者其它模式），如果是其它模式则不设置反向随机失活（dropout）
    @return cell: LSTM 单元
    """
    if mode == 'train':
        keep_prob = 0.8
    else:
        keep_prob = 1.0
    cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
    cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=keep_prob)
    return cell


def multi_cells(num_layers):
    """
    定义多层 LSTM
    @param num_layers: LSTM 层数
    @retrun tf.nn.rnn_cell.MultiRNNCell(cells): 返回多层 LSTM 神经网络
    """
    cells = []
    for i in range(num_layers):
        cell = single_cell()
        cells.append(cell)
    return tf.nn.rnn_cell.MultiRNNCell(cells)
    

"""
编码器部分
双向循环、单层 LSTM
"""
with tf.variable_scope('encoder'):
    num_layers = 1
    fw_cell = multi_cells(num_layers)
    bw_cell = multi_cells(num_layers)
    bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, embedded_X, dtype=tf.float32, sequence_length=X_len)
    # fw: batch_size, seq_len, hidden_size
    # bw: batch_size, seq_len, hidden_size
    print('=' * 100, '\n', bi_outputs)
    
    encoder_outputs = tf.concat(bi_outputs, -1)
    print('=' * 100, '\n', encoder_outputs) # batch_size, seq_len, 2 * hidden_size
    
    # 2 tuple(fw & bw), 2 tuple(c & h), batch_size, hidden_size
    # ((c, h), (c, h))
    # (([?, 512], [?, 512]), ([?, 512], [?, 512]))
    print('=' * 100, '\n', bi_state)
    
    encoder_state = []
    for i in range(num_layers):
        encoder_state.append(bi_state[0][i])  # forward
        encoder_state.append(bi_state[1][i])  # backward
    encoder_state = tuple(encoder_state) # 2 tuple, 2 tuple(c & h), batch_size, hidden_size
    
    print('=' * 100)
    for i in range(len(encoder_state)):
        print(i, encoder_state[i])

Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead
 (<tf.Tensor 'encoder/bidirectional_rnn/fw/fw/transpose_1:0' shape=(?, 62, 512) dtype=float32>, <tf.Tensor 'encoder/ReverseSequence:0' shape=(?, 62, 512) dtype=float32>)
 Tensor("encoder/concat:0", shape=(?, 62, 1024), dtype=float32)
 ((LSTMStateTuple(c=<tf.Tensor 'encoder/bidirectional_rnn/fw/fw/while/Exit_3:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'encoder/bidirectional_rnn/fw/fw/while/Exit_4:0' shape=(?, 512) dtype=float32>),), (LSTMStateTuple(c=<tf.Tensor 'encoder/bidirectional_rnn/bw/bw/while/Exit_3:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'encoder/bidirectional_rnn/bw/bw/while/Exit_4:0' shape=(?, 512) dtype=float32>),))
0 LSTMStateTuple(c=<tf.Tensor 'encoder/bidirectional_rnn/fw/fw/while/Exit_3:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'encoder/bidirectional_rnn/fw/fw/while/Exit_4:0' shape=(?, 512) dtype=float32>)
1 LST

In [6]:
"""
解码部分
"""
with tf.variable_scope('decoder'):
    beam_width = 10
    memory = encoder_outputs
                                                                                                                                                                                                                                                                                                          
    if mode == 'infer':
        memory = tf.contrib.seq2seq.tile_batch(memory, beam_width)
        X_len_ = tf.contrib.seq2seq.tile_batch(X_len, beam_width)
        encoder_state = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)
        bs = batch_size * beam_width
    else:
        bs = batch_size
        X_len_ = X_len
    
    attention = tf.contrib.seq2seq.LuongAttention(hidden_size, memory, X_len_, scale=True) # multiplicative
    # attention = tf.contrib.seq2seq.BahdanauAttention(hidden_size, memory, X_len_, normalize=True) # additive
    cell = multi_cells(num_layers * 2)
    cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention, hidden_size, name='attention')
    decoder_initial_state = cell.zero_state(bs, tf.float32).clone(cell_state=encoder_state)
    
    with tf.variable_scope('projected'):
        output_layer = tf.layers.Dense(len(word2id_en), use_bias=False, kernel_initializer=k_initializer)
    
    if mode == 'infer':
        start = tf.fill([batch_size], word2id_en['<s>'])
        decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell, embeddings_Y, start, word2id_en['</s>'],
                                                       decoder_initial_state, beam_width, output_layer)
        outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(decoder,
                                                output_time_major=True,
                                                maximum_iterations=2 * tf.reduce_max(X_len))
        sample_id = outputs.predicted_ids
    else:
        helper = tf.contrib.seq2seq.TrainingHelper(embedded_Y, [maxlen_en - 1 for b in range(batch_size)])
        decoder = tf.contrib.seq2seq.BasicDecoder(cell, helper, decoder_initial_state, output_layer)
        
        outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(decoder, 
                                                                            output_time_major=True)
        logits = outputs.rnn_output
        logits = tf.transpose(logits, (1, 0, 2))
        print(logits)

Tensor("decoder/transpose:0", shape=(128, ?, 20003), dtype=float32)


In [7]:
if mode != 'infer':
    with tf.variable_scope('loss'):
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=Y_out, logits=logits)
        mask = tf.sequence_mask(Y_len, tf.shape(Y_out)[1], tf.float32)
        loss = tf.reduce_sum(loss * mask) / batch_size

if mode == 'train':
    learning_rate = tf.Variable(0.0, trainable=False)
    params = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, params), 5.0)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).apply_gradients(zip(grads, params))

In [8]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

if mode == 'train':
    saver = tf.train.Saver()
    OUTPUT_DIR = 'model_diy'
    if not os.path.exists(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)
        
    tf.summary.scalar('loss', loss)
    summary = tf.summary.merge_all()
    writer = tf.summary.FileWriter(OUTPUT_DIR)
        
    epochs = 20
    for e in range(epochs):
        total_loss = 0
        total_count = 0
        
        start_decay = int(epochs * 2 / 3)
        if e <= start_decay:
            lr = 1.0
        else:
            decay = 0.5 ** (int(4 * (e - start_decay) / (epochs - start_decay)))
            lr = 1.0 * decay
        sess.run(tf.assign(learning_rate, lr))
        
        train_ch, len_train_ch, train_en, len_train_en = shuffle(train_ch, len_train_ch, train_en, len_train_en)
        
        for i in tqdm(range(train_ch.shape[0] // batch_size)):
            X_batch = train_ch[i * batch_size: i * batch_size + batch_size]
            X_len_batch = len_train_ch[i * batch_size: i * batch_size + batch_size]
            Y_batch = train_en[i * batch_size: i * batch_size + batch_size]
            Y_len_batch = len_train_en[i * batch_size: i * batch_size + batch_size]
            Y_len_batch = [l - 1 for l in Y_len_batch]

            feed_dict = {X: X_batch, Y: Y_batch, X_len: X_len_batch, Y_len: Y_len_batch}
            _, ls_ = sess.run([optimizer, loss], feed_dict=feed_dict)
            
            total_loss += ls_ * batch_size
            total_count += np.sum(Y_len_batch)

            if i > 0 and i % 100 == 0:
                writer.add_summary(sess.run(summary, 
                                            feed_dict=feed_dict), 
                                            e * train_ch.shape[0] // batch_size + i)
                writer.flush()
        
        print('Epoch %d lr %.3f perplexity %.2f' % (e, lr, np.exp(total_loss / total_count)))
        saver.save(sess, os.path.join(OUTPUT_DIR, 'nmt'))

100%|██████████| 781/781 [06:16<00:00,  2.11it/s]


Epoch 0 lr 1.000 perplexity 1280.41


100%|██████████| 781/781 [06:13<00:00,  2.09it/s]


Epoch 1 lr 1.000 perplexity 101.62


100%|██████████| 781/781 [06:13<00:00,  2.09it/s]


Epoch 2 lr 1.000 perplexity 44.82


100%|██████████| 781/781 [06:13<00:00,  2.13it/s]


Epoch 3 lr 1.000 perplexity 29.07


100%|██████████| 781/781 [06:13<00:00,  2.13it/s]


Epoch 4 lr 1.000 perplexity 22.38


100%|██████████| 781/781 [06:13<00:00,  2.08it/s]


Epoch 5 lr 1.000 perplexity 18.66


100%|██████████| 781/781 [06:13<00:00,  2.10it/s]


Epoch 6 lr 1.000 perplexity 16.27


100%|██████████| 781/781 [06:13<00:00,  2.09it/s]


Epoch 7 lr 1.000 perplexity 14.61


100%|██████████| 781/781 [06:13<00:00,  2.11it/s]


Epoch 8 lr 1.000 perplexity 13.34


100%|██████████| 781/781 [06:12<00:00,  2.09it/s]


Epoch 9 lr 1.000 perplexity 12.34


100%|██████████| 781/781 [06:13<00:00,  2.12it/s]


Epoch 10 lr 1.000 perplexity 11.54


100%|██████████| 781/781 [06:13<00:00,  2.12it/s]


Epoch 11 lr 1.000 perplexity 10.89


100%|██████████| 781/781 [06:13<00:00,  2.11it/s]


Epoch 12 lr 1.000 perplexity 10.32


100%|██████████| 781/781 [06:13<00:00,  2.07it/s]


Epoch 13 lr 1.000 perplexity 9.82


100%|██████████| 781/781 [06:13<00:00,  2.12it/s]


Epoch 14 lr 1.000 perplexity 9.42


100%|██████████| 781/781 [06:12<00:00,  2.10it/s]


Epoch 15 lr 0.500 perplexity 7.45


100%|██████████| 781/781 [06:13<00:00,  2.08it/s]


Epoch 16 lr 0.500 perplexity 6.93


100%|██████████| 781/781 [06:13<00:00,  2.09it/s]


Epoch 17 lr 0.250 perplexity 6.13


100%|██████████| 781/781 [06:13<00:00,  2.10it/s]


Epoch 18 lr 0.250 perplexity 5.89


100%|██████████| 781/781 [06:12<00:00,  2.11it/s]


Epoch 19 lr 0.125 perplexity 5.53


In [9]:
if mode == 'eval':
    saver = tf.train.Saver()
    OUTPUT_DIR = 'model_diy'
    saver.restore(sess, tf.train.latest_checkpoint(OUTPUT_DIR))
    
    total_loss = 0
    total_count = 0
    for i in tqdm(range(dev_ch.shape[0] // batch_size)):
        X_batch = dev_ch[i * batch_size: i * batch_size + batch_size]
        X_len_batch = len_dev_ch[i * batch_size: i * batch_size + batch_size]
        Y_batch = dev_en[i * batch_size: i * batch_size + batch_size]
        Y_len_batch = len_dev_en[i * batch_size: i * batch_size + batch_size]
        Y_len_batch = [l - 1 for l in Y_len_batch]
        
        feed_dict = {X: X_batch, Y: Y_batch, X_len: X_len_batch, Y_len: Y_len_batch}
        ls_ = sess.run(loss, feed_dict=feed_dict)
        
        total_loss += ls_ * batch_size
        total_count += np.sum(Y_len_batch)

    print('Dev perplexity %.2f' % np.exp(total_loss / total_count))

In [10]:
if mode == 'infer':
    saver = tf.train.Saver()
    OUTPUT_DIR = 'model_diy'
    saver.restore(sess, tf.train.latest_checkpoint(OUTPUT_DIR))
    
    def translate(ids):
        words = [id2word_en[i] for i in ids]
        if words[0] == '<s>':
            words = words[1:]
        if '</s>' in words:
            words = words[:words.index('</s>')]
        return ' '.join(words)
    
    fw = open('output_test_diy', 'w')
    for i in tqdm(range(test_ch.shape[0] // batch_size)):
        X_batch = test_ch[i * batch_size: i * batch_size + batch_size]
        X_len_batch = len_test_ch[i * batch_size: i * batch_size + batch_size]
        Y_batch = test_en[i * batch_size: i * batch_size + batch_size]
        Y_len_batch = len_test_en[i * batch_size: i * batch_size + batch_size]
        Y_len_batch = [l - 1 for l in Y_len_batch]
        
        feed_dict = {X: X_batch, Y: Y_batch, X_len: X_len_batch, Y_len: Y_len_batch}
        ids = sess.run(sample_id, feed_dict=feed_dict) # seq_len, batch_size, beam_width
        ids = np.transpose(ids, (1, 2, 0)) # batch_size, beam_width, seq_len
        ids = ids[:, 0, :] # batch_size, seq_len
        
        for j in range(ids.shape[0]):
            sentence = translate(ids[j])
            fw.write(sentence + '\n')
    fw.close()
    
    from nmt.utils.evaluation_utils import evaluate
    
    for metric in ['bleu', 'rouge']:
        score = evaluate('data/test.en', 'output_test_diy', metric)
        print(metric, score / 100)