# nmt

参考nmt模型搭建一个seq2seq的翻译系统，和之前seq2seq的区别在于加入attention，并且采用了更快的训练方法。

![seq2seq.jpg](attachment:seq2seq.jpg)

## 数据处理

At the bottom layer, the encoder and decoder RNNs receive as input the following: first, the source sentence, then a boundary marker "<s\>" which indicates the transition from the encoding to the decoding mode, and the target sentence. For training, we will feed the system with the following tensors, which are in time-major format and contain word indices:

- encoder_inputs [max_encoder_time, batch_size]: source input words.
- decoder_inputs [max_decoder_time, batch_size]: target input words.
- decoder_outputs [max_decoder_time, batch_size]: target output words, these are decoder_inputs shifted to the left by one time step with an end-of-sentence tag appended on the right.

Here for efficiency, we train with multiple sentences (batch_size) at once. Testing is slightly different, so we will discuss it later.

我们按照[max_encoder_time, batch_size]的格式将每一个batch的数据进行处理。

In [1]:
# ========读取原始数据========
with open('cmn.txt', 'r', encoding='utf-8') as f:
    data = f.read()
data = data.split('\n')
data = data[:100]
print(data[-5:])


# 分割英文数据和中文数据
en_data = [line.split('\t')[0] for line in data]
ch_data = [line.split('\t')[1] for line in data]
print('英文数据:\n', en_data[:10])
print('\n中文数据:\n', ch_data[:10])

['Tom died.\t汤姆去世了。', 'Tom quit.\t汤姆不干了。', 'Tom swam.\t汤姆游泳了。', 'Trust me.\t相信我。', 'Try hard.\t努力。']
英文数据:
 ['Hi.', 'Hi.', 'Run.', 'Wait!', 'Hello!', 'I try.', 'I won!', 'Oh no!', 'Cheers!', 'He ran.']

中文数据:
 ['嗨。', '你好。', '你用跑的。', '等等！', '你好。', '让我来。', '我赢了。', '不会吧。', '乾杯!', '他跑了。']


In [2]:
# 特殊字符
SOURCE_CODES = ['<PAD>', '<UNK>']
TARGET_CODES = ['<PAD>', '<EOS>', '<UNK>', '<GO>']  # 在target中，需要增加<GO>与<EOS>特殊字符

# 分别生成中英文字典
en_vocab = set(''.join(en_data))
id2en = SOURCE_CODES + list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

# 分别生成中英文字典
ch_vocab = set(''.join(ch_data))
id2ch = TARGET_CODES + list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

print('\n英文字典:\n', en2id)
print('\n中文字典共计\n:', ch2id)


英文字典:
 {'<PAD>': 0, '<UNK>': 1, 'L': 2, 'b': 3, 'B': 4, 'R': 5, 'l': 6, 'N': 7, 'K': 8, 'P': 9, 'i': 10, 'H': 11, 'I': 12, 'h': 13, 'T': 14, 'v': 15, 'w': 16, 'd': 17, 'O': 18, 'y': 19, 'W': 20, '!': 21, '.': 22, 'm': 23, 'S': 24, 'r': 25, 'n': 26, 't': 27, 'e': 28, 'f': 29, 'p': 30, 'g': 31, 'A': 32, 'k': 33, 's': 34, '?': 35, 'D': 36, "'": 37, 'c': 38, 'Y': 39, 'G': 40, 'u': 41, 'a': 42, 'q': 43, 'J': 44, 'C': 45, ' ': 46, 'o': 47}

中文字典共计
: {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3, '加': 4, '泳': 5, '吃': 6, '持': 7, '不': 8, '系': 9, '生': 10, '來': 11, '忙': 12, '完': 13, '忘': 14, '呆': 15, '力': 16, '起': 17, '问': 18, '辞': 19, '嘴': 20, '往': 21, '？': 22, '什': 23, '欢': 24, '！': 25, '住': 26, '到': 27, '老': 28, '用': 29, '杯': 30, '吧': 31, '好': 32, '留': 33, '閉': 34, '把': 35, '们': 36, '們': 37, '干': 38, '帮': 39, '确': 40, '冷': 41, '个': 42, '定': 43, '试': 44, '跳': 45, '是': 46, '意': 47, '联': 48, '趴': 49, '拿': 50, '立': 51, '事': 52, '開': 53, '病': 54, '管': 55, '关': 56, '清': 57, '很': 58, '迎': 59, '洗': 

In [3]:
import numpy as np
# 利用字典，映射数据
en_num_data = [[en2id[en] for en in line] for line in en_data]
ch_num_data = [[ch2id['<GO>']] + [ch2id[ch] for ch in line] for line in ch_data]
de_num_data = [[ch2id[ch] for ch in line] + [ch2id['<EOS>']] for line in ch_data]

print('char:', en_data[1])
print('index:', en_num_data[1])


en_maxlength = max([len(line) for line in en_num_data])
de_maxlength = max([len(line) for line in ch_num_data])



# 设计数据生成器
def batch_data(en_num_data, ch_num_data, de_num_data, batch_size):
    batch_num = len(en_num_data) // batch_size
    for i in range(batch_num):
        begin = i * batch_size
        end = begin + batch_size
        encoder_inputs = en_num_data[begin:end]
        decoder_inputs = ch_num_data[begin:end]
        decoder_targets = de_num_data[begin:end]
        encoder_lengths = [len(line) for line in encoder_inputs]        
        decoder_lengths = [len(line) for line in decoder_inputs]
        encoder_max_length = max(encoder_lengths)
        decoder_max_length = max(decoder_lengths)
        encoder_inputs = np.array([data + [en2id['<PAD>']] * (encoder_max_length - len(data)) for data in encoder_inputs]).T
        decoder_inputs = np.array([data + [en2id['<PAD>']] * (decoder_max_length - len(data)) for data in decoder_inputs]).T
        decoder_targets = np.array([data + [en2id['<PAD>']] * (decoder_max_length - len(data)) for data in decoder_targets])
        mask = decoder_targets > 0
        target_weights = np.ma.array(decoder_targets,mask=mask).astype(np.float32)
        yield encoder_inputs, decoder_inputs, decoder_targets, target_weights, encoder_lengths, decoder_lengths
              


char: Hi.
index: [11, 10, 22]


In [4]:
import tensorflow as tf

max_encoder_seq_length = en_maxlength
max_decoder_seq_length = de_maxlength
keepprb = 0.9

EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)

HIDDEN_LAYERS = 2
HIDDEN_SIZE = 128

learning_rate = 0.001

BATCH_SIZE = 8
BATCH_NUMS = len(ch_num_data) // BATCH_SIZE
MAX_GRAD_NORM = 1

EPOCHS = 50


## placeholder

In [5]:
encoder_inputs = tf.placeholder(tf.int32, [None, BATCH_SIZE])
decoder_inputs = tf.placeholder(tf.int32, [None, BATCH_SIZE])
decoder_targets = tf.placeholder(tf.int32, [BATCH_SIZE, None])
target_weights = tf.placeholder(tf.float32, [BATCH_SIZE, None])
source_sequence_length = tf.placeholder(tf.int32, [BATCH_SIZE,])
decoder_lengths = tf.placeholder(tf.int32, [BATCH_SIZE,])

keepprb = tf.placeholder(tf.float32)

## Embedding词嵌入层

In [6]:
# encoder
with tf.name_scope('embedding_encoder'):
	encoder_embedding = tf.get_variable('embedding_encoder', [EN_VOCAB_SIZE, HIDDEN_SIZE])
	encoder_emb = tf.nn.embedding_lookup(encoder_embedding, encoder_inputs)
	encoder_emb = tf.nn.dropout(encoder_emb, keepprb)

    
# decoder
with tf.name_scope('embedding_decoder'):
	decoder_embedding = tf.get_variable('embedding_decoder', [CH_VOCAB_SIZE, HIDDEN_SIZE])
	decoder_emb = tf.nn.embedding_lookup(decoder_embedding, decoder_inputs)
	decoder_emb = tf.nn.dropout(decoder_emb, keepprb)

## Encoder
Once retrieved, the word embeddings are then fed as input into the main network, which consists of two multi-layer RNNs – an encoder for the source language and a decoder for the target language. These two RNNs, in principle, can share the same weights; however, in practice, we often use two different RNN parameters (such models do a better job when fitting large training datasets). The encoder RNN uses zero vectors as its starting states and is built as follows:
```py
# Build RNN cell
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

# Run Dynamic RNN
#   encoder_outputs: [max_time, batch_size, num_units]
#   encoder_state: [batch_size, num_units]
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_emb_inp,
    sequence_length=source_sequence_length, time_major=True)
```
Note that sentences have different lengths to avoid wasting computation, we tell dynamic_rnn the exact source sentence lengths through source_sequence_length. Since our input is time major, we set time_major=True. Here, we build only a single layer LSTM, encoder_cell. We will describe how to build multi-layer LSTMs, add dropout, and use attention in a later section.

In [7]:
# encoder
with tf.variable_scope('encoder'):
	encoder_lstm = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE, state_is_tuple=True)
	encoder_lstm = tf.contrib.rnn.DropoutWrapper(encoder_lstm, output_keep_prob=keepprb)
	encoder_cell = tf.contrib.rnn.MultiRNNCell([encoder_lstm for _ in range(HIDDEN_LAYERS)])
	initial_state = encoder_cell.zero_state(BATCH_SIZE, tf.float32)
	encoder_outputs, final_state = tf.nn.dynamic_rnn(encoder_cell, encoder_emb, sequence_length=source_sequence_length, 
                                       time_major=True, initial_state=initial_state)

In [8]:
attention_states = tf.transpose(encoder_outputs, [1, 0, 2])
# Create an attention mechanism
attention_mechanism = tf.contrib.seq2seq.LuongAttention(HIDDEN_SIZE, attention_states, memory_sequence_length=source_sequence_length)


In [9]:
from tensorflow.python.layers.core import Dense

# decoder cell
with tf.variable_scope('decoder_cell'):
    decoder_lstm = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE, state_is_tuple=True)
    decoder_lstm = tf.contrib.rnn.DropoutWrapper(decoder_lstm, output_keep_prob=keepprb)
    decoder_cell = [decoder_lstm] * HIDDEN_LAYERS
    decoder_cell[-1] = tf.contrib.seq2seq.AttentionWrapper(decoder_cell[-1], attention_mechanism, attention_layer_size=HIDDEN_SIZE)
    #decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cell)
    # Helper
    projection_layer = Dense(CH_VOCAB_SIZE, use_bias=False)
    with tf.variable_scope('helper'):
        helper = tf.contrib.seq2seq.TrainingHelper(decoder_emb, decoder_lengths, time_major=True)
    #init_state = decoder_cell.zero_state(BATCH_SIZE, tf.float32).clone(cell_state=final_state)
    print(initial_state)
    decoder_cell = tf.contrib.rnn.MultiRNNCell(decoder_cell)
    decoder_cell = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper, final_state, output_layer=projection_layer)
    print(final_state)
    print(projection_layer)
    outputs, final_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder_cell)

    logits = outputs.rnn_output

(LSTMStateTuple(c=<tf.Tensor 'encoder/MultiRNNCellZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros:0' shape=(8, 128) dtype=float32>, h=<tf.Tensor 'encoder/MultiRNNCellZeroState/DropoutWrapperZeroState/LSTMCellZeroState/zeros_1:0' shape=(8, 128) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'encoder/MultiRNNCellZeroState/DropoutWrapperZeroState_1/LSTMCellZeroState/zeros:0' shape=(8, 128) dtype=float32>, h=<tf.Tensor 'encoder/MultiRNNCellZeroState/DropoutWrapperZeroState_1/LSTMCellZeroState/zeros_1:0' shape=(8, 128) dtype=float32>))
(LSTMStateTuple(c=<tf.Tensor 'encoder/rnn/while/Exit_3:0' shape=(8, 128) dtype=float32>, h=<tf.Tensor 'encoder/rnn/while/Exit_4:0' shape=(8, 128) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'encoder/rnn/while/Exit_5:0' shape=(8, 128) dtype=float32>, h=<tf.Tensor 'encoder/rnn/while/Exit_6:0' shape=(8, 128) dtype=float32>))
<tensorflow.python.layers.core.Dense object at 0x0000018F4F345F60>


TypeError: Expected state to be instance of AttentionWrapperState. Received type <class 'tensorflow.python.ops.rnn_cell_impl.LSTMStateTuple'> instead.

In [None]:
with tf.variable_scope('optimizer'):
    # ======计算损失=======
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=decoder_targets, logits=logits)
    cost = (tf.reduce_sum((loss * target_weights) / BATCH_SIZE))

    # =============优化算法==============
    # =============学习率衰减==============
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(learning_rate, global_step, BATCH_NUMS, 0.99, staircase=True)

                # =======通过clip_by_global_norm()控制梯度大小======
    trainable_variables = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, trainable_variables), MAX_GRAD_NORM)
    opt = tf.train.AdamOptimizer(learning_rate).apply_gradients(zip(grads, trainable_variables))

		# ==============预测输出=============
predict = tf.argmax(logits[0], 1)

In [None]:
# 保存模型
saver = tf.train.Saver()
with tf.Session() as sess:
	writer = tf.summary.FileWriter('logs/tensorboard', tf.get_default_graph())
	sess.run(tf.global_variables_initializer())
	for k in range(EPOCHS):
		total_loss = 0.
		data_generator = batch_data(en_num_data, ch_num_data, de_num_data, BATCH_SIZE)
		for i in range(BATCH_NUMS):
			en_input, de_input, de_tg, tg_weight, en_len, de_len = next(data_generator)
			feed = {encoder_inputs: en_input, decoder_inputs: de_input, decoder_targets: de_tg, target_weights: tg_weight, source_sequence_length: en_len, decoder_lengths: de_len, keepprb: 0.8}
			costs, _ = sess.run([cost, opt], feed_dict=feed)
			total_loss += costs
			if (i+1) % 50 == 0:
				print('epochs:', k + 1, 'iter:', i + 1, 'cost:', total_loss / i + 1)
				#print('predict:', sess.run(predict[0], feed_dict=feed))
				print('text:', ''.join([id2ch[i] for i in sess.run(predict, feed_dict=feed)]))
				print('label:', ''.join([id2ch[i] for i in de_tg[0]]))
                
	saver.save(sess, './checkpoints/lstm.ckpt')

writer.close()