# 基于注意力机制的翻译系统


首先加载依赖的包

In [1]:
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import numpy as np

Using TensorFlow backend.


## 1. 英文转汉语

之前基于seq2seq的模型，现在加入注意力机制。
### 1.1 数据集


In [1]:
# ========读取原始数据========
with open('cmn.txt', 'r', encoding='utf-8') as f:
    data = f.read()
data = data.split('\n')
data = data[:1000]
print(data[-5:])


# 分割英文数据和中文数据
en_data = [line.split('\t')[0] for line in data]
ch_data = [line.split('\t')[1] for line in data]
print('英文数据:\n', en_data[:10])
print('\n中文数据:\n', ch_data[:10])

["I don't want it.\t我不要.", 'I feel relieved.\t我感觉轻松了。', 'I get up at six.\t我六點起床。', 'I had no choice.\t那时我没有选择的余地。', 'I hate studying.\t我讨厌学习。']
英文数据:
 ['Hi.', 'Hi.', 'Run.', 'Wait!', 'Hello!', 'I try.', 'I won!', 'Oh no!', 'Cheers!', 'He ran.']

中文数据:
 ['嗨。', '你好。', '你用跑的。', '等等！', '你好。', '让我来。', '我赢了。', '不会吧。', '乾杯!', '他跑了。']


In [2]:
# 特殊字符
SOURCE_CODES = ['<PAD>', '<UNK>']
TARGET_CODES = ['<PAD>', '<EOS>', '<UNK>', '<GO>']  # 在target中，需要增加<GO>与<EOS>特殊字符

# 分别生成中英文字典
en_vocab = set(''.join(en_data))
id2en = SOURCE_CODES + list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

# 分别生成中英文字典
ch_vocab = set(''.join(ch_data))
id2ch = TARGET_CODES + list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

print('\n英文字典:\n', en2id)
print('\n中文字典共计\n:', ch2id)


英文字典:
 {'<PAD>': 0, '<UNK>': 1, 'd': 2, 'j': 3, 'F': 4, '8': 5, 't': 6, 'S': 7, 'w': 8, 'P': 9, 'W': 10, ' ': 11, 'u': 12, '0': 13, 'o': 14, 'r': 15, '3': 16, '!': 17, 'E': 18, ',': 19, 'Q': 20, 'J': 21, 'I': 22, 'a': 23, 'K': 24, 'T': 25, 'f': 26, 's': 27, 'L': 28, 'B': 29, 'k': 30, 'h': 31, '7': 32, 'v': 33, 'q': 34, 'm': 35, ':': 36, 'O': 37, 'c': 38, 'z': 39, '.': 40, 'Y': 41, 'R': 42, 'l': 43, 'e': 44, 'p': 45, "'": 46, 'A': 47, 'G': 48, 'b': 49, 'y': 50, 'g': 51, 'x': 52, 'H': 53, 'U': 54, '1': 55, 'C': 56, 'N': 57, '?': 58, 'D': 59, 'V': 60, 'M': 61, 'n': 62, 'i': 63}

中文字典共计
: {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3, '照': 4, '牢': 5, '存': 6, '學': 7, '他': 8, '认': 9, '说': 10, '溺': 11, '定': 12, '當': 13, '時': 14, '漲': 15, '车': 16, '逝': 17, '赏': 18, '员': 19, '乾': 20, '很': 21, '通': 22, '亲': 23, '指': 24, '聽': 25, '恥': 26, '們': 27, '丽': 28, '谦': 29, '举': 30, '狸': 31, '燃': 32, '矩': 33, '每': 34, '笔': 35, '嘴': 36, '誰': 37, '联': 38, '恨': 39, '對': 40, '斯': 41, '書': 42, '移': 43, '被': 

In [4]:
# 利用字典，映射数据
en_num_data = [[en2id[en] for en in line] for line in en_data]
de_num_data = [[ch2id[ch] for ch in line] + [ch2id['<EOS>']] for line in ch_data]

print('char:', en_data[1])
print('index:', en_num_data[1])

en_maxlength = max([len(line) for line in en_num_data])
ch_maxlength = max([len(line) for line in de_num_data])

# 文本数据转化为数字数据
en_num_data = [data + [en2id['<PAD>']] * (en_maxlength - len(data)) for data in en_num_data]
de_num_data = [data + [en2id['<PAD>']] * (ch_maxlength - len(data)) for data in de_num_data]


# 设计数据生成器
def batch_data(en_num_data, ch_num_data, de_num_data, batch_size):
    batch_num = len(en_num_data) // batch_size
    for i in range(batch_num):
        begin = i * batch_size
        end = begin + batch_size
        x = en_num_data[begin:end]
        z = de_num_data[begin:end]
        yield x, z


char: Hi.
index: [53, 63, 40]


In [5]:
import numpy as np

# 获取输入输出端的最大长度
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print('max encoder length:', max_encoder_seq_length)
print('max decoder length:', max_decoder_seq_length)

# 将数据进行onehot处理
encoder_input_data = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_data[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_data[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.


print('input shape:', encoder_input_data.shape)
print('output shape:', decoder_input_data.shape)

max encoder length: 9
max decoder length: 11
input shape: (100, 9, 46)
output shape: (100, 11, 149)


In [6]:
# =======预定义模型参数========
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

LEARNING_RATE = 0.003
BATCH_SIZE = 100
EPOCHS = 200

In [23]:
import tensorflow as tf

class RNNModel():
	"""docstring for RNNModel"""
	def __init__(self, BATCH_SIZE, HIDDEN_SIZE, HIDDEN_LAYERS, learning_rate):
		super(RNNModel, self).__init__()
		self.BATCH_SIZE = BATCH_SIZE
		self.HIDDEN_SIZE = HIDDEN_SIZE
		self.HIDDEN_LAYERS = HIDDEN_LAYERS
		
		# ======定义占位符======
		with tf.name_scope('input'):
			self.encoder_inputs = tf.placeholder(tf.float32, [BATCH_SIZE, max_encoder_seq_length, EN_VOCAB_SIZE])
			self.decoder_inputs = tf.placeholder(tf.float32, [BATCH_SIZE, max_decoder_seq_length, CH_VOCAB_SIZE])
			self.targets = tf.placeholder(tf.float32, [BATCH_SIZE, max_decoder_seq_length, None])
			self.keepprb = tf.placeholder(tf.float32)

		# ======搭建encoder结构=====
		with tf.name_scope('encoder'):
			lstm1 = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE, state_is_tuple=True)
			lstm1 = tf.contrib.rnn.DropoutWrapper(lstm1, output_keep_prob=self.keepprb)
			cell1 = tf.contrib.rnn.MultiRNNCell([lstm1] * HIDDEN_LAYERS)
			initial_state = cell1.zero_state(BATCH_SIZE, tf.float32)
            
			_, self.final_state = tf.nn.dynamic_rnn(cell1, self.encoder_inputs, initial_state=initial_state)

		# ======搭建decoder结构=====
		with tf.name_scope('decoder'):
			lstm2 = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE, state_is_tuple=True)
			lstm2 = tf.contrib.rnn.DropoutWrapper(lstm2, output_keep_prob=self.keepprb)
			cell2 = tf.contrib.rnn.MultiRNNCell([lstm2] * HIDDEN_LAYERS)

			outputs, self.final_state = tf.nn.dynamic_rnn(cell2, self.decoder_inputs, initial_state=self.final_state)
            
            
		# =====重新reshape输出=====
		with tf.name_scope('output_layer'):
			outputs = tf.reshape(tf.concat(outputs, 1), [-1, HIDDEN_SIZE])
			w = tf.get_variable('outputs_weight', [HIDDEN_SIZE, VOCAB_SIZE])
			b = tf.get_variable('outputs_bias', [VOCAB_SIZE])
			logits = tf.matmul(outputs, w) + b

		# ======计算损失=======
		with tf.name_scope('loss'):
			self.loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(self.targets, [-1])], 
															[tf.ones([BATCH_SIZE * TIME_STEPS], dtype=tf.float32)])
			self.cost = tf.reduce_sum(self.loss) / BATCH_SIZE

		# =============优化算法==============
		with tf.name_scope('opt'):
            # =============学习率衰减==============
			global_step = tf.Variable(0)
			learning_rate = tf.train.exponential_decay(learning_rate, global_step, BATCH_NUMS, 0.99, staircase=True)

			# =======通过clip_by_global_norm()控制梯度大小======
			trainable_variables = tf.trainable_variables()
			grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variables), MAX_GRAD_NORM)
			self.opt = tf.train.AdamOptimizer(learning_rate).apply_gradients(zip(grads, trainable_variables))

		# ==============预测输出=============
		with tf.name_scope('predict'):
			self.predict = tf.argmax(logits, 1)

In [24]:

# =======预定义模型参数========
EPOCHS = 50
TIME_STEPS = 100

HIDDEN_LAYERS = 3
MAX_GRAD_NORM = 1
learning_rate = 0.003


# ===========模型训练===========
model = RNNModel(BATCH_SIZE, HIDDEN_SIZE, HIDDEN_LAYERS, learning_rate)

# 保存模型
saver = tf.train.Saver()
with tf.Session() as sess:
	writer = tf.summary.FileWriter('logs/tensorboard', tf.get_default_graph())

	sess.run(tf.global_variables_initializer())
	for k in range(EPOCHS):
		state = sess.run(model.initial_state)
		train_data = data_generator(numdata, BATCH_SIZE, TIME_STEPS)
		total_loss = 0.
		for i in range(BATCH_NUMS):
			xs, ys = next(train_data)
			feed = {model.inputs: xs, model.targets: ys, model.keepprb: 0.8, model.initial_state: state}
			costs, state, _ = sess.run([model.cost, model.final_state, model.opt], feed_dict=feed)
			total_loss += costs
			if (i+1) % 50 == 0:
				print('epochs:', k + 1, 'iter:', i + 1, 'cost:', total_loss / i + 1)

	saver.save(sess, './checkpoints/lstm.ckpt')

writer.close()

ValueError: Variable rnn/multi_rnn_cell/cell_0/lstm_cell/kernel already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "d:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1654, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access
  File "d:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3290, in create_op
    op_def=op_def)
  File "d:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
