In [1]:
# ========读取原始数据========
with open('cmn.txt', 'r', encoding='utf-8') as f:
    data = f.read()
data = data.split('\n')
data = data[:1000]
print(data[-5:])


# 分割英文数据和中文数据
en_data = [line.split('\t')[0] for line in data]
ch_data = [line.split('\t')[1] for line in data]
print('英文数据:\n', en_data[:10])
print('\n中文数据:\n', ch_data[:10])

["I don't want it.\t我不要.", 'I feel relieved.\t我感觉轻松了。', 'I get up at six.\t我六點起床。', 'I had no choice.\t那时我没有选择的余地。', 'I hate studying.\t我讨厌学习。']
英文数据:
 ['Hi.', 'Hi.', 'Run.', 'Wait!', 'Hello!', 'I try.', 'I won!', 'Oh no!', 'Cheers!', 'He ran.']

中文数据:
 ['嗨。', '你好。', '你用跑的。', '等等！', '你好。', '让我来。', '我赢了。', '不会吧。', '乾杯!', '他跑了。']


In [2]:
# 特殊字符
SOURCE_CODES = ['<PAD>', '<UNK>']
TARGET_CODES = ['<PAD>', '<EOS>', '<UNK>', '<GO>']  # 在target中，需要增加<GO>与<EOS>特殊字符

# 分别生成中英文字典
en_vocab = set(''.join(en_data))
id2en = SOURCE_CODES + list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

# 分别生成中英文字典
ch_vocab = set(''.join(ch_data))
id2ch = TARGET_CODES + list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}

print('\n英文字典:\n', en2id)
print('\n中文字典共计\n:', ch2id)


英文字典:
 {'<PAD>': 0, '<UNK>': 1, 'x': 2, 'b': 3, 'v': 4, '!': 5, '7': 6, 'M': 7, 'c': 8, '0': 9, 'R': 10, 't': 11, 'k': 12, 'y': 13, '1': 14, ' ': 15, 'z': 16, 'P': 17, 'E': 18, 'F': 19, ',': 20, 'e': 21, 'h': 22, 'D': 23, 'I': 24, 'W': 25, 's': 26, 'm': 27, 'G': 28, 'i': 29, 'Y': 30, '8': 31, 'f': 32, 'J': 33, 'q': 34, 'V': 35, 'U': 36, 'C': 37, ':': 38, 'o': 39, 'r': 40, 'Q': 41, '.': 42, 'w': 43, 'N': 44, 'K': 45, 'S': 46, 'n': 47, 'B': 48, 'H': 49, 'O': 50, 'T': 51, 'a': 52, 'd': 53, 'j': 54, 'l': 55, '?': 56, '3': 57, 'L': 58, 'A': 59, 'p': 60, 'g': 61, "'": 62, 'u': 63}

中文字典共计
: {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3, '抗': 4, '會': 5, '糟': 6, '尝': 7, '我': 8, '密': 9, '!': 10, '没': 11, '疯': 12, '續': 13, '掃': 14, '說': 15, '孤': 16, '瑪': 17, '相': 18, '埋': 19, '需': 20, '時': 21, '路': 22, '闭': 23, '结': 24, '脱': 25, '錢': 26, '信': 27, 'D': 28, '學': 29, '三': 30, '开': 31, '燃': 32, '斯': 33, '量': 34, '欠': 35, '危': 36, '呢': 37, '么': 38, '貪': 39, '矮': 40, '幹': 41, '露': 42, '缺': 43, '识': 

In [3]:
# 利用字典，映射数据
en_num_data = [[en2id[en] for en in line] for line in en_data]
ch_num_data = [[ch2id['<GO>']] + [ch2id[ch] for ch in line] + [ch2id['<EOS>']] for line in ch_data]
de_num_data = [[ch2id[ch] for ch in line] + [ch2id['<EOS>']] for line in ch_data]

print('char:', en_data[1])
print('index:', en_num_data[1])

en_maxlength = max([len(line) for line in en_num_data])
ch_maxlength = max([len(line) for line in ch_num_data])

# 文本数据转化为数字数据
en_num_data = [data + [en2id['<PAD>']] * (en_maxlength - len(data)) for data in en_num_data]
ch_num_data = [data + [en2id['<PAD>']] * (ch_maxlength - len(data)) for data in ch_num_data]
de_num_data = [data + [en2id['<PAD>']] * (ch_maxlength - len(data)) for data in de_num_data]


# 设计数据生成器
def batch_data(en_num_data, ch_num_data, de_num_data, batch_size):
    batch_num = len(en_num_data) // batch_size
    for i in range(batch_num):
        begin = i * batch_size
        end = begin + batch_size
        x = en_num_data[begin:end]
        y = ch_num_data[begin:end]
        z = de_num_data[begin:end]
        yield x, y, z


char: Hi.
index: [49, 29, 42]


In [4]:
import tensorflow as tf

max_encoder_seq_length = en_maxlength
max_decoder_seq_length = ch_maxlength
keepprb = 0.9

EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)

HIDDEN_LAYERS = 2
HIDDEN_SIZE = 256

learning_rate = 0.003

BATCH_SIZE = 8
BATCH_NUMS = len(ch_num_data) // BATCH_SIZE
MAX_GRAD_NORM = 1

EPOCHS = 50


In [5]:

encoder_inputs = tf.placeholder(tf.int32, [BATCH_SIZE, max_encoder_seq_length])
decoder_inputs = tf.placeholder(tf.int32, [BATCH_SIZE, max_decoder_seq_length])
targets = tf.placeholder(tf.int32, [BATCH_SIZE, max_decoder_seq_length])
keepprb = tf.placeholder(tf.float32)


with tf.name_scope('embedding_encoder'):
	encoder_embedding = tf.get_variable('embedding_encoder', [EN_VOCAB_SIZE, HIDDEN_SIZE])
	encoder_emb = tf.nn.embedding_lookup(encoder_embedding, encoder_inputs)
	encoder_emb = tf.nn.dropout(encoder_emb, keepprb)


# encoder
with tf.variable_scope('encoder'):
	encoder_lstm = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE, state_is_tuple=True)
	encoder_lstm = tf.contrib.rnn.DropoutWrapper(encoder_lstm, output_keep_prob=keepprb)
	encoder_cell = tf.contrib.rnn.MultiRNNCell([encoder_lstm] * HIDDEN_LAYERS)
	initial_state = encoder_cell.zero_state(BATCH_SIZE, tf.float32)
	_, final_state = tf.nn.dynamic_rnn(encoder_cell, encoder_emb, initial_state=initial_state)


with tf.name_scope('embedding_decoder'):
	decoder_embedding = tf.get_variable('embedding_decoder', [CH_VOCAB_SIZE, HIDDEN_SIZE])
	decoder_emb = tf.nn.embedding_lookup(decoder_embedding, decoder_inputs)
	decoder_emb = tf.nn.dropout(decoder_emb, keepprb)


# decoder
with tf.variable_scope('decoder'):
	decoder_lstm = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE, state_is_tuple=True)
	decoder_lstm = tf.contrib.rnn.DropoutWrapper(decoder_lstm, output_keep_prob=keepprb)
	decoder_cell = tf.contrib.rnn.MultiRNNCell([decoder_lstm] * HIDDEN_LAYERS)
	outputs, _ = tf.nn.dynamic_rnn(decoder_cell, decoder_emb, initial_state=final_state)
	outputs = tf.reshape(tf.concat(outputs, 1), [-1, HIDDEN_SIZE])


w = tf.get_variable('outputs_weight', [HIDDEN_SIZE, CH_VOCAB_SIZE])
b = tf.get_variable('outputs_bias', [CH_VOCAB_SIZE])
logits = tf.matmul(outputs, w) + b

		# ======计算损失=======
loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(targets, [-1])], 
														[tf.ones([BATCH_SIZE * max_decoder_seq_length], dtype=tf.float32)])
cost = tf.reduce_sum(loss) / BATCH_SIZE

		# =============优化算法==============
          # =============学习率衰减==============
global_step = tf.Variable(0)
learning_rate = tf.train.exponential_decay(learning_rate, global_step, BATCH_NUMS, 0.99, staircase=True)

			# =======通过clip_by_global_norm()控制梯度大小======
trainable_variables = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, trainable_variables), MAX_GRAD_NORM)
opt = tf.train.AdamOptimizer(learning_rate).apply_gradients(zip(grads, trainable_variables))

		# ==============预测输出=============
predict = tf.reshape(tf.argmax(logits, 1), [-1, max_decoder_seq_length])


In [6]:
# 保存模型
saver = tf.train.Saver()
with tf.Session() as sess:
	writer = tf.summary.FileWriter('logs/tensorboard', tf.get_default_graph())
	sess.run(tf.global_variables_initializer())
	for k in range(EPOCHS):
		total_loss = 0.
		data_generator = batch_data(en_num_data, ch_num_data, de_num_data, BATCH_SIZE)
		for i in range(BATCH_NUMS):
			en_batch, ch_batch, de_batch = next(data_generator)
			feed = {encoder_inputs: en_batch, decoder_inputs: ch_batch, targets: de_batch, keepprb: 0.8}
			costs, _ = sess.run([cost, opt], feed_dict=feed)
			total_loss += costs
			if (i+1) % 50 == 0:
				print('epochs:', k + 1, 'iter:', i + 1, 'cost:', total_loss / i + 1)
				#print('predict:', sess.run(predict[0], feed_dict=feed))
				print('text:', ''.join([id2ch[i] for i in sess.run(predict[0], feed_dict=feed) if(i != 0 and i != 1)]))
				print('label:', ''.join([id2ch[i] for i in de_batch[0] if(i != 0 and i != 1)]))
                
	saver.save(sess, './checkpoints/lstm.ckpt')

writer.close()

epochs: 1 iter: 50 cost: 35.808754590092875
text: 我是。。。
label: 开车慢点。
epochs: 1 iter: 100 cost: 33.83278162792475
text: 我我是。。
label: 我很快乐。
epochs: 2 iter: 50 cost: 24.753880987361985
text: 我是了。。
label: 开车慢点。
epochs: 2 iter: 100 cost: 26.28698688083225
text: 我是一。。
label: 我很快乐。
epochs: 3 iter: 50 cost: 23.512867324206294
text: 我是。。。
label: 开车慢点。
epochs: 3 iter: 100 cost: 24.913117765176175
text: 我是一。。
label: 我很快乐。
epochs: 4 iter: 50 cost: 22.480378520732025
text: 我是了。。
label: 开车慢点。
epochs: 4 iter: 100 cost: 23.888123830159504
text: 我是歡。。
label: 我很快乐。
epochs: 5 iter: 50 cost: 21.464869382430095
text: 我是了。。
label: 开车慢点。
epochs: 5 iter: 100 cost: 22.81276946597629
text: 我是一本。
label: 我很快乐。
epochs: 6 iter: 50 cost: 20.4119589280109
text: 我上！。。
label: 开车慢点。
epochs: 6 iter: 100 cost: 21.616657054785527
text: 我是一谎。
label: 我很快乐。
epochs: 7 iter: 50 cost: 19.504937133010554
text: 你！說！。
label: 开车慢点。
epochs: 7 iter: 100 cost: 20.603414718550866
text: 我們一谎。
label: 我很快乐。
epochs: 8 iter: 50 cost: 18.8474