# 基于seq2seq的中英文翻译系统
## 1. 项目背景
之前我们利用lstm进行建模，设计了一个自动生成莫言小说的模型，这次想要利用rnn的特点搭建一个中英文的翻译系统。传统的RNN输入和输出长度要一致，而seq2seq在RNN的基础上进行改进，实现了变长序列的输入和输出，广泛的应用在了机器翻译、对话系统、文本摘要等领域。 
- 代码参考：https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py

## 2. 项目数据
项目数据使用中英文翻译数据集，来实现字符级的seq2seq模型的训练。 
该文件来自于:http://www.manythings.org/anki/

内容如下：

In [1]:
# ========读取原始数据========
with open('cmn.txt', 'r', encoding='utf-8') as f:
    data = f.read()
data = data.split('\n')
data = data[:100]
print(data[-1:])

['Try hard.\t努力。']


## 3. 数据处理
### 3.1 生成字典
我们需要将汉字和英文映射为能够输入到模型中的数字信息，就需要建立一个映射关系，需要生成汉字和数字互相映射的字典。
- 我们将英文按照每个字母对应一个id
- 我们将中文按照每一个汉字对应一个id
- **注意增加：**
    1. 未知符号：UNK
    2. 补齐符号：PAD
    3. 开始符号：GO
    4. 结束符号：EOS

In [2]:
# 分割英文数据和中文数据
en_data = [line.split('\t')[0] for line in data]
ch_data = ['\t' + line.split('\t')[1] + '\n' for line in data]
print('英文数据:', en_data[:10])
print('中文数据:', ch_data[:10])
for char in ch_data[0]:
    print(char)

# 分别生成中英文字典
en_vocab = set(''.join(en_data))
id2en = list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}


ch_vocab = set(''.join(ch_data))
id2ch = list(ch_vocab)
ch2id = {c:i for i,c in enumerate(id2ch)}


print('英文字典:\n', en2id)
print('中文字典共计:\n', (ch2id))

英文数据: ['Hi.', 'Hi.', 'Run.', 'Wait!', 'Hello!', 'I try.', 'I won!', 'Oh no!', 'Cheers!', 'He ran.']
中文数据: ['\t嗨。\n', '\t你好。\n', '\t你用跑的。\n', '\t等等！\n', '\t你好。\n', '\t让我来。\n', '\t我赢了。\n', '\t不会吧。\n', '\t乾杯!\n', '\t他跑了。\n']
	
嗨
。


英文字典:
 {'L': 0, 'N': 1, 'w': 2, 'I': 3, 'T': 4, 'd': 5, 't': 6, 'B': 7, 'c': 8, 'm': 9, 'A': 10, '.': 11, 'l': 12, 'P': 13, 'S': 14, "'": 15, 'D': 16, 'G': 17, 'H': 18, 'f': 19, 's': 20, 'a': 21, 'i': 22, 'Y': 23, 'n': 24, '!': 25, 'b': 26, 'O': 27, 'g': 28, 'h': 29, 'k': 30, 'v': 31, 'r': 32, 'K': 33, 'R': 34, 'C': 35, 'W': 36, 'u': 37, 'p': 38, 'y': 39, 'J': 40, 'o': 41, 'q': 42, 'e': 43, '?': 44, ' ': 45}
中文字典共计:
 {'清': 0, '往': 1, '帮': 2, '玩': 3, '出': 4, '什': 5, '前': 6, '入': 7, '别': 8, '没': 9, '公': 10, '告': 11, '滾': 12, '欢': 13, '坚': 14, '退': 15, '醒': 16, '个': 17, '跳': 18, '老': 19, '关': 20, '干': 21, '随': 22, '放': 23, '病': 24, '再': 25, '下': 26, '是': 27, '了': 28, '系': 29, '善': 30, '它': 31, '杯': 32, '世': 33, '？': 34, '开': 35, '赢': 36, '吃': 37, '呆': 38, '忙': 39

### 3.2 转换输入数据格式
建立字典后，将文本数据映射为数字数据形式，并整理为矩阵格式。在生成之前需要考虑训练该模型所需的数据格式。


In [3]:
# number data

en_num_data = [[en2id[en] for en in line ] for line in en_data]
ch_num_data = [[ch2id[ch] for ch in line] for line in ch_data]
de_num_data = [[ch2id[ch] for ch in line][1:] for line in ch_data]

print(en_num_data[:5])
print(ch_num_data[:5])
print(de_num_data[:5])

[[18, 22, 11], [18, 22, 11], [34, 37, 24, 11], [36, 21, 22, 6, 25], [18, 43, 12, 12, 41, 25]]
[[44, 51, 62, 43], [44, 127, 46, 62, 43], [44, 127, 104, 133, 100, 62, 43], [44, 52, 52, 47, 43], [44, 127, 46, 62, 43]]
[[51, 62, 43], [127, 46, 62, 43], [127, 104, 133, 100, 62, 43], [52, 52, 47, 43], [127, 46, 62, 43]]


### 3.3 整理训练数据

In [4]:
import numpy as np

# max length
max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in ch_num_data])
print(max_encoder_seq_length)
print(max_decoder_seq_length)


encoder_input_data = [line + [0] * (max_encoder_seq_length-len(line)) for line in en_num_data]

# no padding, onehot
encoder_input_onehot = np.zeros((len(en_num_data), max_encoder_seq_length, len(en2id)), dtype='float32')
decoder_input_onehot = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')
decoder_target_data = np.zeros((len(ch_num_data), max_decoder_seq_length, len(ch2id)), dtype='float32')

for i in range(len(ch_num_data)):
    for t, j in enumerate(en_num_data[i]):
        encoder_input_onehot[i, t, j] = 1.
    for t, j in enumerate(ch_num_data[i]):
        decoder_input_onehot[i, t, j] = 1.
    for t, j in enumerate(de_num_data[i]):
        decoder_target_data[i, t, j] = 1.

print(decoder_target_data.shape)

9
11
(100, 11, 149)


### 参数设置

In [5]:
# =======预定义模型参数========
EN_VOCAB_SIZE = len(en2id)
CH_VOCAB_SIZE = len(ch2id)
HIDDEN_SIZE = 256

## 4. 模型选择与建模
### 4.1 encoder建模

In [6]:
# ======================================keras model==================================
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Dropout, Embedding, Masking
from keras import regularizers
from keras.optimizers import Adam
import numpy as np

# ==============encoder=============
encoder_inputs = Input(shape=(None,))
emb_inp = Embedding(output_dim=HIDDEN_SIZE, input_dim=EN_VOCAB_SIZE, embeddings_initializer='uniform', mask_zero=True)(encoder_inputs)
encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, return_state=True)(emb_inp)

Using TensorFlow backend.


### 4.2 decoder建模

In [7]:
# ==============decoder=============
decoder_inputs = Input(shape=(None, CH_VOCAB_SIZE))

#emb_target = Embedding(output_dim=HIDDEN_SIZE, input_dim=CH_VOCAB_SIZE, mask_zero=True)(decoder_inputs)
lstm1 = LSTM(HIDDEN_SIZE, return_sequences=True, return_state=True)
decoder_dense = Dense(CH_VOCAB_SIZE, activation='softmax')

decoder_h1, _, _ = lstm1(decoder_inputs, initial_state=[encoder_state_h1, encoder_state_c1])
decoder_outputs = decoder_dense(decoder_h1)

### 4.3 训练模型

In [None]:
batch_size = 100
epochs = 100

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
opt = Adam(lr=0.003, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_onehot], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)

# Save model
model.save('s2s.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    11776       input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 149)    0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 256), (None, 525312      embedding_1[0][0]                
__________________________________________________________________________________________________
lstm_2 (LS

### 4.4 搭建预测模型

In [None]:
encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1])

decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))

decoder_h1, state_h1, state_c1 = lstm1(decoder_inputs, initial_state=[decoder_state_input_h1, decoder_state_input_c1])
decoder_outputs = decoder_dense(decoder_h1)

decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1], 
                      [decoder_outputs, state_h1, state_c1])

### 4.5 利用预测模型进行翻译

In [None]:

for k in range(100):
    test_data = encoder_input_data[k:k+1]
    h1, c1 = encoder_model.predict(test_data)
    target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
    target_seq[0, 0, ch2id['\t']] = 1
    outputs = []
    while True:
        output_tokens, h1, c1 = decoder_model.predict([target_seq, h1, c1])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        outputs.append(sampled_token_index)
        target_seq = np.zeros((1, 1, CH_VOCAB_SIZE))
        target_seq[0, 0, sampled_token_index] = 1
        if sampled_token_index == ch2id['\n'] or len(outputs) > 20: break
    
    print(en_data[k])
    print(''.join([id2ch[i] for i in outputs]))