# 基于seq2seq的中英文翻译系统
## 1. 项目背景
之前我们利用lstm进行建模，设计了一个自动生成莫言小说的模型，这次想要利用rnn的特点搭建一个中英文的翻译系统。传统的RNN输入和输出长度要一致，而seq2seq在RNN的基础上进行改进，实现了变长序列的输入和输出，广泛的应用在了机器翻译、对话系统、文本摘要等领域。 
- 代码参考：https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py

## 2. 项目数据
项目数据使用英法文翻译数据集，来实现字符级的seq2seq模型的训练。 
该文件来自于:http://www.manythings.org/anki/

内容如下：

In [5]:
# ========读取原始数据========
with open('fra.txt', 'r', encoding='utf-8') as f:
    data = f.read()
data = data.split('\n')
data = data[:400]
print(data[-1:])

['Try this.\tEssayez ceci !']


## 3. 数据处理
### 3.1 生成字典
我们需要将汉字和英文映射为能够输入到模型中的数字信息，就需要建立一个映射关系，需要生成汉字和数字互相映射的字典。
- 我们将英文按照每个字母对应一个id
- 我们将中文按照每一个汉字对应一个id
- **注意增加：**
    1. 未知符号：UNK
    2. 补齐符号：PAD
    3. 开始符号：GO
    4. 结束符号：EOS

In [8]:
# 分割英文数据和中文数据
en_data = [line.split('\t')[0] for line in data]
fra_data = [line.split('\t')[1] for line in data]
print('英文数据:', en_data[:10])
print('中文数据:', fra_data[:10])

# 分别生成中英文字典
en_vocab = set(''.join(en_data))
id2en = ['__PAD__', '__UNK__'] + list(en_vocab)
en2id = {c:i for i,c in enumerate(id2en)}

fra_vocab = set(''.join(fra_data))
id2fra = ['__PAD__', '__UNK__', '__GO__', '__EOS__'] + list(fra_vocab)
fra2id = {c:i for i,c in enumerate(id2fra)}

print('英文字典:\n', en2id)
print('法文字典\n:', fra2id)

英文数据: ['Go.', 'Run!', 'Run!', 'Fire!', 'Help!', 'Jump.', 'Stop!', 'Stop!', 'Stop!', 'Wait!']
中文数据: ['Va !', 'Cours\u202f!', 'Courez\u202f!', 'Au feu !', "À l'aide\u202f!", 'Saute.', 'Ça suffit\u202f!', 'Stop\u202f!', 'Arrête-toi !', 'Attends !']
英文字典:
 {'__PAD__': 0, '__UNK__': 1, ' ': 2, 'k': 3, 'M': 4, 'H': 5, 's': 6, 'w': 7, 'p': 8, 'c': 9, ',': 10, 'Y': 11, 't': 12, 'h': 13, 'v': 14, 'D': 15, 'G': 16, 'j': 17, 'l': 18, '.': 19, 'R': 20, "'": 21, 'o': 22, 'J': 23, 'P': 24, 'b': 25, 'T': 26, 'y': 27, 'K': 28, 'O': 29, 'W': 30, 'N': 31, 'L': 32, 'u': 33, 'e': 34, 'S': 35, 'g': 36, '?': 37, 'f': 38, 'a': 39, 'B': 40, 'C': 41, 'm': 42, 'A': 43, '9': 44, 'z': 45, 'q': 46, 'i': 47, 'n': 48, 'r': 49, '1': 50, 'd': 51, '!': 52, 'F': 53, 'I': 54}
法文字典
: {'__PAD__': 0, '__UNK__': 1, '__GO__': 2, '__EOS__': 3, ' ': 4, 'V': 5, 'à': 6, 'x': 7, 'E': 8, 'M': 9, 'H': 10, 's': 11, 'p': 12, 'c': 13, ',': 14, 't': 15, 'h': 16, 'Q': 17, 'v': 18, 'D': 19, 'G': 20, 'j': 21, 'î': 22, '\u2009': 23, 'ê': 24

### 3.2 转换输入数据格式
建立字典后，将文本数据映射为数字数据形式，并整理为矩阵格式。在生成之前需要考虑训练该模型所需的数据格式。

In [11]:
en_num_data = [[en2id[en] for en in line ] for line in en_data]
fra_num_data = [[fra2id['__GO__']] + [fra2id[ch] for ch in line] for line in fra_data]
de_num_data = [[fra2id[fra] for fra in line] + [fra2id['__EOS__']] for line in fra_data]
print(en_num_data[:5])
print(fra_num_data[:5])
print(de_num_data[:5])

[[16, 22, 19], [20, 33, 48, 52], [20, 33, 48, 52], [53, 47, 49, 34, 52], [5, 34, 18, 8, 52]]
[[2, 5, 52, 4, 68], [2, 54, 30, 44, 62, 11, 64, 68], [2, 54, 30, 44, 62, 45, 57, 64, 68], [2, 56, 44, 4, 48, 45, 44, 4, 68], [2, 38, 4, 25, 28, 52, 60, 67, 45, 64, 68]]
[[5, 52, 4, 68, 3], [54, 30, 44, 62, 11, 64, 68, 3], [54, 30, 44, 62, 45, 57, 64, 68, 3], [56, 44, 4, 48, 45, 44, 4, 68, 3], [38, 4, 25, 28, 52, 60, 67, 45, 64, 68, 3]]


### 3.3 整理训练数据

In [13]:
import numpy as np

max_encoder_seq_length = max([len(txt) for txt in en_num_data])
max_decoder_seq_length = max([len(txt) for txt in fra_num_data])
print(max_encoder_seq_length)
print(max_decoder_seq_length)


encoder_input_data = [line + [0] * (max_encoder_seq_length-len(line)) for line in en_num_data]
decoder_input_data = [line + [0] * (max_decoder_seq_length-len(line)) for line in fra_num_data]
decoder_output_data = [line + [0] * (max_decoder_seq_length-len(line)) for line in de_num_data]
decoder_target_data = np.zeros((len(fra_num_data), max_decoder_seq_length, len(fra2id)), dtype='float32')
for i in range(len(fra_num_data)):
    for j in range(max_decoder_seq_length):
        decoder_target_data[i,j,decoder_output_data[i][j]] = 1

print(decoder_target_data.shape)

9
31
(400, 31, 71)


In [14]:
# =======预定义模型参数========
EN_VOCAB_SIZE = len(en2id)
FRA_VOCAB_SIZE = len(fra2id)
HIDDEN_SIZE = 128

## 4. 模型选择与建模
### 4.1 encoder建模

In [15]:
# ======================================keras model==================================
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Dropout, Embedding
from keras import regularizers
from keras.optimizers import Adam
import numpy as np

# ==============encoder=============
encoder_inputs = Input(shape=(None,))
emb_inp = Embedding(output_dim=HIDDEN_SIZE, input_dim=EN_VOCAB_SIZE, input_length=None, mask_zero=True)(encoder_inputs)
encoder_h1, encoder_state_h1, encoder_state_c1 = LSTM(HIDDEN_SIZE, activation='relu', return_sequences=True, return_state=True, dropout=0.2)(emb_inp)
encoder_h2, encoder_state_h2, encoder_state_c2 = LSTM(HIDDEN_SIZE, activation='relu', return_state=True, dropout=0.2)(encoder_h1)
encoder_state = [[encoder_state_h1, encoder_state_c1],[encoder_state_h2, encoder_state_c2]]

Using TensorFlow backend.


### 4.2 decoder建模

In [16]:
# ==============decoder=============
decoder_inputs = Input(shape=(None, ))

emb_target = Embedding(output_dim=HIDDEN_SIZE, input_dim=FRA_VOCAB_SIZE, input_length=None, mask_zero=True)(decoder_inputs)
lstm1 = LSTM(HIDDEN_SIZE, activation='relu', return_sequences=True, return_state=True, dropout=0.2)
lstm2 = LSTM(HIDDEN_SIZE, activation='relu', return_sequences=True, return_state=True, dropout=0.2)
decoder_dense = Dense(FRA_VOCAB_SIZE, activation='softmax')

decoder_h1, _, _ = lstm1(emb_target, initial_state=encoder_state[0])
decoder_h2, _, _ = lstm2(decoder_h1, initial_state=encoder_state[1])
decoder_outputs = decoder_dense(decoder_h2)


### 4.3 训练模型

In [18]:
batch_size = 512
epochs = 100
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# Run training
opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0)

# Save model
model.save('s2s.h5')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 128)    7040        input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 128)    9088        input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


  '. They will not be included '
  '. They will not be included '


### 4.4 搭建预测模型

In [None]:
encoder_model = Model(encoder_inputs, [encoder_state_h1, encoder_state_c1,encoder_state_h2, encoder_state_c2])

decoder_state_input_h1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c1 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_h2 = Input(shape=(HIDDEN_SIZE,))
decoder_state_input_c2 = Input(shape=(HIDDEN_SIZE,))

decoder_h1, state_h1, state_c1 = lstm1(emb_target, initial_state=[decoder_state_input_h1, decoder_state_input_c1])
decoder_h2, state_h2, state_c2 = lstm2(decoder_h1, initial_state=[decoder_state_input_h2, decoder_state_input_c2])
decoder_outputs = decoder_dense(decoder_h2)

decoder_model = Model([decoder_inputs, decoder_state_input_h1, decoder_state_input_c1, decoder_state_input_h2, decoder_state_input_c2], 
                      [decoder_outputs, state_h1, state_c1, state_h2, state_c2])

In [None]:
print(encoder_input_data[1])
print(decoder_input_data[1])
print(decoder_output_data[1])
print(''.join([id2en[i] for i in encoder_input_data[1]]))
print(''.join([id2ch[i] for i in decoder_input_data[1]]))
print(''.join([id2ch[i] for i in decoder_output_data[1]]))

for k in range(50):
    test_data = encoder_input_data[k]
    h1, c1,h2, c2 = encoder_model.predict(test_data)
    condition = True
    outputs = []
    decoder_input = [2]
    while condition:
        output, h1, c1, h2, c2 = decoder_model.predict([decoder_input, h1, c1, h2, c2])
        outputs.append(np.argmax(output))
        decoder_input = [np.argmax(output)]
        if (np.argmax(output)) == 3 or len(outputs) > 20: condition = False
    print(''.join([id2en[i] for i in test_data]))
    print(''.join([id2ch[i] for i in outputs]))