# 1. 构建语料库

In [1]:
sentences = [
    ["咖哥 喜欢 小冰", "<sos> KaGe likes XiaoBing", "KaGe likes XiaoBing <eos>"],
    ["我 爱 学习 人工智能", "<sos> I love studying AI", "I love studying AI <eos>"],
    ["深度学习 改变 世界", "<sos> DL changed the world", "DL changed the world <eos>"],
    ["自然 语言 处理 很 强大", "<sos> NLP is so powerful", "NLP is so powerful <eos>"],
    ["神经网络 非常 复杂", "<sos> Neural-Nets are complex", "Neural-Nets are complex <eos>"]
]
word_list_cn, word_list_en = [], []
for s in sentences:
    word_list_cn.extend(s[0].split())
    word_list_en.extend(s[1].split())
    word_list_en.extend(s[2].split())
word_list_cn = list(set(word_list_cn))
word_list_en = list(set(word_list_en))

word_2_idx_cn = {w: i for i, w in enumerate(word_list_cn)}
word_2_idx_en = {w: i for i, w in enumerate(word_list_en)}

idx_2_word_cn = {i: w for i, w in enumerate(word_list_cn)}
idx_2_word_en = {i: w for i, w in enumerate(word_list_en)}

voc_size_cn = len(word_list_cn)
voc_size_en = len(word_list_en)

print(f"句子数量: {len(sentences)}")
print(f"中文词汇表大小: {voc_size_cn}")
print(f"英文词汇表大小: {voc_size_en}")
print(f"中文词汇到索引: {word_2_idx_cn}")
print(f"英文词汇到索引: {word_2_idx_en}")

句子数量: 5
中文词汇表大小: 18
英文词汇表大小: 20
中文词汇到索引: {'爱': 0, '语言': 1, '学习': 2, '世界': 3, '人工智能': 4, '改变': 5, '强大': 6, '复杂': 7, '咖哥': 8, '自然': 9, '非常': 10, '小冰': 11, '喜欢': 12, '深度学习': 13, '很': 14, '神经网络': 15, '处理': 16, '我': 17}
英文词汇到索引: {'world': 0, 'likes': 1, 'XiaoBing': 2, 'KaGe': 3, 'I': 4, 'changed': 5, '<eos>': 6, 'is': 7, '<sos>': 8, 'so': 9, 'the': 10, 'complex': 11, 'are': 12, 'DL': 13, 'AI': 14, 'love': 15, 'powerful': 16, 'Neural-Nets': 17, 'studying': 18, 'NLP': 19}


# 2. 生成训练数据

In [2]:
import numpy as np
import torch
import random

def make_data(sentences):
    random_sentence = random.choice(sentences)
    encoder_input = np.array([[word_2_idx_cn[w] for w in random_sentence[0].split()]])
    decoder_input = np.array([[word_2_idx_en[w] for w in random_sentence[1].split()]])
    target = np.array([[word_2_idx_en[w] for w in random_sentence[2].split()]])
    encoder_input = torch.LongTensor(encoder_input)
    decoder_input = torch.LongTensor(decoder_input)
    target = torch.LongTensor(target)
    return encoder_input, decoder_input, target

encoder_input, decoder_input, target = make_data(sentences)

# 3. 定义编码器和解码器类

In [3]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__() 
        self.hidden_size = hidden_size
        self.embedding  = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        
    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden
    
class Decoder(nn.Module):
    
    def __init__(self, hidden_size, output_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs)
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output)
        return output, hidden
 
n_hidden = 128   
encoder = Encoder(voc_size_cn, n_hidden)
decoder = Decoder(n_hidden, voc_size_en)
print(f"编码器: {encoder}")
print(f"解码器: {decoder}")

编码器: Encoder(
  (embedding): Embedding(18, 128)
  (rnn): RNN(128, 128, batch_first=True)
)
解码器: Decoder(
  (embedding): Embedding(20, 128)
  (rnn): RNN(128, 128, batch_first=True)
  (out): Linear(in_features=128, out_features=20, bias=True)
)


# 4. 定义 Seq2Seq 结构

In [4]:
class Seq2Seq(nn.Module):
    
    def __init__(self, input_size, hidden_size, ouput_size):
        super().__init__()
        self.encoder = Encoder(input_size, hidden_size)
        self.decoder = Decoder(hidden_size, ouput_size)
    
    def forward(self, enc_input, hidden, dec_input):
        encoder_output, encoder_hidden = self.encoder(enc_input, hidden)
        decoder_hidden = encoder_hidden
        decoder_ouput, _ = self.decoder(dec_input, decoder_hidden)
        return decoder_ouput
    
model = Seq2Seq(voc_size_cn, n_hidden, voc_size_en)
print(f"Seq2Seq 模型: {model}")
        

Seq2Seq 模型: Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(18, 128)
    (rnn): RNN(128, 128, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(20, 128)
    (rnn): RNN(128, 128, batch_first=True)
    (out): Linear(in_features=128, out_features=20, bias=True)
  )
)


# 5. 训练 Seq2Seq

In [5]:
def train_seq2seq(model, creterion, optimizer, epochs):
    for epoch in range(epochs):
        encoder_input, decoder_input, target = make_data(sentences)
        hidden = torch.zeros(1, encoder_input.size(0), n_hidden)
        optimizer.zero_grad()
        output = model(encoder_input, hidden, decoder_input)
        loss = creterion(output.view(-1, voc_size_en), target.view(-1))
        if (epoch + 1) % 100 == 0:
            print(f"Epoch: {epoch+1}, Loss: {loss:.6f}")
        loss.backward()
        optimizer.step()

epochs = 1000
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_seq2seq(model, criterion, optimizer, epochs)

Epoch: 100, Loss: 0.065651
Epoch: 200, Loss: 0.019598
Epoch: 300, Loss: 0.012312
Epoch: 400, Loss: 0.006995
Epoch: 500, Loss: 0.003901
Epoch: 600, Loss: 0.003139
Epoch: 700, Loss: 0.002583
Epoch: 800, Loss: 0.001938
Epoch: 900, Loss: 0.002187
Epoch: 1000, Loss: 0.001389


  # 6. 测试 Seq2Seq 架构
 

In [6]:
def test_seq2seq(model, source_sentence):
    encoder_input = np.array([[word_2_idx_cn[w] for w in source_sentence.split()]])
    decoder_input = np.array([word_2_idx_en["<sos>"]] + [word_2_idx_en["<eos>"]]*(len(encoder_input[0])-1))
    encoder_input = torch.LongTensor(encoder_input)
    decoder_input = torch.LongTensor(decoder_input).unsqueeze(0)
    hidden = torch.zeros(1, encoder_input.size(0), n_hidden)
    predict = model(encoder_input, hidden, decoder_input)
    predict = predict.data.max(2, keepdim=True)[1]
    print(f"{source_sentence} -> {[idx_2_word_en[n.item()] for n in predict.squeeze()]}")

test_seq2seq(model, "咖哥 喜欢 小冰")
test_seq2seq(model, "自然 语言 处理 很 强大")

咖哥 喜欢 小冰 -> ['KaGe', 'likes', 'XiaoBing']
自然 语言 处理 很 强大 -> ['NLP', 'is', 'so', '<eos>', '<eos>']
