<a href="https://colab.research.google.com/github/stfromnjust/nlp-task/blob/main/Seq2SeqForTranslation_useSpaCyJieba_20210602eve.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 下载数据 必要配置

In [1]:
!wget http://www.manythings.org/anki/cmn-eng.zip
!unzip -d ./cmn-eng cmn-eng.zip

--2021-06-02 11:44:13--  http://www.manythings.org/anki/cmn-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.21.55.222, 172.67.173.198, 2606:4700:3036::ac43:adc6, ...
Connecting to www.manythings.org (www.manythings.org)|104.21.55.222|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1062383 (1.0M) [application/zip]
Saving to: ‘cmn-eng.zip’


2021-06-02 11:44:14 (1.55 MB/s) - ‘cmn-eng.zip’ saved [1062383/1062383]

Archive:  cmn-eng.zip
  inflating: ./cmn-eng/cmn.txt       
  inflating: ./cmn-eng/_about.txt    


In [2]:
seed = 2021

In [3]:
!pip install opencc

Collecting opencc
[?25l  Downloading https://files.pythonhosted.org/packages/25/a1/83402033399c7bc61482f45e186156f6c51c8ca5cf4a66f22039a586a520/OpenCC-1.1.2-cp37-cp37m-manylinux1_x86_64.whl (765kB)
[K     |████████████████████████████████| 768kB 2.8MB/s 
[?25hInstalling collected packages: opencc
Successfully installed opencc-1.1.2


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext.legacy.data import *


import time
import math
import random
import jieba
import spacy
import opencc

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# 读取数据

In [6]:
# 每一行数据如下
# 'Hi.\t嗨。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #891077 (Martha)'
with open('./cmn-eng/cmn.txt', 'r', encoding='utf-8') as f:
    data = f.read()
data = data.strip()
data = data.split('\n')
print('样本数:\n', len(data))
print('\n样本示例:')
data[0]

样本数:
 24360

样本示例:


'Hi.\t嗨。\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #891077 (Martha)'

In [7]:
# 分割英文数据和中文数据

# 繁体中文转简体中文
cc = opencc.OpenCC('t2s')

en_data = [line.split('\t')[0] for line in data]
ch_data = [cc.convert(line.split('\t')[1]) for line in data]
print('英文数据:\n', en_data[:10])
print('\n中文数据:\n', ch_data[:10])

英文数据:
 ['Hi.', 'Hi.', 'Run.', 'Wait!', 'Wait!', 'Begin.', 'Hello!', 'I won!', 'Oh no!', 'Cheers!']

中文数据:
 ['嗨。', '你好。', '你用跑的。', '等等！', '等一下！', '开始！', '你好。', '我赢了。', '不会吧。', '干杯!']


# 分词

In [8]:
# 中文使用jieba分词
ch_token_list = []
for line in ch_data:
  ch_token_list.append(list(jieba.cut(line)) + ['<eos>'])

# 测试
print('\n中文数据:\n', ch_token_list[:3])

# 英文使用spacy分词
nlp = spacy.load('en')
en_token_list = []
for line in en_data:
  doc = nlp(line)
  en_new = []
  for token in doc:
    en_new.append(token.text)
  en_token_list.append(en_new + ['<eos>'])

# 测试
print('英文数据:\n', en_token_list[:3])

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.852 seconds.
Prefix dict has been built successfully.



中文数据:
 [['嗨', '。', '<eos>'], ['你好', '。', '<eos>'], ['你', '用', '跑', '的', '。', '<eos>']]
英文数据:
 [['Hi', '.', '<eos>'], ['Hi', '.', '<eos>'], ['Run', '.', '<eos>']]


# 建立词典

In [9]:
ch_TEXT = Field()
ch_TEXT.build_vocab(ch_token_list, specials=['<pad>', '<unk>', '<bos>', '<eos>'])
en_TEXT = Field()
en_TEXT.build_vocab(en_token_list, specials=['<pad>', '<unk>', '<bos>', '<eos>'])

print('中文词典: ')
print(len(ch_TEXT.vocab))
print(ch_TEXT)
print(ch_TEXT.vocab.itos[:20])
print(ch_TEXT.vocab.stoi['<unk>'])
print('\n英文词典: ')
print(len(en_TEXT.vocab))
print(en_TEXT)
print(en_TEXT.vocab.itos[:20])
print(en_TEXT.vocab.stoi['<unk>'])

中文词典: 
11392
<torchtext.legacy.data.field.Field object at 0x7effd07bead0>
['<unk>', '<pad>', '<bos>', '<eos>', '。', '我', '的', '了', '你', '他', '？', '汤姆', '在', '是', '她', '吗', '我们', '，', '不', '很']
0

英文词典: 
7346
<torchtext.legacy.data.field.Field object at 0x7f00885cefd0>
['<unk>', '<pad>', '<bos>', '<eos>', '.', 'I', 'to', 'the', 'you', 'a', '?', 'is', "n't", 'Tom', 'He', 'in', "'s", 'of', 'do', 'me']
0


In [10]:
# 利用字典，映射数据 
en_num_data = [[en_TEXT.vocab.stoi[token] for token in line ] for line in en_token_list]
ch_num_data = [[ch_TEXT.vocab.stoi[token] for token in line] for line in ch_token_list]

print('char:', en_data[1])
print('index:', en_num_data[1])

char: Hi.
index: [2310, 4, 3]


# 表示为Dataset

In [11]:
class TranslationDataset(Dataset):
    def __init__(self, src_data, trg_data):
        self.src_data = src_data
        self.trg_data = trg_data

        assert len(src_data) == len(trg_data), \
            "numbers of src_data  and trg_data must be equal!"

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        src_sample =self.src_data[idx]
        src_len = len(self.src_data[idx])
        trg_sample = self.trg_data[idx]
        trg_len = len(self.trg_data[idx])
        return {"src": src_sample, "src_len": src_len, "trg": trg_sample, "trg_len": trg_len}

In [12]:
def padding_batch(batch):
    """
    input: -> list of dict
        [{'src': [1, 2, 3], 'trg': [1, 2, 3]}, {'src': [1, 2, 2, 3], 'trg': [1, 2, 2, 3]}]
    output: -> dict of tensor 
        {
            "src": [[1, 2, 3, 0], [1, 2, 2, 3]].T
            "trg": [[1, 2, 3, 0], [1, 2, 2, 3]].T
        }
    """
    src_lens = [d["src_len"] for d in batch]
    trg_lens = [d["trg_len"] for d in batch]
    
    src_max = max([d["src_len"] for d in batch])
    trg_max = max([d["trg_len"] for d in batch])
    for d in batch:
        d["src"].extend([en_TEXT.vocab.stoi["<pad>"]] * (src_max-d["src_len"]))
        d["trg"].extend([ch_TEXT.vocab.stoi["<pad>"]] * (trg_max-d["trg_len"]))
    srcs = torch.tensor([pair["src"] for pair in batch], dtype=torch.long, device=device)
    trgs = torch.tensor([pair["trg"] for pair in batch], dtype=torch.long, device=device)
    
    batch = {"src":srcs.T, "src_len":src_lens, "trg":trgs.T, "trg_len":trg_lens}
    return batch

# Attention机制

In [13]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout=0.5, bidirectional=True):
        super(Encoder, self).__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        
    def forward(self, input_seqs, input_lengths, hidden):
        # input_seqs = [seq_len, batch]
        embedded = self.embedding(input_seqs)
        # embedded = [seq_len, batch, embed_dim]
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, enforce_sorted=False)
        
        outputs, hidden = self.gru(packed, hidden)        
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # outputs = [seq_len, batch, hid_dim * n directions]
        # output_lengths = [batch]
        return outputs, hidden

In [14]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)  # [seq_len, batch]

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)  # [seq_len, batch, hid_dim]
        return torch.sum(hidden * energy, dim=2)  # [seq_len, batch]

    def concat_score(self, hidden, encoder_output):
        # hidden.expand(encoder_output.size(0), -1, -1) -> [seq_len, batch, N]
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        # energy = [sql_len, batch, hidden_size]
        return torch.sum(self.v * energy, dim=2)  # [seq_len, batch]

    def forward(self, hidden, encoder_outputs):
        # hidden = [1, batch,  n_directions * hid_dim]
        # encoder_outputs = [seq_len, batch, hid dim * n directions]
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        attn_energies = attn_energies.t()  # [batch, seq_len]
 
        return F.softmax(attn_energies, dim=1).unsqueeze(1)  # softmax归一化# [batch, 1, seq_len]

In [15]:
class AttnDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers=1, dropout=0.5, bidirectional=True, attn_method="general"):
        super(AttnDecoder, self).__init__()

        self.output_dim = output_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=bidirectional)
        
        if bidirectional:
            self.concat = nn.Linear(hid_dim * 2 * 2, hid_dim*2)
            self.out = nn.Linear(hid_dim*2, output_dim)
            self.attn = Attn(attn_method, hid_dim*2)
        else:
            self.concat = nn.Linear(hid_dim * 2, hid_dim)
            self.out = nn.Linear(hid_dim, output_dim)
            self.attn = Attn(attn_method, hid_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, token_inputs, last_hidden, encoder_outputs):
        batch_size = token_inputs.size(0)
        embedded = self.embedding(token_inputs)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, -1) # [1, B, hid_dim]

        gru_output, hidden = self.gru(embedded, last_hidden)
        # gru_output = [1, batch,  n_directions * hid_dim]
        # hidden = [n_layers * n_directions, batch, hid_dim]

        # encoder_outputs = [sql_len, batch, hid dim * n directions]
        attn_weights = self.attn(gru_output, encoder_outputs)
        # attn_weights = [batch, 1, sql_len]
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # [batch, 1, hid_dim * n directions]

        # LuongAttention
        gru_output = gru_output.squeeze(0) # [batch, n_directions * hid_dim]
        context = context.squeeze(1)       # [batch, n_directions * hid_dim]
        concat_input = torch.cat((gru_output, context), 1)  # [batch, n_directions * hid_dim * 2]
        concat_output = torch.tanh(self.concat(concat_input))  # [batch, n_directions*hid_dim]

        output = self.out(concat_output)  # [batch, output_dim]
        output = self.softmax(output)

        return output, hidden, attn_weights

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 device, 
                 predict=False, 
                 basic_dict=None,
                 max_len=100
                 ):
        super(Seq2Seq, self).__init__()
        
        self.device = device

        self.encoder = encoder
        self.decoder = decoder

        self.predict = predict  # 训练阶段还是预测阶段
        self.basic_dict = basic_dict  # decoder的字典，存放特殊token对应的id
        self.max_len = max_len  # 翻译时最大输出长度

        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        assert encoder.gru.bidirectional == decoder.gru.bidirectional, \
            "Decoder and encoder must had same value of bidirectional attribute!"
        
    def forward(self, input_batches, input_lengths, target_batches=None, target_lengths=None, teacher_forcing_ratio=0.5):
        # input_batches = [seq_len, batch]
        # target_batches = [seq_len, batch]
        batch_size = input_batches.size(1)
        
        BOS_token = self.basic_dict["<bos>"]
        EOS_token = self.basic_dict["<eos>"]
        PAD_token = self.basic_dict["<pad>"]

        # 初始化
        enc_n_layers = self.encoder.gru.num_layers
        enc_n_directions = 2 if self.encoder.gru.bidirectional else 1
        encoder_hidden = torch.zeros(enc_n_layers*enc_n_directions, batch_size, self.encoder.hid_dim, device=self.device)
        
        # encoder_outputs = [input_lengths, batch, hid_dim * n directions]
        # encoder_hidden = [n_layers*n_directions, batch, hid_dim]
        encoder_outputs, encoder_hidden = self.encoder(
            input_batches, input_lengths, encoder_hidden)

        # 初始化
        decoder_input = torch.tensor([BOS_token] * batch_size, dtype=torch.long, device=self.device)
        decoder_hidden = encoder_hidden

        if self.predict:
            # 一次只输入一句话
            assert batch_size == 1, "batch_size of predict phase must be 1!"
            output_tokens = []

            while True:
                decoder_output, decoder_hidden, decoder_attn = self.decoder(
                    decoder_input, decoder_hidden, encoder_outputs
                )
                # [1, 1]
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(1).detach()
                output_token = topi.squeeze().detach().item()
                if output_token == EOS_token or len(output_tokens) == self.max_len:
                    break
                output_tokens.append(output_token)
            return output_tokens

        else:
            max_target_length = max(target_lengths)
            all_decoder_outputs = torch.zeros((max_target_length, batch_size, self.decoder.output_dim), device=self.device)

            for t in range(max_target_length):
                use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
                if use_teacher_forcing:
                    # decoder_output = [batch, output_dim]
                    # decoder_hidden = [n_layers*n_directions, batch, hid_dim]
                    decoder_output, decoder_hidden, decoder_attn = self.decoder(
                        decoder_input, decoder_hidden, encoder_outputs
                    )
                    all_decoder_outputs[t] = decoder_output
                    decoder_input = target_batches[t]  # 下一个输入来自训练数据
                else:
                    decoder_output, decoder_hidden, decoder_attn = self.decoder(
                        decoder_input, decoder_hidden, encoder_outputs
                    )
                    # [batch, 1]
                    topv, topi = decoder_output.topk(1)
                    all_decoder_outputs[t] = decoder_output
                    decoder_input = topi.squeeze(1).detach()  # 下一个输入来自模型预测
            
            loss_fn = nn.NLLLoss(ignore_index=PAD_token)
            loss = loss_fn(
                all_decoder_outputs.reshape(-1, self.decoder.output_dim),  # [batch*seq_len, output_dim]
                target_batches.reshape(-1)               # [batch*seq_len]
            )
            return loss

# 训练和预测代码

In [17]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [18]:
def train(
    model,
    data_loader, 
    optimizer, 
    clip=1, 
    teacher_forcing_ratio=0.5, 
    print_every=None  # None不打印
    ):
    model.predict = False
    model.train()

    if print_every == 0:
        print_every = 1

    print_loss_total = 0  # 每次打印都重置
    start = time.time()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):

        # shape = [seq_len, batch]
        input_batchs = batch["src"]
        target_batchs = batch["trg"]
        # list
        input_lens = batch["src_len"]
        target_lens = batch["trg_len"]
        
        optimizer.zero_grad()
        
        loss = model(input_batchs, input_lens, target_batchs, target_lens, teacher_forcing_ratio)
        print_loss_total += loss.item()
        epoch_loss += loss.item()
        loss.backward()

        # 梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        if print_every and (i+1) % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('\tCurrent Loss: %.4f' % print_loss_avg)

    return epoch_loss / len(data_loader)

In [19]:
def evaluate(
    model,
    data_loader, 
    print_every=None
    ):
    model.predict = False
    model.eval()
    if print_every == 0:
        print_every = 1

    print_loss_total = 0  # 每次打印都重置
    start = time.time()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):

            # shape = [seq_len, batch]
            input_batchs = batch["src"]
            target_batchs = batch["trg"]
            # list
            input_lens = batch["src_len"]
            target_lens = batch["trg_len"]

            loss = model(input_batchs, input_lens, target_batchs, target_lens, teacher_forcing_ratio=0)
            print_loss_total += loss.item()
            epoch_loss += loss.item()

            if print_every and (i+1) % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('\tCurrent Loss: %.4f' % print_loss_avg)

    return epoch_loss / len(data_loader)

In [77]:
def translate(
    model,
    sample, 
    idx2token=None
    ):
    model.predict = True
    model.eval()

    # shape = [seq_len, 1]
    input_batch = sample["src"]
    # list
    input_len = sample["src_len"]

    output_tokens = model(input_batch, input_len)
    output_tokens = [idx2token[t] for t in output_tokens]

    return output_tokens

In [140]:
from nltk.translate.bleu_score import corpus_bleu

def cal_bleu(
    model,
    data_loader,
):
  model.predict = False
  model.eval()
  candidate_corpus = []
  references_corpus = []
  with torch.no_grad():
    for i, batch in enumerate(data_loader):
      # shape = [seq_len, batch]
      input_batches = batch["src"]
      target_batches = batch["trg"]
      # list
      input_lens = batch["src_len"]
      target_lens = batch["trg_len"]
      
      input_batches = input_batches.t()
      for j in range(input_batches.shape[0]):
        sample = {}
        en_tokens = [t for t in input_batches[j] if t != 1] 
        sample["src"] = torch.tensor(en_tokens, dtype=torch.long, device=device).reshape(-1, 1)
        sample["src_len"] = [len(en_tokens)]
        output_tokens = translate(model, sample, ch_TEXT.vocab.itos)
        if len(output_tokens) == 0:
          print("yes")
        candidate_corpus.append(output_tokens)

      # 转置
      target_batches = target_batches.t()
      for j in range(target_batches.shape[0]):
        ch_src = [ch_TEXT.vocab.itos[t] for t in target_batches[j] if t !=1 and t !=3]
        if len(ch_src) == 0:
          print("yes")
        references_corpus.append([ch_src])
  print(candidate_corpus)
  print(references_corpus)
  print(len(candidate_corpus))
  print(len(references_corpus))
  print(corpus_bleu(references_corpus, candidate_corpus))
  return
      

In [142]:
# cal_bleu(model, train_loader)

[['他', '住', '在', '我们', '街', '对面', '。'], ['你', '能', '跳', '多', '高', '？'], ['8', '月份', '没课', '。'], ['如果', '你', '抓紧', '，', '你', '还', '能', '赶上', '火车', '。'], ['我们', '用', '耳朵', '听', '。'], ['听', '起来', '或许', '有点', '怪', '，', '但', '她', '说', '的', '是', '真的', '。'], ['美丽', '的', '夕阳', '，', '不是', '吗', '？'], ['它', '发生', '在', '十一点', '一刻', '。'], ['我们', '在', '探险', '的', '时候', '遇到', '了', '很多', '机会', '。'], ['当', '她', '看到', '妈妈', '没在生', '她', '的', '气', '，', '她', '的', '双眼', '因为', '幸福', '了', '而', '闪烁', '。'], ['什么', '是', '格式', '塔', '疗法', '？'], ['别忘了', '在', '你', '的', '信上', '贴', '张', '邮票', '。'], ['那个', '男孩子', '在', '跑步', '。'], ['我', '想要', '酱料', '放在', '旁边', '。'], ['笔在', '桌上', '。'], ['你', '能', '猜', '到', '我', '的', '年龄', '吗', '？'], ['这是', '开往', '东京', '正确', '的', '火车', '吗', '？'], ['我家', '宝宝', '还', '不会', '说话', '。'], ['汤姆', '向门', '走', '。'], ['那位', '绅士', '经常', '戴著', '一顶', '帽子', '。'], ['那', '是', '个', '有趣', '的', '主意', '。'], ['你', '的', '狗', '在', '哪儿', '？'], ['你', '的', '嘴唇', '很漂亮', '。'], ['我', '收到', '汤姆', '的', '一条', '紧急', '。'], [

In [141]:
# cal_bleu(model, test_loader)

[['我们', '想', '知道', '。'], ['她', '被', '警察', '盘问', '。'], ['每个', '人', '都', '知道', '他', '的', '妻子', '卖', '了', '这个', '。'], ['昨天', '很', '远', '。'], ['我', '听到', '他们', '讲', '了', '我', '的', '故事', '。'], ['你', '有', '看见', '过', '汤姆生', '气', '的', '时候', '吗', '？'], ['我', '的', '女朋友', '是', '好', '好', '。'], ['我', '对', '你', '来说', '来说', '很', '容易', '。'], ['是', '时候', '了', '。'], ['这', '有些', '许多', '有', '很多', '外国', '的', '机会', '。'], ['他', '设法', '地', '依靠', '了', '。'], ['你', '几岁', '过', '吗', '？'], ['她', '给', '我', '我', '的', '兄弟', '。'], ['我', '承认', '这', '可能', '不是', '最好', '的', '方法', '。'], ['她', '知道', '我们', '是', '委员会', '新', '的', '人', '。'], ['今晚', '晚饭', '什么', '时候', '？'], ['今天', '今天', '今天', '的', '狗', '，', '我们', '在', '我们', '上', '其他', '的', '时候', '才', '了', '。'], ['这', '让', '我', '生气', '。'], ['她', '给', '了', '我们', '一个', '漂亮', '的', '摩托车', '。'], ['他', '老', '那', '只', '狗', '。'], ['我会', '你', '饮料', '。'], ['我', '不', '喜欢', '寿司', '。'], ['我', '是', '最后', '的', '房子', '我', '的', '网站', '。'], ['太暗', '了', '，', '无法', '不能', '起床', '。'], ['我', '怀疑', '汤姆', 

# 开始训练

In [21]:
INPUT_DIM = len(en_TEXT.vocab)
OUTPUT_DIM = len(ch_TEXT.vocab)
# 超参数
BATCH_SIZE = 32
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
LEARNING_RATE = 1e-4
N_EPOCHS = 50
CLIP = 1

bidirectional = True
attn_method = "general"
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT, bidirectional)
dec = AttnDecoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT, bidirectional, attn_method)
model = Seq2Seq(enc, dec, device, basic_dict=ch_TEXT.vocab.stoi).to(device)

optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [22]:
# 数据集
data_set = TranslationDataset(en_num_data, ch_num_data)
train_size = int(len(data_set) * 0.9)
test_size = len(data_set) - train_size
print(train_size)
print(test_size)

train_set, test_set = torch.utils.data.random_split(data_set, [train_size, test_size])
train_loader = DataLoader(train_set, batch_size = BATCH_SIZE, collate_fn=padding_batch)
test_loader = DataLoader(test_set, batch_size = BATCH_SIZE, collate_fn=padding_batch)

# train_set = TranslationDataset(en_num_data, ch_num_data)
# train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=padding_batch)

21924
2436


In [23]:
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer, CLIP)
    valid_loss = evaluate(model, train_loader)
    test_loss = evaluate(model, test_loader)

    end_time = time.time()
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "en2ch-attn-model.pt")

    if epoch %2 == 0:
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} | Test Loss: {test_loss:.3f}')

Epoch: 01 | Time: 0m 47s
	Train Loss: 5.097 | Val. Loss: 5.285 | Test Loss: 4.804
Epoch: 03 | Time: 0m 47s
	Train Loss: 4.066 | Val. Loss: 4.000 | Test Loss: 4.459
Epoch: 05 | Time: 0m 47s
	Train Loss: 3.549 | Val. Loss: 3.538 | Test Loss: 4.247
Epoch: 07 | Time: 0m 46s
	Train Loss: 3.142 | Val. Loss: 3.162 | Test Loss: 4.117
Epoch: 09 | Time: 0m 46s
	Train Loss: 2.788 | Val. Loss: 2.855 | Test Loss: 4.047
Epoch: 11 | Time: 0m 46s
	Train Loss: 2.486 | Val. Loss: 2.574 | Test Loss: 4.004
Epoch: 13 | Time: 0m 46s
	Train Loss: 2.204 | Val. Loss: 2.353 | Test Loss: 4.002
Epoch: 15 | Time: 0m 46s
	Train Loss: 1.967 | Val. Loss: 2.108 | Test Loss: 4.001
Epoch: 17 | Time: 0m 46s
	Train Loss: 1.760 | Val. Loss: 1.922 | Test Loss: 4.017
Epoch: 19 | Time: 0m 46s
	Train Loss: 1.571 | Val. Loss: 1.774 | Test Loss: 4.043
Epoch: 21 | Time: 0m 47s
	Train Loss: 1.401 | Val. Loss: 1.565 | Test Loss: 4.052
Epoch: 23 | Time: 0m 46s
	Train Loss: 1.251 | Val. Loss: 1.433 | Test Loss: 4.092
Epoch: 25 | Time

In [24]:
print("best valid loss：", best_valid_loss)
# 加载最优权重
model.load_state_dict(torch.load("en2ch-attn-model.pt"))

best valid loss： 0.24004792807584374


<All keys matched successfully>

In [88]:
random.seed(2030)
for i in random.sample(range(len(en_num_data)), 1):  # 随机看10个
    en_tokens = list(filter(lambda x: x!=1, en_num_data[i]))  # 过滤零
    ch_tokens = list(filter(lambda x: x!=3 and x!=1, ch_num_data[i]))  # 和机器翻译作对照
    sentence = [en_TEXT.vocab.itos[t] for t in en_tokens]
    print(type(sentence[0]))
    print("【原文】")
    print("".join(sentence))
    translation = [ch_TEXT.vocab.itos[t] for t in ch_tokens]
    print("【原文】")
    print(" ".join(translation))
    test_sample = {}
    test_sample["src"] = torch.tensor(en_tokens, dtype=torch.long, device=device).reshape(-1, 1)
    test_sample["src_len"] = [len(en_tokens)]
    print(test_sample["src"])
    print(test_sample["src_len"])
    print("【机器翻译】")
    print(translate(model, test_sample, ch_TEXT.vocab.itos), end="\n\n")

<class 'str'>
【原文】
Tomlikestoplaybaseball.<eos>
【原文】
汤姆 喜欢 打 棒球 。
tensor([[ 13],
        [234],
        [  6],
        [190],
        [434],
        [  4],
        [  3]], device='cuda:0')
[7]
【机器翻译】
['汤姆', '喜欢', '打', '棒球', '。']

