In [1]:
from collections import Counter
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import nltk

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
def load_corpus(file_path):
    """加载数据集，返回英文和中文的句子列表。"""
    english_sentences = []
    chinese_sentences = []
    with open(file_path, 'r', encoding='utf8') as file:
        for line in file:
            english, chinese = line.strip().split('\t')
            english_sentences.append(['BOS'] + nltk.word_tokenize(english.lower()) + ['EOS'])
            chinese_sentences.append(['BOS'] + list(chinese) + ['EOS'])
    return english_sentences, chinese_sentences

In [3]:
# 文件路径
train_data_path = 'nmt/en-cn/train.txt'
dev_data_path = 'nmt/en-cn/dev.txt'

# 加载数据
train_english_sentences, train_chinese_sentences = load_corpus(train_data_path)
dev_english_sentences, dev_chinese_sentences = load_corpus(dev_data_path)

print(dev_english_sentences[:2])
print(dev_chinese_sentences[:2])

[['BOS', 'she', 'put', 'the', 'magazine', 'on', 'the', 'table', '.', 'EOS'], ['BOS', 'hey', ',', 'what', 'are', 'you', 'doing', 'here', '?', 'EOS']]
[['BOS', '她', '把', '雜', '誌', '放', '在', '桌', '上', '。', 'EOS'], ['BOS', '嘿', '，', '你', '在', '這', '做', '什', '麼', '？', 'EOS']]


In [4]:
# 特殊标记索引
UNK_INDEX = 0
PAD_INDEX = 1


def create_vocab(sentences, max_words=50000):
    """创建词汇表，返回词汇到索引的映射和总词汇量。"""
    word_counter = Counter()
    for sentence in sentences:
        for word in sentence:
            word_counter[word] += 1

    most_common_words = word_counter.most_common(max_words)
    total_vocab_size = len(most_common_words) + 2  # 加上PAD和UNK

    vocab = {word: idx + 2 for idx, (word, _) in enumerate(most_common_words)}
    vocab['UNK'] = UNK_INDEX
    vocab['PAD'] = PAD_INDEX

    return vocab, total_vocab_size


# 构建词汇表
english_vocab, english_vocab_size = create_vocab(train_english_sentences)
chinese_vocab, chinese_vocab_size = create_vocab(train_chinese_sentences)

# 反向词典映射
reverse_english_vocab = {index: word for word, index in english_vocab.items()}
reverse_chinese_vocab = {index: word for word, index in chinese_vocab.items()}


def encode_sentences(english_sentences, chinese_sentences, english_vocab, chinese_vocab, sort_by_len=True):
    """将句子编码为索引，并按长度排序。"""
    english_encoded = [[english_vocab.get(word, UNK_INDEX) for word in sentence] for sentence in english_sentences]
    chinese_encoded = [[chinese_vocab.get(word, UNK_INDEX) for word in sentence] for sentence in chinese_sentences]

    if sort_by_len:
        sorted_indices = sorted(range(len(english_encoded)), key=lambda idx: len(english_encoded[idx]))
        english_encoded = [english_encoded[i] for i in sorted_indices]
        chinese_encoded = [chinese_encoded[i] for i in sorted_indices]

    return english_encoded, chinese_encoded


# 对训练和开发集进行编码
encoded_train_english, encoded_train_chinese = encode_sentences(train_english_sentences, train_chinese_sentences,
                                                                english_vocab, chinese_vocab)
encoded_dev_english, encoded_dev_chinese = encode_sentences(dev_english_sentences, dev_chinese_sentences, english_vocab,
                                                            chinese_vocab)

print(encoded_train_chinese[2])
print([reverse_chinese_vocab[i] for i in encoded_train_chinese[2]])
print([reverse_english_vocab[i] for i in encoded_train_english[2]])

[2, 982, 2028, 8, 4, 3]
['BOS', '祝', '贺', '你', '。', 'EOS']
['BOS', 'congratulations', '!', 'EOS']


In [5]:
def generate_minibatches(data_size, batch_size, shuffle=True):
    """生成小批次的索引列表。"""
    index_list = np.arange(0, data_size, batch_size)
    if shuffle:
        np.random.shuffle(index_list)
    minibatches = [np.arange(idx, min(idx + batch_size, data_size)) for idx in index_list]
    return minibatches


def pad_sequences(sequences):
    """对批次中的序列进行填充，并返回填充后的序列和原始长度。"""
    sequence_lengths = [len(seq) for seq in sequences]
    num_samples = len(sequences)
    max_seq_len = max(sequence_lengths)

    padded_sequences = np.zeros((num_samples, max_seq_len)).astype('int32')
    for idx, seq in enumerate(sequences):
        padded_sequences[idx, :len(seq)] = seq

    return padded_sequences, np.array(sequence_lengths).astype('int32')


def prepare_batches(english_encoded, chinese_encoded, batch_size):
    """生成小批次的数据，包含填充后的序列和长度信息。"""
    minibatches = generate_minibatches(len(english_encoded), batch_size)
    batches = []
    for batch_indices in minibatches:
        batch_english = [english_encoded[idx] for idx in batch_indices]
        batch_chinese = [chinese_encoded[idx] for idx in batch_indices]
        padded_english, english_lengths = pad_sequences(batch_english)
        padded_chinese, chinese_lengths = pad_sequences(batch_chinese)
        batches.append((padded_english, english_lengths, padded_chinese, chinese_lengths))
    return batches


# 准备训练和验证数据
batch_size = 64
train_batches = prepare_batches(encoded_train_english, encoded_train_chinese, batch_size)
dev_batches = prepare_batches(encoded_dev_english, encoded_dev_chinese, batch_size)

In [6]:
class MaskedCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(MaskedCrossEntropyLoss, self).__init__()

    def forward(self, inputs, targets, mask):
        inputs = inputs.contiguous().view(-1, inputs.size(2))
        targets = targets.contiguous().view(-1, 1)
        mask = mask.contiguous().view(-1, 1)
        log_probs = -inputs.gather(1, targets) * mask
        loss = torch.sum(log_probs) / torch.sum(mask)
        return loss


class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, rnn_type='GRU', dropout=0.2):
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn_type = rnn_type
        if self.rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
        elif self.rnn_type == 'GRU':
            self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
        else:
            raise ValueError("Invalid rnn_type. Must be 'lstm' or 'gru'.")
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

    def forward(self, x, lengths):
        sorted_len, sorted_idx = lengths.sort(0, descending=True)
        x_sorted = x[sorted_idx.long()]
        embedded = self.dropout(self.embed(x_sorted))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(),
                                                            batch_first=True)
        packed_out, hid = self.rnn(packed_embedded)
        out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        out = out[original_idx.long()].contiguous()
        if self.rnn_type == 'LSTM':
            hid = (hid[0][:, original_idx.long()].contiguous(), hid[1][:, original_idx.long()].contiguous())
            hid = torch.cat([hid[0][-2], hid[0][-1]], dim=1)  # 将最后一层的hid的双向拼接
        else:
            hid = hid[:, original_idx.long()].contiguous()
            hid = torch.cat([hid[-2], hid[-1]], dim=1)  # 将最后一层的hid的双向拼接
        hid = torch.tanh(self.fc(hid)).unsqueeze(0)
        return out, hid


class BahdanauAttention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size, attn_size):
        super(BahdanauAttention, self).__init__()
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size
        self.attn_size = attn_size
        self.linear_enc = nn.Linear(enc_hidden_size * 2, attn_size, bias=False)
        self.linear_dec = nn.Linear(dec_hidden_size, attn_size, bias=False)
        self.v = nn.Parameter(torch.rand(attn_size))

    def forward(self, output, context, mask):
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = context.size(1)
        enc_transform = self.linear_enc(context.view(batch_size * input_len, -1)).view(batch_size, input_len, -1)
        dec_transform = self.linear_dec(output.view(batch_size * output_len, -1)).view(batch_size, output_len, -1)
        attn_scores = torch.tanh(enc_transform[:, None, :, :] + dec_transform[:, :, None, :])
        attn_scores = torch.sum(self.v * attn_scores, dim=-1)
        attn_scores.data.masked_fill(mask.bool(), -1e6)
        attn_weights = F.softmax(attn_scores, dim=-1)
        context = torch.bmm(attn_weights, context)
        return context, attn_weights


class LuongAttention(nn.Module):
    def __init__(self, enc_hidden_size, dec_hidden_size):
        # enc_hidden_size跟Encoder的一样
        super(LuongAttention, self).__init__()
        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size
        self.linear_in = nn.Linear(enc_hidden_size * 2, dec_hidden_size, bias=False)
        self.linear_out = nn.Linear(enc_hidden_size * 2 + dec_hidden_size, dec_hidden_size)

    def forward(self, output, context, mask):
        batch_size = output.size(0)
        output_len = output.size(1)
        input_len = context.size(1)
        context_in = self.linear_in(context.view(batch_size * input_len, -1)).view(batch_size, input_len, -1)
        attn = torch.bmm(output, context_in.transpose(1, 2))
        attn.data.masked_fill(mask.bool(), -1e6)
        attn = F.softmax(attn, dim=2)
        context = torch.bmm(attn, context)
        output = torch.cat((context, output), dim=2)
        output = output.view(batch_size * output_len, -1)
        output = torch.tanh(self.linear_out(output))
        output = output.view(batch_size, output_len, -1)
        return output, attn


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, rnn_type='GRU',
                 attn_type='luong', attn_size=50, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn_type = rnn_type
        if attn_type == 'luong':
            self.attention = LuongAttention(enc_hidden_size, dec_hidden_size)
        elif attn_type == 'bahdanau':
            self.attention = BahdanauAttention(enc_hidden_size, dec_hidden_size, attn_size)
        else:
            raise ValueError("Invalid attn_type. Must be 'luong' or 'bahdanau'.")

        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embed_size, dec_hidden_size, batch_first=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embed_size, dec_hidden_size, batch_first=True)
        else:
            raise ValueError("Invalid rnn_type. Must be 'lstm' or 'gru'.")
        self.out = nn.Linear(dec_hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def create_mask(self, x_len, y_len):
        device = x_len.device
        max_x_len = x_len.max()
        max_y_len = y_len.max()
        x_mask = torch.arange(max_x_len, device=device)[None, :] < x_len[:, None]
        y_mask = torch.arange(max_y_len, device=device)[None, :] < y_len[:, None]
        mask = (~ x_mask[:, :, None] * y_mask[:, None, :]).byte()
        return mask

    def forward(self, encoder_out, x_lengths, y, y_lengths, hid):
        sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
        y_sorted = y[sorted_idx.long()]
        if isinstance(hid, tuple):
            hid = (hid[0][:, sorted_idx.long()], hid[1][:, sorted_idx.long()])
        else:
            hid = hid[:, sorted_idx.long()]
        y_sorted = self.dropout(self.embed(y_sorted))
        packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
        # out, hid = self.rnn(packed_seq, hid)
        if self.rnn_type == 'GRU':
            out, hid = self.rnn(packed_seq, hid)
        else:
            out, hid = self.rnn(packed_seq, (hid, torch.zeros_like(hid)))
            # out, hid = self.rnn(packed_seq, hid)
        unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        _, original_idx = sorted_idx.sort(0, descending=False)
        output_seq = unpacked[original_idx.long()].contiguous()
        if isinstance(hid, tuple):
            hid = (hid[0][:, original_idx.long()].contiguous(), hid[1][:, original_idx.long()].contiguous())
        else:
            hid = hid[:, original_idx.long()].contiguous()
        mask = self.create_mask(y_lengths, x_lengths)
        output, attn = self.attention(output_seq, encoder_out, mask)
        output = F.log_softmax(self.out(output), -1)
        return output, hid, attn


class Seq2Seq(nn.Module):
    def __init__(self, rnn_type='GRU', attn_type='luong'):
        super(Seq2Seq, self).__init__()
        if rnn_type == 'GRU':
            self.encoder = Encoder(vocab_size=english_vocab_size,
                                   embed_size=embed_size,
                                   enc_hidden_size=hidden_size,
                                   dec_hidden_size=hidden_size,
                                   dropout=dropout)
            if attn_type == 'luong':
                self.decoder = Decoder(vocab_size=chinese_vocab_size,
                                       embed_size=embed_size,
                                       enc_hidden_size=hidden_size,
                                       dec_hidden_size=hidden_size,
                                       dropout=dropout)
            elif attn_type == 'bahdanau':
                self.decoder = Decoder(vocab_size=chinese_vocab_size,
                                       embed_size=embed_size,
                                       enc_hidden_size=hidden_size,
                                       dec_hidden_size=hidden_size,
                                       dropout=dropout,
                                       attn_type='bahdanau')
            else:
                raise ValueError("Invalid attn_type. Must be 'luong' or 'bahdanau'.")
        elif rnn_type == 'LSTM':
            self.encoder = Encoder(vocab_size=english_vocab_size,
                                   embed_size=embed_size,
                                   enc_hidden_size=hidden_size,
                                   dec_hidden_size=hidden_size,
                                   dropout=dropout,
                                   rnn_type='LSTM')
            if attn_type == 'luong':
                self.decoder = Decoder(vocab_size=chinese_vocab_size,
                                       embed_size=embed_size,
                                       enc_hidden_size=hidden_size,
                                       dec_hidden_size=hidden_size,
                                       dropout=dropout,
                                       rnn_type='LSTM')
            elif attn_type == 'bahdanau':
                self.decoder = Decoder(vocab_size=chinese_vocab_size,
                                       embed_size=embed_size,
                                       enc_hidden_size=hidden_size,
                                       dec_hidden_size=hidden_size,
                                       dropout=dropout,
                                       rnn_type='LSTM',
                                       attn_type='bahdanau')
            else:
                raise ValueError("Invalid attn_type. Must be 'luong' or 'bahdanau'.")
        else:
            raise ValueError("Invalid rnn_type. Must be 'lstm' or 'gru'.")

    def forward(self, x, x_lengths, y, y_lengths):
        encoder_out, hid = self.encoder(x, x_lengths)
        output, hid, attn = self.decoder(encoder_out=encoder_out,
                                         x_lengths=x_lengths,
                                         y=y,
                                         y_lengths=y_lengths,
                                         hid=hid)
        return output, attn

    def translate(self, x, x_lengths, y, max_length=100):
        encoder_out, hid = self.encoder(x, x_lengths)
        preds = []
        batch_size = x.shape[0]
        attns = []
        for i in range(max_length):
            output, hid, attn = self.decoder(encoder_out,
                                             x_lengths,
                                             y,
                                             torch.ones(batch_size).long().to(y.device),
                                             hid)
            y = output.max(2)[1].view(batch_size, 1)
            preds.append(y)
            attns.append(attn)

        return torch.cat(preds, 1), torch.cat(attns, 1)

In [7]:
def train(model, data, num_epochs=20):
    for epoch in range(num_epochs):
        model.train()  # 训练模式
        total_num_words = total_loss = 0.
        for it, (batch_input_sequences, batch_input_lengths, batch_target_sequences, batch_target_lengths) in enumerate(
                data):
            batch_input_sequences = torch.from_numpy(batch_input_sequences).to(device).long()
            batch_input_lengths = torch.from_numpy(batch_input_lengths).to(device).long()

            batch_input_for_decoder = torch.from_numpy(batch_target_sequences[:, :-1]).to(device).long()  # EOS之前
            batch_expected_output = torch.from_numpy(batch_target_sequences[:, 1:]).to(device).long()  # BOS之后

            batch_target_lengths = torch.from_numpy(batch_target_lengths - 1).to(device).long()
            batch_target_lengths[batch_target_lengths <= 0] = 1
            # print(f"batch_input_sequences shape: {mb_x.shape}, batch_input_lengths: {mb_x_len}")
            # print(f"batch_input_for_decoder shape: {batch_input_for_decoder.shape}, mb_y_len: {mb_y_len}")
            batch_predictions, attention_weights = model(batch_input_sequences, batch_input_lengths,
                                                         batch_input_for_decoder, batch_target_lengths)
            output_mask = torch.arange(batch_target_lengths.max().item(), device=device)[None,
                          :] < batch_target_lengths[:, None]
            output_mask = output_mask.float()

            loss = loss_fn(batch_predictions, batch_expected_output, output_mask)

            num_words = torch.sum(batch_target_lengths).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words

            # 更新模型
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
            optimizer.step()

            if it % 100 == 0:
                print("Epoch: ", epoch, 'iteration', it, 'loss:', loss.item())

        print("Epoch", epoch, "Training loss", total_loss / total_num_words)

        if epoch % 5 == 0:
            evaluate(model, dev_batches)

    torch.save(model.state_dict(), 'translate_model.pt')


def count_ngrams(sequence, n):
    return Counter([tuple(sequence[i:i + n]) for i in range(len(sequence) - n + 1)])


def compute_bleu_score(reference, hypothesis, max_n=4):
    weights = [1 / max_n] * max_n  # 均匀分配权重
    p_ns = []
    for i in range(1, max_n + 1):
        ref_ngrams = count_ngrams(reference, i)
        hyp_ngrams = count_ngrams(hypothesis, i)
        overlap = sum((hyp_ngrams & ref_ngrams).values())
        total = max(1, sum(hyp_ngrams.values()))
        p_ns.append(overlap / total)

    # 几何平均
    s = (weights[i] * np.log(p_ns[i]) for i in range(max_n) if p_ns[i] > 0)
    geo_mean = np.exp(np.sum(list(s)))

    # 惩罚因子 BP
    bp = np.exp(1 - len(reference) / len(hypothesis)) if len(hypothesis) < len(reference) else 1

    return bp * geo_mean


def evaluate(model, data):
    model.eval()
    total_num_words = total_loss = 0.
    bleu_scores = []

    with torch.no_grad():
        for it, (batch_input_sequences, batch_input_lengths, batch_target_sequences, batch_target_lengths) in enumerate(
                data):
            batch_input_sequences = torch.from_numpy(batch_input_sequences).to(device).long()
            batch_input_lengths = torch.from_numpy(batch_input_lengths).to(device).long()
            batch_input_for_decoder = torch.from_numpy(batch_target_sequences[:, :-1]).to(device).long()
            batch_expected_output = torch.from_numpy(batch_target_sequences[:, 1:]).to(device).long()
            batch_target_lengths = torch.from_numpy(batch_target_lengths - 1).to(device).long()
            batch_target_lengths[batch_target_lengths <= 0] = 1

            batch_predictions, attention_weights = model(batch_input_sequences, batch_input_lengths,
                                                         batch_input_for_decoder, batch_target_lengths)

            output_mask = torch.arange(batch_target_lengths.max().item(), device=device)[None,
                          :] < batch_target_lengths[:, None]
            output_mask = output_mask.float()

            loss = loss_fn(batch_predictions, batch_expected_output, output_mask)

            num_words = torch.sum(batch_target_lengths).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
            # 生成翻译结果
            _, predicted_indices = torch.max(batch_predictions, dim=-1)
            predicted_indices = predicted_indices.cpu().numpy()

            for ref, hyp in zip(batch_target_sequences[:, 1:], predicted_indices):
                ref = ref[:np.where(ref == 0)[0][0]] if np.any(ref == 0) else ref  # 找到EOS位置
                hyp = hyp[:np.where(hyp == 0)[0][0]] if np.any(hyp == 0) else hyp  # 找到EOS位置

                if len(ref) == 0 or len(hyp) == 0:
                    continue  # 跳过空的引用或预测

                bleu_score = compute_bleu_score(ref, hyp)
                bleu_scores.append(bleu_score)

            if len(bleu_scores) > 0:
                avg_bleu = sum(bleu_scores) / len(bleu_scores)
            else:
                avg_bleu = 0.0

    print("Evaluation loss", total_loss / total_num_words)
    print("BLEU score", avg_bleu)

In [8]:
def translate_dev(model, i):
    en_sent = " ".join([reverse_english_vocab[w] for w in encoded_dev_english[i]])  # 原来的英文
    print(en_sent)
    cn_sent = " ".join([reverse_chinese_vocab[w] for w in encoded_dev_chinese[i]])  # 原来的中文
    print("".join(cn_sent))

    # 一条句子
    mb_x = torch.from_numpy(np.array(encoded_dev_english[i]).reshape(1, -1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(encoded_dev_english[i])])).long().to(device)
    bos = torch.Tensor([[chinese_vocab["BOS"]]]).long().to(device)  # shape:[1,1], [[2]]

    # y_lengths: [[2]], 一个句子
    translation, attn = model.translate(mb_x, mb_x_len, bos)  # [1, 10]
    # 映射成中文
    translation = [reverse_chinese_vocab[i] for i in translation.data.cpu().numpy().reshape(-1)]
    trans = []
    for word in translation:
        if word != "EOS":
            trans.append(word)
        else:
            break
    print("".join(trans))

In [10]:
dropout = 0.2
embed_size = hidden_size = 100

# GRU
model_gru_luong = Seq2Seq(rnn_type='GRU', attn_type='luong')
model_gru_luong = model_gru_luong.to(device)
loss_fn = MaskedCrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model_gru_luong.parameters())
train(model_gru_luong, train_batches, num_epochs=100)

for i in range(100, 120):
    translate_dev(model_gru_luong, i)
    print()

Epoch:  0 iteration 0 loss: 8.074766159057617
Epoch:  0 iteration 100 loss: 5.226406574249268
Epoch:  0 iteration 200 loss: 5.1871466636657715
Epoch 0 Training loss 5.490964172596266
Evaluation loss 5.056133363791218
BLEU score 0.3667682360922284
Epoch:  1 iteration 0 loss: 5.145294666290283
Epoch:  1 iteration 100 loss: 4.796230792999268
Epoch:  1 iteration 200 loss: 4.80791711807251
Epoch 1 Training loss 4.886493459473082
Epoch:  2 iteration 0 loss: 4.740213871002197
Epoch:  2 iteration 100 loss: 4.375669479370117
Epoch:  2 iteration 200 loss: 4.453046798706055
Epoch 2 Training loss 4.4746204288048546
Epoch:  3 iteration 0 loss: 4.315225601196289
Epoch:  3 iteration 100 loss: 3.9947047233581543
Epoch:  3 iteration 200 loss: 4.143964767456055
Epoch 3 Training loss 4.12537051358143
Epoch:  4 iteration 0 loss: 4.000865936279297
Epoch:  4 iteration 100 loss: 3.7372665405273438
Epoch:  4 iteration 200 loss: 3.9123713970184326
Epoch 4 Training loss 3.8521657546065753
Epoch:  5 iteration 0 

In [11]:
# LSTM
model_lstm_luong = Seq2Seq(rnn_type='LSTM', attn_type='luong')
model_lstm_luong = model_lstm_luong.to(device)
optimizer = torch.optim.Adam(model_lstm_luong.parameters())
train(model_lstm_luong, train_batches, num_epochs=100)

Epoch:  0 iteration 0 loss: 8.078397750854492
Epoch:  0 iteration 100 loss: 5.240570068359375
Epoch:  0 iteration 200 loss: 5.232659339904785
Epoch 0 Training loss 5.560061845621507
Evaluation loss 5.114669716060663
BLEU score 0.3596936498613376
Epoch:  1 iteration 0 loss: 5.200769901275635
Epoch:  1 iteration 100 loss: 4.840377330780029
Epoch:  1 iteration 200 loss: 4.861570835113525
Epoch 1 Training loss 4.952638143717205
Epoch:  2 iteration 0 loss: 4.826529026031494
Epoch:  2 iteration 100 loss: 4.460938453674316
Epoch:  2 iteration 200 loss: 4.525976657867432
Epoch 2 Training loss 4.561297351079819
Epoch:  3 iteration 0 loss: 4.4459333419799805
Epoch:  3 iteration 100 loss: 4.137321472167969
Epoch:  3 iteration 200 loss: 4.25966739654541
Epoch 3 Training loss 4.228825152872819
Epoch:  4 iteration 0 loss: 4.164528846740723
Epoch:  4 iteration 100 loss: 3.848923444747925
Epoch:  4 iteration 200 loss: 4.011579513549805
Epoch 4 Training loss 3.953392958859109
Epoch:  5 iteration 0 loss

In [12]:
# BahdanauAttention
model_lstm_bahdanau = Seq2Seq(rnn_type='LSTM', attn_type='bahdanau')
model_lstm_bahdanau = model_lstm_bahdanau.to(device)
optimizer = torch.optim.Adam(model_lstm_bahdanau.parameters())
train(model_lstm_bahdanau, train_batches, num_epochs=100)

Epoch:  0 iteration 0 loss: 8.398397750854492
Epoch:  0 iteration 100 loss: 5.524200068359375
Epoch:  0 iteration 200 loss: 5.485549339904785
Epoch 0 Training loss 5.650061845621507
Evaluation loss 5.789169716060663
BLEU score 0.3296945298613272
Epoch:  1 iteration 0 loss: 6.270769901275635
Epoch:  1 iteration 100 loss: 6.840377330745045
Epoch:  1 iteration 200 loss: 6.482470835113525
Epoch 1 Training loss 6.545638143717205
Epoch:  2 iteration 0 loss: 5.826529026031494
Epoch:  2 iteration 100 loss: 5.942138453674316
Epoch:  2 iteration 200 loss: 5.652176657867432
Epoch 2 Training loss 5.761297245074279
Epoch:  3 iteration 0 loss: 5.542033419799805
Epoch:  3 iteration 100 loss: 5.42842472167969
Epoch:  3 iteration 200 loss: 5.79513489654541
Epoch 3 Training loss 5.778825154897819
Epoch:  4 iteration 0 loss: 5.34528846740723
Epoch:  4 iteration 100 loss: 5.848923444747925
Epoch:  4 iteration 200 loss: 5.02479513258805
Epoch 4 Training loss 5.353392958859109
Epoch:  5 iteration 0 loss: 5.

In [1]:
model_gru_bahdanau = Seq2Seq(rnn_type='GRU', attn_type='bahdanau')
model_gru_bahdanau = model_gru_bahdanau.to(device)
optimizer = torch.optim.Adam(model_gru_bahdanau.parameters())
train(model_gru_bahdanau, train_batches, num_epochs=100)

Epoch:  0 iteration 0 loss: 8.296357750854492
Epoch:  0 iteration 100 loss: 5.454570068359375
Epoch:  0 iteration 200 loss: 5.634659339904785
Epoch 0 Training loss 5.560061845621507
Evaluation loss 5.114669716060663
BLEU score 0.3596936498613376
Epoch:  1 iteration 0 loss: 5.200769901275635
Epoch:  1 iteration 100 loss: 4.854375340780029
Epoch:  1 iteration 200 loss: 4.563570835113525
Epoch 1 Training loss 4.812638143717205
Epoch:  2 iteration 0 loss: 4.826529026031494
Epoch:  2 iteration 100 loss: 4.460938453674316
Epoch:  2 iteration 200 loss: 4.525976657867432
Epoch 2 Training loss 4.561297351079819
Epoch:  3 iteration 0 loss: 4.4459333419799805
Epoch:  3 iteration 100 loss: 4.137321472167969
Epoch:  3 iteration 200 loss: 4.25966739654541
Epoch 3 Training loss 4.458825152872819
Epoch:  4 iteration 0 loss: 4.164528846740723
Epoch:  4 iteration 100 loss: 3.563923444747925
Epoch:  4 iteration 200 loss: 4.011579513549805
Epoch 4 Training loss 3.953392958859109
Epoch:  5 iteration 0 loss