In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [2]:
import os, math, random, copy
from dataclasses import dataclass
from typing import Dict, List
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from datasets import load_dataset
import evaluate
from transformers import MarianTokenizer

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE:", DEVICE)

DEVICE: cuda


In [3]:
# Tokenizer
TOKENIZER_NAME = "Helsinki-NLP/opus-mt-en-de"
tokenizer = MarianTokenizer.from_pretrained(TOKENIZER_NAME)

if tokenizer.bos_token is None:
    tokenizer.add_special_tokens({"bos_token": "<bos>"})
if tokenizer.eos_token is None:
    tokenizer.add_special_tokens({"eos_token": "</s>"})

PAD_ID = tokenizer.pad_token_id
BOS_ID = tokenizer.bos_token_id
EOS_ID = tokenizer.eos_token_id
VOCAB_SIZE = len(tokenizer)

SRC_LANG = "en"  # 영어 -> 독일어
TGT_LANG = "de"
MAX_SRC_LEN = 128
MAX_TGT_LEN = 128

PAD_ID, BOS_ID, EOS_ID, VOCAB_SIZE

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



(58100, 58101, 0, 58102)

In [4]:
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [5]:
class LayerNorm(nn.Module):
    def __init__(self, features, eps=1e-6):
        super().__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [6]:
class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [7]:
def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2,-1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float("-inf"))
    p_attn = scores.softmax(dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [8]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super().__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(dropout)
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        query, key, value =             [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1,2)
             for l, x in zip(self.linears, (query, key, value))]
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        x = x.transpose(1,2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

In [9]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

In [11]:
class EncoderLayer(nn.Module):
    def __init__(self, size, self_attn, feed_forward, dropout):
        super().__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size
    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [12]:
class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super().__init__()
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
        self.size = size
    def forward(self, x, memory, src_mask, tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [13]:
class Encoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [14]:
class Decoder(nn.Module):
    def __init__(self, layer, N):
        super().__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [15]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [16]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab)
    def forward(self, x):
        return self.proj(x)

In [17]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
    def forward(self, src, src_mask, tgt, tgt_mask):
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [18]:
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(torch.uint8)
    return (subsequent_mask == 0)

def make_std_mask(tgt, pad):
    tgt_mask = (tgt != pad).unsqueeze(-2) & subsequent_mask(tgt.size(-1)).to(tgt.device)
    return tgt_mask

def make_src_mask(src, pad):
    return (src != pad).unsqueeze(-2)

In [19]:
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab)
    )
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

In [20]:
from datasets import load_dataset

raw = load_dataset("wmt16", "de-en")
train_ds = raw.get("train")
valid_ds = raw.get("validation") or raw.get("validation1") or raw.get("dev")
test_ds  = raw.get("test")

from datasets import concatenate_datasets

def cap_select(ds, n, seed=42):
    if ds is None or n is None:
        return ds
    n_cap = min(n, len(ds))
    return ds.shuffle(seed=seed).select(range(n_cap))

N_TRAIN = 200_000
N_VALID = 3_000

train_ds = cap_select(train_ds, N_TRAIN, seed=SEED)
valid_ds = cap_select(valid_ds, N_VALID, seed=SEED)
test_ds  = cap_select(test_ds, None, seed=SEED)

len(train_ds), len(valid_ds), (len(test_ds) if test_ds is not None else None)

README.md: 0.00B [00:00, ?B/s]

de-en/train-00000-of-00003.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

de-en/train-00001-of-00003.parquet:   0%|          | 0.00/267M [00:00<?, ?B/s]

de-en/train-00002-of-00003.parquet:   0%|          | 0.00/277M [00:00<?, ?B/s]

de-en/validation-00000-of-00001.parquet:   0%|          | 0.00/343k [00:00<?, ?B/s]

de-en/test-00000-of-00001.parquet:   0%|          | 0.00/475k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4548885 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2169 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2999 [00:00<?, ? examples/s]

(200000, 2169, 2999)

In [21]:
import os
from dataclasses import dataclass
from typing import List, Dict

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [22]:
@dataclass
class Batch:
    src_input_ids: torch.Tensor  # (B, S)
    tgt_input_ids: torch.Tensor  # (B, T)
    labels: torch.Tensor         # (B, T), PAD -> -100

def preprocess_batch(batch):
    # batch["translation"]: List[{"de": "...", "en": "..."}]
    src_texts = [ex[SRC_LANG] for ex in batch["translation"]]
    tgt_texts = [ex[TGT_LANG] for ex in batch["translation"]]

    src_tok = tokenizer(src_texts, max_length=MAX_SRC_LEN, truncation=True, padding=False)
    tgt_tok = tokenizer(tgt_texts, max_length=MAX_TGT_LEN, truncation=True, padding=False)

    out_src, out_tgtin, out_labels = [], [], []
    for src_ids, tgt_ids in zip(src_tok["input_ids"], tgt_tok["input_ids"]):
        # teacher-forcing 입력: BOS + target[:-1]
        if len(tgt_ids) == 0:
            tgt_in = [BOS_ID]
            labels = [-100]
        else:
            tgt_in = [BOS_ID] + tgt_ids[:-1]
            labels = [(tid if tid != PAD_ID else -100) for tid in tgt_ids]

        out_src.append(src_ids)
        out_tgtin.append(tgt_in)
        out_labels.append(labels)

    return {
        "src_input_ids": out_src,     # List[List[int]]
        "tgt_input_ids": out_tgtin,   # List[List[int]]
        "labels": out_labels,         # List[List[int]] (-100 포함)
    }

In [23]:
# 병렬 전처리
num_proc = max(1, (os.cpu_count() or 2) // 2)
train_proc = train_ds.map(
    preprocess_batch, batched=True, batch_size=1024, num_proc=num_proc,
    remove_columns=train_ds.column_names
)
valid_proc = valid_ds.map(
    preprocess_batch, batched=True, batch_size=2048, num_proc=num_proc,
    remove_columns=valid_ds.column_names
)
test_proc = (test_ds.map(
    preprocess_batch, batched=True, batch_size=2048, num_proc=num_proc,
    remove_columns=test_ds.column_names
) if test_ds is not None else None)

Map (num_proc=6):   0%|          | 0/200000 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2169 [00:00<?, ? examples/s]

Map (num_proc=6):   0%|          | 0/2999 [00:00<?, ? examples/s]

In [24]:
def pad_2d(seqs: List[List[int]], pad_val: int) -> torch.Tensor:
    maxlen = max(len(s) for s in seqs) if seqs else 0
    out = torch.full((len(seqs), maxlen), pad_val, dtype=torch.long)
    for i, s in enumerate(seqs):
        if len(s) > 0:
            out[i, :len(s)] = torch.tensor(s, dtype=torch.long)
    return out

def collate_fn(examples: List[Dict]):
    src = pad_2d([ex["src_input_ids"] for ex in examples], PAD_ID)
    tgt = pad_2d([ex["tgt_input_ids"] for ex in examples], PAD_ID)
    lab = pad_2d([ex["labels"] for ex in examples], -100)
    return Batch(src, tgt, lab)

In [25]:
# DataLoader
BATCH_SIZE  = 32
NUM_WORKERS = 0
PIN_MEM     = False
PERSISTENT  = False

train_loader = DataLoader(
    train_proc, batch_size=BATCH_SIZE, shuffle=True,
    collate_fn=collate_fn, num_workers=NUM_WORKERS,
    pin_memory=PIN_MEM, persistent_workers=PERSISTENT
)
valid_loader = DataLoader(
    valid_proc, batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=collate_fn, num_workers=NUM_WORKERS,
    pin_memory=PIN_MEM, persistent_workers=PERSISTENT
)
test_loader = (DataLoader(
    test_proc, batch_size=BATCH_SIZE, shuffle=False,
    collate_fn=collate_fn, num_workers=NUM_WORKERS,
    pin_memory=PIN_MEM, persistent_workers=PERSISTENT
) if test_proc is not None else None)

In [26]:
# model = make_model(
#     src_vocab=VOCAB_SIZE,
#     tgt_vocab=VOCAB_SIZE,
#     N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
# ).to(DEVICE)

In [27]:
class NoamOpt:
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0.0
    def step(self):
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
    def rate(self, step=None):
        if step is None:
            step = self._step
        return self.factor * (
            self.model_size ** (-0.5) *
            min(step ** (-0.5), step * (self.warmup ** (-1.5)))
        )

def get_std_opt(model, factor=1.0, warmup=4000):
    d_model = model.src_embed[0].d_model  # Embeddings에 저장된 d_model 사용
    return NoamOpt(
        model_size=d_model, factor=factor, warmup=warmup,
        optimizer=torch.optim.Adam(
            model.parameters(), lr=0.0, betas=(0.9, 0.98), eps=1e-9
        )
    )

In [28]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1, ignore_index=-100):
        super().__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.ignore_index = ignore_index
    def forward(self, pred, target):
        logprobs = F.log_softmax(pred, dim=-1)
        ignore = target.eq(self.ignore_index)
        target = target.clone()
        target[ignore] = 0
        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)).squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
        loss = loss.masked_fill(ignore, 0.0)
        return loss.mean()

In [29]:
criterion = LabelSmoothingLoss(classes=VOCAB_SIZE, smoothing=0.1, ignore_index=-100)

# 모델 만들고 weight tying
model = make_model(
          src_vocab=VOCAB_SIZE, tgt_vocab=VOCAB_SIZE,
          N=6, d_model=512, d_ff=2048, h=8, dropout=0.1
          ).to(DEVICE)

model.generator.proj.weight = model.tgt_embed[0].lut.weight  # tying

opt = get_std_opt(model)  # Noam

In [30]:
from tqdm.auto import tqdm

def train_one_epoch(epoch):
    model.train()
    total_loss, steps = 0.0, 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch} [train]", leave=False)
    for batch in pbar:
        src_ids = batch.src_input_ids.to(DEVICE)
        tgt_in  = batch.tgt_input_ids.to(DEVICE)
        labels  = batch.labels.to(DEVICE)

        src_mask = make_src_mask(src_ids, PAD_ID)
        tgt_mask = make_std_mask(tgt_in, PAD_ID)

        logits = model(src_ids, src_mask, tgt_in, tgt_mask)
        logits = model.generator(logits)

        loss = criterion(logits.reshape(-1, VOCAB_SIZE), labels.reshape(-1))

        opt.optimizer.zero_grad(set_to_none=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        total_loss += loss.item()
        steps += 1

        # 진행률 바에 현재 loss 표시
        pbar.set_postfix(loss=loss.item())

    return total_loss / max(1, steps)

In [31]:
@torch.no_grad()
def evaluate_ppl():
    model.eval()
    total_loss, steps = 0.0, 0
    pbar = tqdm(valid_loader, desc="Validation", leave=False)
    for batch in pbar:
        src_ids = batch.src_input_ids.to(DEVICE)
        tgt_in  = batch.tgt_input_ids.to(DEVICE)
        labels  = batch.labels.to(DEVICE)

        src_mask = make_src_mask(src_ids, PAD_ID)
        tgt_mask = make_std_mask(tgt_in, PAD_ID)

        logits = model(src_ids, src_mask, tgt_in, tgt_mask)
        logits = model.generator(logits)
        loss = criterion(logits.reshape(-1, VOCAB_SIZE), labels.reshape(-1))

        total_loss += loss.item()
        steps += 1
        pbar.set_postfix(loss=loss.item())

    avg = total_loss / max(1, steps)
    return math.exp(avg)

In [32]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [33]:
sacrebleu = evaluate.load("sacrebleu")

def detok(ids):
    out = []
    for t in ids:
        if t == EOS_ID:
            break
        out.append(int(t))
    return tokenizer.decode(out, skip_special_tokens=True)

@torch.no_grad()
def greedy_generate(src_ids, max_new_tokens=MAX_TGT_LEN):
    model.eval()
    B = src_ids.size(0)
    ys = torch.full((B, 1), BOS_ID, dtype=torch.long, device=src_ids.device)
    for _ in range(max_new_tokens):
        src_mask = make_src_mask(src_ids, PAD_ID)
        tgt_mask = make_std_mask(ys, PAD_ID)
        out = model(src_ids, src_mask, ys, tgt_mask)         # (B, T, D)
        logits = model.generator(out[:, -1:, :])              # (B, 1, V)
        next_tok = torch.argmax(logits.squeeze(1), dim=-1, keepdim=True)  # (B,1)
        ys = torch.cat([ys, next_tok], dim=1)
        if (next_tok.squeeze(1) == EOS_ID).all():
            break
    return ys[:, 1:]  # BOS 제외

@torch.no_grad()
def eval_bleu4(data_loader):
    preds_text, refs_text = [], []
    for batch in data_loader:
        src_ids = batch.src_input_ids.to(DEVICE)
        labels  = batch.labels
        gen = greedy_generate(src_ids)
        for g, y in zip(gen.cpu().tolist(), labels.cpu().tolist()):
            preds_text.append(detok(g))
            ref = [tid for tid in y if tid != -100 and tid != BOS_ID and tid != PAD_ID]
            refs_text.append(detok(ref))
    bleu = sacrebleu.compute(predictions=preds_text, references=[[r] for r in refs_text])
    print({"bleu": bleu["score"]})
    return bleu

Downloading builder script: 0.00B [00:00, ?B/s]

In [34]:
EPOCHS = 3
best_bleu = 0.0

for epoch in range(1, EPOCHS+1):
    tr_loss = train_one_epoch(epoch)
    ppl = evaluate_ppl()
    print(f"Epoch {epoch} | train_loss {tr_loss:.4f}")

    dev_bleu = eval_bleu4(valid_loader)["score"]
    if dev_bleu > best_bleu:
        best_bleu = dev_bleu
        torch.save(model.state_dict(), "best_bleu.pt")
        print(f"best BLEU: {best_bleu:.2f}")

print("Final BLEU:", best_bleu)

Epoch 1 [train]:   0%|          | 0/6250 [00:00<?, ?it/s]

Validation:   0%|          | 0/68 [00:00<?, ?it/s]

Epoch 1 | train_loss 2.2605
{'bleu': 1.1621268537935767}
best BLEU: 1.16


Epoch 2 [train]:   0%|          | 0/6250 [00:00<?, ?it/s]

Validation:   0%|          | 0/68 [00:00<?, ?it/s]

Epoch 2 | train_loss 1.7103
{'bleu': 3.662923758674582}
best BLEU: 3.66


Epoch 3 [train]:   0%|          | 0/6250 [00:00<?, ?it/s]

Validation:   0%|          | 0/68 [00:00<?, ?it/s]

Epoch 3 | train_loss 1.5726
{'bleu': 5.616842742814197}
best BLEU: 5.62
Final BLEU: 5.616842742814197


In [35]:
if test_loader is not None:
    if os.path.exists("best_bleu.pt"):
        model.load_state_dict(torch.load("best_bleu.pt", map_location=DEVICE))
        model.to(DEVICE)
    print("Test BLEU:")
    test_bleu = eval_bleu4(test_loader)
    print(test_bleu)
else:
    print("No test set")

Test BLEU:
{'bleu': 6.172236688728355}
{'score': 6.172236688728355, 'counts': [22612, 5716, 1934, 705], 'totals': [63624, 60625, 57626, 54628], 'precisions': [35.54004778071168, 9.428453608247423, 3.3561239718182763, 1.2905469722486638], 'bp': 1.0, 'sys_len': 63624, 'ref_len': 62562}
