In [1]:
!pip install rotary-embedding-torch --quiet

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import autocast

from datasets import load_dataset

from typing import Tuple, List
import youtokentome as yttm
from unidecode import unidecode
import re

import os
import math
from tqdm.auto import tqdm

from transformers import get_linear_schedule_with_warmup

In [2]:
torch.__version__

'2.0.1'

In [3]:
! python3 --version

Python 3.11.4


In [4]:
def seed_everything(seed):
    import random
    import numpy as np
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

In [5]:
PAD_token = 0
BOS_token = 1
EOS_token = 2
UNK_token = 3

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
class CFG:
    # model parameters
    hidden_dim: int = 512
    num_heads: int = 8
    max_seq_len: int = 512
    dropout: float = 0.1
    num_layers: int = 6
    de_dictionary_size: int = 12_000
    en_dictionary_size: int = 12_000
    batch_size: int = 20

    # optimizer parameters
    learning_rate: float = 1e-4
    betas: Tuple[float, float] = (0.9, 0.998) # изменить
    eps: float = 1e-6

    # scheduler parameters
    num_cycles: float = 0.5
    warmup_ratio: int = 0

    # project parameters (for wandb)
    epochs: int = 5
    wandb: bool = True
    model: str = 'mega_tranformer'

    # special tokens
    PAD_token = 0
    BOS_token = 1
    EOS_token = 2
    UNK_token = 3

In [7]:
key = 'sasha'

if CFG.wandb:

    import wandb

    try:
        if key == 'sasha':
            w_key = '0f3df295b5449226265568f9d229b4fb7d62fbf0'
            wandb.login(key=w_key)
            anony = None
        else:
            from kaggle_secrets import UserSecretsClient
            user_secrets = UserSecretsClient()
            secret_value_0 = user_secrets.get_secret("wandb_api")
            wandb.login(key=secret_value_0)
            anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. \
        Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='transtalor',
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

[34m[1mwandb[0m: Currently logged in as: [33mwhatisslove7[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/agar1us/.netrc


In [8]:
def load_tokenizer(tokenizer_path: str) -> yttm.BPE:
    if tokenizer_path.split('.')[-1] != 'tok':
        raise TypeError('Invalid tokenizer type. Only ".tok" files is allowed')
    if not os.path.exists(tokenizer_path):
        raise FileNotFoundError("No such file in directory.")
    tokenizer = yttm.BPE(model=tokenizer_path)
    return tokenizer

def clear_text(sent: str) -> str:
    wo_prep = re.sub(r'[^\w\s]','', sent)
    return wo_prep.lower()

def tokenize(
    sentence: str,
    tokenizer
) -> List[int]:

    ans = tokenizer.encode(
        sentences=sentence,
        output_type=yttm.OutputType.ID,
        bos=True, eos=True
    )

    return ans


In [9]:
tokenizer_en_path = '/home/agar1us/Documents/4500k_en_sentences.tok'
tokenizer_de_path = '/home/agar1us/Documents/4500k_de_sentences.tok'

de_tokenizer = load_tokenizer(tokenizer_de_path)
en_tokenizer = load_tokenizer(tokenizer_en_path)

In [10]:
wmt16 = load_dataset("wmt16", "de-en")

# wmt16['train'] = wmt16['train'].select(range(300_000))

Found cached dataset wmt16 (/home/agar1us/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
def tokenize_pipeline(data):
    en_sentence = data['en']
    de_sentence = unidecode(data['de'])

    clear_en = clear_text(en_sentence)
    clear_de = clear_text(de_sentence)

    en_tokens = tokenize(clear_en, en_tokenizer)
    de_tokens = tokenize(clear_de, de_tokenizer)
    
    return de_tokens, en_tokens

def tokenize_sentence(example):
    return {"de_tokens": tokenize_pipeline(example["translation"])[0], "en_tokens": tokenize_pipeline(example["translation"])[1]}

In [12]:
num_proc = os.cpu_count() // 2
wmt16 = wmt16.map(tokenize_sentence, num_proc=num_proc)

Loading cached processed dataset at /home/agar1us/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-1c80317fa3b1799d_*_of_00004.arrow
Loading cached processed dataset at /home/agar1us/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-bdd640fb06671ad1_*_of_00004.arrow
Loading cached processed dataset at /home/agar1us/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-3eb13b9046685257_*_of_00004.arrow


In [13]:
wmt16 = wmt16.filter(lambda example: len(example['de_tokens']) <= 512 and len(example["en_tokens"]) <= 512, num_proc=num_proc)

Loading cached processed dataset at /home/agar1us/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-eabc699f2b863429_*_of_00004.arrow
Loading cached processed dataset at /home/agar1us/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-2c2ae2a1481a9383_*_of_00004.arrow
Loading cached processed dataset at /home/agar1us/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227/cache-775140c67dc362df_*_of_00004.arrow


In [14]:
wmt_train_de = wmt16['train']['de_tokens']
wmt_train_en = wmt16['train']['en_tokens']

wmt_val_de = wmt16['validation']['de_tokens']
wmt_val_en = wmt16['validation']['en_tokens']

wmt_test_de = wmt16['test']['de_tokens']
wmt_test_en = wmt16['test']['en_tokens']

In [15]:
class TranslationDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        de_data, en_data
    ) -> None:

        self.de_data = de_data
        self.en_data = en_data

    def __len__(self) -> int:
        return len(self.de_data)

    def __getitem__(self, idx: int) -> Tuple[List[int], List[int]]:
        # return self.data[idx][0], self.data[idx][1]
        return self.de_data[idx], self.en_data[idx]

In [16]:
# сначала возвращает токены для немецкого, потом для английского
train_dataset = TranslationDataset(wmt_train_de, wmt_train_en)
val_dataset = TranslationDataset(wmt_val_de, wmt_val_en)
test_dataset = TranslationDataset(wmt_test_de, wmt_test_en)

In [17]:
def collate_fn(batch):
    data = sorted(batch, key=lambda x: -len(x[0]))

    de_batch, en_batch = [], []
    for de, en in data:
        de_batch.append(torch.tensor(de))
        en_batch.append(torch.tensor(en))

    de_batch = torch.nn.utils.rnn.pad_sequence(
        sequences=de_batch,
        batch_first=True,
        padding_value=CFG.PAD_token
    )
    en_batch = torch.nn.utils.rnn.pad_sequence(
        sequences=en_batch,
        batch_first=True,
        padding_value=CFG.PAD_token
    )

    return de_batch, en_batch

In [18]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=CFG.batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
    collate_fn=collate_fn
)

val_dataloader = torch.utils.data.DataLoader(
    dataset=val_dataset,
    batch_size=2 * CFG.batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    collate_fn=collate_fn
)

test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=2 * CFG.batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    collate_fn=collate_fn
)

In [19]:
from rotary_embedding_torch import RotaryEmbedding

In [20]:
class AttentionModule(nn.Module):

    def __init__(self, hidden_dim: int, num_heads: int):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.dropout = 0.1

        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(F, 'scaled_dot_product_attention')

        self.out_linear = torch.nn.Linear(self.hidden_dim, self.hidden_dim) # c_proj
        self.resid_dropout = nn.Dropout(self.dropout)
        
        # self.rotary_dim = num_heads // 2
        # self.rotary_emb = RotaryEmbedding(dim=num_heads // 2)
        
        self.rotary_emb = RotaryEmbedding(dim=32)

    def forward(self, K, V, Q, mask):

        batch_size, hidden_dim = Q.size(0), Q.size(2)
        key_len, value_len, query_len = K.size(1), V.size(1), Q.size(1)

        assert hidden_dim % self.num_heads == 0, "Hidden_dim must be equal to num_heads * head_dim"

        K = K.reshape(batch_size, key_len, self.num_heads, -1).transpose(1,
                                                                         2)  # (batch_size, num_heads, seq_len, head_dim)
        V = V.reshape(batch_size, value_len, self.num_heads, -1).transpose(1,
                                                                           2)  # (batch_size, num_heads, seq_len, head_dim)
        Q = Q.reshape(batch_size, query_len, self.num_heads, -1).transpose(1,
                                                                           2)  # (batch_size, num_heads, seq_len, head_dim)
        Q = self.rotary_emb.rotate_queries_or_keys(Q)
        K = self.rotary_emb.rotate_queries_or_keys(K)
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            with autocast():
                # print(Q.shape, K.shape, V.shape, mask.shape)
                y = F.scaled_dot_product_attention(Q, K, V,
                                                   attn_mask=mask,
                                                   dropout_p=self.dropout if self.training else 0,
                                                   is_causal=False)
        else:
            raise ImportError("PyTorch >= 2.0 must be installed for using Flash Attention")

        y = y.transpose(1, 2).contiguous().view(batch_size, query_len, hidden_dim)
        # y = self.resid_dropout(self.out_linear(y)) + y
        return self.resid_dropout(self.out_linear(y.to(torch.float32)))

In [21]:
@torch.jit.script
def new_gelu(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class MLP(torch.nn.Module):
    def __init__(self, hidden_dim: int):
        super().__init__()

        self.linear_0 = torch.nn.Linear(hidden_dim, 4 * hidden_dim)
        self.linear_1 = torch.nn.Linear(4 * hidden_dim, hidden_dim)

    def forward(self, hidden_state):
        return self.linear_1(new_gelu(self.linear_0(hidden_state))) + hidden_state

In [22]:
class LayerNorm(nn.Module):
    def __init__(self,
                 hidden_dim: int,
                 bias: bool = False
    ):
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_dim))
        self.bias = nn.Parameter(torch.zeros(hidden_dim)) if bias else None

    def forward(self, input):
        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)

In [23]:
#!g1.1
class EncoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int, dropout: float = 0.1):
        super().__init__()

        self.attention = AttentionModule(hidden_dim, num_heads)
        self.mlp = MLP(hidden_dim)

        self.norm_for_v = LayerNorm(hidden_dim)
        self.norm_for_k = LayerNorm(hidden_dim)
        self.norm_for_q = LayerNorm(hidden_dim)

        self.norm_for_attention = LayerNorm(hidden_dim)
        # self.norm_for_mlp = LayerNorm(hidden_dim)

    def forward(self, value, key, query, mask):
        attn_output = self.attention(self.norm_for_v(value),
                                     self.norm_for_k(key),
                                     self.norm_for_q(query),
                                     mask) + query
        attn_output = self.norm_for_attention(attn_output)

        mlp_output = self.mlp(attn_output)

        return mlp_output

In [24]:
#!g1.1
class Encoder(torch.nn.Module):
    def __init__(
        self,
        de_dictionary_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        dropout: float = 0.1,
        max_seq_len: int = 512
    ):
        super().__init__()

        self.word_embedding = torch.nn.Embedding(de_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        self.layers = torch.nn.ModuleList(
            [
                EncoderTransformerLayer(
                    hidden_dim,
                    num_heads,
                    dropout
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, inputs, mask):
        batch_size, seq_len = inputs.shape
        positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device)
        hidden_dim = self.dropout(self.word_embedding(inputs) + self.pos_embedding(positions))

        for layer in self.layers:
            hidden_dim = layer(hidden_dim, hidden_dim, hidden_dim, mask)

        return hidden_dim

In [25]:
#!g1.1
class DecoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int, dropout: float = 0.1):
        super().__init__()

        self.self_attention = AttentionModule(hidden_dim, num_heads) # Аттенш на то, что происходит в переводе
        self.out_attention = EncoderTransformerLayer(hidden_dim, num_heads) # Аттенш на то, что происходит в оригинале

        self.norm_for_hidden = LayerNorm(hidden_dim)

        self.norm_for_attention = LayerNorm(hidden_dim)
        self.norm_for_encoder = LayerNorm(hidden_dim)

    def forward(self, hidden_state, encoder_layer_output, src_mask, trg_mask):
        encoder_layer_output = self.norm_for_encoder(encoder_layer_output)

        normalized_hidden_state = self.norm_for_hidden(hidden_state)
        self_attn_output = self.self_attention(normalized_hidden_state,
                                               normalized_hidden_state,
                                               normalized_hidden_state,
                                               trg_mask) + hidden_state

        self_attn_output = self.norm_for_attention(self_attn_output)

        output = self.out_attention(encoder_layer_output, encoder_layer_output, self_attn_output, src_mask)

        return output

In [26]:
#!g1.1
class Decoder(torch.nn.Module):
    def __init__(
        self,
        en_dictionary_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        dropout: float = 0.1,
        max_seq_len: int = 512
    ):
        super().__init__()

        self.word_embedding = torch.nn.Embedding(en_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        self.layers = torch.nn.ModuleList(
            [
                DecoderTransformerLayer(hidden_dim, num_heads)
                for _ in range(num_layers)
            ]
        )

        self.lm_head = torch.nn.Linear(hidden_dim, en_dictionary_size)
        self.dropout = torch.nn.Dropout(dropout)

        # weight tying
        self.word_embedding.weight = self.lm_head.weight

    def forward(self, inputs, encoder_output, src_mask, trg_mask):
        batch_size, seq_len = inputs.shape
        positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device)
        inputs = self.dropout(self.word_embedding(inputs) + self.pos_embedding(positions))

        for layer in self.layers:
            inputs = layer(inputs, encoder_output, src_mask, trg_mask)

        return self.lm_head(inputs)

In [27]:
#!g1.1
class TranslationModel(torch.nn.Module):
    def __init__(
        self,
        de_dictionary_size: int,
        en_dictionary_size: int,
        hidden_dim: int = 512,
        num_layers: int = 6,
        num_heads: int = 8,
        dropout: float = 0.1
    ):
        super().__init__()

        self.encoder = Encoder(de_dictionary_size, hidden_dim, num_layers, num_heads, dropout)
        self.decoder = Decoder(en_dictionary_size, hidden_dim, num_layers, num_heads, dropout)

        self._small_init(hidden_dim)

    def make_src_mask(self, src):
        src_pad_mask = (src != PAD_token).unsqueeze(1).unsqueeze(2)
        return src_pad_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != PAD_token).unsqueeze(1).unsqueeze(2)

        batch_size, trg_len = trg.shape
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
                batch_size, 1, trg_len, trg_len
            ).bool().to(device)
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, inputs):
        src_ids, trg_ids = inputs
        src_mask = self.make_src_mask(src_ids).to(device)
        trg_mask = self.make_trg_mask(trg_ids).to(device)
        encoder_output = self.encoder(src_ids, src_mask)
        decoder_output = self.decoder(trg_ids, encoder_output, src_mask, trg_mask)
        return decoder_output

    def _small_init(self, hidden_dim: int):
        mean = 0
        std = (2 / (5 * hidden_dim)) ** 0.5
        for _, p in self.named_parameters():
            if p.dim() > 1:
                torch.nn.init.normal_(p, mean=mean, std=std)

In [28]:
import math

num_training_steps = len(train_dataloader) * CFG.epochs
num_warmup_steps = math.ceil(0.02 * num_training_steps)
print(num_training_steps, num_warmup_steps)

1137150 22743


In [29]:
#!g1.1
model = TranslationModel(CFG.de_dictionary_size, CFG.en_dictionary_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.learning_rate, betas=CFG.betas, eps=CFG.eps)
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_token)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

In [30]:
#!g1.1
# add Train Loop
def train_epoch(model, optimizer, criterion, dataloader):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=True)
    running_loss = 0.0

    for src, trg in tqdm(dataloader):
        src = src.to(device)
        trg = trg.to(device) # (batch_size, seq_len)
        optimizer.zero_grad()
        
        with autocast():
            output = model((src, trg[:, :-1])) # (batch_size, seq_len, en_vocab_len)
            trg = trg[:, 1:].contiguous().view(-1) # (batch_size * (seq_len - 1))
            en_vocab_len = output.size(-1)
            output = output.view(-1, en_vocab_len) # (batch_size * (seq_len - 1), en_vocab_len)
            loss = criterion(output, trg)
            
        """loss.backward()
        optimizer.step()
        scheduler.step()"""
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        
        if CFG.wandb:
            wandb.log({f"learning rate": optimizer.param_groups[0]["lr"], 
                       f"loss": loss.item()})
        running_loss += loss.item()

    epoch_loss = running_loss / len(dataloader.dataset)
    
    return epoch_loss

def eval_epoch(model, criterion, dataloader):
    model.eval()
    running_loss = 0.0

    with torch.no_grad():
        for src, trg in dataloader:
            src = src.to(device)
            trg = trg.to(device)
            output = model((src, trg[:, :-1]))

            trg = trg[:, 1:].contiguous().view(-1)
            en_vocab_len = output.size(-1)
            output = output.view(-1, en_vocab_len)

            loss = criterion(output, trg)
            running_loss += loss.item()

    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

In [31]:
#!g1.1
model_checkpoint = "model_1450k_samples.pt"
path = "/home/agar1us/Documents/"

In [32]:
#!g1.1
num_epochs = 7
best_valid_loss = 1e12

for epoch in range(CFG.epochs):
    train_loss = train_epoch(model, optimizer, criterion, train_dataloader)
    valid_loss = eval_epoch(model, criterion, val_dataloader)
    
    if CFG.wandb:
        wandb.log({f"epoch": epoch+1, 
                           f"avg_train_loss": train_loss, 
                           f"avg_val_loss": valid_loss})

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), os.path.join(path, model_checkpoint))
        print(f"Model is saved to {os.path.join(path, model_checkpoint)}")

    print(f"Epoch №{epoch + 1}:")
    print(f"Training Loss: {train_loss}")
    print(f"Validation Loss: {valid_loss}")
    print()

  0%|          | 0/227430 [00:00<?, ?it/s]

Model is saved to /home/agar1us/Documents/model_1450k_samples.pt
Epoch №1:
Training Loss: 0.1659594972041533
Validation Loss: 0.06095040756856972



  0%|          | 0/227430 [00:00<?, ?it/s]

Model is saved to /home/agar1us/Documents/model_1450k_samples.pt
Epoch №2:
Training Loss: 0.12568035511872483
Validation Loss: 0.053273658205034984



  0%|          | 0/227430 [00:00<?, ?it/s]

Model is saved to /home/agar1us/Documents/model_1450k_samples.pt
Epoch №3:
Training Loss: 0.11697097834619313
Validation Loss: 0.049975897706340894



  0%|          | 0/227430 [00:00<?, ?it/s]

Model is saved to /home/agar1us/Documents/model_1450k_samples.pt
Epoch №4:
Training Loss: 0.11138810323581869
Validation Loss: 0.04734422908833962



  0%|          | 0/227430 [00:00<?, ?it/s]

RuntimeError: DataLoader worker (pid 40350) is killed by signal: Killed. 

In [33]:
test_loss = eval_epoch(model, criterion, test_dataloader)
print(f"Test Loss: {test_loss}")

Test Loss: 0.03972892039376284


## Inference

In [34]:
def decode(src, model, max_len=512):
    model.eval()
    de_sentence = unidecode(src)
    clear_de = clear_text(de_sentence)
    de_tokens = tokenize(clear_de, de_tokenizer)
    src_tensor = torch.tensor(de_tokens).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor).to(device)

    encoded_src = model.encoder(src_tensor, src_mask)


    trg_ids = [1] # 1 in trg_idx - BOS_token
    while len(trg_ids) <= max_len:
        trg_tensor = torch.tensor(trg_ids).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)

        with torch.no_grad():
            predictions = model.decoder(trg_tensor, encoded_src, src_mask, trg_mask)
            last_pred_id = predictions[:, -1, :].argmax(-1).item()

            if last_pred_id == EOS_token:
                break

            trg_ids.append(last_pred_id)
    # return trg_ids
    return en_tokenizer.decode(trg_ids, ignore_ids=[1])

In [91]:
!pip install evaluate --quiet

In [35]:
wmt_test = wmt16['test']['translation']

In [36]:
import evaluate
bleu = evaluate.load("bleu")

In [37]:
wmt_test[1]

{'de': 'Das Verhältnis zwischen Obama und Netanyahu ist nicht gerade freundschaftlich.',
 'en': 'The relationship between Obama and Netanyahu is not exactly friendly.'}

In [38]:
decode(wmt_test[1]['de'], model)

['the relationship between obama and netanyahu is not exactly friendly']

In [39]:
pr = [decode(ex['de'], model)[0] for ex in tqdm(wmt_test)]

  0%|          | 0/2999 [00:00<?, ?it/s]

In [40]:
ref = [ex['en'] for ex in wmt_test]

In [41]:
test_bleu = bleu.compute(predictions=pr, references=ref)
print(test_bleu)

{'bleu': 0.16901773603961578, 'precisions': [0.5041616696797752, 0.25559884996595295, 0.14108965489582706, 0.08104014676394045], 'brevity_penalty': 0.8626830253910908, 'length_ratio': 0.8713017982189367, 'translation_length': 55867, 'reference_length': 64119}


In [43]:
test_bleu_trigramm = bleu.compute(predictions=pr, references=ref, max_order=3)
print(test_bleu_trigramm)

{'bleu': 0.22684326156224746, 'precisions': [0.5041616696797752, 0.25559884996595295, 0.14108965489582706], 'brevity_penalty': 0.8626830253910908, 'length_ratio': 0.8713017982189367, 'translation_length': 55867, 'reference_length': 64119}


In [42]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_train_loss,█▃▂▁
avg_val_loss,█▄▂▁
epoch,▁▃▆█
learning rate,▆███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
loss,█▇▄▅▅▃▄▃▄▄▅▄▃▂▃▂▃▃▃▄▃▃▂▂▄▂▃▂▄▄▁▃▂▂▄▂▁▃▄▁

0,1
avg_train_loss,0.11139
avg_val_loss,0.04734
epoch,4.0
learning rate,0.0
loss,1.80619


In [49]:
de_sentences = [
    "Gutach: Noch mehr Sicherheit für Fußgänger",
    "Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?",
    "Dies bestätigt auch Peter Arnold vom Landratsamt Offenburg.",
    "Daher sei der Bau einer weiteren Ampel mehr als notwendig: \"Sicherheit geht hier einfach vor\", so Arnold.",
    "Pro Fahrtrichtung gibt es drei Lichtanlagen.",
    "Drückt der Fußgänger den Ampelknopf, testet der obere Radarsensor die Verkehrslage.",
    "Ein weiteres Radarsensor prüft, ob die Grünphase für den Fußgänger beendet werden kann.",
    "Josef Winkler schreibt sich seit mehr als 30 Jahren die Nöte seiner Kindheit und Jugend von der Seele.",
    "Dabei scheint Regisseur Fresacher dem Text wenig zu vertrauen.",
    "Sie werden hart angefasst, mit dem Kopf unter Wasser getaucht, mit ihren Abendroben an die Wand getackert.",
]

en_sentences = [decode(example, model) for example in de_sentences]

In [50]:
en_sentences

[['good even more security for pedestrians'],
 ['two plants are so close to the legacy or shieldy'],
 ['this also confirms peter arnold from the landratzück'],
 ['therefore the construction of another amplore is more than necessary security here is simply so arnold'],
 ['the road is about 3 light plants'],
 ['the pedestrian lights are pressed on the top wheel sensor'],
 ['another radar sensor checks whether the green phase can be completed for pedestrians'],
 ['josef winkler has been writing for more than 30 years the nond of his childhood and youth from the soul'],
 ['the text seems to be too little trusted'],
 ['they are hardened with the head under water with their evening baking on the wall']]

In [51]:
de = 'Wie ist dein Name?'
print(decode(de, model))

['how is your name']
