In [1]:
!pip install datasets youtokentome unidecode --quiet

In [2]:
import torch
import torch.nn as nn

from datasets import load_dataset

from typing import Tuple, List
import youtokentome as yttm
from unidecode import unidecode
import re

import os
import math
from tqdm.auto import tqdm

In [None]:
def seed_everything(seed):
    import random
    import numpy as np
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

In [3]:
PAD_token = 0
BOS_token = 1
EOS_token = 2
UNK_token = 3

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
class CFG:
    # model parameters
    hidden_dim: int = 512
    num_heads: int = 8
    max_seq_len: int = 512
    dropout: float = 0.1
    num_layers: int = 6
    de_dictionary_size: int = 10_000
    en_dictionary_size: int = 10_000
    batch_size: int = 32

    # adam parameters
    learning_rate: float = 1e-4
    betas: Tuple[float, float] = (0.9, 0.998) # изменить

    # warmup parameters
    num_cycles: float = 0.5
    warmup_ratio: float = 0.06

    # project parameters (for wandb)
    epochs: int = 10
    wandb: bool = False

    # special tokens
    PAD_token = 0
    BOS_token = 1
    EOS_token = 2
    UNK_token = 3

In [None]:
if CFG.wandb:

    import wandb

    try:
        if 'google.colab' in sys.modules:
            wandb.login()
        else:
            from kaggle_secrets import UserSecretsClient
            user_secrets = UserSecretsClient()
            secret_value_0 = user_secrets.get_secret("wandb_api")
            wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. \
        Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='expv6',
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

In [5]:
def load_tokenizer(tokenizer_path: str) -> yttm.BPE:
    if tokenizer_path.split('.')[-1] != 'tok':
        raise TypeError('Invalid tokenizer type. Only ".tok" files is allowed')
    tokenizer = yttm.BPE(model=tokenizer_path)
    return tokenizer

def clear_text(sent):
    wo_prep = re.sub(r'[^\w\s]','', sent)
    return wo_prep.lower()

def tokenize(
    sentence: str,
    tokenizer
):

    ans = tokenizer.encode(
        sentences=sentence,
        output_type=yttm.OutputType.ID,
        bos=True, eos=True
    )

    return ans

class TranslationDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        de_tokenizer,
        en_tokenizer,
        data
    ) -> None:

        self.data = data
        self.en_tokenizer = en_tokenizer
        self.de_tokenizer = de_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        en_sentence = self.data[idx]['en']
        de_sentence = unidecode(self.data[idx]['de'])

        clear_en = clear_text(en_sentence)
        clear_de = clear_text(de_sentence)

        en_tokens = tokenize(clear_en, self.en_tokenizer)
        de_tokens = tokenize(clear_de, self.de_tokenizer)

        return de_tokens, en_tokens

In [6]:
tokenizer_en_path = '/kaggle/input/tokenizers/truncated_en_tokenizer.tok'
tokenizer_de_path = '/kaggle/input/tokenizers/truncated_de_tokenizer.tok'

de_tokenizer = load_tokenizer(tokenizer_de_path)
en_tokenizer = load_tokenizer(tokenizer_en_path)

In [7]:
wmt16 = load_dataset("wmt16", "de-en")
wmt16['train'] = wmt16['train'].select(range(1_150_000))

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/8.73k [00:00<?, ?B/s]

Downloading and preparing dataset wmt16/de-en (download: 1.57 GiB, generated: 1.28 GiB, post-processed: Unknown size, total: 2.85 GiB) to /root/.cache/huggingface/datasets/wmt16/de-en/1.0.0/9e0038fe4cc117bd474d2774032cc133e355146ed0a47021b2040ca9db4645c0...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/658M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/919M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/5 [00:00<?, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/4548885 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2169 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2999 [00:00<?, ? examples/s]

Dataset wmt16 downloaded and prepared to /root/.cache/huggingface/datasets/wmt16/de-en/1.0.0/9e0038fe4cc117bd474d2774032cc133e355146ed0a47021b2040ca9db4645c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
wmt_train = wmt16['train']['translation']
wmt_val = wmt16['validation']['translation']
wmt_test = wmt16['test']['translation']

In [11]:
# сначала возвращает токены для немецкого, потом для английского
train_dataset = TranslationDataset(de_tokenizer, en_tokenizer, wmt_train)
val_dataset = TranslationDataset(de_tokenizer, en_tokenizer, wmt_val)
test_dataset = TranslationDataset(de_tokenizer, en_tokenizer, wmt_test)

In [12]:
def collate_fn(batch):
    data = sorted(batch, key=lambda x: -len(x[0]))

    de_batch, en_batch = [], []
    for de, en in data:
        de_batch.append(torch.tensor(de))
        en_batch.append(torch.tensor(en))

    de_batch = torch.nn.utils.rnn.pad_sequence(
        sequences=de_batch,
        batch_first=True,
        padding_value=CFG.PAD_token
    )
    en_batch = torch.nn.utils.rnn.pad_sequence(
        sequences=en_batch,
        batch_first=True,
        padding_value=CFG.PAD_token
    )

    return de_batch, en_batch

In [13]:
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=CFG.batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
    collate_fn=collate_fn
)

val_dataloader = torch.utils.data.DataLoader(
    dataset=val_dataset,
    batch_size=2 * CFG.batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    collate_fn=collate_fn
)

test_dataloader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=2 * CFG.batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
    collate_fn=collate_fn
)

In [14]:
def attention(K, V, Q, num_heads, mask=None):
    batch_size, hidden_dim = Q.size(0), Q.size(2)
    key_len, value_len, query_len = K.size(1), V.size(1), Q.size(1)

    K = K.reshape(batch_size, key_len, num_heads, -1)
    V = V.reshape(batch_size, value_len, num_heads, -1)
    Q = Q.reshape(batch_size, query_len, num_heads, -1)

    energy = torch.einsum('bqhd,bkhd->bhqk', [Q, K])

    if mask is not None:
        energy = energy.masked_fill(mask == 0, -1e12)

    attention = torch.softmax(energy / math.sqrt(hidden_dim // num_heads), dim=3)
    result_headed = torch.einsum('bhql,blhd->bqhd', [attention, V])
    return result_headed.reshape(batch_size, query_len, hidden_dim)


class AttentionModule(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_heads = num_heads

        self.k_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.v_linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.q_linear = torch.nn.Linear(hidden_dim, hidden_dim)

        self.out_linear = torch.nn.Linear(hidden_dim, hidden_dim)

    def forward(self, keys, values, query, mask):
        K = self.k_linear(keys)
        V = self.v_linear(values)
        Q = self.q_linear(query)
        attention_output = attention(K, V, Q, self.num_heads, mask)
        return self.out_linear(attention_output) + query


In [15]:
@torch.jit.script
def new_gelu(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

class MLP(torch.nn.Module):
    def __init__(self, hidden_dim: int):
        super().__init__()

        self.linear_0 = torch.nn.Linear(hidden_dim, 4 * hidden_dim)
        self.linear_1 = torch.nn.Linear(4 * hidden_dim, hidden_dim)

        mean = 0
        std = (2 / (5 * hidden_dim)) ** 0.5
        torch.nn.init.normal_(self.linear_0.weight, mean=mean, std=std)
        torch.nn.init.normal_(self.linear_1.weight, mean=mean, std=std)

    def forward(self, hidden_state):
        return self.linear_1(new_gelu(self.linear_0(hidden_state))) + hidden_state

In [16]:
class EncoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int, dropout: float = 0.1):
        super().__init__()

        self.attention = AttentionModule(hidden_dim, num_heads)
        self.mlp = MLP(hidden_dim)

        self.norm = torch.nn.LayerNorm(hidden_dim)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attn_output = self.dropout(self.norm(self.attention(value, key, query, mask)))
        mlp_output = self.dropout(self.norm(self.mlp(attn_output)))
        return mlp_output

In [17]:
#!g1.1
class Encoder(torch.nn.Module):
    def __init__(
        self,
        de_dictionary_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        dropout: float = 0.1,
        max_seq_len: int = 512
    ):
        super().__init__()

        self.word_embedding = torch.nn.Embedding(de_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        self.layers = torch.nn.ModuleList(
            [
                EncoderTransformerLayer(
                    hidden_dim,
                    num_heads,
                    dropout
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, inputs, mask):
        batch_size, seq_len = inputs.shape
        positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device)
        hidden_dim = self.dropout(self.word_embedding(inputs) + self.pos_embedding(positions))

        for layer in self.layers:
            hidden_dim = layer(hidden_dim, hidden_dim, hidden_dim, mask)

        return hidden_dim

In [18]:
#!g1.1
class DecoderTransformerLayer(torch.nn.Module):
    def __init__(self, hidden_dim: int, num_heads: int, dropout: float = 0.1):
        super().__init__()

        self.self_attention = AttentionModule(hidden_dim, num_heads) # Аттенш на то, что происходит в переводе
        self.out_attention = EncoderTransformerLayer(hidden_dim, num_heads) # Аттенш на то, что происходит в оригинале

        self.norm = torch.nn.LayerNorm(hidden_dim)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, hidden_state, encoder_layer_output, src_mask, trg_mask):
        self_attn_output = self.dropout(self.norm(self.self_attention(hidden_state, hidden_state, hidden_state, trg_mask)))
        output = self.out_attention(encoder_layer_output, encoder_layer_output, self_attn_output, src_mask)
        return output

In [19]:
#!g1.1
class Decoder(torch.nn.Module):
    def __init__(
        self,
        en_dictionary_size: int,
        hidden_dim: int,
        num_layers: int,
        num_heads: int,
        dropout: float = 0.1,
        max_seq_len: int = 512
    ):
        super().__init__()

        self.word_embedding = torch.nn.Embedding(en_dictionary_size, hidden_dim)
        self.pos_embedding = torch.nn.Embedding(max_seq_len, hidden_dim)
        self.layers = torch.nn.ModuleList(
            [
                DecoderTransformerLayer(hidden_dim, num_heads)
                for _ in range(num_layers)
            ]
        )

        self.lm_head = torch.nn.Linear(hidden_dim, en_dictionary_size)
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, inputs, encoder_output, src_mask, trg_mask):
        batch_size, seq_len = inputs.shape
        positions = torch.arange(0, seq_len).expand(batch_size, seq_len).to(device)
        inputs = self.dropout(self.word_embedding(inputs) + self.pos_embedding(positions))

        for layer in self.layers:
            inputs = layer(inputs, encoder_output, src_mask, trg_mask)

        return self.lm_head(inputs)

In [20]:
#!g1.1
class TranslationModel(torch.nn.Module):
    def __init__(
        self,
        de_dictionary_size: int,
        en_dictionary_size: int,
        hidden_dim: int = 512,
        num_layers: int = 6,
        num_heads: int = 8,
        dropout: float = 0.1
    ):
        super().__init__()

        self.encoder = Encoder(de_dictionary_size, hidden_dim, num_layers, num_heads, dropout)
        self.decoder = Decoder(en_dictionary_size, hidden_dim, num_layers, num_heads, dropout)

    def make_src_mask(self, src):
        src_pad_mask = (src != PAD_token).unsqueeze(1).unsqueeze(2)
        return src_pad_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != PAD_token).unsqueeze(1).unsqueeze(2)

        batch_size, trg_len = trg.shape
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
                batch_size, 1, trg_len, trg_len
            ).bool().to(device)
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, inputs):
        src_ids, trg_ids = inputs
        src_mask = self.make_src_mask(src_ids).to(device)
        trg_mask = self.make_trg_mask(trg_ids).to(device)
        encoder_output = self.encoder(src_ids, src_mask)
        decoder_output = self.decoder(trg_ids, encoder_output, src_mask, trg_mask)
        return decoder_output

In [22]:
#!g1.1
model = TranslationModel(10_000, 10_000).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = torch.nn.CrossEntropyLoss(ignore_index=PAD_token)

In [23]:
#!g1.1
# add Train Loop
def train_epoch(model, optimizer, criterion, dataloader):
    model.train()
    running_loss = 0.0

    for src, trg in tqdm(dataloader):
        src = src.to(device)
        trg = trg.to(device) # (batch_size, seq_len)
        optimizer.zero_grad()

        output = model((src, trg[:, :-1])) # (batch_size, seq_len, en_vocab_len)
        trg = trg[:, 1:].contiguous().view(-1) # (batch_size * (seq_len - 1))
        en_vocab_len = output.size(-1)
        output = output.view(-1, en_vocab_len) # (batch_size * (seq_len - 1), en_vocab_len)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

def eval_epoch(model, criterion, dataloader):
    model.eval()
    running_loss = 0.0

    with torch.no_grad():
        for src, trg in dataloader:
            src = src.to(device)
            trg = trg.to(device)
            output = model((src, trg[:, :-1]))

            trg = trg[:, 1:].contiguous().view(-1)
            en_vocab_len = output.size(-1)
            output = output.view(-1, en_vocab_len)

            loss = criterion(output, trg)
            running_loss += loss.item()

    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

In [24]:
#!g1.1
model_checkpoint = "model.pt"
path = "/kaggle/working/"

In [25]:
#!g1.1
num_epochs = 7
best_valid_loss = 1e12

for epoch in range(num_epochs):
    train_loss = train_epoch(model, optimizer, criterion, train_dataloader)
    valid_loss = eval_epoch(model, criterion, val_dataloader)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), os.path.join(path, model_checkpoint))
        print(f"Model is saved to {os.path.join(path, model_checkpoint)}")

    print(f"Epoch №{epoch + 1}:")
    print(f"Training Loss: {train_loss}")
    print(f"Validation Loss: {valid_loss}")
    print()

  0%|          | 0/35938 [00:00<?, ?it/s]

Model is saved to /kaggle/working/model.pt
Epoch №1:
Training Loss: 0.1199806475029821
Validation Loss: 0.07040959604877983



  0%|          | 0/35938 [00:00<?, ?it/s]

Model is saved to /kaggle/working/model.pt
Epoch №2:
Training Loss: 0.08039488737842311
Validation Loss: 0.06046745259402369



  0%|          | 0/35938 [00:00<?, ?it/s]

Model is saved to /kaggle/working/model.pt
Epoch №3:
Training Loss: 0.0691538540128003
Validation Loss: 0.05496959996256426



  0%|          | 0/35938 [00:00<?, ?it/s]

Model is saved to /kaggle/working/model.pt
Epoch №4:
Training Loss: 0.0634221480700244
Validation Loss: 0.05146551165178439



  0%|          | 0/35938 [00:00<?, ?it/s]

Model is saved to /kaggle/working/model.pt
Epoch №5:
Training Loss: 0.05982996261368627
Validation Loss: 0.048828825856090524



  0%|          | 0/35938 [00:00<?, ?it/s]

Model is saved to /kaggle/working/model.pt
Epoch №6:
Training Loss: 0.0573141716605684
Validation Loss: 0.04712311602672051



  0%|          | 0/35938 [00:00<?, ?it/s]

Model is saved to /kaggle/working/model.pt
Epoch №7:
Training Loss: 0.05540354325460351
Validation Loss: 0.045933016767805215



In [26]:
test_loss = eval_epoch(model, criterion, test_dataloader)
print(f"Test Loss: {test_loss}")

Test Loss: 0.043185467201695915


## Inference

In [82]:
def decode(src, model, max_len=512):
    model.eval()
    de_sentence = unidecode(src)
    clear_de = clear_text(de_sentence)
    de_tokens = tokenize(clear_de, de_tokenizer)
    src_tensor = torch.tensor(de_tokens).unsqueeze(0).to(device)
    src_mask = model.make_src_mask(src_tensor).to(device)
    
    encoded_src = model.encoder(src_tensor, src_mask)
    
    
    trg_ids = [1] # 1 in trg_idx - BOS_token
    while len(trg_ids) <= max_len:
        trg_tensor = torch.tensor(trg_ids).unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            predictions = model.decoder(trg_tensor, encoded_src, src_mask, trg_mask)
            last_pred_id = predictions[:, -1, :].argmax(-1).item()

            if last_pred_id == EOS_token:
                break

            trg_ids.append(last_pred_id)
    # return trg_ids
    return en_tokenizer.decode(trg_ids, ignore_ids=[1])

In [52]:
import evaluate
bleu = evaluate.load("bleu")



Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [93]:
references = [[sent['en']] for sent in wmt_test]
pred = [x[0] for x in predictions]

test_bleu = bleu.compute(predictions=pr, references=ref)
print(test_bleu)

ValueError: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: ['obama is starting to receive neanyahu'],
Input references: ['Obama receives Netanyahu']

In [100]:
pr = [pred[i][0] for i in range(len(pred))]

In [103]:
references[:10]

[['Obama receives Netanyahu'],
 ['The relationship between Obama and Netanyahu is not exactly friendly.'],
 ["The two wanted to talk about the implementation of the international agreement and about Teheran's destabilising activities in the Middle East."],
 ['The meeting was also planned to cover the conflict with the Palestinians and the disputed two state solution.'],
 ['Relations between Obama and Netanyahu have been strained for years.'],
 ['Washington criticises the continuous building of settlements in Israel and accuses Netanyahu of a lack of initiative in the peace process.'],
 ["The relationship between the two has further deteriorated because of the deal that Obama negotiated on Iran's atomic programme, ."],
 ['In March, at the invitation of the Republicans, Netanyahu made a controversial speech to the US Congress, which was partly seen as an affront to Obama.'],
 ['The speech had not been agreed with Obama, who had rejected a meeting with reference to the election that was a

In [104]:
ref = [references[i][0] for i in range(len(references))]

In [110]:
test_bleu = bleu.compute(predictions=pr, references=ref, max_order=3)
print(test_bleu)

{'bleu': 0.17930259445166924, 'precisions': [0.43591240875912407, 0.19226451309165973, 0.09318577801604473], 'brevity_penalty': 0.9037269643039942, 'length_ratio': 0.9080771690138648, 'translation_length': 58225, 'reference_length': 64119}


In [111]:
de_sentences = [
    "Gutach: Noch mehr Sicherheit für Fußgänger",
    "Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?",
    "Dies bestätigt auch Peter Arnold vom Landratsamt Offenburg.",
    "Daher sei der Bau einer weiteren Ampel mehr als notwendig: \"Sicherheit geht hier einfach vor\", so Arnold.",
    "Pro Fahrtrichtung gibt es drei Lichtanlagen.",
    "Drückt der Fußgänger den Ampelknopf, testet der obere Radarsensor die Verkehrslage.",
    "Ein weiteres Radarsensor prüft, ob die Grünphase für den Fußgänger beendet werden kann.",
    "Josef Winkler schreibt sich seit mehr als 30 Jahren die Nöte seiner Kindheit und Jugend von der Seele.",
    "Dabei scheint Regisseur Fresacher dem Text wenig zu vertrauen.",
    "Sie werden hart angefasst, mit dem Kopf unter Wasser getaucht, mit ihren Abendroben an die Wand getackert.",
]

en_sentences = [decode(example, model) for example in de_sentences]

In [112]:
en_sentences

[['goodness more security for pedestrians'],
 ['two plants are so close to each other and to the same extent to each other or to the public'],
 ['this is also confirmed by peters arhmad by the council of the country of england'],
 ['so building another ampse more than necessary safety is simply taking place here in this case'],
 ['there are three lights per ferry'],
 ['if the pedestrian is the amphalk number the high radar service is the transport situation'],
 ['another radar stamp test is whether the green phase for pedestrians can be ended'],
 ['josef winkler has been hiding by the soul for over thirty years'],
 ['the text seems to be a little confidence for regulatory parties'],
 ['they are being hardhashed with the head of water with their night bombs to the wall']]

In [115]:
de = 'Wie ist dein Name?'
print(decode(de, model))

['what is the name']
