In [None]:
!pip3 install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cpu

In [None]:
!pip3 install torchtext==0.18.0

In [None]:
!pip3 install pandas

In [None]:
!pip3 install -U spacy
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download uk_core_news_sm

In [None]:
import pandas as pd
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch.nn as nn
from collections import Counter
import math
import time

from tqdm import tqdm

torchtext.disable_torchtext_deprecation_warning()



In [2]:
SRC = "en"
TGT = "uk"
csv = pd.read_csv("../data/en-uk.tsv", sep="\t", usecols=[1, 3], names=[SRC, TGT])
csv.head()

Unnamed: 0,en,uk
0,Let's try something.,Давайте щось спробуємо!
1,I have to go to sleep.,Маю піти спати.
2,Muiriel is 20 now.,Мюріел зараз двадцять.
3,"The password is ""Muiriel"".","Пароль - ""Muiriel""."
4,I will be back soon.,Я скоро повернуся.


In [3]:
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
uk_tokenizer = get_tokenizer("spacy", language="uk_core_news_sm")

In [None]:
def build_vocab(data, tokenizer):
    counter = Counter()
    for d in data:
        counter.update(tokenizer(d))
    print(len(counter))
    return vocab(
        counter, specials=["<unk>", "<pad>", "<bos>", "<eos>"], special_first=True
    )


en_vocab = build_vocab(csv[SRC], en_tokenizer)
uk_vocab = build_vocab(csv[TGT], uk_tokenizer)

18523
49759


In [None]:
def data_process(csv):
    raw_en_iter = iter(csv[SRC])
    raw_uk_iter = iter(csv[TGT])
    data = []
    for raw_en, raw_uk in zip(raw_en_iter, raw_uk_iter):
        en_tensor_ = torch.tensor(
            [en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long
        )
        uk_tensor_ = torch.tensor(
            [uk_vocab[token] for token in uk_tokenizer(raw_uk)], dtype=torch.long
        )
        data.append((en_tensor_, uk_tensor_))
    return data


data = data_process(csv)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\uni\lyng\lab2\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\uni\lyng\lab2\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "d:\uni\lyng\lab2\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "d:\uni\lyng\lab2\.venv\Lib\site-packages\tor

In [None]:
BATCH_SIZE = 64
PAD_IDX = en_vocab["<pad>"]
BOS_IDX = en_vocab["<bos>"]
EOS_IDX = en_vocab["<eos>"]
print(PAD_IDX, uk_vocab["<pad>"])

1 1


In [None]:
def generate_batch(data_batch):
    en_batch, uk_batch = [], []
    for en_item, uk_item in data_batch:
        en_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
        uk_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), uk_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    uk_batch = pad_sequence(uk_batch, padding_value=PAD_IDX)
    return en_batch, uk_batch

In [None]:
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    data, [0.8, 0.1, 0.1]
)

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch
)
valid_loader = DataLoader(
    valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch
)

In [10]:
print(len(train_loader), len(train_loader) * BATCH_SIZE)
for en, uk in test_loader:
    en_itos = en_vocab.get_itos()
    uk_itos = uk_vocab.get_itos()
    print(f"Shape of en: {en.shape} {en.dtype}")
    print(f"Shape of uk: {uk.shape} {uk.dtype}")
    print([en_itos[w] for w in en.T[0]])
    print([uk_itos[w] for w in uk.T[0]])
    break

2683 171712
Shape of en: torch.Size([29, 64]) torch.int64
Shape of uk: torch.Size([27, 64]) torch.int64
['<bos>', 'You', "'ve", 'got', 'to', 'wake', 'up', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'Тобі', 'потрібно', 'прокидатися', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

In [None]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        *,
        src_vocab_size,
        tgt_vocab_size,
        emb_size,
        nhead,
        ff_dim,
        num_enc_layers,
        num_dec_layers,
        pad_idx,
    ):
        super(TransformerModel, self).__init__()
        self.src_emb = nn.Embedding(src_vocab_size, emb_size, padding_idx=pad_idx)
        self.tgt_emb = nn.Embedding(tgt_vocab_size, emb_size, padding_idx=pad_idx)
        self.positional_encoding = PositionalEncoding(emb_size)

        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_enc_layers,
            num_decoder_layers=num_dec_layers,
            dim_feedforward=ff_dim,
        )
        self.fc_out = nn.Linear(emb_size, tgt_vocab_size)
        self.pad_idx = pad_idx

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask):
        src = self.positional_encoding(self.src_emb(src))
        tgt = self.positional_encoding(self.tgt_emb(tgt))

        output = self.transformer(
            src,
            tgt,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
        )
        return self.fc_out(output)

    def create_masks(self, src, tgt):
        src_seq_len, tgt_seq_len = src.size(0), tgt.size(0)
        src_mask = torch.zeros((src_seq_len, src_seq_len), device=src.device).type(
            torch.bool
        )
        tgt_mask = self.generate_square_subsequent_mask(tgt_seq_len).to(tgt.device)

        src_padding_mask = (src == self.pad_idx).transpose(0, 1)
        tgt_padding_mask = (tgt == self.pad_idx).transpose(0, 1)
        return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

    def generate_square_subsequent_mask(self, size):
        return torch.triu(torch.ones(size, size) * float("-inf"), diagonal=1)

In [14]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SRC_VOCAB_SIZE = len(en_vocab)
TGT_VOCAB_SIZE = len(uk_vocab)
EMB_SIZE = 192
NHEAD = 6
FFN_HID_DIM = 192
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
NUM_EPOCHS = 10

print(DEVICE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE)

cuda 18527 49763


In [15]:
model = TransformerModel(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    emb_size=EMB_SIZE,
    nhead=NHEAD,
    ff_dim=FFN_HID_DIM,
    num_enc_layers=NUM_ENCODER_LAYERS,
    num_dec_layers=NUM_DECODER_LAYERS,
    pad_idx=PAD_IDX,
).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters())



In [16]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 24,501,155 trainable parameters


In [17]:
def train_epoch(model, optimizer, criterion, train_loader):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, total=len(train_loader)):
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)
        tgt_input = tgt[:-1, :]
        tgt_output = tgt[1:, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = model.create_masks(
            src, tgt_input
        )
        logits = model(
            src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask
        )

        optimizer.zero_grad()
        loss = criterion(logits.view(-1, logits.size(-1)), tgt_output.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += float(loss)
    return total_loss / len(train_loader)


def evaluate(model, criterion, valid_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(valid_loader, total=len(valid_loader)):
            src, tgt = src.to(DEVICE), tgt.to(DEVICE)
            tgt_input = tgt[:-1, :]
            tgt_output = tgt[1:, :]

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = model.create_masks(
                src, tgt_input
            )
            logits = model(
                src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask
            )

            loss = criterion(logits.view(-1, logits.size(-1)), tgt_output.view(-1))
            total_loss += float(loss)
    return total_loss / len(valid_loader)

In [18]:
def epoch_time(start_time: int, end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:
for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, optimizer, criterion, train_loader)
    valid_loss = evaluate(model, criterion, valid_loader)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}")

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
100%|██████████| 2683/2683 [13:10<00:00,  3.40it/s]
100%|██████████| 336/336 [00:15<00:00, 21.11it/s]


Epoch: 01 | Time: 13m 26s
	Train Loss: 3.660 | Train PPL:  38.855
	 Val. Loss: 2.641 |  Val. PPL:  14.031


100%|██████████| 2683/2683 [19:45<00:00,  2.26it/s]
100%|██████████| 336/336 [00:13<00:00, 25.50it/s]


Epoch: 02 | Time: 19m 58s
	Train Loss: 2.336 | Train PPL:  10.343
	 Val. Loss: 2.052 |  Val. PPL:   7.780


100%|██████████| 2683/2683 [19:10<00:00,  2.33it/s]
100%|██████████| 336/336 [00:13<00:00, 25.57it/s]


Epoch: 03 | Time: 19m 23s
	Train Loss: 1.802 | Train PPL:   6.061
	 Val. Loss: 1.853 |  Val. PPL:   6.381


100%|██████████| 2683/2683 [19:08<00:00,  2.34it/s]
100%|██████████| 336/336 [00:15<00:00, 22.18it/s]


Epoch: 04 | Time: 19m 23s
	Train Loss: 1.502 | Train PPL:   4.491
	 Val. Loss: 1.758 |  Val. PPL:   5.803


100%|██████████| 2683/2683 [19:13<00:00,  2.33it/s]
100%|██████████| 336/336 [00:13<00:00, 25.63it/s]


Epoch: 05 | Time: 19m 27s
	Train Loss: 1.310 | Train PPL:   3.705
	 Val. Loss: 1.713 |  Val. PPL:   5.544


100%|██████████| 2683/2683 [18:50<00:00,  2.37it/s]
100%|██████████| 336/336 [00:13<00:00, 25.61it/s]


Epoch: 06 | Time: 19m 3s
	Train Loss: 1.169 | Train PPL:   3.219
	 Val. Loss: 1.692 |  Val. PPL:   5.433


100%|██████████| 2683/2683 [18:54<00:00,  2.36it/s]
100%|██████████| 336/336 [00:13<00:00, 25.61it/s]


Epoch: 07 | Time: 19m 7s
	Train Loss: 1.070 | Train PPL:   2.914
	 Val. Loss: 1.666 |  Val. PPL:   5.294


100%|██████████| 2683/2683 [18:56<00:00,  2.36it/s]
100%|██████████| 336/336 [00:13<00:00, 25.58it/s]


Epoch: 08 | Time: 19m 9s
	Train Loss: 0.993 | Train PPL:   2.699
	 Val. Loss: 1.643 |  Val. PPL:   5.169


100%|██████████| 2683/2683 [19:04<00:00,  2.34it/s]
100%|██████████| 336/336 [00:13<00:00, 25.59it/s]


Epoch: 09 | Time: 19m 17s
	Train Loss: 0.934 | Train PPL:   2.544
	 Val. Loss: 1.640 |  Val. PPL:   5.156


100%|██████████| 2683/2683 [18:59<00:00,  2.36it/s]
100%|██████████| 336/336 [00:13<00:00, 25.61it/s]

Epoch: 10 | Time: 19m 12s
	Train Loss: 0.885 | Train PPL:   2.422
	 Val. Loss: 1.620 |  Val. PPL:   5.056





In [20]:
torch.save(model, "../outputs/model.pth")