In [1]:
!pip3 install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [2]:
!pip3 install torchtext==0.18.0



In [3]:
!pip3 install pandas



In [4]:
!pip3 install -U spacy
!python3 -m spacy download en_core_web_sm
!python3 -m spacy download uk_core_news_sm



Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.
Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.


In [1]:
import torchtext
import pandas as pd
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch.nn as nn
from collections import Counter
import math
import time

from tqdm import tqdm

torchtext.disable_torchtext_deprecation_warning()



In [2]:
SRC = "en"
TGT = "uk"
csv = pd.read_csv("../data/en-uk.tsv", sep="\t", usecols=[1, 3], names=[SRC, TGT])
csv.head()

Unnamed: 0,en,uk
0,Let's try something.,Давайте щось спробуємо!
1,I have to go to sleep.,Маю піти спати.
2,Muiriel is 20 now.,Мюріел зараз двадцять.
3,"The password is ""Muiriel"".","Пароль - ""Muiriel""."
4,I will be back soon.,Я скоро повернуся.


In [3]:
en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
uk_tokenizer = get_tokenizer("spacy", language="uk_core_news_sm")

In [4]:
def build_vocab(data, tokenizer):
    counter = Counter()
    for d in data:
        counter.update(tokenizer(d))
    print(len(counter))
    return vocab(
        counter, specials=["<unk>", "<pad>", "<bos>", "<eos>"], special_first=True
    )


en_vocab = build_vocab(csv[SRC], en_tokenizer)
uk_vocab = build_vocab(csv[TGT], uk_tokenizer)

18523
49759


In [5]:
def data_process(csv):
    raw_en_iter = iter(csv[SRC])
    raw_uk_iter = iter(csv[TGT])
    data = []
    for raw_en, raw_uk in zip(raw_en_iter, raw_uk_iter):
        en_tensor_ = torch.tensor(
            [en_vocab[token] for token in en_tokenizer(raw_en)], dtype=torch.long
        )
        uk_tensor_ = torch.tensor(
            [uk_vocab[token] for token in uk_tokenizer(raw_uk)], dtype=torch.long
        )
        data.append((en_tensor_, uk_tensor_))
    return data


data = data_process(csv)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\uni\lyng\lab2\.venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\uni\lyng\lab2\.venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "d:\uni\lyng\lab2\.venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File "d:\uni\lyng\lab2\.venv\Lib\site-packages\tor

In [6]:
BATCH_SIZE = 64
PAD_IDX = en_vocab["<pad>"]
BOS_IDX = en_vocab["<bos>"]
EOS_IDX = en_vocab["<eos>"]
print(PAD_IDX, uk_vocab["<pad>"], 3)

1 1 3


In [7]:
def generate_batch(data_batch):
    en_batch, uk_batch = [], []
    for en_item, uk_item in data_batch:
        en_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
        uk_batch.append(
            torch.cat(
                [torch.tensor([BOS_IDX]), uk_item, torch.tensor([EOS_IDX])], dim=0
            )
        )
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    uk_batch = pad_sequence(uk_batch, padding_value=PAD_IDX)
    return en_batch, uk_batch

In [8]:
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    data, [0.8, 0.1, 0.1]
)

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch
)
valid_loader = DataLoader(
    valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=generate_batch
)

In [9]:
print(len(train_loader), len(train_loader) * BATCH_SIZE)
for en, uk in test_loader:
    en_itos = en_vocab.get_itos()
    uk_itos = uk_vocab.get_itos()
    print(f"Shape of en: {en.shape} {en.dtype}")
    print(f"Shape of uk: {uk.shape} {uk.dtype}")
    print([en_itos[w] for w in en.T[0]])
    print([uk_itos[w] for w in uk.T[0]])
    break

2683 171712
Shape of en: torch.Size([20, 64]) torch.int64
Shape of uk: torch.Size([15, 64]) torch.int64
['<bos>', 'He', 'will', 'be', 'a', 'good', 'teacher', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<bos>', 'Він', 'стане', 'хорошим', 'викладачем', '.', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.size(0), :]
        return self.dropout(x)

In [11]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [12]:
class TransformerModel(nn.Module):
    def __init__(
        self,
        *,
        src_vocab_size,
        tgt_vocab_size,
        emb_size,
        nhead,
        ff_dim,
        num_enc_layers,
        num_dec_layers,
        pad_idx,
    ):
        super(TransformerModel, self).__init__()
        self.src_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size)

        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_enc_layers,
            num_decoder_layers=num_dec_layers,
            dim_feedforward=ff_dim,
        )
        self.fc_out = nn.Linear(emb_size, tgt_vocab_size)
        self.pad_idx = pad_idx

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask):
        src = self.positional_encoding(self.src_emb(src))
        tgt = self.positional_encoding(self.tgt_emb(tgt))

        output = self.transformer(
            src,
            tgt,
            src_mask=src_mask,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=src_padding_mask,
        )
        return self.fc_out(output)

    def create_masks(self, src, tgt):
        src_seq_len, tgt_seq_len = src.size(0), tgt.size(0)
        src_mask = torch.zeros((src_seq_len, src_seq_len), device=src.device).type(
            torch.bool
        )
        tgt_mask = self.generate_square_subsequent_mask(tgt_seq_len).to(tgt.device)

        src_padding_mask = (src == self.pad_idx).transpose(0, 1)
        tgt_padding_mask = (tgt == self.pad_idx).transpose(0, 1)
        return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

    def generate_square_subsequent_mask(self, size):
        return torch.triu(torch.ones(size, size) * float("-inf"), diagonal=1)

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SRC_VOCAB_SIZE = len(en_vocab)
TGT_VOCAB_SIZE = len(uk_vocab)
EMB_SIZE = 256
NHEAD = 8
FFN_HID_DIM = 256
NUM_ENCODER_LAYERS = 6
NUM_DECODER_LAYERS = 6
NUM_EPOCHS = 30

print(DEVICE, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE)

cuda 18527 49763


In [None]:
model = TransformerModel(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    emb_size=EMB_SIZE,
    nhead=NHEAD,
    ff_dim=FFN_HID_DIM,
    num_enc_layers=NUM_ENCODER_LAYERS,
    num_dec_layers=NUM_DECODER_LAYERS,
    pad_idx=PAD_IDX,
).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), eps=1e-8)



In [None]:
def lr_lambda(step):
    warmup_steps = 4000
    if step < warmup_steps:
        return float(step) / float(max(1, warmup_steps))
    return 1.0

In [15]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 24,501,155 trainable parameters


In [16]:
def train_epoch(model, optimizer, criterion, train_loader):
    model.train()
    total_loss = 0
    for src, tgt in tqdm(train_loader, total=len(train_loader)):
        src, tgt = src.to(DEVICE), tgt.to(DEVICE)
        tgt_input = tgt[:-1, :]
        tgt_output = tgt[1:, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = model.create_masks(
            src, tgt_input
        )
        logits = model(
            src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask
        )

        optimizer.zero_grad()
        loss = criterion(
            logits.view(-1, logits.size(-1)), tgt_output.contiguous().view(-1)
        )
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)


def evaluate(model, criterion, valid_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, tgt in tqdm(valid_loader, total=len(valid_loader)):
            src, tgt = src.to(DEVICE), tgt.to(DEVICE)
            tgt_input = tgt[:-1, :]
            tgt_output = tgt[1:, :]

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = model.create_masks(
                src, tgt_input
            )
            logits = model(
                src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask
            )

            loss = criterion(
                logits.view(-1, logits.size(-1)), tgt_output.contiguous().view(-1)
            )
            total_loss += loss.item()
    return total_loss / len(valid_loader)

In [17]:
def epoch_time(start_time: int, end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [22]:
for epoch in range(NUM_EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, optimizer, criterion, train_loader)
    valid_loss = evaluate(model, criterion, valid_loader)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}")

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
100%|██████████| 2683/2683 [03:03<00:00, 14.65it/s]
100%|██████████| 336/336 [00:08<00:00, 40.10it/s]


Epoch: 01 | Time: 3m 11s
	Train Loss: 4.832 | Train PPL: 125.422
	 Val. Loss: 4.042 |  Val. PPL:  56.914


100%|██████████| 2683/2683 [05:16<00:00,  8.48it/s]
100%|██████████| 336/336 [00:07<00:00, 46.67it/s]


Epoch: 02 | Time: 5m 23s
	Train Loss: 3.895 | Train PPL:  49.143
	 Val. Loss: 3.591 |  Val. PPL:  36.253


100%|██████████| 2683/2683 [05:28<00:00,  8.16it/s]
100%|██████████| 336/336 [00:07<00:00, 47.08it/s]


Epoch: 03 | Time: 5m 36s
	Train Loss: 3.530 | Train PPL:  34.127
	 Val. Loss: 3.291 |  Val. PPL:  26.862


100%|██████████| 2683/2683 [05:25<00:00,  8.24it/s]
100%|██████████| 336/336 [00:07<00:00, 47.10it/s]


Epoch: 04 | Time: 5m 32s
	Train Loss: 3.278 | Train PPL:  26.518
	 Val. Loss: 3.074 |  Val. PPL:  21.637


100%|██████████| 2683/2683 [05:33<00:00,  8.05it/s]
100%|██████████| 336/336 [00:07<00:00, 46.88it/s]


Epoch: 05 | Time: 5m 40s
	Train Loss: 3.086 | Train PPL:  21.899
	 Val. Loss: 2.905 |  Val. PPL:  18.268


100%|██████████| 2683/2683 [05:24<00:00,  8.26it/s]
100%|██████████| 336/336 [00:07<00:00, 46.99it/s]


Epoch: 06 | Time: 5m 32s
	Train Loss: 2.932 | Train PPL:  18.757
	 Val. Loss: 2.775 |  Val. PPL:  16.040


100%|██████████| 2683/2683 [05:29<00:00,  8.15it/s]
100%|██████████| 336/336 [00:07<00:00, 46.39it/s]


Epoch: 07 | Time: 5m 36s
	Train Loss: 2.804 | Train PPL:  16.507
	 Val. Loss: 2.655 |  Val. PPL:  14.220


100%|██████████| 2683/2683 [05:32<00:00,  8.06it/s]
100%|██████████| 336/336 [00:07<00:00, 46.46it/s]


Epoch: 08 | Time: 5m 39s
	Train Loss: 2.698 | Train PPL:  14.850
	 Val. Loss: 2.578 |  Val. PPL:  13.167


100%|██████████| 2683/2683 [05:31<00:00,  8.09it/s]
100%|██████████| 336/336 [00:07<00:00, 47.42it/s]


Epoch: 09 | Time: 5m 38s
	Train Loss: 2.610 | Train PPL:  13.595
	 Val. Loss: 2.505 |  Val. PPL:  12.238


100%|██████████| 2683/2683 [05:25<00:00,  8.23it/s]
100%|██████████| 336/336 [00:07<00:00, 47.67it/s]


Epoch: 10 | Time: 5m 33s
	Train Loss: 2.535 | Train PPL:  12.621
	 Val. Loss: 2.444 |  Val. PPL:  11.520


100%|██████████| 2683/2683 [05:21<00:00,  8.33it/s]
100%|██████████| 336/336 [00:07<00:00, 46.57it/s]


Epoch: 11 | Time: 5m 29s
	Train Loss: 2.470 | Train PPL:  11.823
	 Val. Loss: 2.395 |  Val. PPL:  10.970


100%|██████████| 2683/2683 [05:24<00:00,  8.26it/s]
100%|██████████| 336/336 [00:06<00:00, 55.77it/s]


Epoch: 12 | Time: 5m 31s
	Train Loss: 2.413 | Train PPL:  11.163
	 Val. Loss: 2.352 |  Val. PPL:  10.502


100%|██████████| 2683/2683 [04:46<00:00,  9.37it/s]
100%|██████████| 336/336 [00:05<00:00, 56.29it/s]


Epoch: 13 | Time: 4m 52s
	Train Loss: 2.363 | Train PPL:  10.620
	 Val. Loss: 2.313 |  Val. PPL:  10.109


100%|██████████| 2683/2683 [04:46<00:00,  9.37it/s]
100%|██████████| 336/336 [00:06<00:00, 55.83it/s]


Epoch: 14 | Time: 4m 52s
	Train Loss: 2.319 | Train PPL:  10.164
	 Val. Loss: 2.277 |  Val. PPL:   9.743


100%|██████████| 2683/2683 [04:44<00:00,  9.43it/s]
100%|██████████| 336/336 [00:06<00:00, 55.54it/s]


Epoch: 15 | Time: 4m 50s
	Train Loss: 2.279 | Train PPL:   9.771
	 Val. Loss: 2.251 |  Val. PPL:   9.495


100%|██████████| 2683/2683 [04:39<00:00,  9.61it/s]
100%|██████████| 336/336 [00:06<00:00, 55.84it/s]


Epoch: 16 | Time: 4m 45s
	Train Loss: 2.243 | Train PPL:   9.420
	 Val. Loss: 2.218 |  Val. PPL:   9.191


100%|██████████| 2683/2683 [04:41<00:00,  9.52it/s]
100%|██████████| 336/336 [00:05<00:00, 56.03it/s]


Epoch: 17 | Time: 4m 47s
	Train Loss: 2.211 | Train PPL:   9.129
	 Val. Loss: 2.195 |  Val. PPL:   8.982


100%|██████████| 2683/2683 [04:49<00:00,  9.27it/s]
100%|██████████| 336/336 [00:06<00:00, 55.84it/s]


Epoch: 18 | Time: 4m 55s
	Train Loss: 2.185 | Train PPL:   8.890
	 Val. Loss: 2.179 |  Val. PPL:   8.837


100%|██████████| 2683/2683 [04:42<00:00,  9.48it/s]
100%|██████████| 336/336 [00:06<00:00, 55.84it/s]


Epoch: 19 | Time: 4m 48s
	Train Loss: 2.160 | Train PPL:   8.671
	 Val. Loss: 2.165 |  Val. PPL:   8.718


100%|██████████| 2683/2683 [04:45<00:00,  9.40it/s]
100%|██████████| 336/336 [00:06<00:00, 55.71it/s]

Epoch: 20 | Time: 4m 51s
	Train Loss: 2.137 | Train PPL:   8.476
	 Val. Loss: 2.156 |  Val. PPL:   8.640





In [23]:
torch.save(model, "../outputs/model_new1.pth")

In [18]:
def save_model_full(epoch):
  torch.save({
    'epoch': epoch,
    'mode_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
  }, f"../outputs/model_full_new_epoch{epoch}.pth")

In [25]:
save_model_full(NUM_EPOCHS)

In [27]:
for epoch in range(NUM_EPOCHS, 50):
    start_time = time.time()
    train_loss = train_epoch(model, optimizer, criterion, train_loader)
    valid_loss = evaluate(model, criterion, valid_loader)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")
    print(f"\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}")

    if epoch % 10 == 0 and epoch != NUM_EPOCHS:
        print("saving...")
        save_model_full(epoch)

100%|██████████| 2683/2683 [07:11<00:00,  6.22it/s]
100%|██████████| 336/336 [00:08<00:00, 40.90it/s]


Epoch: 21 | Time: 7m 19s
	Train Loss: 2.081 | Train PPL:   8.009
	 Val. Loss: 2.126 |  Val. PPL:   8.383


100%|██████████| 2683/2683 [07:14<00:00,  6.18it/s]
100%|██████████| 336/336 [00:08<00:00, 41.05it/s]


Epoch: 22 | Time: 7m 22s
	Train Loss: 2.081 | Train PPL:   8.016
	 Val. Loss: 2.171 |  Val. PPL:   8.771


100%|██████████| 2683/2683 [07:13<00:00,  6.19it/s]
100%|██████████| 336/336 [00:08<00:00, 40.39it/s]


Epoch: 23 | Time: 7m 22s
	Train Loss: 2.108 | Train PPL:   8.232
	 Val. Loss: 2.187 |  Val. PPL:   8.907


100%|██████████| 2683/2683 [07:07<00:00,  6.28it/s]
100%|██████████| 336/336 [00:08<00:00, 40.88it/s]


Epoch: 24 | Time: 7m 15s
	Train Loss: 2.117 | Train PPL:   8.303
	 Val. Loss: 2.181 |  Val. PPL:   8.855


100%|██████████| 2683/2683 [07:09<00:00,  6.25it/s]
100%|██████████| 336/336 [00:08<00:00, 40.90it/s]


Epoch: 25 | Time: 7m 17s
	Train Loss: 2.118 | Train PPL:   8.314
	 Val. Loss: 2.192 |  Val. PPL:   8.951


100%|██████████| 2683/2683 [07:04<00:00,  6.32it/s]
100%|██████████| 336/336 [00:08<00:00, 41.23it/s]


Epoch: 26 | Time: 7m 12s
	Train Loss: 2.113 | Train PPL:   8.275
	 Val. Loss: 2.187 |  Val. PPL:   8.911


100%|██████████| 2683/2683 [07:10<00:00,  6.24it/s]
100%|██████████| 336/336 [00:08<00:00, 40.99it/s]


Epoch: 27 | Time: 7m 18s
	Train Loss: 2.109 | Train PPL:   8.240
	 Val. Loss: 2.188 |  Val. PPL:   8.916


100%|██████████| 2683/2683 [07:18<00:00,  6.12it/s]
100%|██████████| 336/336 [00:08<00:00, 40.14it/s]


Epoch: 28 | Time: 7m 26s
	Train Loss: 2.104 | Train PPL:   8.203
	 Val. Loss: 2.186 |  Val. PPL:   8.901


100%|██████████| 2683/2683 [07:08<00:00,  6.26it/s]
100%|██████████| 336/336 [00:08<00:00, 40.82it/s]


Epoch: 29 | Time: 7m 16s
	Train Loss: 2.097 | Train PPL:   8.140
	 Val. Loss: 2.186 |  Val. PPL:   8.904


100%|██████████| 2683/2683 [07:06<00:00,  6.29it/s]
100%|██████████| 336/336 [00:08<00:00, 40.98it/s]


Epoch: 30 | Time: 7m 15s
	Train Loss: 2.092 | Train PPL:   8.099
	 Val. Loss: 2.185 |  Val. PPL:   8.893


100%|██████████| 2683/2683 [07:03<00:00,  6.33it/s]
100%|██████████| 336/336 [00:08<00:00, 40.97it/s]


Epoch: 31 | Time: 7m 11s
	Train Loss: 2.087 | Train PPL:   8.064
	 Val. Loss: 2.182 |  Val. PPL:   8.861
saving...


100%|██████████| 2683/2683 [07:05<00:00,  6.31it/s]
100%|██████████| 336/336 [00:08<00:00, 41.15it/s]


Epoch: 32 | Time: 7m 13s
	Train Loss: 2.081 | Train PPL:   8.011
	 Val. Loss: 2.186 |  Val. PPL:   8.898


100%|██████████| 2683/2683 [07:14<00:00,  6.18it/s]
100%|██████████| 336/336 [00:08<00:00, 40.37it/s]


Epoch: 33 | Time: 7m 22s
	Train Loss: 2.076 | Train PPL:   7.969
	 Val. Loss: 2.189 |  Val. PPL:   8.925


100%|██████████| 2683/2683 [07:10<00:00,  6.23it/s]
100%|██████████| 336/336 [00:08<00:00, 41.13it/s]


Epoch: 34 | Time: 7m 19s
	Train Loss: 2.072 | Train PPL:   7.942
	 Val. Loss: 2.188 |  Val. PPL:   8.920


100%|██████████| 2683/2683 [07:07<00:00,  6.27it/s]
100%|██████████| 336/336 [00:08<00:00, 41.14it/s]


Epoch: 35 | Time: 7m 15s
	Train Loss: 2.068 | Train PPL:   7.912
	 Val. Loss: 2.190 |  Val. PPL:   8.939


100%|██████████| 2683/2683 [07:07<00:00,  6.28it/s]
100%|██████████| 336/336 [00:08<00:00, 40.89it/s]


Epoch: 36 | Time: 7m 15s
	Train Loss: 2.065 | Train PPL:   7.883
	 Val. Loss: 2.189 |  Val. PPL:   8.930


100%|██████████| 2683/2683 [07:11<00:00,  6.22it/s]
100%|██████████| 336/336 [00:08<00:00, 41.10it/s]


Epoch: 37 | Time: 7m 19s
	Train Loss: 2.062 | Train PPL:   7.862
	 Val. Loss: 2.194 |  Val. PPL:   8.969


100%|██████████| 2683/2683 [07:13<00:00,  6.19it/s]
100%|██████████| 336/336 [00:08<00:00, 40.24it/s]


Epoch: 38 | Time: 7m 21s
	Train Loss: 2.058 | Train PPL:   7.830
	 Val. Loss: 2.189 |  Val. PPL:   8.924


100%|██████████| 2683/2683 [07:14<00:00,  6.17it/s]
100%|██████████| 336/336 [00:08<00:00, 41.06it/s]


Epoch: 39 | Time: 7m 23s
	Train Loss: 2.056 | Train PPL:   7.816
	 Val. Loss: 2.193 |  Val. PPL:   8.959


100%|██████████| 2683/2683 [07:13<00:00,  6.19it/s]
100%|██████████| 336/336 [00:08<00:00, 41.08it/s]


Epoch: 40 | Time: 7m 21s
	Train Loss: 2.052 | Train PPL:   7.783
	 Val. Loss: 2.190 |  Val. PPL:   8.939


100%|██████████| 2683/2683 [07:10<00:00,  6.23it/s]
100%|██████████| 336/336 [00:08<00:00, 40.89it/s]


Epoch: 41 | Time: 7m 19s
	Train Loss: 2.050 | Train PPL:   7.771
	 Val. Loss: 2.192 |  Val. PPL:   8.949
saving...


100%|██████████| 2683/2683 [07:08<00:00,  6.26it/s]
100%|██████████| 336/336 [00:08<00:00, 41.22it/s]


Epoch: 42 | Time: 7m 16s
	Train Loss: 2.047 | Train PPL:   7.744
	 Val. Loss: 2.192 |  Val. PPL:   8.954


100%|██████████| 2683/2683 [07:15<00:00,  6.16it/s]
100%|██████████| 336/336 [00:08<00:00, 40.18it/s]


Epoch: 43 | Time: 7m 24s
	Train Loss: 2.045 | Train PPL:   7.733
	 Val. Loss: 2.193 |  Val. PPL:   8.961


100%|██████████| 2683/2683 [07:05<00:00,  6.31it/s]
100%|██████████| 336/336 [00:08<00:00, 41.16it/s]


Epoch: 44 | Time: 7m 13s
	Train Loss: 2.043 | Train PPL:   7.716
	 Val. Loss: 2.196 |  Val. PPL:   8.986


100%|██████████| 2683/2683 [07:04<00:00,  6.31it/s]
100%|██████████| 336/336 [00:08<00:00, 40.89it/s]


Epoch: 45 | Time: 7m 13s
	Train Loss: 2.041 | Train PPL:   7.696
	 Val. Loss: 2.201 |  Val. PPL:   9.033


100%|██████████| 2683/2683 [07:07<00:00,  6.28it/s]
100%|██████████| 336/336 [00:08<00:00, 41.15it/s]


Epoch: 46 | Time: 7m 15s
	Train Loss: 2.038 | Train PPL:   7.675
	 Val. Loss: 2.202 |  Val. PPL:   9.041


100%|██████████| 2683/2683 [07:08<00:00,  6.26it/s]
100%|██████████| 336/336 [00:08<00:00, 40.64it/s]


Epoch: 47 | Time: 7m 16s
	Train Loss: 2.037 | Train PPL:   7.667
	 Val. Loss: 2.200 |  Val. PPL:   9.024


100%|██████████| 2683/2683 [07:09<00:00,  6.24it/s]
100%|██████████| 336/336 [00:08<00:00, 39.97it/s]


Epoch: 48 | Time: 7m 18s
	Train Loss: 2.035 | Train PPL:   7.654
	 Val. Loss: 2.196 |  Val. PPL:   8.991


100%|██████████| 2683/2683 [07:13<00:00,  6.19it/s]
100%|██████████| 336/336 [00:08<00:00, 41.30it/s]


Epoch: 49 | Time: 7m 21s
	Train Loss: 2.034 | Train PPL:   7.645
	 Val. Loss: 2.196 |  Val. PPL:   8.986


100%|██████████| 2683/2683 [07:09<00:00,  6.24it/s]
100%|██████████| 336/336 [00:08<00:00, 41.39it/s]

Epoch: 50 | Time: 7m 17s
	Train Loss: 2.031 | Train PPL:   7.621
	 Val. Loss: 2.201 |  Val. PPL:   9.034





In [28]:
save_model_full(50)

In [20]:
checkpoint = torch.load("../outputs/model_full_new_epoch50.pth", weights_only=True)
model.load_state_dict(checkpoint['mode_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [22]:
en_vocab.set_default_index(en_vocab["<unk>"])
uk_vocab.set_default_index(uk_vocab["<unk>"])

In [73]:
def translate(input: str, len_diff = 10):
  model.eval()

  indices = torch.tensor(
    [en_vocab[token] for token in en_tokenizer(input)], dtype=torch.long
  )

  src = torch.cat(
    [torch.tensor([BOS_IDX]), indices, torch.tensor([EOS_IDX])], dim=0
  ).unsqueeze(1).to(DEVICE)

  num_tokens = src.shape[0]
  src_mask = (torch.zeros(num_tokens, num_tokens, device=DEVICE)).type(torch.bool)
  src_padding_mask = (src == en_vocab["<pad>"]).transpose(0, 1).type(torch.bool).to(DEVICE)

  with torch.no_grad():
    memory = model.transformer.encoder(
      model.positional_encoding(model.src_emb(src)),
      mask=src_mask,
      src_key_padding_mask=src_padding_mask,
    )

  tgt_indices = (
    torch.tensor([uk_vocab["<bos>"]], dtype=torch.long).unsqueeze(1).to(DEVICE)
  ).to(DEVICE)
  memory = memory.to(DEVICE)

  for _ in range(num_tokens + len_diff):
    tgt_mask = model.generate_square_subsequent_mask(tgt_indices.size(0)).type(torch.bool).to(DEVICE)
    tgt_padding_mask = (tgt_indices == uk_vocab["<pad>"]).transpose(0, 1).to(DEVICE)

    with torch.no_grad():
      output = model.transformer.decoder(
        model.positional_encoding(model.tgt_emb(tgt_indices)),
        memory,
        tgt_mask=tgt_mask,
        tgt_key_padding_mask=tgt_padding_mask
      )
      logits = model.fc_out(output[-1])
      next_token = logits.argmax(-1).item()

    tgt_indices = torch.cat(
      [tgt_indices, torch.tensor([[next_token]], device=DEVICE)], dim=0
    )

    if next_token == uk_vocab["<eos>"]:
      break

  translated_sentence = [
    uk_vocab.lookup_token(idx)
    for idx in tgt_indices.squeeze().tolist()
    if idx not in {uk_vocab["<bos>"], uk_vocab["<eos>"]}
  ]

  return " ".join(translated_sentence)

  

In [87]:
samples = [
  "I am very cool",
  "Is it working?",
  "I am a good teacher",
  "He likes apples",
  "Those who stand with me shall be my brothers",
  "Despair for your end is near",
  "What is the weather today?",
  "She is good at it",
  "Today was great",
  "Call me!",
  "People stopped telling jokes",
  "Well, I am not very happy with the results",
  "But this is probably okay",
  "I will try to do another one",
  "It will take a whole night"
]


for s in samples:
  print(s, "========", translate(s))

