In [1]:
import sys

import torch
from torch import nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ExponentialLR

sys.path.append("../../")
from models.transformer import Transformer
from modules.utils import count_params, create_pad_mask, create_subsequent_mask, BPETokenizer

In [2]:
raw_corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]
tokenizer = BPETokenizer(raw_corpus,target_size=100)

100%|██████████| 100/100 [00:00<00:00, 6236.05it/s]


In [2]:
tknr = BPETokenizer()
tknr.load_state_dict(torch.load('tokenizer.pth'))

In [5]:
print(
    tknr.decode(
        tknr(
            [
                "<sos> Hôm nay cứ để nó thế này thôi, mai xem tiếp :D <eos> <pad>",
                "To continue, we need to apply that merge in our splits dictionary. Let’s write another function for this:",
            ]
        )
    )
)

[['<unk>', 'so', 's', '<unk>', ' hôm', ' nay', ' cứ', ' để', ' nó', ' thế', ' này', ' thôi', ',', ' mai', ' xem', ' tiếp', ' :', 'd', ' ', '<unk>', 'e', 'os', '<unk>', ' ', '<unk>', 'pad', '<unk>'], ['to', ' continue', ',', ' we', ' need', ' to', ' apply', ' that', ' merge', ' in', ' our', ' spl', 'its', ' dictionary', '.', ' let', '<unk>', 's', ' write', ' another', ' function', ' for', ' this', ':']]


In [7]:
tknr.encode(["<eos>","<sos>","<pad>","<unk>"])

[2, 1, 0, 3]

In [9]:
tknr.decode([0,3,1,2])

['<pad>', '<unk>', '<sos>', '<eos>']

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

max_length = 20
vocab_size = 21
n_heads = 4
n_blocks = 2
d_model = 16
d_k = d_v = d_model // n_heads
d_ff = 4 * d_model
p_drop = 0.2

model = Transformer(
    vocab_size, n_heads, max_length, n_blocks, d_model, d_ff, d_k, d_v, p_drop
).to(device)
count_params(model)
optimizer = Adam(model.parameters(), lr=0.002, betas=(0.98, 0.99), weight_decay=0.01)
scheduler = ExponentialLR(optimizer, 0.999**0.125)
loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1, ignore_index=0)

Total: 15,333 parameters.
Trainable: 15,333 parameters.


In [2]:
sample_data = torch.arange(1, 21).view(-1, 10).to(device)
sample_lens = torch.LongTensor([10] * 2).to(device)
sample_mask1 = create_pad_mask(sample_lens).to(device)
sample_mask2 = create_subsequent_mask(sample_lens - 1).to(device)

In [3]:
for i in range(1000):
    logits = model(sample_data, sample_mask1, sample_data[:, :-1], sample_mask2)
    #    print(logits.argmax(-1))
    loss = loss_fn(logits.reshape(-1, logits.size(-1)), sample_data[:, 1:].reshape(-1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(loss.item())
    scheduler.step()

10.734347343444824
11.453489303588867
8.829902648925781
9.75387954711914
8.953031539916992
8.13963794708252
8.208979606628418
6.586256504058838
6.446315765380859
6.559572219848633
6.019176959991455
6.731379985809326
5.703819274902344
5.653080940246582
5.378207683563232
5.0579328536987305
4.6011962890625
4.2487006187438965
4.60893440246582
3.933199405670166
3.9547829627990723
3.8501944541931152
4.149725914001465
3.820894241333008
3.8539371490478516
3.8390679359436035
3.5317938327789307
3.5049667358398438
3.6981773376464844
3.246305465698242
3.6801791191101074
3.439358949661255
3.3345284461975098
3.4234962463378906
3.480539321899414
3.3895487785339355
2.9255383014678955
3.106116771697998
2.7707767486572266
3.306351661682129
3.1348891258239746
2.635317325592041
2.9378833770751953
3.0529236793518066
3.0195560455322266
2.7501416206359863
2.787433624267578
2.533318042755127
2.526667833328247
2.9423158168792725
2.723445177078247
2.4501538276672363
2.4180519580841064
2.3553810119628906
2.31911

In [9]:
src_data = torch.arange(8, 18).unsqueeze(0).to(device)
src_lens = torch.LongTensor([10]).to(device)
src_mask = create_pad_mask(src_lens).to(device)

tgt_data = torch.LongTensor([[9, 10, 11]]).to(device)
tgt_lens = torch.LongTensor([3]).to(device)
tgt_mask = create_subsequent_mask(tgt_lens).to(device)

model.eval()
with torch.inference_mode():
    encoder_outputs = model.encode(src_data, src_mask)
    for i in range(12):
        logits = model.generate(tgt_data, tgt_mask, encoder_outputs, src_mask)
        # print(logits.argmax(-1))
        n = logits.softmax(-1).argmax(-1)[:, [-1]]
        tgt_data = torch.cat([tgt_data, n], dim=1).to(device)
        tgt_lens += 1
        tgt_mask = create_subsequent_mask(tgt_lens).to(device)

print(tgt_data)

tensor([[ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 20, 20]],
       device='cuda:0')
