# Implementacija GPT-2 modela za srpski jezik

Cilj projekta jeste treniranje GPT-2 modela na delu srpske wikipedije i generisanje artikala sličnog sastava

### Biblioteke

In [2]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import time
import math

from srb_gpt import wiki, tokenizer, data, gpt, helper
import numpy as np
import torch

from torchsummary import summary

### Konstante

In [2]:
# URL
BASE_URL = 'https://sr.wikipedia.org'
ROOT_LINK = 'https://sr.wikipedia.org/wiki/%D0%9D%D0%B8%D0%BA%D0%BE%D0%BB%D0%B0_%D0%A2%D0%B5%D1%81%D0%BB%D0%B0' # Nikola Tesla

# fajlovi sa podacima
DATAFILE = 'data/data.txt' # fajl sa tekstom za treniranje, test i validaciju
BIN_DATAFILE = 'data/data.npy' # numpy reprezentacija tekstualnog fajla konvertovanog u tokene

# tokenizer
TOKENIZER_DIR = 'models'
TOKENIZER_MODEL = f'{TOKENIZER_DIR}/regex.model'
OUR_SPLIT_PATTERN = r"""'|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
VOCAB_SIZE = 512 # Veličina vokabulara / broj tokena rečnika

# GPT-2 model
BLOCK_SIZE = 128 # Veličina kontensta, broj tokena koji se uzimaju za predikciju
N_LAYER = 2 # broj slojeva
N_HEAD = 4 # broj glava 
N_EMBD = 128 # veličina vektora kojim se predstavlja jedan token 
DROPOUT = 0.1 
BIAS = False # True: bias u Linears i LayerNorms solojveima, False: noviji pristup, brže i bolje

# oknfiguracije za treniranje
TRAIN = 0.8
TEST = 0.1
VAL = 0.1

DEVICE = 'cpu' # 'cuda'
BATCH_SIZE = 64
ITERS = 5000
MAX_LR = 6e-3
MIN_LR = MAX_LR / 10
WARMUP_ITERS = 200
LR_DECAY_DUR = ITERS

WEIGHT_DECAY = 1e-1
BETA1 = 0.9
BETA2 = 0.95

LOG_INTERVAL = 100 # broj iteracija za ispis trenutne greške
VAL_SAMPLES = 50 # broj batcheva za procenu rezultata nad validacionim skupom

model_cfg = gpt.GPTConfig(block_size=BLOCK_SIZE, vocab_size=VOCAB_SIZE, n_layer=N_LAYER, n_head=N_HEAD, n_embd=N_EMBD, dropout=DROPOUT, bias=BIAS)

### Preuzimanje teksta as wikipedije

Preuzima se <ROOT_LINK> u našem slučaju stranica o Nikoli Tesli i svi likovi koji se nalaze na toj stranici i predstavljaju artikle wikipedije

In [None]:
print(f'DATAFILE: {DATAFILE} Not Found, downloading data')
t0 = time.time()
wiki.download_wiki_data_around_link(ROOT_LINK, BASE_URL, DATAFILE)
t1 = time.time()
print(f'DATAFILE: {DATAFILE} Created, took {t1 - t0:.2f} seconds')

### Treniranje tokenizatora

In [None]:
# train tokenizer
tok = tokenizer.RegexTokenizer(OUR_SPLIT_PATTERN)
print(f'TOKENIZER: {TOKENIZER_MODEL} Not Found, training it using DATAFILE: {DATAFILE}')
# učitavanje teksta iz DATAFILA-a
text = ""
with open(DATAFILE, "r", encoding="utf-8") as file:
    text = file.read() 

os.makedirs("models", exist_ok=True)

# Treniranje tokenizer modela
t0 = time.time()
tok.train(text, VOCAB_SIZE, verbose=True)
t1 = time.time()

# Čuvanje
tok.save(TOKENIZER_MODEL.split('.')[0])
print(f'TOKENIZER: {TOKENIZER_MODEL} Trained, took {t1 - t0:.2f} seconds')

In [43]:
tok = tokenizer.RegexTokenizer(OUR_SPLIT_PATTERN)
tok.load(TOKENIZER_MODEL)

### Kreiranje Dataset-a

In [None]:
text = ""
with open(DATAFILE, mode='r', encoding='utf-8') as file:
    text = file.read()
# Kodovanje tokena 
print(f'ENCODING: {DATAFILE}')
t0 = time.time()
ids = tok.encode(text)
t1 = time.time()
print(f'ENCODING: {DATAFILE}, took {t1 - t0:.2f} seconds')

# Čuvanje u fajl
arr = np.array(ids).astype(np.uint16)
fp_save = np.memmap(BIN_DATAFILE, dtype='uint16', mode='w+', shape=(arr.shape[0],))
fp_save[:] = arr[:]

### Inicijalizacija promenjivih za treniranje

In [10]:
fp = np.memmap(BIN_DATAFILE, dtype='uint16', mode='r')

num_samples = len(fp)
num_train = int(TRAIN * num_samples)
num_test = int(TEST * num_samples)
num_val = num_samples - num_train - num_test

data_train = fp[:num_train]
data_test  = fp[num_train:num_train+num_test]
data_val   = fp[num_train+num_test:]

model = gpt.GPT(model_cfg).to(DEVICE)
optimizer = model.configure_optimizers(WEIGHT_DECAY, MAX_LR, (BETA1, BETA2), DEVICE)


number of parameters: 0.46M
num decayed parameter tensors: 10, with 475,136 parameters
num non-decayed parameter tensors: 5, with 640 parameters
using fused AdamW: False


In [41]:
num_samples

4098651

### Trening

In [39]:
t0 = time.time()
dt = 0
for it in range(ITERS):
    # računanje learning rata-a za trenutnu iteraciju
    lr = helper.get_lr(it, WARMUP_ITERS, MAX_LR, LR_DECAY_DUR, MIN_LR)
    # učitavanje podataka
    X, Y = data.get_batch(data_train, BATCH_SIZE, BLOCK_SIZE)
    X = X.to(DEVICE)
    Y = Y.to(DEVICE)

    # Forward
    optimizer.zero_grad()
    logits, loss = model.forward(X, Y)
    # Backward
    if loss is not None:
        loss.backward() 
        optimizer.step()

    t1 = time.time()
    dt += t1 - t0
    t0 = t1
    # Logs
    if it % LOG_INTERVAL == 0:
        if loss is not None:
            print(f"iter {it}: loss {loss.item():.4f}, time {dt*1000:.2f}ms, ", end="")
        model.eval()
        temp_loss = 0
        for i in range(VAL_SAMPLES):
            X, Y = data.get_batch(data_val, BATCH_SIZE, BLOCK_SIZE)
            X = X.to(DEVICE)
            Y = Y.to(DEVICE)
            logits, loss = model.forward(X, Y)
            temp_loss += loss.item()
        print(f"val_loss {temp_loss/VAL_SAMPLES:.4f}")
        model.train()
        dt = 0 # reset delta time

iter 0: loss 4.0942, time 421.51ms, val_loss 4.0675
iter 100: loss 3.9110, time 29014.34ms, val_loss 3.8834
iter 200: loss 3.7924, time 30608.32ms, val_loss 3.7577
iter 300: loss 3.6595, time 33477.88ms, val_loss 3.6361
iter 400: loss 3.5656, time 39814.40ms, val_loss 3.5030
iter 500: loss 3.5277, time 35448.25ms, val_loss 3.3900
iter 600: loss 3.3688, time 34087.94ms, val_loss 3.2803
iter 700: loss 3.3004, time 36933.13ms, val_loss 3.1887
iter 800: loss 3.2448, time 34140.04ms, val_loss 3.1247
iter 900: loss 3.1880, time 33698.79ms, val_loss 3.0632
iter 1000: loss 3.1618, time 32847.58ms, val_loss 3.0302


KeyboardInterrupt: 

### Test i Metrike

In [40]:
nlls = []
prev_end_loc = 0
stride = BLOCK_SIZE
dt = torch.from_numpy(data_test.astype(np.int64)).to(DEVICE)
x, y = None, None
for begin_loc in range(0, num_test, stride):
    end_loc = min(begin_loc + BLOCK_SIZE, num_test)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = dt[begin_loc:end_loc]
    target_ids = dt[begin_loc+1:end_loc+1].clone()
    target_ids[:-trg_len] = -100
    if input_ids.shape[0] != BLOCK_SIZE:
        break
    input_ids = torch.unsqueeze(input_ids, 0)
    target_ids =  torch.unsqueeze(target_ids, 0)
    
    x = torch.cat((x, input_ids)) if x is not None else input_ids
    y = torch.cat((y, target_ids)) if y is not None else target_ids
    if x.shape[0] < BATCH_SIZE:
        continue

    with torch.no_grad():
        outputs, loss = model(x, targets=y)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = loss

    x, y = None, None
    nlls.append(neg_log_likelihood)
    prev_end_loc = end_loc
    if end_loc == num_test:
        break

# https://huggingface.co/docs/transformers/en/perplexity
ppl = torch.exp(torch.stack(nlls).mean())
print(f"Perplexity: {ppl:4f}")

Perplexity: 24.869570


### Generisanje Sadržaja

In [44]:
x = torch.stack([torch.from_numpy(np.array(tok.encode("генератор")).astype(np.int64))]).to(DEVICE)
txt = model.generate(x, 200) # Genrisanje 200 Tokena
txt = list(txt.detach().cpu().numpy()[0])
print(tok.decode(txt))

генераторијски. (Натка;
Марта Арозо посебне Тојфектични току због олошење физике цаштву за баговне професорална дружности
Лерминама на воја Ројсија у Нобелојину, Центре Деј, гре је Житлаца кући се на
 једно хвађарици и узаштите су новембрзеноставе Николови њиховоки
 Бороград, у али да је обуштен налаз нестеос биле орип народника.
Гадишње неустр
