In [None]:
import os
from tqdm import tqdm
import json

import torch
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaTokenizerFast, RobertaConfig, RobertaForMaskedLM
from transformers import AdamW

### BPE-токенизация

In [None]:
path = r'C:\m092\avar_bert\raw_corpora'
paths = [path + '\\' + fn for fn in os.listdir(path)]

In [None]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=paths, vocab_size=50265, min_frequency=2,
                special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])

In [None]:
os.mkdir('./avarBPEtokenizer') # для проверки эпохи копировать файлы из папки avarBPEtokenizer в файл с моделью
tokenizer.save_model('avarBPEtokenizer')

### MLM (обучение)

In [None]:
def mlm(tensor):
    rand = torch.rand(tensor.shape)
    mask_arr = (rand < 0.15) * (tensor > 2)
    for i in range(tensor.shape[0]):
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        tensor[i, selection] = 4
    return tensor

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, i):
        return {key: tensor[i] for key, tensor in self.encodings.items()}

In [None]:
with open('avarCorpora.json', 'r', encoding='utf-8') as f:
    corpora = json.load(f)

In [None]:
input_ids = []
mask = []
labels = []

In [None]:
avar_tokenizer = RobertaTokenizerFast.from_pretrained('avarBPEtokenizer')

In [None]:
sample = avar_tokenizer(corpora, max_length=512, padding='max_length', truncation=True, return_tensors='pt')
labels.append(sample.input_ids)
mask.append(sample.attention_mask)
input_ids.append(mlm(sample.input_ids.detach().clone()))

In [None]:
input_ids = torch.cat(input_ids)
mask = torch.cat(mask)
labels = torch.cat(labels)

In [None]:
dataset = Dataset({'input_ids': input_ids, 'attention_mask': mask, 'labels': labels})

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
config = RobertaConfig(
    vocab_size=50265,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)
model = RobertaForMaskedLM(config)

In [None]:
print(torch.cuda.is_available())
device = torch.device('cuda')
model.to(device)

In [None]:
model.train()
optim = AdamW(model.parameters(), lr=1e-4)

In [None]:
# несколько эпох подряд
# for epoch in range(1, 6):
#     loop = tqdm(loader, leave=True)
#     for batch in loop:
#         optim.zero_grad()
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device)
#         outputs = model(input_ids, attention_mask=attention_mask,
#                         labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optim.step()
#         loop.set_description(f'Epoch {epoch}')
        # loop.set_postfix(loss=loss.item())
    # model.save_pretrained(f'./avarBERT-{epoch}_epochs')

In [None]:
# одна эпоха
epoch = 1
loop = tqdm(loader, leave=True)
for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask,
                    labels=labels)
    loss = outputs.loss
    loss.backward()
    optim.step()
    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())
model.save_pretrained(f'./avarBERT-{epoch}_epochs')