In [None]:
!pip install transformers --quiet 
!pip install pandas numpy nltk gdown

In [None]:
!sudo apt install zip -y

In [None]:
import pandas as pd 
import numpy as np 
import nltk 
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')

In [None]:
# download prozhito texts 
import gdown

url = 'https://drive.google.com/file/d/1oHAmsa6Nxb1B2sbWriJXCExETwSirtMj/view?usp=sharing' 
gdown.download(url, 'prozhito_texts.zip', fuzzy=True)

In [None]:
# from google.colab import drive 

# drive.mount('/content/drive')

In [None]:
!unzip prozhito_texts.zip

In [None]:
import glob, os

sentences = [] 
rootdir = 'prozhito_texts'
for subdir, dirs, files in os.walk(rootdir):
    for filename in files:
        if filename.endswith('.txt'): 
            path = os.path.join(subdir, filename)
            with open(path) as f: 
                sents = f.readlines() 
                for sentence in sents: 
                    sentences.append(sentence.strip())   

In [None]:
len(sentences)

In [None]:
sentences[5]

## Загружаем машины

In [None]:
import transformers 
from transformers import BertTokenizer, BertForMaskedLM
import torch

In [None]:
seed = 42 
torch.manual_seed(seed)
np.random.seed(seed) 

In [None]:
MODEL_PATH = "DeepPavlov/rubert-base-cased"

In [None]:
# s = 'Привет! Как ты?' 
# for word in word_tokenize(s): 
#     print(word)

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH) 
model = BertForMaskedLM.from_pretrained(MODEL_PATH) 

In [None]:
# inputs = tokenizer(s_masked, return_tensors="pt") 
# labels = tokenizer(s, return_tensors="pt")["input_ids"]

# outputs = model(**inputs, labels=labels) 
# loss = outputs.loss 
# loss.backward()

In [None]:
inputs = tokenizer(sentences, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length') 

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [None]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [None]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    array = torch.flatten(mask_arr[i].nonzero()) 
    if not torch.sum(array): 
        zero_idx = None 
        try: 
            zero_idx = inputs.input_ids[i].tolist().index(0)
        except ValueError: 
            zero_idx = 512 

        idx = np.random.randint(zero_idx) 
        mask_arr[i][idx] = True 

    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [None]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [None]:
inputs.input_ids[56]

### Готовим датасет

In [None]:
class DiariesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = DiariesDataset(inputs)

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device) 
model.to(device)

In [None]:
from torch.optim import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm.notebook import tqdm  # for our progress bar

epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)

        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    model.save_pretrained(f'rubert_epoch_{epoch + 1}') 