In [None]:
!pip install transformers --quiet 
!pip install gdown

In [None]:
import pandas as pd 
import numpy as np 
import nltk 
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# download prozhito texts 
import gdown

url = 'https://drive.google.com/file/d/1oHAmsa6Nxb1B2sbWriJXCExETwSirtMj/view?usp=sharing' 
gdown.download(url, 'prozhito_texts.zip', fuzzy=True)

In [None]:
# from google.colab import drive 

# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip prozhito_texts.zip

In [None]:
import glob, os

sentences = [] 
rootdir = 'prozhito_texts'
for subdir, dirs, files in os.walk(rootdir):
    for filename in files:
        if filename.endswith('.txt'): 
            path = os.path.join(subdir, filename)
            with open(path) as f: 
                sents = f.readlines() 
                for sentence in sents: 
                    sentences.append(sentence.strip())   

In [None]:
len(sentences)

16084

In [None]:
sentences[5]

'Воскресенье . Пасха ! Предполагаемый разговор с Дьяченко . Дорогой Боря ! Ваши взаимоотношения с Аллой дошли до физической ненависти , до несовместимости . И тут у тебя — тупик . Защитить мне тебя очень трудно , потому что , по общему мнению , ты играешь плохо , хотя весьма стараешься . Драматизм и подчас трагедия нашей профессии заключаются в том , что словами свой « образ » не защитишь . Аллу не исправишь , и уж коль она кривится ( а она — великая ) , психологически разберись , она же не в пионерском кружке самодеятельности , она чуть ли не каждый год в Париж ездит . А Эфроса нет , и судьба спектакля в ее руках . Новый главный далеко не поклонник спектакля , по всей видимости , он и тебя не « отметил » в своих кадрах . Так , например , Певцова он отметил и Яцко … Что делать ? Алла предлагает кандидатуру Беляева , и , если он согласится ( а по-моему , он может это сделать ) , — он убьет тебя . Или два состава . Что делать ? Взывать к этике , к человеческим качествам , но ведь она дум

## Загружаем машины

In [None]:
import transformers 
from transformers import BertTokenizer, BertForMaskedLM
import torch

In [None]:
seed = 42 
torch.manual_seed(seed)
np.random.seed(seed) 

In [None]:
MODEL_PATH = "DeepPavlov/rubert-base-cased"

In [None]:
# s = 'Привет! Как ты?' 
# for word in word_tokenize(s): 
#     print(word)

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH) 
model = BertForMaskedLM.from_pretrained(MODEL_PATH) 

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# inputs = tokenizer(s_masked, return_tensors="pt") 
# labels = tokenizer(s, return_tensors="pt")["input_ids"]

# outputs = model(**inputs, labels=labels) 
# loss = outputs.loss 
# loss.backward()

In [None]:
inputs = tokenizer(sentences, return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length') 

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [None]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [None]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    array = torch.flatten(mask_arr[i].nonzero()) 
    if not torch.sum(array): 
        zero_idx = None 
        try: 
            zero_idx = inputs.input_ids[i].tolist().index(0)
        except ValueError: 
            zero_idx = 512 

        idx = np.random.randint(zero_idx) 
        mask_arr[i][idx] = True 

    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [None]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [None]:
inputs.input_ids[56]

tensor([   101,   9110,   1984,    164,    164,    304,   7472,    878,    106,
           103,  28850,   6675,   1703,  11393,   7840,    869,    103,    898,
         14198,  70663,   2761, 109813,  11401,    156,    304,    839, 101527,
           128,    103,   4609,   8973,    103,   7853,  23198,    103,    326,
           132,  36335,   4650,   9450,  38784,    128,   4609,   1699,    103,
           128,   2739,    103,    861,  38784,   6305,  40819,    106,  29195,
          1655,   1699,    103,    128,   5247,   4609,   8152,    103,  14198,
           845,  58164,   8151,  13144,  54492,    842,    128,    103,   1699,
          3815,   1997,   2067,  30145,    132,    103,   5853,    130,  10785,
           128,    861,  38784,  15380,   1699,   8470,  12006,    132,  52275,
         19202,    851,    845,  62197,    842,   8953,    845,    103,  31918,
           861,  19555,  29255,    132,    777,  26916,    877,   2752,    845,
         50425,    103,  85529,  53542, 

### Готовим датасет

In [None]:
class DiariesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = DiariesDataset(inputs)

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device) 
model.to(device)

cuda


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [None]:
from torch.optim import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm.notebook import tqdm  # for our progress bar

epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        token_type_ids=token_type_ids,
                        labels=labels)

        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    model.save_pretrained(f'rubert_epoch_{epoch + 1}') 

  0%|          | 0/4021 [00:00<?, ?it/s]

  """


KeyboardInterrupt: ignored