#### Дообучение модели DeepPavlov/rubert-base-cased 
В данном ноутбуке можно в деталях ознакомиться с решением нашей командой задачи дообучения rubert-модели под классификацию новостей. 

In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertForSequenceClassification, BertModel

import torch
from torch import nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm.notebook import tqdm


repo_dir = Path().resolve().parent
sys.path.append(str(repo_dir))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('data.txt', delimiter='\n\"\t', header=None)
df.head()

  df = pd.read_csv('data.txt', delimiter='\n\"\t', header=None)


Unnamed: 0,0
0,"""Безработным севастопольцам помогают вложиться..."
1,"""\t5"
2,"""Продолжаем делиться с вами впечатлениями от в..."
3,"""\t5"
4,"""На прошлой неделе я принял участие в экспертн..."


In [3]:
df_texts = df.iloc[::2,0].reset_index()[[0]]
df_targets = df.iloc[1::2,0].str.replace('"\t', '').astype(int).reset_index()[[0]]

In [4]:
df = pd.concat([df_texts, df_targets], axis=1)
df.columns = ['texts','targets']
df.head()

Unnamed: 0,texts,targets
0,"""Безработным севастопольцам помогают вложиться...",5.0
1,"""Продолжаем делиться с вами впечатлениями от в...",5.0
2,"""На прошлой неделе я принял участие в экспертн...",5.0
3,"""Это была обычная среда. Моя душа поймала трев...",5.0
4,""" Отзыв о Тренинге """"Стартап Технический"""" мы ...",5.0


In [5]:
df.dropna(subset=['targets', 'texts'], inplace=True)

In [6]:
replacements = {
    original_class: i for i, original_class in enumerate(df['targets'].unique())
}
df['targets'].replace(replacements,inplace=True)
df['targets'] = df['targets'].astype(int)
df.sample(10)

Unnamed: 0,texts,targets
5557,"""Как прикрепиться к поликлиникам Истринской об...",5
2913,"""Οчeнь горячий танeц от Ченнинг Татум и Сaльма...",2
1853,""" Я очень люблю вдохновлять!. Может быть, с...",1
8679,"""Фотографии, снятые на пленочный фотоаппарат, ...",8
3460,"""ПОЧЕМУ НУЖНО ОСВОИТЬ ИНФОГРАФИКУ СЕЙЧАС? Спро...",3
4834,"""ХАЧАПУРИ. Мука - 3 стакана + Кефир - 1 стака...",4
6522,"""Создатели Warhammer Age of Sigmar Realms of R...",6
3559,"""Заработок до 6000 руб.: Дизайн статьи [Figma]...",3
2296,"""Οт проcмотрa этого aниме увеличивaетcя уровен...",2
8808,"""Микротоки лица, шеи, декольте Если Вы ещё не ...",8


In [7]:
from utils.preprocessing import textPreprocesser

preprocesser = textPreprocesser(df, ['texts'])
preprocesser.clean()
preprocesser.df.head(2)

Unnamed: 0,texts,targets
0,"[безработн, севастопольц, помога, влож, в, ста...",0
1,"[продолжа, дел, с, вам, впечатлен, от, взаимод...",0


Создаем датасет, в котором формируются эмбеддинги и по которому мы будем затем запускать обучение по батчам

In [8]:
class TextDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=64):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [9]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [10]:
X = preprocesser.df['texts'].str.join(' ').str.replace('\.*', '', regex=True).values.compute()
y = df.iloc[:, 1].to_numpy()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)

Создаем валидационный датасет и даталоудеры на датасетах для дальнейшего обучения

In [13]:
# Валидационный датасет
val_ratio = 0.2

train_dataset, val_dataset = random_split(
    dataset=train_dataset, 
    lengths=[1-val_ratio, val_ratio],
)

In [14]:
train_dataloader = DataLoader(train_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)
val_dataloader = DataLoader(val_dataset, batch_size=16)

Архитектура моеди для файн-тюнинга rubert

In [15]:
class BERT_Arch(nn.Module):
    def __init__(self, bert, num_classes):
        super().__init__()
        self.bert = bert 
        self.dropout = nn.Dropout(0.5)
        self.relu =  nn.ReLU()
        self.fc1 = nn.Linear(768,512)
        self.fc2 = nn.Linear(512,num_classes)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, sent_id, mask):
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
        x = self.fc1(cls_hs)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

Очистка кэша куды для нового обучения

In [16]:
import torch
torch.cuda.empty_cache()

В качестве бейзлайна был выбран алгоритм ближайших соседей

In [17]:
training_data = []
for train_batch in train_dataset:
    training_data.append(train_batch)

training_data = [[data['input_ids'].detach().cpu().numpy(),data['targets'].detach().cpu().numpy()] for data in training_data]

training_data = pd.DataFrame(training_data)
X_train_sklearn, X_test_sklearn, y_train_sklearn, y_test_sklearn = train_test_split(training_data.iloc[:,0], training_data.iloc[:, 1], test_size=0.2, random_state=42)

In [20]:
from sklearn.linear_model import LogisticRegression

num_classes = np.unique(y).shape[0]
knn_clf=LogisticRegression()
knn_clf.fit(np.stack(X_train_sklearn.values),np.stack(y_train_sklearn.values))
ypred=knn_clf.predict(np.stack(X_test_sklearn.values)) 

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('classifier' , RandomForestClassifier())])


param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data

best_clf = clf.fit(np.stack(X_train_sklearn.values),np.stack(y_train_sklearn.values))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [25]:
ypred=best_clf.predict(np.stack(X_test_sklearn.values)) 

In [26]:
from sklearn.metrics import f1_score, accuracy_score
y_true = np.stack(y_test_sklearn.values).flatten()
{
    'f1': f1_score(y_true, ypred, average='macro'),
    'acc': accuracy_score(y_true, ypred)
}

{'f1': 0.3346933096649962, 'acc': 0.3307291666666667}

In [39]:
import pickle
# save the classifier
with open('model.pkl', 'wb') as fid:
    pickle.dump(best_clf, fid)

Непосредственный файн-тюнинг модели. TL;DR:
* **скорость** дообучения равна 0,001
* для **оптимизации** дообучения используется warm-up scheduler, который подходит для обучения трансформеров, таким образом и нашей модели (https://ufal.mff.cuni.cz/pbml/110/art-popel-bojar.pdf)
* **количество батчей** равно 16 (для избавления от проблемы отсутствия памяти в CUDA)
* в качестве **функции потерь** была использована кросс-энтропия (CrossEntropyLoss)
* количество **эпох** равно 10

In [27]:
from utils.engine import Trainer
from utils.optim import WarmupScheduler

num_classes = np.unique(y).shape[0]
bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased')
model = BERT_Arch(bert,num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=.0001, weight_decay=1e-3)
criterion = nn.CrossEntropyLoss()
batch_size = 16
scheduler = WarmupScheduler(optimizer)

trainer = Trainer(
    model, 
    optimizer=optimizer, 
    criterion=criterion,
    train_dataset=train_dataset, 
    val_dataset=val_dataset, 
    batch_size=batch_size,
    scheduler=scheduler,
    path = 'model.pt'
)

trainer.train(1)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training::   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 0:   0%|          | 0/360 [00:00<?, ?it/s]

Validating:   0%|          | 0/90 [00:00<?, ?it/s]

In [28]:
from sklearn.metrics import f1_score, accuracy_score

def evaluate_loader(loader: DataLoader, model: torch.nn.Module):
    
    with torch.no_grad():
        model.eval()
        N = 0
        total_loss = 0.0
        target_epoch = []
        predicted_epoch = []
        for i, input_ in enumerate(loader):
            inputs = input_['input_ids'].to('cuda')
            mask = input_['attention_mask'].to('cuda')
            targets = input_['targets'].to('cuda')
            outputs = model(inputs, mask).to('cuda')
            N += inputs.shape[0]
            predicted_targets = outputs.argmax(dim=1)
            target_epoch.append(targets.detach().cpu().numpy())
            predicted_epoch.append(predicted_targets.detach().cpu().numpy())
        f1 = f1_score(
            np.concatenate(target_epoch),
            np.concatenate(predicted_epoch),
            average='macro'
        )
        acc = accuracy_score(np.concatenate(target_epoch),
            np.concatenate(predicted_epoch))
        
  
        return {
            'p': predicted_epoch,
            'acc': acc,
            'f1': f1,
        }


In [29]:
bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased')
model = BERT_Arch(bert, 24)
model = torch.load('model.pt')

evaluate_loader(test_dataloader, model)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'p': [array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], dtype=int64),
  array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], d