# Использование предобученных трансформеров

In [1]:
! pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[torch]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64

In [2]:
! pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece!=0.1.92,>=0.1.91 (from transformers[sentencepiece])
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
# стандартные библиотеки
import os, re
import numpy as np
from time import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import pandas as pd
from collections import Counter
from string import punctuation
import matplotlib.pyplot as plt
%matplotlib inline


# tf и huggingface
import tensorflow as tf
from transformers import TFAutoModel
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

import torch

In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cuda')

Возьмем данные lenta.ru, но не целиком. Fine-tuning больших моделей лучше всего подходит, когда данных совсем мало и стандартным алгоритмам просто не хватает информации, чтобы обучиться. Поэтому возьмем только небольшой процент всех данных.

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data = pd.read_csv('/content/drive/MyDrive/ВШЭ/Магистратура/NLP/lenta_sample.csv')
data.dropna(subset=['topic', 'text'], inplace=True)

In [7]:
data.shape

(607, 6)

Будем обучаться на заголовках, а не на самих текстах

In [8]:
data.title.values[:5], data.topic[:5]

(array(['Московская милиция ужесточила паспортный режим',
        'Московского студента ограбили на\xa06\xa0миллионов рублей',
        'В Ставропольском крае обезврежены боевики',
        'Лужков отказался трудоустраивать китайцев с\xa0Черкизовского рынка',
        'По факту пожара на\xa0заводе в\xa0Югре заведено дело'],
       dtype=object),
 0    Россия
 1    Россия
 2    Россия
 3    Россия
 4    Россия
 Name: topic, dtype: object)

In [9]:
texts = data.title.values
id2label = {i:l for i,l in enumerate(set(data.topic))}
label2id = {l:i for i,l in id2label.items()}
targets = [label2id[l] for l in data.topic]

In [10]:
train_texts, test_texts, train_targets, test_targets = train_test_split(texts, targets, test_size=0.05, stratify=targets)

In [12]:
test_texts[:5]

array(['"Кассини" сфотографировал восход спутников над кольцами Сатурна',
       'Maybelline снимет в\xa0рекламе модель-азиатку И-Хуа У',
       'Еврокомиссия предложила снизить НДС для ресторанов и\xa0парикмахерских',
       'Полное уничтожение элитной гостиницы за\xa010\xa0секунд попало на\xa0видео',
       'На юге Англии появилось фальшивое граффити Бэнкси'], dtype=object)

In [13]:
test_targets[:5]

[4, 15, 8, 6, 7]

In [11]:
train_texts = train_texts.tolist()
test_texts = test_texts.tolist()

In [12]:
MAX_LEN = 512

In [13]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

Список всех доступных моделей можно найти тут - https://huggingface.co/models  
А вот тут основные с описанием - https://huggingface.co/transformers/pretrained_models.html

### DeepPavlov/rubert-base-cased

In [16]:
tokenizer_dp = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model_dp = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=len(label2id)).to(DEVICE)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [17]:
train_enc_dp = tokenizer_dp(train_texts, truncation=True, padding=True, max_length=MAX_LEN)
test_enc_dp = tokenizer_dp(test_texts, truncation=True, padding=True, max_length=MAX_LEN)

In [26]:
train_dataset = Dataset(train_enc_dp, train_targets)
test_dataset = Dataset(test_enc_dp, test_targets)

In [28]:
# https://www.thepythoncode.com/code/finetuning-bert-using-huggingface-transformers-python
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=20,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=200,
    evaluation_strategy="steps",
)

In [30]:
trainer = Trainer(
    model=model_dp,                         # the instantiated Transformers model to be trained
    args=training_args,                     # training arguments, defined above
    train_dataset=train_dataset,            # training dataset
    eval_dataset=test_dataset,              # evaluation dataset
    )

In [31]:
trainer.train()

Step,Training Loss,Validation Loss
200,0.184,1.519929
400,0.044,2.55376
600,0.0313,2.487431
800,0.0074,2.858083
1000,0.0119,2.615438


TrainOutput(global_step=1080, training_loss=0.051691612463306495, metrics={'train_runtime': 164.776, 'train_samples_per_second': 52.435, 'train_steps_per_second': 6.554, 'total_flos': 97693134739200.0, 'train_loss': 0.051691612463306495, 'epoch': 15.0})

In [33]:
predictions = trainer.predict(test_dataset)
pred = list(np.argmax(predictions.predictions, axis=-1))
print(classification_report(test_targets, pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

        Экономика       0.50      0.50      0.50         2
           Россия       0.00      0.00      0.00         2
             Крым       0.00      0.00      0.00         0
          Легпром       0.50      1.00      0.67         1
Силовые структуры       0.67      1.00      0.80         2
              Мир       0.00      0.00      0.00         1
  Наука и техника       0.60      1.00      0.75         3
         Культура       0.50      0.33      0.40         3
         Из жизни       1.00      0.33      0.50         3
         Ценности       0.50      0.50      0.50         2
            Спорт       1.00      1.00      1.00         2
           Бизнес       0.00      0.00      0.00         2
   Интернет и СМИ       1.00      1.00      1.00         2
   69-я параллель       0.50      1.00      0.67         1
       Библиотека       0.00      0.00      0.00         0
              Дом       1.00      0.50      0.67       

А как нарисовать красивый график с историей обучения?

xlm-roberta-base давала f1 = 0.1 почему-то

cointegrated/rubert-tiny давала f1 = 0.26

mt5 и LaBSE не запустились

### cointegrated/rubert-tiny2



In [26]:
tokenizer_coin = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model_coin = AutoModelForSequenceClassification.from_pretrained("cointegrated/rubert-tiny2", num_labels=len(label2id)).to(DEVICE)

In [18]:
MAX_LEN = 512

In [30]:
train_enc_coin = tokenizer_coin(train_texts, truncation=True, padding=True, max_length=MAX_LEN)
test_enc_coin = tokenizer_coin(test_texts, truncation=True, padding=True, max_length=MAX_LEN)

In [31]:
train_dataset = Dataset(train_enc_coin, train_targets)
test_dataset = Dataset(test_enc_coin, test_targets)

In [32]:
# https://www.thepythoncode.com/code/finetuning-bert-using-huggingface-transformers-python
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=20,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=200,
    evaluation_strategy="steps",
)

In [33]:
trainer_coin = Trainer(
    model=model_coin,                         # the instantiated Transformers model to be trained
    args=training_args,                     # training arguments, defined above
    train_dataset=train_dataset,            # training dataset
    eval_dataset=test_dataset,              # evaluation dataset
    )

In [34]:
trainer_coin.train()



Step,Training Loss,Validation Loss
200,2.8038,2.727407
400,2.5242,2.277668
600,1.8109,1.761022
800,1.0784,1.499253
1000,0.5738,1.439064
1200,0.2895,1.440316
1400,0.1556,1.413124
1600,0.096,1.442928
1800,0.0657,1.472498
2000,0.0534,1.54586


TrainOutput(global_step=2160, training_loss=0.8786867832695996, metrics={'train_runtime': 57.5911, 'train_samples_per_second': 300.046, 'train_steps_per_second': 37.506, 'total_flos': 6234156576000.0, 'train_loss': 0.8786867832695996, 'epoch': 30.0})

In [35]:
predictions = trainer_coin.predict(test_dataset)
pred = list(np.argmax(predictions.predictions, axis=-1))
print(classification_report(test_targets, pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

              Мир       0.00      0.00      0.00         1
   Интернет и СМИ       0.40      1.00      0.57         2
      Бывший СССР       0.00      0.00      0.00         3
            Спорт       0.67      1.00      0.80         2
  Наука и техника       1.00      0.67      0.80         3
   69-я параллель       0.00      0.00      0.00         1
         Из жизни       1.00      0.67      0.80         3
         Культура       1.00      1.00      1.00         3
        Экономика       0.25      0.50      0.33         2
             Крым       0.00      0.00      0.00         0
           Бизнес       0.50      0.50      0.50         2
          Легпром       0.00      0.00      0.00         1
       Библиотека       0.00      0.00      0.00         0
              Дом       0.67      1.00      0.80         2
           Россия       0.50      0.50      0.50         2
         Ценности       0.67      1.00      0.80       

### sismetanin/ruroberta-ru-rusentitweet

In [15]:
tokenizer = AutoTokenizer.from_pretrained("sismetanin/ruroberta-ru-rusentitweet")
model = AutoModelForSequenceClassification.from_pretrained("sismetanin/ruroberta-ru-rusentitweet", num_labels=len(label2id), ignore_mismatched_sizes=True).to(DEVICE)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sismetanin/ruroberta-ru-rusentitweet and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([5, 1024]) in the checkpoint and torch.Size([17, 1024]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([17]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
train_enc = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LEN)
test_enc = tokenizer(test_texts, truncation=True, padding=True, max_length=MAX_LEN)

In [17]:
train_dataset = Dataset(train_enc, train_targets)
test_dataset = Dataset(test_enc, test_targets)

In [18]:
# https://www.thepythoncode.com/code/finetuning-bert-using-huggingface-transformers-python
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=20,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=200,
    evaluation_strategy="steps",
)

In [19]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                     # training arguments, defined above
    train_dataset=train_dataset,            # training dataset
    eval_dataset=test_dataset,              # evaluation dataset
    )

In [20]:
trainer.train()



Step,Training Loss,Validation Loss
200,2.3728,1.57495
400,0.7772,1.538012
600,0.2862,2.737845
800,0.1179,3.442244
1000,0.0591,2.300742
1200,0.0185,2.273026
1400,0.0005,2.513854
1600,0.0004,2.458716
1800,0.0003,2.451779
2000,0.0036,2.267057


TrainOutput(global_step=2160, training_loss=0.3367229464757084, metrics={'train_runtime': 747.1541, 'train_samples_per_second': 23.128, 'train_steps_per_second': 2.891, 'total_flos': 1006536882954240.0, 'train_loss': 0.3367229464757084, 'epoch': 30.0})

In [23]:
predictions = trainer.predict(test_dataset)
pred = list(np.argmax(predictions.predictions, axis=-1))
print(classification_report(test_targets, pred, labels=list(range(len(label2id))),
                            target_names=list(label2id), zero_division=0))

                   precision    recall  f1-score   support

           Россия       0.50      0.50      0.50         2
          Легпром       1.00      1.00      1.00         1
           Бизнес       0.67      1.00      0.80         2
      Бывший СССР       1.00      0.67      0.80         3
         Из жизни       0.67      0.67      0.67         3
       Библиотека       0.00      0.00      0.00         0
   69-я параллель       1.00      1.00      1.00         1
        Экономика       1.00      1.00      1.00         2
   Интернет и СМИ       1.00      0.50      0.67         2
            Спорт       1.00      1.00      1.00         2
Силовые структуры       1.00      1.00      1.00         2
         Ценности       0.00      0.00      0.00         2
         Культура       0.50      0.67      0.57         3
  Наука и техника       1.00      0.67      0.80         3
              Дом       1.00      0.50      0.67         2
             Крым       0.00      0.00      0.00       

У этой модели лучше всего получилось.

1. Чем ROBERTA отличается от BERT? В какой статье описана ROBERTA?

2. Чем модель T5 отличается от ROBERTA/BERT?

RoBERTa - [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)

RoBERTa - хорошо натренированный берт, с большим числом гиперпараметров. У неё лучше качество, чем у берта. Она тренировалась дольше, на более длинных последовательностях, батчи были длиннее, данных было больше. Маскирование у роберты динамическое, то есть на разных этапах обучения разные токены маскируются.


T5 - энкодер-декодер модель, (Ro)BERT(a) - энкодер модель. (Ro)BERT(a) учились угадывать маскированное слово и чувствовать связь между предложениями. T5 учили предсказывать последовательность слов, поэтому она хорошо справляется с задачами, на текст отвечает подходящим текстом. (особенно здорово, если модель учили на нескольких задачах сразу)