  # ДЗ №4. Трансформеры

In [None]:
import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np
import pandas as pd
import random

from tqdm import tqdm
from transformers import AdamW, BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch import nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split

random.seed(123)
np.random.seed(123)
torch.manual_seed(123)
torch.cuda.manual_seed_all(123)

ImportError: cannot import name 'RandomSampler' from 'transformers' (c:\Users\Vasilij\home\projects\git\otus\nlp\otus-nlp-homeworks\.venv\Lib\site-packages\transformers\__init__.py)

In [83]:
if torch.cuda.is_available():
    print('CUDA доступен. Имя используемого GPU:', torch.cuda.get_device_name(0))

    _device = torch.device("cuda")
else:
    print('CUDA недоступен. Для вычислений будет использоватся CPU.')

    _device = torch.device("cpu")

CUDA недоступен. Для вычислений будет использоватся CPU.


## EDA

Загрузим данные и разделим их на тренировочный, тестовый и валидационный наборы.

In [None]:
_data_train_val = pd.read_csv('./data/in_domain_train.csv')
#_data_train_val = _data_train_val[:(int(len(_data_train_val) / 50))] #TODO remove

_X_train, _X_val, _y_train, _y_val = train_test_split(_data_train_val['sentence'], _data_train_val['acceptable'], test_size=0.1, random_state=123)

_X_train = _X_train.to_numpy()
_X_val = _X_val.to_numpy()
_y_train = _y_train.to_numpy()
_y_val = _y_val.to_numpy()

_data_test = pd.read_csv('./data/in_domain_dev.csv')
#_data_test = _data_test[:(int(len(_data_test) / 20))] #TODO remove

_X_test = _data_test['sentence']
_y_test = _data_test['acceptable']

print('Размер набора данных')
print('train:', len(_X_train))
print('validation:', len(_X_val))
print('test:', len(_X_test))

Размер набора данных
train: 141
validation: 16
test: 983


Посмотрим на баланс классов.

In [85]:
#_y_train.hist()

Классы несбалансированы. Для оценки качества будем использовать MCC (Matthews Correlation Coefficient).

## BERT

### Обучение

Определим несколько утлитных классов.

In [2]:
def tokenize(tokenizer, X):
    input_ids = []
    attention_masks = []

    for document in X:
        encoded_dict = tokenizer.encode_plus(document,
                                             add_special_tokens=True,
                                             max_length=64, #TODO
                                             pad_to_max_length=True,
                                             return_attention_mask = True,
                                             return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # преобразуем в тензоры
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [None]:
def train(model, optimizer, learning_rate_schedule, train_data_loader:DataLoader, val_data_loader:DataLoader, num_of_epochs = 3):
    model.to(_device)

    for epoch in range(num_of_epochs):
        print(f'=== Эпоха {epoch + 1}/{num_of_epochs} ===')
        print('Обучение...')

        train_epoch(model, optimizer, learning_rate_schedule, train_data_loader)

        print('Валидация...')
        test(model, val_data_loader)

def train_epoch(model, optimizer, learning_rate_schedule, data_loader:DataLoader):
    # переводим модель в режим обучения
    model.train()

    total_loss = 0

    for batch in tqdm(data_loader):
        # извлекаем данные из батча и перемещаем их на устройство
        input_ids = batch[0].to(_device)
        attention_mask = batch[1].to(_device)
        labels = batch[2].to(_device)

        # обнуляем предыдущие значения градиентов
        model.zero_grad()

        # делаем предсказание
        pred = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = pred.loss
        total_loss += loss.item()

        # вычисляем градиент функции потерь
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                
        # обновляем веса модели
        optimizer.step()

        learning_rate_schedule.step()

    avg_loss = total_loss / len(data_loader)
    print(f'Loss: {avg_loss}')
    
def test(model, data_loader:DataLoader):
    model.to(_device)

    model.eval() # переводим модель в режим использования

    total_loss = 0

    batch_logits = []
    batch_labels = []

    for batch in tqdm(data_loader):
        input_ids = batch[0].to(_device)
        attention_mask = batch[1].to(_device)
        labels = batch[2].to(_device)

        with torch.no_grad():
            pred = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += pred.loss.item()

            batch_logits.append(pred.logits.detach().cpu().numpy())
            batch_labels.append(labels.to('cpu').numpy())

    logits = np.concatenate(batch_logits, axis=0)
    labels = np.concatenate(batch_labels, axis=0)

    print(f'Loss: {total_loss / len(data_loader)}')
    print(f'Accuracy: {calculate_accuracy(logits, labels)}')
    print(f'MCC: {calculate_mcc(logits, labels)}')

def calculate_accuracy(logits, labels):
    y_true = labels.flatten()
    y_pred = np.argmax(logits, axis=1).flatten()
    return np.sum(y_pred == y_true) / len(y_true)

def calculate_mcc(logits, labels):
    y_true = labels.flatten()
    y_pred = np.argmax(logits, axis=1).flatten()
    return matthews_corrcoef(y_true, y_pred)

Загрузим модель.

In [88]:
_bert_model = BertForSequenceClassification.from_pretrained('ai-forever/ruBert-base',
                                                            num_labels = 2,
                                                            output_attentions = False,
                                                            output_hidden_states = False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Токенизируем корпус и обучим модель.

In [None]:
def prepare_dataset(X, y, sampler_class):
    input_ids, attention_masks = tokenize(X)
    tensor_y = torch.tensor(y)
    tensor_dataset = TensorDataset(input_ids, attention_masks, tensor_y)
    sampler = sampler_class(tensor_dataset)
    return DataLoader(dataset=tensor_dataset, batch_size=32, shuffle=True)

_train_data_loader = prepare_dataset(_X_train, _y_train, sampler_class=RandomSampler)
_val_data_loader = prepare_dataset(_X_val, _y_val, sampler_class=RandomSampler)
_test_data_loader = prepare_dataset(_X_test, _y_test, sampler_class=SequentialSampler)

In [None]:
_num_of_epochs = 3
_num_training_steps = len(_train_data_loader) * _num_of_epochs

_optimizer = AdamW(_bert_model.parameters())
_learning_rate_scheduler = get_linear_schedule_with_warmup(_optimizer, num_warmup_steps=0, num_training_steps=_num_training_steps)

train(_bert_model, _optimizer, _learning_rate_scheduler, _train_data_loader, _val_data_loader, num_of_epochs=_num_of_epochs)

=== Эпоха 1/1 ===
Обучение...


100%|██████████| 5/5 [00:32<00:00,  6.43s/it]


Loss: 1.2409416556358337
Валидация...


100%|██████████| 1/1 [00:00<00:00,  1.04it/s]

Loss: 0.6967453956604004
[1 0 1 0 1 1 1 1 0 1 1 0 1 0 1 1] [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
MCC: 0.0





In [90]:
test(_bert_model, _test_data_loader)

100%|██████████| 31/31 [01:04<00:00,  2.09s/it]

Loss: 0.6995247737053902
[1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 0
 1 0 0 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 0
 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 0 1 0
 1 0 0 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1
 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 1 1 1 1
 1 1 1 1 1 1 1 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 1 1
 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 1 1 0
 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0
 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1
 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1





## Few-/zero-shot с GPT3

## RuT5

### Обучение

### Тестирование