# ruBERT text classification

## Шаг 0 - Описание

Полезные ссылки:

https://newtechaudit.ru/klassifikacziya-teksta-s-ispolzovaniem-modelej-transformerov/

https://huggingface.co/datasets/zloelias/kinopoisk-reviews

https://huggingface.co/ai-forever/ruBert-base

## Шаг 1 - Подготовка окружения

In [None]:
# Устанавливаем зависимости

%pip install -r requirements.txt

In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

tqdm.pandas()

# Для mac выбираем mps
device = torch.device('mps')

## Шаг 2 - Подготовка данных

In [None]:
from datasets import load_dataset

dataset = load_dataset("zloelias/kinopoisk-reviews")

In [None]:
dataset['train'][10]

In [None]:
train_text = dataset['train']['text']
test_text = dataset['test']['text']

In [None]:
seq_len = [len(str(i).split()) for i in train_text]
pd.Series(seq_len).hist(bins = 50)

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_text,
    max_length = 350,
    padding = 'max_length',
    truncation = True
)

## Шаг 3 - Подготовка модели

In [None]:
from transformers import pipeline
from base import baseModel, tokenizer

unmasker = pipeline('fill-mask', model=baseModel, tokenizer=tokenizer)

[i['sequence'] for i in unmasker("[MASK] - самый классный город в мире")]

In [None]:
tokens_test = tokenizer.batch_encode_plus(
    test_text,
    max_length = 350,
    padding = 'max_length',
    truncation = True
)

In [None]:
train_labels = dataset['train']['labels']
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels)

In [None]:
test_labels = dataset['test']['labels']
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels)

In [None]:
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = 8)

In [None]:
len(train_dataloader)

In [None]:
from transformers import AdamW
from model import BERT_Arch

model = BERT_Arch().to(device)

optimizer = AdamW(model.parameters(), lr = 1e-3)

## Шаг 4 - Обучение модели

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)

print(class_weights)

weights = torch.tensor(class_weights, dtype = torch.float)
weights = weights.to(device)
cross_entropy = nn.CrossEntropyLoss(weights)

In [None]:
def train():
    model.train()
    total_loss, total_accuracy = 0, 0
    total_preds = []
    
    for step, batch in tqdm(enumerate(train_dataloader), total = len(train_dataloader)):
        batch = [r.to(device) for r in batch]
        sent_id,mask,labels = batch
        model.zero_grad()
        preds = model(sent_id, mask)
        loss = cross_entropy(preds, labels)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        preds = preds.detach().cpu().numpy()
        total_preds.append(preds)
        
    avg_loss = total_loss / len(train_dataloader)
    total_preds = np.concatenate(total_preds, axis = 0)
    
    return avg_loss, total_preds

In [None]:
best_valid_loss = float('inf')

train_losses = []
valid_losses = []
epochs = 20
for epoch in range(epochs):
    print('\n Epoch{:} / {:}'.format(epoch+1, epochs))
    
    train_loss = train()

    train_losses.append(train_loss)

    print(f'\nTraining loss: {train_loss:.3f}')
