# ruBERT text classification

## Шаг 0 - Описание

Полезные ссылки:

https://newtechaudit.ru/klassifikacziya-teksta-s-ispolzovaniem-modelej-transformerov/

https://huggingface.co/datasets/zloelias/kinopoisk-reviews

https://huggingface.co/ai-forever/ruBert-base

## Шаг 1 - Подготовка окружения

In [None]:
# Устанавливаем зависимости

%pip install -r requirements.txt

In [None]:
import torch

deviceId = "mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available else "cpu")
print("device =", deviceId)
device = torch.device(deviceId)

## Шаг 2 - Подготовка данных

In [None]:
from datasets import load_dataset

dataset = load_dataset("zloelias/kinopoisk-reviews")

In [None]:
dataset['train'][10]

In [None]:
from model import baseModel, tokenizer
from functions import tensorDataset

# Dataset обучающей выборки
train_data = tensorDataset(tokenizer, dataset['train'])

# Dataset тестовой выборки
test_data = tensorDataset(tokenizer, dataset['test'])

## Шаг 3 - Подготовка модели

In [None]:
from transformers import pipeline

unmasker = pipeline('fill-mask', model=baseModel, tokenizer=tokenizer)

[i['sequence'] for i in unmasker("[MASK] - самый классный город в мире")]

In [None]:
# Set up parameters
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [None]:
from model import BERTClassifier

model = BERTClassifier(num_classes).to(device)

print(model)

## Шаг 4 - Обучение модели

Подготовка батчей

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(test_data, batch_size=batch_size)

Set up optimizer and learning rate scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=learning_rate)

total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Training the model

In [None]:
import sys
from importlib import reload

reload(sys.modules['functions'])

from functions import train, evaluate

for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Saving the final model

In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")