**3. Neuron Model**

Evaluation and Training RuBERT model for binary classification

In [1]:
# Import
import torch
from torch.utils.data import Dataset
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import re
import os

- Data preparation

In [2]:
# Clean function
RE_HTML = re.compile(r'<.*?>')
RE_EMOJI = re.compile(r'[\U00010000-\U0001FFFF]', flags=re.UNICODE)

def clean_for_transf(text):
    text = RE_HTML.sub('', text) # HTML
    text = RE_EMOJI.sub('', text) # Emoji
    text = re.sub(r"[^а-яА-Яa-zA-Z0-9\s.,!?\"'()\-–—]", ' ', text) # extra special characters
    text = re.sub(r'\s+', ' ', text).strip() # extra spaces
    return text

In [3]:
# Data prepare
df = pd.read_csv("data/data.csv")
print(df['text'].isnull().sum())
df = df.dropna(subset=['text'])
print(df['text'].isnull().sum())

10
0


In [4]:
# Data Clean
df['clean_text']=df['text'].astype(str).apply(clean_for_transf)
#df.to_csv("clean_transf.csv", index=False)

In [5]:
# Data Split
X = df['clean_text']
y = df['relevant']
X_temp, X_test, y_temp, y_test = train_test_split(X.tolist(), y.tolist(), test_size=0.2, random_state=42, stratify=y, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp, shuffle=True)

# Index drop (for training)


- Model Setup

In [7]:
model_name = "cointegrated/rubert-tiny2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cpu")

In [8]:
# Dataset class for ruBert model
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding="max_length",
            max_length=max_length
        )
        self.labels = labels

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx]),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx]),
            "label": torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = ReviewDataset(X_train, y_train, tokenizer)
test_dataset = ReviewDataset(X_test, y_test, tokenizer)
val_dataset = ReviewDataset(X_val, y_val, tokenizer)

In [10]:
# Trainings parameters
training_args = TrainingArguments(
    output_dir="./rubert_results",
    num_train_epochs=2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=300,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_steps=50,
    use_cpu=True,
    report_to="none"
)

- Training

In [11]:
# Metrics
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=1)
    return {
        "f1": f1_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds)
    }

In [12]:
# Training
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, use_safetensors=True)
model.to(device)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.4618,0.443239,0.853071,0.812809,0.89753
2,0.3954,0.443949,0.855091,0.833232,0.878127


TrainOutput(global_step=1634, training_loss=0.45374708339079506, metrics={'train_runtime': 1205.0232, 'train_samples_per_second': 43.36, 'train_steps_per_second': 1.356, 'total_flos': 96325578624000.0, 'train_loss': 0.45374708339079506, 'epoch': 2.0})

- Evaluation

In [13]:
predictions = trainer.predict(test_dataset)
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)
y_scores = predictions.predictions[:, 1]
roc_auc = roc_auc_score(y_true, y_scores)

print(classification_report(y_true, y_pred))
print("F1:", f1_score(y_true, y_pred))
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print(f'ROC AUC: {roc_auc:.4f}')

              precision    recall  f1-score   support

           0       0.64      0.56      0.59      2473
           1       0.83      0.87      0.85      6236

    accuracy                           0.78      8709
   macro avg       0.73      0.71      0.72      8709
weighted avg       0.78      0.78      0.78      8709

F1: 0.852561595619867
Precision: 0.8321881203237136
Recall: 0.8739576651699807
ROC AUC: 0.8403


RuBERT model has more comlex architecture, so this model predict more accurately then classic ML models.

It has better precision, f1 and roc auc.

The model takes longer to train, but it gives better metrics.

So, this is the best choice for this task.