In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
import torch

In [None]:
# Заходим на hugging face
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
from huggingface_hub import HfFolder
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)

In [None]:
# Загрузка данных
df = pd.read_csv('data\multiclass_dataset.csv')

In [None]:
# Преобразование меток
all_labels = sorted(list({label.strip() for sublist in df['Thematics'].str.split(',') for label in sublist}))
mlb = MultiLabelBinarizer()
mlb.fit([all_labels])

In [None]:
# Проверка преобразования меток
df['labels'] = df['Thematics'].str.split(',').apply(
    lambda x: mlb.transform([x])[0].astype(np.float32) 
)

In [None]:
# Разделение данных
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Инициализация модели и токенизатора
model_name = 'cointegrated/rubert-tiny'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(all_labels),
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True,
    torch_dtype=torch.float32
)

In [None]:
# Токенизация
def tokenize_function(examples):
    encoding = tokenizer(
        examples['Text'],
        padding=False,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )
    return {
        'input_ids': encoding['input_ids'][0],
        'attention_mask': encoding['attention_mask'][0]
    }

train_dataset = train_dataset.map(tokenize_function, batched=False)  # Обрабатываем по одному примеру
test_dataset = test_dataset.map(tokenize_function, batched=False)

In [None]:
# Удаление лишних колонок
columns_to_remove = ['Text', 'Thematics', '__index_level_0__']

existing_columns_to_remove = [col for col in columns_to_remove if col in train_dataset.column_names]

train_dataset = train_dataset.remove_columns(existing_columns_to_remove)
test_dataset = test_dataset.remove_columns(existing_columns_to_remove)

In [None]:
# 5. Создание DataCollator
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding='longest',
    max_length=128,
    pad_to_multiple_of=8,
    return_tensors='pt'
)

In [None]:
# Перенос модели на все GPU
if torch.cuda.device_count() > 1:
    print("Активируем DataParallel")
    model = torch.nn.DataParallel(model)
model.to('cuda')

In [None]:
# Конфигурация обучения
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,                  # лерниг рейт
    per_device_train_batch_size=16,      # размер батчей
    per_device_eval_batch_size=16,
    num_train_epochs=5,                  # кол-во эпох
    logging_steps=1,                     # шаги логгирования
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",    # метрика
    fp16=True,
    dataloader_num_workers=2,
    report_to="none",
    remove_unused_columns=False,         # причина тряски (возможно)
    label_names=['labels'],
    gradient_accumulation_steps=2,  # Накопление градиентов
)

In [None]:
# Метрики
def compute_metrics(p):

    logits = p.predictions
    labels = p.label_ids
    
    preds = (torch.sigmoid(torch.tensor(logits)).cpu().numpy() > 0.5).astype(int) #порог классификации
    
    return {
        'micro_f1': f1_score(labels, preds, average='micro'),
        'macro_f1': f1_score(labels, preds, average='macro')
    }

In [None]:
# Инициализация Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [None]:
# Запуск обучения
trainer.train()

In [None]:
# Оценка модели на тестовых данных
eval_results = trainer.evaluate()
print(f"Результаты оценки: {eval_results}")

In [None]:
# Сохранение модели и токенизатора на Hugging Face
from huggingface_hub import HfFolder

repo_name = "telegram_multiLabel_classifier_rubert_tiny_model"

model_to_save = model.module if hasattr(model, 'module') else model

model_to_save.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Модель и токенизатор успешно загружены в huggingface")