In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch

In [2]:
import pandas as pd

In [None]:
# Заходим на hugging face
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(token = hf_token)

In [None]:
# Загрузка данных
df = pd.read_csv('data\balanced_dataset.csv') 

In [None]:
# Преобразование меток
label_to_id = {label: idx for idx, label in enumerate(df['Thematics'].unique())}
id_to_label = {idx: label for label, idx in label_to_id.items()}

df['label'] = df['Thematics'].map(label_to_id)

In [None]:
# Разделение данных
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Загрузка токенизатора и модели
model_name = 'cointegrated/rubert-tiny'  # Используем rubert-tiny
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_to_id),  
    ignore_mismatched_sizes=True 
)

In [None]:
# Токенизация
def tokenize_function(examples):
    return tokenizer(examples['Text'], padding= True, truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
# Удаление лишних колонок
train_dataset = train_dataset.remove_columns(['Text', 'Thematics', '__index_level_0__'])
test_dataset = test_dataset.remove_columns(['Text', 'Thematics', '__index_level_0__'])

In [None]:
# Создаём DataCollator для токенизации
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Проверяем, доступны ли несколько GPU
if torch.cuda.device_count() > 1:
    print(f"Используем {torch.cuda.device_count()} GPU!")
    model = torch.nn.DataParallel(model)  # Распределяем модель на несколько GPU
else:
    print("Используем один GPU или CPU.")

# Перемещение модели на устройство
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Вычисляем F1-score
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')  # Вычисляем F1-score
    return {"accuracy": accuracy, "f1": f1}  # Возвращаем метрики

In [None]:
# Настройка обучения
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=1,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1", 
    fp16=True,
    dataloader_num_workers=4,
    report_to="none",
)

In [None]:
# Создание Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Обучение модели
trainer.train()

In [None]:
# вывод метрик по топ-3 классам
def compute_metrics_with_top_k(p):
    predictions, labels = p
    top_k_predictions = np.argsort(predictions, axis=1)[:, -3:]  # Топ-3 предсказания
    labels = labels[:, None]  
    top_k_hits = np.any(top_k_predictions == labels, axis=1)
    accuracy_top_k = np.mean(top_k_hits)
    
    predictions = np.argmax(predictions, axis=1)
    f1 = f1_score(labels.flatten(), predictions, average='weighted')
    return {"accuracy_top_k": accuracy_top_k, "f1": f1}

trainer.compute_metrics = compute_metrics_with_top_k


In [None]:
# Оценка модели на тестовых данных
eval_results = trainer.evaluate()
print(f"Результаты оценки: {eval_results}")

In [None]:
print(f"Результаты оценки: {eval_results}")

In [None]:
from huggingface_hub import HfFolder

# Укажи название репозитория
repo_name = "telegram_classifier_rubert_tiny_model"

# Сохранение модели и токенизатора локально
model.save_pretrained(f"./{repo_name}")
tokenizer.save_pretrained(f"./{repo_name}")

# Загрузка на Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Модель и токенизатор успешно загружены в huggingface")