In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from sklearn.metrics import accuracy_score

In [None]:
# Загрузка данных из CSV-файла
df = pd.read_csv('./your_dataset.csv')  # Замените 'your_dataset.csv' на путь к вашему файлу CSV

# Разделение данных на обучающий и тестовый наборы
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Подготовка текстов и меток
train_texts, train_labels = train_df['news_headline'].tolist(), train_df['category_num'].tolist()
test_texts, test_labels = test_df['news_headline'].tolist(), test_df['category_num'].tolist()

In [None]:
# Инициализация токенизатора и преобразование текста в токены
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Преобразование меток в тензоры
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Создание DataLoader для обучения и тестирования
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Инициализация модели RoBERTa для классификации
num_classes = 2  # Замените 2 на количество классов в вашем наборе данных
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_classes)

# Инициализация оптимизатора
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Обучение модели с логированием
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_batches = 0

    for batch_num, batch in enumerate(train_loader, 1):
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Считаем общую потерю
        total_loss += loss.item()

        # Считаем количество правильных предсказаний
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(predictions == batch[2]).item()

        # Выводим лог каждые, например, 10 батчей
        if batch_num % 10 == 0:
            avg_loss = total_loss / batch_num
            accuracy = correct_predictions / (batch_num * train_loader.batch_size)
            print(f'Epoch {epoch + 1}, Batch {batch_num}/{len(train_loader)}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Выводим лог в конце каждой эпохи
    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / (len(train_loader) * train_loader.batch_size)
    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')


In [None]:
# Оценка модели на тестовом наборе
model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_loader:
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

# Вычисление точности
accuracy = accuracy_score(test_labels.numpy(), all_preds)
print(f'Accuracy: {accuracy}')