In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import torch
from sklearn.metrics import accuracy_score

In [37]:
print(torch.cuda.is_available())


True


In [38]:
# Проверяем доступность GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [39]:
# Загрузка данных из CSV-файла
df = pd.read_csv('your_dataset.csv')  # Замените 'your_dataset.csv' на путь к вашему файлу CSV

# Разделение данных на обучающий и тестовый наборы
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Подготовка текстов и меток
train_texts, train_labels = train_df['news_headline'].tolist(), train_df['category_num'].tolist()
test_texts, test_labels = test_df['news_headline'].tolist(), test_df['category_num'].tolist()


In [40]:
# Инициализация токенизатора и преобразование текста в токены
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Преобразование меток в тензоры
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Перемещение данных на GPU
train_encodings = {key: value.to(device) for key, value in train_encodings.items()}
test_encodings = {key: value.to(device) for key, value in test_encodings.items()}
train_labels = train_labels.to(device)
test_labels = test_labels.to(device)

# Создание DataLoader для обучения и тестирования
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Инициализация модели RoBERTa для классификации на GPU
num_classes = 2  # Замените 2 на количество классов в вашем наборе данных
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_classes)
model.to(device)

# Инициализация оптимизатора
optimizer = AdamW(model.parameters(), lr=5e-5)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# Обучение модели на GPU
num_epochs = 1

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_batches = 0

    for batch_num, batch in enumerate(train_loader, 1):
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        inputs = {key: value.to(device) for key, value in inputs.items()}  # Перемещаем данные на GPU
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Считаем общую потерю
        total_loss += loss.item()

        # Считаем количество правильных предсказаний
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(predictions == batch[2]).item()

        # Выводим лог каждые, например, 10 батчей
        if batch_num % 10 == 0:
            avg_loss = total_loss / batch_num
            accuracy = correct_predictions / (batch_num * train_loader.batch_size)
            print(f'Epoch {epoch + 1}, Batch {batch_num}/{len(train_loader)}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Выводим лог в конце каждой эпохи
    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / (len(train_loader) * train_loader.batch_size)
    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Проверка точности на тестовом наборе
    model.eval()
    all_preds = []
    with torch.no_grad():
        for test_batch in test_loader:
            test_inputs = {'input_ids': test_batch[0].to(device), 'attention_mask': test_batch[1].to(device)}
            test_outputs = model(**test_inputs)
            test_logits = test_outputs.logits
            test_preds = torch.argmax(test_logits, dim=1)
            all_preds.extend(test_preds.cpu().numpy())

    # Вычисление и вывод точности на тестовом наборе
    test_accuracy = accuracy_score(test_labels.cpu().numpy(), all_preds)
    print(f'Test Accuracy: {test_accuracy:.4f}')

Epoch 1, Batch 10/1047, Loss: 0.6988, Accuracy: 0.5250
Epoch 1, Batch 20/1047, Loss: 0.5987, Accuracy: 0.6438
Epoch 1, Batch 30/1047, Loss: 0.5133, Accuracy: 0.7208
Epoch 1, Batch 40/1047, Loss: 0.4432, Accuracy: 0.7750
Epoch 1, Batch 50/1047, Loss: 0.4178, Accuracy: 0.8025
Epoch 1, Batch 60/1047, Loss: 0.4009, Accuracy: 0.8187
Epoch 1, Batch 70/1047, Loss: 0.3851, Accuracy: 0.8321
Epoch 1, Batch 80/1047, Loss: 0.3717, Accuracy: 0.8422
Epoch 1, Batch 90/1047, Loss: 0.3475, Accuracy: 0.8542
Epoch 1, Batch 100/1047, Loss: 0.3228, Accuracy: 0.8662
Epoch 1, Batch 110/1047, Loss: 0.2968, Accuracy: 0.8784
Epoch 1, Batch 120/1047, Loss: 0.2825, Accuracy: 0.8854
Epoch 1, Batch 130/1047, Loss: 0.2684, Accuracy: 0.8913
Epoch 1, Batch 140/1047, Loss: 0.2519, Accuracy: 0.8982
Epoch 1, Batch 150/1047, Loss: 0.2490, Accuracy: 0.9000
Epoch 1, Batch 160/1047, Loss: 0.2405, Accuracy: 0.9039
Epoch 1, Batch 170/1047, Loss: 0.2385, Accuracy: 0.9066
Epoch 1, Batch 180/1047, Loss: 0.2338, Accuracy: 0.9090
E

In [42]:
# Сохранение модели
model.save_pretrained('roberta_classification_model')