In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch
from sklearn.metrics import accuracy_score
from torch.cuda.amp import autocast

In [4]:
print(torch.cuda.is_available())


True


In [5]:
# Проверяем доступность GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [6]:
# Загрузка данных из CSV-файла
df = pd.read_csv('your_dataset.csv')  # Замените 'your_dataset.csv' на путь к вашему файлу CSV

# Разделение данных на обучающий и тестовый наборы
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Подготовка текстов и меток
train_texts, train_labels = train_df['news_headline'].tolist(), train_df['category_num'].tolist()
test_texts, test_labels = test_df['news_headline'].tolist(), test_df['category_num'].tolist()

In [7]:
# Инициализация токенизатора и преобразование текста в токены
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Преобразование меток в тензоры
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

# Создание DataLoader для обучения и тестирования
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Используйте DataLoader для автоматического перемещения данных на GPU
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, pin_memory=True)

# Инициализация модели RoBERTa для классификации на GPU
num_classes = 2  # Замените 2 на количество классов в вашем наборе данных

# Загрузка оригинальной модели RoBERTa
original_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Создание новой (менее объемной) модели с использованием distill
distil_model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased')
distil_model.to(device)

# Инициализация оптимизатора
optimizer = AdamW(distil_model.parameters(), lr=5e-5)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Обучение модели на GPU
num_epochs = 1

for epoch in range(num_epochs):
    distil_model.train()
    total_loss = 0.0
    correct_predictions = 0

    for batch_num, batch in enumerate(train_loader, 1):
        inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
        optimizer.zero_grad()

        with autocast():  # Используем mixed-precision training
            outputs = distil_model(**inputs)
            loss = outputs.loss

        loss.backward()
        optimizer.step()

        # Считаем общую потерю
        total_loss += loss.item()

        # Считаем количество правильных предсказаний
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(predictions == batch[2].to(device)).item()  # Перемещаем метки на устройство GPU

        # Выводим лог каждые, например, 10 батчей
        if batch_num % 10 == 0:
            avg_loss = total_loss / batch_num
            accuracy = correct_predictions / (batch_num * train_loader.batch_size)
            print(f'Epoch {epoch + 1}, Batch {batch_num}/{len(train_loader)}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Выводим лог в конце каждой эпохи
    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / (len(train_loader) * train_loader.batch_size)
    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    # Проверка точности на тестовом наборе
    distil_model.eval()
    all_preds = []
    with torch.no_grad():
        for test_batch in test_loader:
            test_inputs = {'input_ids': test_batch[0].to(device), 'attention_mask': test_batch[1].to(device)}
            test_outputs = distil_model(**test_inputs)
            test_logits = test_outputs.logits
            test_preds = torch.argmax(test_logits, dim=1)
            all_preds.extend(test_preds.cpu().numpy())

    # Вычисление и вывод точности на тестовом наборе
    test_accuracy = accuracy_score(test_labels.cpu().numpy(), all_preds)
    print(f'Test Accuracy: {test_accuracy:.4f}')


Epoch 1, Batch 10/1047, Loss: 0.6710, Accuracy: 0.5125
Epoch 1, Batch 20/1047, Loss: 0.5538, Accuracy: 0.7063
Epoch 1, Batch 30/1047, Loss: 0.4406, Accuracy: 0.7875
Epoch 1, Batch 40/1047, Loss: 0.3665, Accuracy: 0.8313
Epoch 1, Batch 50/1047, Loss: 0.3242, Accuracy: 0.8550
Epoch 1, Batch 60/1047, Loss: 0.2993, Accuracy: 0.8708
Epoch 1, Batch 70/1047, Loss: 0.2741, Accuracy: 0.8804
Epoch 1, Batch 80/1047, Loss: 0.2617, Accuracy: 0.8844
Epoch 1, Batch 90/1047, Loss: 0.2444, Accuracy: 0.8917
Epoch 1, Batch 100/1047, Loss: 0.2265, Accuracy: 0.9000
Epoch 1, Batch 110/1047, Loss: 0.2220, Accuracy: 0.9045
Epoch 1, Batch 120/1047, Loss: 0.2070, Accuracy: 0.9115
Epoch 1, Batch 130/1047, Loss: 0.2149, Accuracy: 0.9087
Epoch 1, Batch 140/1047, Loss: 0.2178, Accuracy: 0.9062
Epoch 1, Batch 150/1047, Loss: 0.2177, Accuracy: 0.9083
Epoch 1, Batch 160/1047, Loss: 0.2122, Accuracy: 0.9109
Epoch 1, Batch 170/1047, Loss: 0.2081, Accuracy: 0.9132
Epoch 1, Batch 180/1047, Loss: 0.2056, Accuracy: 0.9146
E

In [9]:
# Сохранение модели
distil_model.save_pretrained('distilbert_model')