In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
import torch
from sklearn.metrics import accuracy_score
from torch.cuda.amp import autocast

In [44]:
print(torch.cuda.is_available())


True


In [45]:
# Проверяем доступность GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [46]:
# Загрузка данных из CSV-файла
df = pd.read_csv('your_dataset.csv')  # Замените 'your_dataset.csv' на путь к вашему файлу CSV

# Разделение данных на обучающий и тестовый наборы
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Подготовка текстов и меток
train_texts, train_labels = train_df['news_headline'].tolist(), train_df['category_num'].tolist()
test_texts, test_labels = test_df['news_headline'].tolist(), test_df['category_num'].tolist()

In [47]:
# Инициализация токенизатора и преобразование текста в токены
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Преобразование меток в тензоры
train_labels = torch.tensor(train_labels).clone().detach()
test_labels = torch.tensor(test_labels).clone().detach()

# Создание DataLoader для обучения и тестирования
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Используйте DataLoader для автоматического перемещения данных на GPU
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, pin_memory=True)

# Инициализация модели RoBERTa для классификации на GPU
num_classes = 2  # Замените 2 на количество классов в вашем наборе данных

# Загрузка оригинальной модели RoBERTa
original_model = AutoModelForSequenceClassification.from_pretrained('albert-base-v2')

# Создание новой (менее объемной) модели с использованием distill
albert_model = AutoModelForSequenceClassification.from_pretrained('albert-base-v2')
albert_model.to(device)

# Инициализация оптимизатора
optimizer = torch.optim.AdamW(albert_model.parameters(), lr=2e-5)


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# Обучение модели на GPU
num_epochs = 1

for epoch in range(num_epochs):
    albert_model.train()
    total_loss = 0.0
    correct_predictions = 0

    for batch_num, batch in enumerate(train_loader, 1):
        inputs = {'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device)}
        optimizer.zero_grad()

        with autocast():  
            outputs = albert_model(**inputs)
            loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(predictions == batch[2].to(device)).item()

        if batch_num % 10 == 0:
            avg_loss = total_loss / batch_num
            accuracy = correct_predictions / (batch_num * train_loader.batch_size)
            print(f'Epoch {epoch + 1}, Batch {batch_num}/{len(train_loader)}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / (len(train_loader) * train_loader.batch_size)
    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')

    albert_model.eval()
    all_preds = []
    with torch.no_grad():
        for test_batch in test_loader:
            test_inputs = {'input_ids': test_batch[0].to(device), 'attention_mask': test_batch[1].to(device)}
            test_outputs = albert_model(**test_inputs)
            test_logits = test_outputs.logits
            test_preds = torch.argmax(test_logits, dim=1)
            all_preds.extend(test_preds.cpu().numpy())

    test_accuracy = accuracy_score(test_labels.cpu().numpy(), all_preds)
    print(f'Test Accuracy: {test_accuracy:.4f}')


Epoch 1, Batch 10/2094, Loss: 0.7553, Accuracy: 0.5250
Epoch 1, Batch 20/2094, Loss: 0.7159, Accuracy: 0.5250
Epoch 1, Batch 30/2094, Loss: 0.6586, Accuracy: 0.5917
Epoch 1, Batch 40/2094, Loss: 0.6120, Accuracy: 0.6438
Epoch 1, Batch 50/2094, Loss: 0.5555, Accuracy: 0.7000
Epoch 1, Batch 60/2094, Loss: 0.5045, Accuracy: 0.7375
Epoch 1, Batch 70/2094, Loss: 0.4689, Accuracy: 0.7643
Epoch 1, Batch 80/2094, Loss: 0.4360, Accuracy: 0.7875
Epoch 1, Batch 90/2094, Loss: 0.4199, Accuracy: 0.8028
Epoch 1, Batch 100/2094, Loss: 0.4136, Accuracy: 0.8075
Epoch 1, Batch 110/2094, Loss: 0.4112, Accuracy: 0.8114
Epoch 1, Batch 120/2094, Loss: 0.3895, Accuracy: 0.8229
Epoch 1, Batch 130/2094, Loss: 0.3869, Accuracy: 0.8231
Epoch 1, Batch 140/2094, Loss: 0.3757, Accuracy: 0.8304
Epoch 1, Batch 150/2094, Loss: 0.3669, Accuracy: 0.8333
Epoch 1, Batch 160/2094, Loss: 0.3543, Accuracy: 0.8406
Epoch 1, Batch 170/2094, Loss: 0.3447, Accuracy: 0.8456
Epoch 1, Batch 180/2094, Loss: 0.3315, Accuracy: 0.8514
E

In [49]:
# Сохранение модели
albert_model.save_pretrained('albert_model')