In [2]:
from torch.optim import AdamW

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
from datasets import Dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np

# Проверка GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [6]:
# 1. Загрузка данных
df = pd.read_csv("cleaned_combined_guardian.csv")
texts = df["cleaned_text"].tolist()
categories = df["category"].tolist()

In [7]:
df = df.dropna(subset=["cleaned_text", "category"])

# Конвертация и очистка текстов
df["cleaned_text"] = (
    df["cleaned_text"]
    .astype(str)
    .replace({"nan": "", "None": "", "null": ""})  # Очистка скрытых NaN
)

# Проверка проблемных значений
nan_mask = df["cleaned_text"].str.strip().isin(["", "nan", "None", "null"])
print(f"Found {nan_mask.sum()} invalid texts:")
print(df.loc[nan_mask, ["cleaned_text", "category"]].head())

# Удаление строк с пустыми текстами после конвертации
df = df[~nan_mask]

# Дополнительная проверка типов
assert df["cleaned_text"].apply(lambda x: isinstance(x, str)).all()
assert df["category"].apply(lambda x: isinstance(x, str)).all()

Found 0 invalid texts:
Empty DataFrame
Columns: [cleaned_text, category]
Index: []


In [8]:
# Подготовка данных для модели
texts = df["cleaned_text"].tolist()
categories = df["category"].tolist()

In [9]:
# Преобразование типов
df["cleaned_text"] = df["cleaned_text"].astype(str)
df["category"] = df["category"].astype(str)

In [10]:
# Точечное удаление 7 проблемных строк
print(f"Исходный размер датасета: {len(df)}")
df = df.dropna(subset=["cleaned_text"])  # Удаляем только 7 строк с NaN в текстах
print(f"Размер после очистки: {len(df)}")

Исходный размер датасета: 49993
Размер после очистки: 49993


In [11]:
# Проверка результатов
print("\nПроверка очищенных данных:")
print(f"NaN в cleaned_text: {df['cleaned_text'].isna().sum()}")
print(f"Пример текста: {df['cleaned_text'].iloc[0][:100]}...")
print(f"Уникальные категории: {df['category'].unique()}")


Проверка очищенных данных:
NaN в cleaned_text: 0
Пример текста: hsbc has sounded the alarm about the impact of higher trade tariffs on economic growth unemployment ...
Уникальные категории: ['News' 'Analytical' 'Feature' 'Editorial' 'Review']


In [12]:
# Кодирование категорий
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(categories)

# Разделение данных (добавим стратификацию)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts, 
    labels, 
    test_size=0.2, 
    stratify=labels,
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, 
    temp_labels, 
    test_size=0.5, 
    stratify=temp_labels,
    random_state=42
)

# Дополнительная проверка перед созданием Dataset
def validate_texts(text_list):
    for i, text in enumerate(text_list):
        if not isinstance(text, str):
            raise ValueError(f"Non-string text at index {i}: {type(text)}")
        if len(text.strip()) == 0:
            raise ValueError(f"Empty text at index {i}")

validate_texts(train_texts)
validate_texts(val_texts)
validate_texts(test_texts)

In [13]:
# 4. Создание Dataset
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

In [16]:
# 5. Правильная токенизация с удалением исходных текстов
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,
        return_attention_mask=True,
        return_token_type_ids=False
    )
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": examples["label"]
    }

# Применяем токенизацию и удаляем исходные тексты
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)
tokenized_test = test_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

Map:   0%|          | 0/39994 [00:00<?, ? examples/s]

Map:   0%|          | 0/4999 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
# 6. Настройка DataLoader
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    max_length=256,
    pad_to_multiple_of=8,
    return_tensors="pt"
)

train_dataloader = DataLoader(
    tokenized_train,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)

In [18]:
# 7. Модель
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=5
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# 8. Оптимизатор и планировщик
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [20]:
sample_batch = next(iter(train_dataloader))
print("\nПример батча:")
print({k: v.shape for k, v in sample_batch.items()})


Пример батча:
{'input_ids': torch.Size([16, 256]), 'attention_mask': torch.Size([16, 256]), 'labels': torch.Size([16])}




In [22]:
val_dataloader = DataLoader(
    tokenized_val,
    batch_size=16,
    collate_fn=data_collator
)


train_dataloader = DataLoader(
    tokenized_train,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
)

val_dataloader = DataLoader(
    tokenized_val,
    batch_size=16,
    collate_fn=data_collator
)

test_dataloader = DataLoader(
    tokenized_test,
    batch_size=16,
    collate_fn=data_collator
)

In [23]:
# 9. Обучение
best_val_accuracy = 0

for epoch in range(epochs):
    # Тренировка
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        progress_bar.set_postfix({"loss": loss.item()})

    # Валидация
    model.eval()
    val_accuracy = []
    for batch in tqdm(val_dataloader, desc="Validating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        accuracy = (predictions == batch["labels"]).float().mean()
        val_accuracy.append(accuracy.item())
    
    avg_val_accuracy = np.mean(val_accuracy)
    print(f"Validation Accuracy: {avg_val_accuracy:.4f}")

    # Сохранение лучшей модели
    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        torch.save(model.state_dict(), "best_model.pth")

Epoch 1:   0%|          | 0/2500 [00:00<?, ?it/s]

Validating:   0%|          | 0/313 [00:00<?, ?it/s]

Validation Accuracy: 0.8779


Epoch 2:   0%|          | 0/2500 [00:00<?, ?it/s]

Validating:   0%|          | 0/313 [00:00<?, ?it/s]

Validation Accuracy: 0.8811


Epoch 3:   0%|          | 0/2500 [00:00<?, ?it/s]

Validating:   0%|          | 0/313 [00:00<?, ?it/s]

Validation Accuracy: 0.8811


In [24]:
# 10. Тестирование
model.load_state_dict(torch.load("best_model.pth"))
model.eval()
test_accuracy = []
for batch in tqdm(test_dataloader, desc="Testing"):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)
    accuracy = (predictions == batch["labels"]).float().mean()
    test_accuracy.append(accuracy.item())

print(f"Test Accuracy: {np.mean(test_accuracy):.4f}")

Testing:   0%|          | 0/313 [00:00<?, ?it/s]

Test Accuracy: 0.8762


In [25]:
# 11. Сохранение модели и токенизатора
model.save_pretrained("./bert_category_classifier")
tokenizer.save_pretrained("./bert_category_classifier")

('./bert_category_classifier\\tokenizer_config.json',
 './bert_category_classifier\\special_tokens_map.json',
 './bert_category_classifier\\vocab.txt',
 './bert_category_classifier\\added_tokens.json')

In [26]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(categories)

# Сохранение энкодера
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']