In [1]:
from datasets import load_dataset

dataset_dict = load_dataset('ag_news')

small_train_dataset = dataset_dict["train"].shuffle(seed=42).select(range(500))
small_test_dataset = dataset_dict["test"].shuffle(seed=42).select(range(200))

def truncate_text(example):
    example["text"] = example["text"][:100]
    return example

small_train_dataset = small_train_dataset.map(truncate_text)
small_test_dataset = small_test_dataset.map(truncate_text)

In [2]:
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from transformers import DistilBertModel


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=100)

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)


class SimpleClassifier(nn.Module):
    def __init__(self, num_labels=4):
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = nn.Sequential(
            nn.Linear(768, num_labels)
        )

    def forward(self, input_ids, attention_mask=None):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        logits = self.classifier(hidden_state)
        return logits
    

from torch.utils.data import DataLoader
from transformers import default_data_collator

# Преобразуем в DataLoader
train_loader = DataLoader(tokenized_train, shuffle=True, batch_size=16, collate_fn=default_data_collator)
test_loader = DataLoader(tokenized_test, batch_size=16, collate_fn=default_data_collator)

# Инициализируем модель, loss и оптимизатор
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleClassifier(num_labels=4).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [3]:
model.train()
for epoch in range(5):
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} Loss: {loss.item():.4f}")


model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy on test set: {100 * correct / total:.2f}%")

Epoch 1 Loss: 0.0654
Epoch 2 Loss: 0.2269
Epoch 3 Loss: 0.5233
Epoch 4 Loss: 0.0079
Epoch 5 Loss: 0.0059
Accuracy on test set: 82.50%


In [4]:
model = model.to("cpu")
torch.cuda.empty_cache() 

In [5]:
import json
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

def label_smoothing_with_llm(
    text: str,
    y_true: int,
    num_classes: int,
    class_names: list,
    model,
    tokenizer,
    device: str = "cuda",
    epsilon: float = 0.1,
    temperature: float = 0.1
):


    system_msg = f"""Вы - классификатор текстов. Верните сглаженные вероятности для каждого класса в формате JSON.
    На основе следующего текста определите сглаженные вероятности принадлежности к каждому из {num_classes} классов: 
    {', '.join(class_names)}. Учтите, что истинный класс текста — {class_names[y_true]} (индекс {y_true}), 
    и примените label smoothing с параметром epsilon={epsilon}. Верните результат в формате JSON, например:
    {"{"}"class_1": prob_1, "class_2": prob_2, "class_3": prob_3, "class_4": prob_4{"}"}
    """
    user_msg = f"""Текст: "{text}"
    """

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": user_msg}
    ]

    text_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
    inputs = tokenizer(text_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=64,
        temperature=temperature,
        do_sample=True
    )

    output_ids = outputs[0][len(inputs.input_ids[0]):].tolist()

    try:
        index = len(output_ids) - output_ids[::-1].index(151668)
    except ValueError:
        index = 0

    content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

    # Парсинг JSON и нормализация
    try:
        content = content.replace("```json", "").replace("```", "").strip()
        data = json.loads(content)

        if not isinstance(data, dict):
            raise ValueError(f"Ожидался словарь, получили {type(data)}")

        probabilities = [data.get(cls, 0.0) for cls in class_names]
        total = sum(probabilities)
        if (total > 0):
            normalized_probs = [p / total for p in probabilities]
        else:
            normalized_probs = [1.0 / num_classes for _ in range(num_classes)]

    except (json.JSONDecodeError, ValueError) as e:
        print(f"Ошибка парсинга JSON: {e}. Используется равномерное распределение.")
        normalized_probs = [1.0 / num_classes for _ in range(num_classes)]
    print(y_true, normalized_probs)
    return normalized_probs

In [6]:
class_names = dataset_dict["train"].features["label"].names
model_name = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda"
)

def apply_label_smoothing(example, num_classes=4, epsilon=0.1):
    example["soft_label"] = label_smoothing_with_llm(
    text=example["text"],
    y_true=example["label"],
    num_classes=num_classes,
    class_names=class_names,
    model=model,
    tokenizer=tokenizer,
    device="cuda",
    epsilon=epsilon
    )
    return example

small_train_dataset = small_train_dataset.map(lambda x: apply_label_smoothing(x, num_classes=4, epsilon=0.5))


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

0 [0.9999970000060001, 9.99998000004e-07, 9.99998000004e-07, 9.99998000004e-07]
1 [0.25, 0.25, 0.25, 0.25]
0 [0.9999970000060001, 9.99998000004e-07, 9.99998000004e-07, 9.99998000004e-07]
3 [0.25, 0.25, 0.25, 0.25]
0 [0.9999970000060001, 9.99998000004e-07, 9.99998000004e-07, 9.99998000004e-07]
3 [0.3333333333333333, 0.16666666666666666, 0.16666666666666666, 0.3333333333333333]
0 [0.9999970000060001, 9.99998000004e-07, 9.99998000004e-07, 9.99998000004e-07]
3 [0.3333333333333333, 0.16666666666666666, 0.16666666666666666, 0.3333333333333333]
3 [0.3333333333333333, 0.16666666666666666, 0.16666666666666666, 0.3333333333333333]
2 [0.3333333333333333, 0.0, 0.6333333333333333, 0.03333333333333333]
1 [0.25, 0.25, 0.25, 0.25]
2 [0.25, 0.25, 0.25, 0.25]
3 [0.0, 0.0, 0.0, 1.0]
0 [0.9999970000060001, 9.99998000004e-07, 9.99998000004e-07, 9.99998000004e-07]
2 [0.3333333333333333, 0.0, 0.6333333333333333, 0.03333333333333333]
1 [0.25, 0.25, 0.25, 0.25]
1 [0.25, 0.25, 0.25, 0.25]
2 [0.25, 0.25, 0.25, 0

In [8]:
model = model.to("cpu")
torch.cuda.empty_cache() 

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=100)

tokenized_train = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test = small_test_dataset.map(tokenize_function, batched=True)

train_loader =  DataLoader(tokenized_train, shuffle=True, batch_size=16, collate_fn=default_data_collator)
test_loader =  DataLoader(tokenized_test, batch_size=16, collate_fn=default_data_collator)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [11]:
model = SimpleClassifier(num_labels=4).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.KLDivLoss(reduction='batchmean')

model.train()
for epoch in range(5):
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        soft_labels = batch["soft_label"].float().to(device)

        logits = model(input_ids, attention_mask=attention_mask)
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

        loss = loss_fn(log_probs, soft_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")



model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(logits, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f"Accuracy on test set: {100 * correct / total:.2f}%")

Epoch 1 Loss: 0.3170
Epoch 2 Loss: 0.1326
Epoch 3 Loss: 0.0213
Epoch 4 Loss: 0.1349
Epoch 5 Loss: 0.0222
Accuracy on test set: 56.50%


Выводы:

Маленькая llm (Qwen3-0.6B) если ей не дать подумать не очень хорошо соблюдают формат вывода а также выполняет задачу выдачи сглаженных вероятностей для классификации текста, поэтому такой label_smoothing только ощктимо ухудшает метрики: 56.5% против 82.5% accuracy.