In [1]:
import torch

import pandas as pd
from datasets import Dataset

from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
)

import transformers
import datasets
import accelerate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv(r'C:\web_class_news\transformer-news-finetune\research\data\text_and_topic.csv') # Заменить на наш путь к дата сэту
df.head()

Unnamed: 0,text,topic
0,В России увеличили МРОТ и прожиточный минимум,Экономика
1,В Сербии запустили продолжение «Турецкого потока»,Экономика
2,Названы преимущества «Турецкого потока» для Се...,Экономика
3,В России заморозили накопительную пенсию,Экономика
4,В России начал действовать налог на проценты о...,Экономика


In [4]:
df.value_counts(df['topic'])

topic
Экономика          59958
Наука и техника    40976
Спорт              23629
Name: count, dtype: int64

In [5]:
label2id = {"Экономика": 0, "Наука и техника": 1, "Спорт": 2}
df["label"] = df["topic"].map(label2id)

df = df[["text", "label"]]
df.head()

Unnamed: 0,text,label
0,В России увеличили МРОТ и прожиточный минимум,0
1,В Сербии запустили продолжение «Турецкого потока»,0
2,Названы преимущества «Турецкого потока» для Се...,0
3,В России заморозили накопительную пенсию,0
4,В России начал действовать налог на проценты о...,0


In [6]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

train_ds = dataset["train"]
test_ds  = dataset["test"]

In [7]:
tokenizer = BertTokenizerFast.from_pretrained("DeepPavlov/rubert-base-cased")

In [10]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

Map:   0%|          | 0/112106 [00:00<?, ? examples/s]

Map:   0%|          | 0/12457 [00:00<?, ? examples/s]

In [11]:
train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [12]:
model = BertForSequenceClassification.from_pretrained(
    "DeepPavlov/rubert-base-cased",
    num_labels=3
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/DeepPavlov/rubert-base-cased/resolve/main/pytorch_model.bin: The operation did not complete (read) (_ssl.c:2559)
Trying to resume download...


pytorch_model.bin:  73%|#######3  | 524M/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

In [13]:
training_args = TrainingArguments(
    output_dir="rubert-news",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [8]:
model_dir = r"C:\web_class_news\transformer-news-finetune\app\model\rubert-news_model" # Замените на ваш путь к модели

tokenizer = BertTokenizerFast.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [9]:
id2label = {0: "Экономика", 1: "Наука и техника", 2: "Спорт"}

In [11]:
text = ""

inputs = tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=256
)

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
pred_id = torch.argmax(logits, dim=-1).item()
pred_label = id2label[pred_id]

print(pred_label)

Наука и техника
