In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import load_dataset
from transformers import DataCollatorForTokenClassification

# Датасет
ner_dataset = load_dataset("wikiann", "ru")

# Метки
label_list = ner_dataset["train"].features["ner_tags"].feature.names
num_labels = len(label_list)

# Токенизатор
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny")

# Токенизация + выравнивание меток
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, word_labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(word_labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_ner = ner_dataset.map(tokenize_and_align_labels, batched=True)

label_list = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG']
num_labels = len(label_list)
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

ner_model = AutoModelForTokenClassification.from_pretrained(
    "cointegrated/rubert-tiny",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


# DataCollator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Аргументы обучения (только совместимые с 5.0.0)
training_args = TrainingArguments(
    output_dir="data\models\ner_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_steps=50
)

# Trainer
trainer = Trainer(
    model=ner_model,
    args=training_args,
    train_dataset=tokenized_ner["train"],
    eval_dataset=tokenized_ner["validation"],
    data_collator=data_collator
)

# Обучение
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

ru/validation-00000-of-00001.parquet:   0%|          | 0.00/809k [00:00<?, ?B/s]

ru/test-00000-of-00001.parquet:   0%|          | 0.00/816k [00:00<?, ?B/s]



ru/train-00000-of-00001.parquet:   0%|          | 0.00/1.63M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/53 [00:00<?, ?it/s]

BertForTokenClassification LOAD REPORT from: cointegrated/rubert-tiny
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
bert.pooler.dense.bias                     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
bert.pooler.dense.weight                   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect

Step,Training Loss
50,1.591253
100,1.176574
150,0.953467
200,0.762909
250,0.712873
300,0.676857
350,0.584391
400,0.560399
450,0.52056
500,0.515421


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=3750, training_loss=0.3891661089579264, metrics={'train_runtime': 101.4572, 'train_samples_per_second': 591.383, 'train_steps_per_second': 36.961, 'total_flos': 106185231360000.0, 'train_loss': 0.3891661089579264, 'epoch': 3.0})

In [None]:
!zip -r ner_model_saved.zip ner_model_saved

  adding: ner_model_saved/ (stored 0%)
  adding: ner_model_saved/config.json (deflated 55%)
  adding: ner_model_saved/model.safetensors (deflated 8%)
  adding: ner_model_saved/tokenizer_config.json (deflated 45%)
  adding: ner_model_saved/tokenizer.json (deflated 70%)


In [None]:
save_path = "data\models\ner_model"

trainer.model.save_pretrained(save_path)

tokenizer.save_pretrained(save_path)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./ner_model_saved/tokenizer_config.json', './ner_model_saved/tokenizer.json')

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_path = "data\models\ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
model.eval()

Loading weights:   0%|          | 0/55 [00:00<?, ?it/s]

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elem

In [None]:
label_list = model.config.id2label
label_list

{0: 'O',
 1: 'B-PER',
 2: 'I-PER',
 3: 'B-LOC',
 4: 'I-LOC',
 5: 'B-ORG',
 6: 'I-ORG'}

In [None]:
import torch

# Словарь для замены
replace_dict = {
    "PER": "*имя*",
    "LOC": "*город*",
    "ORG": "*организация*"
}

def filter_text(text):
    words = text.split()

    inputs = tokenizer(
        words,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=128
    )

    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    word_ids = inputs.word_ids(batch_index=0)
    pred_labels = []
    for i, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if i == 0 or word_idx != word_ids[i-1]:  # первый токен слова
            label = model.config.id2label[predictions[0, i].item()]
            pred_labels.append(label)

    filtered_words = []
    for word, label in zip(words, pred_labels):
        entity_type = label.split("-")[-1] if "-" in label else label
        if entity_type in replace_dict:
            filtered_words.append(replace_dict[entity_type])
        else:
            filtered_words.append(word)

    return " ".join(filtered_words)


In [None]:
from transformers import pipeline

#Mood
mood_model = pipeline(
    "sentiment-analysis",
    model="blanchefort/rubert-base-cased-sentiment"
)

def get_mood(text):
    r = mood_model(text)[0]
    return r["label"], round(r["score"], 2)


config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: blanchefort/rubert-base-cased-sentiment
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
text = "Мне грустно, я в печали"
filtered_text = filter_text(text)
mood = get_mood(text)
print(filtered_text, mood)


Мне грустно, я в печали ('NEGATIVE', 0.75)
