In [None]:
import numpy as np
import pandas as pd
import ast

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
import torch
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel, Sequence
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, \
    DataCollatorForTokenClassification, EarlyStoppingCallback, pipeline

In [None]:
# 0. Проверка доступности GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используется устройство: {device}")

# 1. Определение меток (тегов) для навыков
label_list = ["O", "B-SKILL", "I-SKILL"]
label_encoding_dict = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}
file_path = 'vacancies_bio_tagged.csv'
df = pd.read_csv(file_path)
df['bio_tags'] = df['bio_tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['tokens'] = df['tokens'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df['id'] = range(len(df))
df['id'] = df.index
# 1. Определяем множество допустимых тегов
allowed_tags = {'O', 'B-SKILL', 'I-SKILL'}

# 2. Выводим размер DataFrame до фильтрации
print(f"Размер df до фильтрации: {df.shape}")

# 3. Создаем булеву маску для фильтрации
mask = df['bio_tags'].apply(
    lambda tags_list: isinstance(tags_list, list) and all(tag in allowed_tags for tag in tags_list)
)

# 4. Применяем маску и перезаписываем DataFrame
df = df[mask]

# 5. Выводим размер DataFrame после фильтрации
print(f"Размер df после фильтрации: {df.shape}")

all_tags = []
for tags_list in df['bio_tags'].dropna():
    if isinstance(tags_list, list):
        all_tags.extend(tags_list)

unique_tags = set(all_tags)

print("Уникальные теги в DataFrame:")
print(unique_tags)
train_data = {'id': df.iloc[0:4000]
['id'].tolist(),
              'tokens': df.iloc[0:4000]['tokens'],
              'ner_tags_str': df.iloc[0:4000]['bio_tags'].tolist()}

validation_data = {'id': df.iloc[4001:4400]
['id'].tolist(),
                   'tokens': df.iloc[4001:4400]['tokens'].tolist(),
                   'ner_tags_str': df.iloc[4001:4400]['bio_tags'].tolist()}

test_data = {'id': df.iloc[4401:4900]
['id'].tolist(),
             'tokens': df.iloc[4401:4900]['tokens'].tolist(),
             'ner_tags_str': df.iloc[4401:4900]['bio_tags'].tolist()}

In [None]:
train_data['ner_tags'] = [[label_encoding_dict[tag] for tag in tags_list] for tags_list in train_data['ner_tags_str']]
validation_data['ner_tags'] = [[label_encoding_dict[tag] for tag in tags_list] for tags_list in
                               validation_data['ner_tags_str']]
test_data['ner_tags'] = [[label_encoding_dict[tag] for tag in tags_list] for tags_list in test_data['ner_tags_str']]

features = Features({
    'id': Value('string'),
    'tokens': Sequence(Value('string')),
    'ner_tags': Sequence(ClassLabel(names=label_list))
})

train_dataset = Dataset.from_dict({
    "id": [str(i) for i in train_data['id']],
    "tokens": train_data['tokens'],
    "ner_tags": train_data['ner_tags']
}, features=features)
validation_dataset = Dataset.from_dict({
    "id": [str(i) for i in validation_data['id']],
    "tokens": validation_data['tokens'],
    "ner_tags": validation_data['ner_tags']
}, features=features)

test_dataset = Dataset.from_dict({
    "id": [str(i) for i in test_data['id']],
    "tokens": test_data['tokens'],
    "ner_tags": test_data['ner_tags']
}, features=features)

dataset_dict = DatasetDict({'train': train_dataset, 'validation': validation_dataset,
                            'test': test_dataset})

# 3. Загрузка токенизатора
model_checkpoint = "ai-forever/ruBert-base"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    max_seq_length = 512
except Exception as e:
    print(f"Ошибка при загрузке токенизатора {model_checkpoint}: {e}")
    exit()


# 4. Функция токенизации и выравнивания меток
# Токенизатор может разбивать слова на под-токены.
# Метки должны быть выровнены с этими под-токенами.
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding="max_length",
        max_length=max_seq_length,
        is_split_into_words=True,
    )

    labels = []
    for i, label_list_per_example in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_list_per_example[word_idx])
            else:
                current_label_str = id2label[label_list_per_example[word_idx]]
                if current_label_str.startswith("B-"):
                    related_i_tag = "I-" + current_label_str[2:]
                    if related_i_tag in label2id:
                        label_ids.append(label2id[related_i_tag])
                    else:
                        label_ids.append(-100)
                elif current_label_str.startswith("I-"):
                    label_ids.append(label_list_per_example[word_idx])
                else:
                    label_ids.append(label_list_per_example[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# Применяем токенизацию к набору данных
tokenized_datasets = dataset_dict.map(tokenize_and_align_labels, batched=True)

# 5. Загрузка модели для Token Classification
try:
    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    ).to(device)
except Exception as e:
    print(f"Ошибка при загрузке модели {model_checkpoint} для TokenClassification: {e}")
    exit()

# 6. Определение аргументов для обучения
model_output_dir = "my-skill-extractor-model"

training_args = TrainingArguments(
    output_dir=model_output_dir,
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    use_cpu=True
)

# 7. Data Collator
# DataCollatorForTokenClassification будет выполнять паддинг динамически
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


# 8. Определение метрик для оценки
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    report = classification_report(true_labels, true_predictions, output_dict=True, zero_division=0)

    return {
        "precision": precision_score(true_labels, true_predictions, zero_division=0),
        "recall": recall_score(true_labels, true_predictions, zero_division=0),
        "f1": f1_score(true_labels, true_predictions, zero_division=0),
        "accuracy_score": accuracy_score(true_labels, true_predictions)
    }


# 9. Инициализация Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
print("\n--- Оценка на тестовом наборе ---")
if len(tokenized_datasets["test"]) > 0:
    test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
    print("Результаты на тестовом наборе:")
    for key, value in test_results.items():
        print(f"  {key}: {value:.4f}")

else:
    print("Тестовый набор данных пуст. Оценка на нем не будет проведена.")
