In [2]:
from google.colab import drive
import getpass, os

# === Настройка проекта ===
USER = "tokarevdr"   # твой GitHub username
REPO = "entities-extraction-x5"            # название репозитория
EMAIL = "fedorov.alexander.04@gmail.com"    # твоя почта для git
NAME = "Alexander"           # твоё имя для git
# === Подключение Google Drive ===
drive.mount('/content/drive')
PROJECTS_DIR = "/content/drive/MyDrive/Colab Notebooks"
%cd $PROJECTS_DIR
# === GitHub авторизация ===
token = getpass.getpass('Введи GitHub PAT токен: ')
os.environ["GITHUB_TOKEN"] = token


# === Проверяем: если репозиторий ещё не скачан, клонируем ===
if not os.path.exists(f"{PROJECTS_DIR}/{REPO}/ML PART"):
    print('Заново склонировали репу')
    !git clone https://{USER}:{os.environ["GITHUB_TOKEN"]}@github.com/{USER}/{REPO}.git
# === Переходим в папку проекта ===
%cd {REPO}/{'ML_PART'}

# === Настройка Git ===
!git config --global user.email "{EMAIL}"
!git config --global user.name "{NAME}"
!git remote set-url origin https://{USER}:{os.environ["GITHUB_TOKEN"]}@github.com/{USER}/{REPO}.git

print("✅ Всё готово! Рабочая папка:", os.getcwd())


Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
Введи GitHub PAT токен: ··········
Заново склонировали репу
fatal: destination path 'entities-extraction-x5' already exists and is not an empty directory.
/content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART
✅ Всё готово! Рабочая папка: /content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART


In [3]:
# Установка зависимостей
!pip install transformers torch torchvision torchaudio accelerate tokenizers sentencepiece torchcrf datasets seqeval matplotlib onnx onnxruntime

Collecting torchcrf
  Downloading TorchCRF-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnx
  Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading TorchCRF-1.1.0-py3-none-any.whl (5.2 kB)
Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━

In [4]:
! pip install --upgrade onnxruntime



In [5]:
from google.colab import drive
import getpass, os, json, random, time
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler
from TorchCRF import CRF # Написать про версию потом
from seqeval.metrics import f1_score
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
import ast
import traceback
from module import calculate_ner_metrics, calculate_macro_f1, process_submission
from torch.nn.utils.rnn import pad_sequence

In [6]:
# --- Основные пути для сохранения результатов ---
WHERE_DATA = '/cleared_data'
BASE_MODEL_NAME = "bert"
OUT_DIR = f"OUTPUT{WHERE_DATA}/{BASE_MODEL_NAME}"
os.makedirs(OUT_DIR, exist_ok=True)       # папка для сохранения всех файлов
FINAL_METRICS_PATH = f"{OUT_DIR}/final_training_metrics_per_epoch.csv"
MODEL_PATH = f'MODELS/{WHERE_DATA}/{BASE_MODEL_NAME}'
os.makedirs(MODEL_PATH, exist_ok=True)
DATA_DIR = f'data/{WHERE_DATA}/'
PATIENCE = 3      # количество эпох без улучшения F1 до остановки
SEED = 42

In [7]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [8]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [9]:
# Проверка GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

Используемое устройство: cuda


In [10]:
# Конфигурация
CONFIG = {
    "model_checkpoint": "DeepPavlov/rubert-base-cased",
    "num_epochs": 20,
    "batch_size": 64,
    "learning_rate": 2e-5,
    "weight_decay": 0.01,
    "patience": PATIENCE,
    "label_list": ["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"],
    "id2label": {i: label for i, label in enumerate(["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"])},
    "label2id": {label: i for i, label in enumerate(["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"])},
    "metrics_csv": f"{OUT_DIR}/screening_metrics.csv",
    "onnx_model_path": f"{OUT_DIR}/model.onnx",
    "quantized_onnx_path": f"{OUT_DIR}/model_quantized.onnx",
    "submission_input": f"{DATA_DIR}/submission.csv",
    "submission_output": f"{OUT_DIR}/submission_response_bert.csv"
}


In [11]:
# Загрузка данных
train_split = pd.read_csv(f"{DATA_DIR}train.csv")
valid_data = pd.read_csv(f"{DATA_DIR}val.csv")


# Инициализация токенизатора
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_checkpoint"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
# Ожидается, что столбцы называются 'sample' и 'annotation' (как в оригинале)
def parse_row_to_example(row):
    # annotation может быть строкой repr списка сущностей; безопасно парсим ast.literal_eval
    try:
        ann = ast.literal_eval(row['annotation'])
    except Exception:
        ann = []
    return (row['sample'], {'entities': ann})
train_data = [parse_row_to_example(row) for _, row in train_split.iterrows()]
valid_data = [parse_row_to_example(row) for _, row in valid_data.iterrows()]

tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_checkpoint"])

In [14]:
# === Функции spans_to_bio и bio_to_spans ===
def _normalize_entity_tuple(entity):
    if isinstance(entity, dict) and all(k in entity for k in ("start", "end", "label")):
        return int(entity["start"]), int(entity["end"]), str(entity["label"])
    if isinstance(entity, (list, tuple)) and len(entity) == 3:
        a, b, c = entity
        if isinstance(a, int) and isinstance(b, int):
            return int(a), int(b), str(c)
        if isinstance(a, str) and isinstance(b, int) and isinstance(c, int):
            return int(b), int(c), str(a)
    raise ValueError(f"Unknown entity format: {entity}")

def spans_to_bio(text, entities, tokenizer):
    tokenized = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=512)
    offsets = tokenized["offset_mapping"]
    input_ids = tokenized["input_ids"]
    labels = [CONFIG["label2id"]["O"]] * len(input_ids)

    valid_spans = []
    for ent in entities:
        try:
            s, e, l = _normalize_entity_tuple(ent)
        except Exception as ex:
            print(f"⚠️ Пропущен спан: {ent} — {ex}")
            continue
        if s < 0 or e < 0 or s >= e or e > len(text):
            print(f"⚠️ Спан вне границ: {ent}")
            continue
        valid_spans.append((s, e, l))
    valid_spans.sort(key=lambda x: (x[0], -x[1]))

    # фильтрация пересечений
    merged = []
    last_end = -1
    for s, e, l in valid_spans:
        if s < last_end:
            print(f"⚠️ Пересечение, спан отброшен: {(s,e,l)}")
            continue
        merged.append((s, e, l))
        last_end = e

    for s, e, l in merged:
        base = l.split("-", 1)[-1] if l.startswith(("B-", "I-")) else l
        b, i = f"B-{base}", f"I-{base}"
        if b not in CONFIG["label2id"]:
            print(f"⚠️ Неизвестный тип: {l}")
            continue
        b_id, i_id = CONFIG["label2id"][b], CONFIG["label2id"][i]
        started = False
        for idx, (os_, oe_) in enumerate(offsets):
            if os_ == oe_:
                continue
            if os_ >= e:
                break
            if oe_ <= s:
                continue
            labels[idx] = b_id if not started else i_id
            started = True
    return tokenized["input_ids"], tokenized["attention_mask"], labels

def bio_to_spans(text, bio_labels, offsets):
    ents, start, cur = [], None, None
    for i, (lab, (s, e)) in enumerate(zip(bio_labels, offsets)):
        if s == e or lab == CONFIG["label2id"]["O"]:
            if start is not None:
                ents.append((start, offsets[i-1][1], cur))
                start, cur = None, None
            continue
        lbl = CONFIG["id2label"][int(lab)]
        if "-" not in lbl: continue
        pfx, tp = lbl.split("-", 1)
        if pfx == "B":
            if start is not None:
                ents.append((start, offsets[i-1][1], cur))
            start, cur = s, tp
        elif pfx == "I":
            if start is None:
                start, cur = s, tp
    if start is not None:
        ents.append((start, offsets[-1][1], cur))
    return ents

In [15]:
# === Проверка функций на примерах ===
print("\n=== ПРОВЕРКА spans_to_bio / bio_to_spans ===")
sample_text = "молоко Простоквашино 2.5% 1 л"
test_entities = [
    (0, 6, "TYPE"),
    {"start": 7, "end": 19, "label": "BRAND"},
    (21, 25, "PERCENT"),
    (26, 29, "VOLUME"),
    (30, 10, "TYPE"),        # некорректный
    (0, 1000, "TYPE")        # вне границ
]
ids, mask, labs = spans_to_bio(sample_text, test_entities, tokenizer)
print("Токены:", tokenizer.convert_ids_to_tokens(ids))
print("BIO:", [CONFIG["id2label"][x] for x in labs])
offs = tokenizer(sample_text, return_offsets_mapping=True)["offset_mapping"]
print("Назад в спаны:", bio_to_spans(sample_text, labs, offs))

# Пересечения
print("\n=== Проверка пересекающихся спанов ===")
text2 = "чипсы лейс краб"
ents2 = [(0, 5, "TYPE"), (4, 9, "BRAND")]
_, _, labs2 = spans_to_bio(text2, ents2, tokenizer)
print("BIO:", [CONFIG["id2label"][x] for x in labs2])
print("Назад в спаны:", bio_to_spans(text2, labs2, tokenizer(text2, return_offsets_mapping=True)["offset_mapping"]))



=== ПРОВЕРКА spans_to_bio / bio_to_spans ===
⚠️ Спан вне границ: (30, 10, 'TYPE')
⚠️ Спан вне границ: (0, 1000, 'TYPE')
Токены: ['[CLS]', 'молоко', 'Просто', '##ква', '##шино', '2', '.', '5', '%', '1', 'л', '[SEP]']
BIO: ['O', 'B-TYPE', 'B-BRAND', 'I-BRAND', 'I-BRAND', 'B-PERCENT', 'I-PERCENT', 'I-PERCENT', 'I-PERCENT', 'B-VOLUME', 'I-VOLUME', 'O']
Назад в спаны: [(0, 6, 'TYPE'), (7, 20, 'BRAND'), (21, 25, 'PERCENT'), (26, 29, 'VOLUME')]

=== Проверка пересекающихся спанов ===
⚠️ Пересечение, спан отброшен: (4, 9, 'BRAND')
BIO: ['O', 'B-TYPE', 'I-TYPE', 'O', 'O', 'O', 'O', 'O']
Назад в спаны: [(0, 5, 'TYPE')]


In [16]:
# Класс NERDataset
class NERDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.processed_data = []
        for text, ann in data:
            input_ids, attention_mask, labels = spans_to_bio(text, ann['entities'], tokenizer)
            self.processed_data.append({
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
                "labels": torch.tensor(labels, dtype=torch.long)
            })

    def __len__(self):
        return len(self.processed_data)

    def __getitem__(self, idx):
        return self.processed_data[idx]

# Кастомный collate_fn
def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=0)  # Изменено с -100 на 0

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# Модель с CRF
class NERModelWithCRF(torch.nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModelForTokenClassification.from_pretrained(CONFIG["model_checkpoint"], num_labels=num_labels)
        self.crf = CRF(num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        emissions = outputs.logits
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.type(torch.uint8))
            return loss
        else:
            return self.crf.decode(emissions, mask=attention_mask.type(torch.uint8))

In [17]:
# Функция оценки модели
def evaluate_model(model, eval_data, tokenizer):
    model.eval()
    entity_pairs = []
    for text, annotations in eval_data:
        tokenized = tokenizer([text], padding=True, truncation=True, return_tensors="pt", return_offsets_mapping=True)
        input_ids = tokenized["input_ids"].to(device)
        attention_mask = tokenized["attention_mask"].to(device)
        with torch.no_grad():
            pred = model(input_ids, attention_mask)[0]
        offsets = tokenized["offset_mapping"][0].tolist()
        pred_spans = bio_to_spans(text, pred, offsets)
        true_entities = annotations['entities']
        entity_pairs.append((true_entities, pred_spans))

    macro_f1, f1_type, f1_brand, f1_volume, f1_percent = calculate_macro_f1(entity_pairs)
    return {
        'f1_macro': macro_f1,
        'f1_TYPE': f1_type,
        'f1_BRAND': f1_brand,
        'f1_VOLUME': f1_volume,
        'f1_PERCENT': f1_percent
    }

In [18]:
# Создание модели
model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
optimizer = AdamW(model.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"])
num_training_steps = CONFIG["num_epochs"] * len(train_data) // CONFIG["batch_size"]
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

train_dataset = NERDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"], shuffle=True, collate_fn=collate_fn)

valid_dataset = NERDataset(valid_data, tokenizer)
valid_loader = DataLoader(valid_dataset, batch_size=CONFIG["batch_size"], shuffle=False, collate_fn=collate_fn)

metrics_df = pd.DataFrame(columns=['epoch', 'loss', 'f1_macro', 'f1_TYPE', 'f1_BRAND', 'f1_VOLUME', 'f1_PERCENT'])
best_f1 = 0
patience_counter = 0
best_epoch = 0

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвес

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвестный тип: O
⚠️ Неизвес

In [16]:
try:
    for epoch in range(CONFIG["num_epochs"]):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            loss = model(input_ids, attention_mask, labels)
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_loss = total_loss / len(train_loader)

        # Валидация
        eval_metrics = evaluate_model(model, valid_data, tokenizer)
        current_f1 = eval_metrics["f1_macro"]

        # Сохранение метрик
        metrics_row = {
            'epoch': epoch + 1,
            'loss': avg_loss,
            **eval_metrics
        }
        metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics_row])], ignore_index=True)

        # Вывод
        print(f'Эпоха {epoch + 1:<3} | Loss: {avg_loss:.4f} | '
              f'F1-macro: {current_f1:.4f} | '
              f'F1-TYPE: {eval_metrics["f1_TYPE"]:.4f} | '
              f'F1-BRAND: {eval_metrics["f1_BRAND"]:.4f} | '
              f'F1-VOLUME: {eval_metrics["f1_VOLUME"]:.4f} | '
              f'F1-PERCENT: {eval_metrics["f1_PERCENT"]:.4f}')

        # Early stopping
        if current_f1 > best_f1:
            best_f1 = current_f1
            best_epoch = epoch + 1
            # model.bert.save_pretrained(f"{MODEL_PATH}_screening")
            patience_counter = 0
        else:
            patience_counter += 1
            print(f"⏳ Patience: {patience_counter}/{PATIENCE}")
            if patience_counter >= PATIENCE:
                print(f"\n🛑 Ранняя остановка на эпохе {epoch + 1}")
                print(f"Лучший F1-macro: {best_f1:.4f} достигнут на эпохе {best_epoch}")
                break

except Exception as e:
    print(f'💥 Критическая ошибка: {str(e)}')
    print(traceback.format_exc())
    print("⚠️ Обучение прервано из-за ошибки, сохранен текущий прогресс")

finally:
    # Сохранение финальной модели после скрининга
    # Вариант 1: Сохранение полной модели и BERT
    torch.save(model.state_dict(), f"{MODEL_PATH}/model_screening.pt")
    model.bert.save_pretrained(f"{MODEL_PATH}_screening_final")
    # Вариант 2: Экспорт в ONNX (закомментирован)
    dummy_input_ids = torch.randint(0, tokenizer.vocab_size, (1, 512)).to(device)
    dummy_attention_mask = torch.ones(1, 512).to(device)
    torch.onnx.export(model.bert, (dummy_input_ids, dummy_attention_mask),
                      f"{MODEL_PATH}_screening/model.onnx",
                      export_params=True,
                      opset_version=14,  # Изменено на 14
                      input_names=['input_ids', 'attention_mask'],
                      output_names=['logits'],
                      dynamic_axes={'input_ids': {0: 'batch', 1: 'seq'},
                                    'attention_mask': {0: 'batch', 1: 'seq'},
                                    'logits': {0: 'batch', 1: 'seq'}})
    metrics_df.to_csv(CONFIG["metrics_csv"], index=False)
    print("💾 Screening модель и метрики сохранены")
print("\n" + "="*80)
print("ИТОГОВЫЕ РЕЗУЛЬТАТЫ SCREENING:")
print("="*80)
print(f"Лучший F1-macro: {best_f1:.4f} на эпохе {best_epoch}")
print(f"Всего эпох выполнено: {len(metrics_df)}")
print("\nДетальные метрики по эпохам:")
print(metrics_df.round(4))

💥 Критическая ошибка: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Traceback (most recent call last):
  File "/tmp/ipython-input-668927516.py", line 9, in <cell line: 0>
    loss = model(input_ids, attention_mask, labels)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-3522910203.py", line 48, in forward
    loss = -self.crf(emissions, labels, mask=attention_mask.type(torch.uint8))
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/m

AcceleratorError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Визуализация
plt.figure(figsize=(15, 10))
plt.subplot(2, 1, 1)
plt.plot(metrics_df['epoch'], metrics_df['loss'], 'b-', linewidth=2, label='Loss')
plt.axvline(x=best_epoch, color='r', linestyle='--', alpha=0.7, label=f'Лучшая эпоха ({best_epoch})')
plt.xlabel('Эпоха')
plt.ylabel('Loss')
plt.title('Loss по эпохам')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 1, 2)
plt.plot(metrics_df['epoch'], metrics_df['f1_macro'], 'r-', linewidth=3, label='F1-macro')
plt.plot(metrics_df['epoch'], metrics_df['f1_TYPE'], 'g--', label='F1-TYPE')
plt.plot(metrics_df['epoch'], metrics_df['f1_BRAND'], 'b--', label='F1-BRAND')
plt.plot(metrics_df['epoch'], metrics_df['f1_VOLUME'], 'y--', label='F1-VOLUME')
plt.plot(metrics_df['epoch'], metrics_df['f1_PERCENT'], 'c--', label='F1-PERCENT')
plt.axvline(x=best_epoch, color='r', linestyle='--', alpha=0.7, label=f'Лучшая эпоха ({best_epoch})')
plt.xlabel('Эпоха')
plt.ylabel('F1 Score')
plt.title('F1 Scores по эпохам')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{OUT_DIR}/screening_metrics.png", dpi=300, bbox_inches='tight')
plt.show()




In [None]:
# Ячейка 2: Подбор гиперпараметров (Tuning) с grid search
PARAM_GRID = {
    "learning_rate": [1e-5, 2e-5, 3e-5],
    "batch_size": [32, 64],
    "epochs": [10, 20],
    "weight_decay": [0.01, 0.1]
}

grid_results = []

for lr in PARAM_GRID["learning_rate"]:
    for bsz in PARAM_GRID["batch_size"]:
        for max_ep in PARAM_GRID["epochs"]:
            for wd in PARAM_GRID["weight_decay"]:
                combo = {"learning_rate": lr, "batch_size": bsz, "epochs": max_ep, "weight_decay": wd}
                print(f"\n=== Tuning combo: learning_rate={lr}, batch_size={bsz}, epochs={max_ep}, weight_decay={wd} ===")

                model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
                optimizer = AdamW(model.parameters(), lr=lr, weight_decay=wd)
                num_training_steps = max_ep * len(train_data) // bsz
                scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

                train_loader = DataLoader(train_dataset, batch_size=bsz, shuffle=True)

                patience_counter, best_f1, best_metrics = 0, 0.0, None
                for epoch in range(1, max_ep + 1):
                    model.train()
                    total_loss = 0
                    for batch in train_loader:
                        input_ids = batch["input_ids"].to(device)
                        attention_mask = batch["attention_mask"].to(device)
                        labels = batch["labels"].to(device)
                        loss = model(input_ids, attention_mask, labels)
                        total_loss += loss.item()
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                        scheduler.step()

                    avg_loss = total_loss / len(train_loader)
                    metrics = evaluate_model(model, valid_data, tokenizer)
                    metrics["epoch"] = epoch
                    metrics["loss"] = avg_loss
                    current_f1 = metrics["f1_macro"]

                    print(f"Ep {epoch} | Loss: {metrics['loss']:.4f} | F1-macro: {current_f1:.4f}")

                    if current_f1 > best_f1:
                        best_f1 = current_f1
                        best_metrics = metrics
                        patience_counter = 0
                    else:
                        patience_counter += 1
                        if patience_counter >= PATIENCE:
                            break

                combo["best_f1_macro"] = best_f1
                combo["best_metrics"] = best_metrics
                grid_results.append(combo)

# Выбор лучших параметров
best_combo = max(grid_results, key=lambda x: x["best_f1_macro"])
print("\nBest tuning params:", best_combo)

# Сохранение результатов
pd.DataFrame(grid_results).to_csv(f"{OUT_DIR}/tuning_summary.csv", index=False)
with open(f"{OUT_DIR}/tuning_detailed.json", "w", encoding="utf-8") as f:
    json.dump(grid_results, f, ensure_ascii=False, indent=2)
with open(f"{OUT_DIR}/best_combo.json", "w", encoding="utf-8") as f:
    json.dump({k: v for k, v in best_combo.items() if k != "best_metrics"}, f, ensure_ascii=False, indent=2)
print("💾 Tuning результаты и best_combo сохранены")


In [None]:
# Ячейка 3: Кросс-валидация (CV) с лучшими параметрами
with open(f"{OUT_DIR}/best_combo.json", "r", encoding="utf-8") as f:
    best_combo = json.load(f)

best_lr = best_combo["learning_rate"]
best_bsz = best_combo["batch_size"]
best_max_ep = best_combo["epochs"]
best_wd = best_combo["weight_decay"]

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
cv_results = []
fold_best_f1s = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(train_data), 1):
    print(f"\n=== CV Fold {fold} ===")
    fold_train = [train_data[i] for i in tr_idx]
    fold_valid = [train_data[i] for i in val_idx]

    fold_train_dataset = NERDataset(fold_train, tokenizer)
    fold_train_loader = DataLoader(fold_train_dataset, batch_size=best_bsz, shuffle=True)

    model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
    optimizer = AdamW(model.parameters(), lr=best_lr, weight_decay=best_wd)
    num_training_steps = best_max_ep * len(fold_train) // best_bsz
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    patience_counter, best_f1, best_metrics = 0, 0.0, None
    for epoch in range(1, best_max_ep + 1):
        model.train()
        total_loss = 0
        for batch in fold_train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            loss = model(input_ids, attention_mask, labels)
            total_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_loss = total_loss / len(fold_train_loader)
        metrics = evaluate_model(model, fold_valid, tokenizer)
        metrics["epoch"] = epoch
        metrics["loss"] = avg_loss
        current_f1 = metrics["f1_macro"]

        print(f"Fold {fold} Ep {epoch} | Loss: {metrics['loss']:.4f} | F1-macro: {current_f1:.4f}")

        if current_f1 > best_f1:
            best_f1 = current_f1
            best_metrics = metrics
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                break

    cv_results.append({"fold": fold, "best_f1_macro": best_f1, "best_metrics": best_metrics})
    fold_best_f1s.append(best_f1)

mean_f1 = np.mean(fold_best_f1s)
std_f1 = np.std(fold_best_f1s)
print(f"\nCV Results: Mean F1_macro = {mean_f1:.4f} ± {std_f1:.4f}")

# Сохранение результатов CV
pd.DataFrame(cv_results).to_csv(f"{OUT_DIR}/cv_summary.csv", index=False)
with open(f"{OUT_DIR}/cv_detailed.json", "w", encoding="utf-8") as f:
    json.dump(cv_results, f, ensure_ascii=False, indent=2)
print("💾 CV результаты сохранены")

In [None]:
# Ячейка 4: Финальное обучение на объединённом датасете (train+val)
train_val = [(row['sample'], {'entities': ast.literal_eval(row['annotation'])}) for _, row in pd.concat([train_split, valid_data]).iterrows()]
train_val_dataset = NERDataset(train_val, tokenizer)
train_val_loader = DataLoader(train_val_dataset, batch_size=best_bsz, shuffle=True)

model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
optimizer = AdamW(model.parameters(), lr=best_lr, weight_decay=best_wd)
num_training_steps = best_max_ep * len(train_val) // best_bsz
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

records = []
best_final_f1, patience_counter = 0.0, 0
for epoch in range(1, best_max_ep + 1):
    model.train()
    total_loss = 0
    for batch in train_val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        loss = model(input_ids, attention_mask, labels)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_val_loader)
    print(f"Эпоха {epoch} | Loss: {avg_loss:.4f}")


# Вариант 1: Сохранение BERT-модели
model.bert.save_pretrained(MODEL_PATH)
# Вариант 2: Экспорт в ONNX (закомментирован)
# dummy_input_ids = torch.randint(0, tokenizer.vocab_size, (1, 512)).to(device)
# dummy_attention_mask = torch.ones(1, 512).to(device)
# torch.onnx.export(model.bert, (dummy_input_ids, dummy_attention_mask),
#                   CONFIG["onnx_model_path"],
#                   export_params=True,
#                   opset_version=14,  # Изменено на 14
#                   input_names=['input_ids', 'attention_mask'],
#                   output_names=['logits'],
#                   dynamic_axes={'input_ids': {0: 'batch', 1: 'seq'},
#                                 'attention_mask': {0: 'batch', 1: 'seq'},
#                                 'logits': {0: 'batch', 1: 'seq'}})
# quantize_dynamic(CONFIG["onnx_model_path"], CONFIG["quantized_onnx_path"], weight_type=QuantType.QUInt8)
print(f"\nFinal model saved: {MODEL_PATH}")