In [1]:
from google.colab import drive
import getpass, os

# === Настройка проекта ===
USER = "tokarevdr"   # твой GitHub username
REPO = "entities-extraction-x5"            # название репозитория
EMAIL = "fedorov.alexander.04@gmail.com"    # твоя почта для git
NAME = "Alexander"           # твоё имя для git
# === Подключение Google Drive ===
drive.mount('/content/drive')
PROJECTS_DIR = "/content/drive/MyDrive/Colab Notebooks"
%cd $PROJECTS_DIR
# === GitHub авторизация ===
token = getpass.getpass('Введи GitHub PAT токен: ')
os.environ["GITHUB_TOKEN"] = token


# === Проверяем: если репозиторий ещё не скачан, клонируем ===
if not os.path.exists(f"{PROJECTS_DIR}/{REPO}/ML PART"):
    print('Заново склонировали репу')
    !git clone https://{USER}:{os.environ["GITHUB_TOKEN"]}@github.com/{USER}/{REPO}.git
# === Переходим в папку проекта ===
%cd {REPO}/{'ML_PART'}

# === Настройка Git ===
!git config --global user.email "{EMAIL}"
!git config --global user.name "{NAME}"
!git remote set-url origin https://{USER}:{os.environ["GITHUB_TOKEN"]}@github.com/{USER}/{REPO}.git

print("✅ Всё готово! Рабочая папка:", os.getcwd())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks
Введи GitHub PAT токен: ··········
Заново склонировали репу
fatal: destination path 'entities-extraction-x5' already exists and is not an empty directory.
/content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART
✅ Всё готово! Рабочая папка: /content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART


In [2]:
# Установка зависимостей
!pip install -r requirements_bert.txt



In [3]:
! pip install --upgrade onnxruntime



In [4]:
from google.colab import drive
import getpass, os, json, random, time
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# Импорты transformers с обработкой ошибок
try:
    from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler
    print("✅ Transformers успешно импортированы")
except ImportError as e:
    print(f"❌ Ошибка импорта transformers: {e}")
    !pip install transformers==4.35.2
    from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler

try:
    from TorchCRF import CRF
    print("✅ TorchCRF успешно импортирован")
except ImportError as e:
    print(f"❌ Ошибка импорта TorchCRF: {e}")
    !pip install TorchCRF==1.1.0
    from TorchCRF import CRF

import ast
import traceback
from module import calculate_ner_metrics, calculate_macro_f1, process_submission_bert, \
                  setup_hf_login, save_bert_to_hf, load_bert_from_hf, list_my_repos, check_repo_exists, NERModelWithCRF
from torch.nn.utils.rnn import pad_sequence

  _torch_pytree._register_pytree_node(


✅ Transformers успешно импортированы
✅ TorchCRF успешно импортирован


In [5]:
# --- Основные пути для сохранения результатов ---
WHERE_DATA = 'cleared_data'
BASE_MODEL_NAME = "bert"
OUT_DIR = f"OUTPUT/{WHERE_DATA}/{BASE_MODEL_NAME}"
os.makedirs(OUT_DIR, exist_ok=True)
FINAL_METRICS_PATH = f"{OUT_DIR}/final_training_metrics_per_epoch.csv"
MODEL_PATH = f'MODELS/{WHERE_DATA}/{BASE_MODEL_NAME}'
os.makedirs(MODEL_PATH, exist_ok=True)
DATA_DIR = f'data/{WHERE_DATA}/'
PATIENCE = 3
SEED = 42


In [6]:

# Hugging Face настройки
HF_TOKEN= getpass.getpass('Введи HFT токен: ')
HF_USERNAME = "alexflex04"
BERT_REPO_NAME = f"{HF_USERNAME}/NER_{WHERE_DATA}_bert"

setup_hf_login(HF_TOKEN)

Введи HFT токен: ··········
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
✅ Авторизация HF настроена


True

In [7]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [8]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

Используемое устройство: cuda


In [10]:
CONFIG = {
    "model_checkpoint": "DeepPavlov/rubert-base-cased",
    "num_epochs": 20,
    "batch_size": 32,
    "learning_rate": 2e-5,
    "weight_decay": 0.01,
    "patience": PATIENCE,
    "max_length": 128,
    "label_list": ["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"],
    "id2label": {i: label for i, label in enumerate(["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"])},
    "label2id": {label: i for i, label in enumerate(["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"])},
    "metrics_csv": f"{OUT_DIR}/screening_metrics.csv",
    "submission_input": f"{DATA_DIR}/submission.csv",
    "submission_output": f"{OUT_DIR}/submission_response_bert.csv"
}


In [11]:
# Загрузка данных
train_split = pd.read_csv(f"{DATA_DIR}train.csv")
valid_data = pd.read_csv(f"{DATA_DIR}val.csv")

def parse_row_to_example(row):
    try:
        ann = ast.literal_eval(row['annotation'])
    except Exception:
        ann = []
    return (row['sample'], {'entities': ann})

train_data = [parse_row_to_example(row) for _, row in train_split.iterrows()]
valid_data = [parse_row_to_example(row) for _, row in valid_data.iterrows()]

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased", use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
print(train_data)

[('aa', {'entities': [(0, 2, 'O')]}), ('aala', {'entities': [(0, 4, 'O')]}), ('aarcca', {'entities': [(0, 6, 'O')]}), ('abso', {'entities': [(0, 4, 'B-BRAND')]}), ('abtoys игруш', {'entities': [(0, 6, 'B-BRAND'), (7, 12, 'B-TYPE')]}), ('abtoys игрушк', {'entities': [(0, 6, 'B-BRAND'), (7, 13, 'B-TYPE')]}), ('abtoys игрушки', {'entities': [(0, 6, 'B-BRAND'), (7, 14, 'B-TYPE')]}), ('actimal', {'entities': [(0, 7, 'B-BRAND')]}), ('actimel', {'entities': [(0, 7, 'B-BRAND')]}), ('actimeuno', {'entities': [(0, 9, 'B-BRAND')]}), ('actimino', {'entities': [(0, 8, 'B-BRAND')]}), ('actimun', {'entities': [(0, 7, 'B-BRAND')]}), ('actimunno', {'entities': [(0, 9, 'B-BRAND')]}), ('activ', {'entities': [(0, 5, 'B-BRAND')]}), ('activa', {'entities': [(0, 6, 'B-BRAND')]}), ('active', {'entities': [(0, 6, 'B-BRAND')]}), ('adremaine', {'entities': [(0, 9, 'B-BRAND')]}), ('adrena', {'entities': [(0, 6, 'B-BRAND')]}), ('adrinaline', {'entities': [(0, 10, 'B-BRAND')]}), ('afanasiy пи', {'entities': [(0, 8,

In [13]:
def parse_span_str(span_str):
    """
    Парсит строковое представление спанов, например:
    "[(0, 4, 'B-TYPE'), (5, 12, 'I-TYPE')]"
    Возвращает list of tuples (int,int,str).
    """
    if isinstance(span_str, (list, tuple)):
        return span_str
    if not span_str or not isinstance(span_str, str):
        return []
    try:
        parsed = ast.literal_eval(span_str)
        # ensure ints
        out = []
        for s in parsed:
            if len(s) >= 3:
                out.append((int(s[0]), int(s[1]), str(s[2])))
        return out
    except Exception as e:
        raise ValueError(f"Can't parse span string: {e}")

def merge_prefixed_char_spans(spans):
    """
    Вход: spans - list of (start,end,label) где label может быть:
      - 'B-TYPE', 'I-TYPE', 'O' или просто 'TYPE' (robust)
    Возвращает список объединённых базовых спанов:
      [(start,end,'TYPE'), ...] или [(start,end,'O'), ...]
    Правило объединения: последовательности B-/I- с одинаковым типом и
    прилегающими границами (next.start == cur.end) объединяются.
    O-спаны объединяются только если непрерывны (смежны).
    """
    if not spans:
        return []
    # sort by start
    spans_sorted = sorted(spans, key=lambda x: int(x[0]))
    merged = []
    i = 0
    n = len(spans_sorted)
    while i < n:
        s0, s1, lab = spans_sorted[i]
        s0 = int(s0); s1 = int(s1)
        if lab == 'O':
            cur_s, cur_e = s0, s1
            j = i + 1
            while j < n and spans_sorted[j][2] == 'O' and int(spans_sorted[j][0]) == cur_e:
                cur_e = int(spans_sorted[j][1]); j += 1
            merged.append((cur_s, cur_e, 'O'))
            i = j
            continue

        # handle labels with B- or I- or plain
        if isinstance(lab, str) and (lab.startswith('B-') or lab.startswith('I-')):
            base = lab.split('-', 1)[1]
        else:
            base = lab  # already base
        # start new span at s0..s1
        cur_s, cur_e = s0, s1
        j = i + 1
        while j < n:
            ns0, ns1, nlab = spans_sorted[j]
            ns0 = int(ns0); ns1 = int(ns1)
            # accept continuation if it's I-base and contiguous, or plain base contiguous
            if (isinstance(nlab, str) and nlab.startswith('I-') and nlab.split('-',1)[1] == base and ns0 == cur_e) \
               or (nlab == base and ns0 == cur_e):
                cur_e = ns1
                j += 1
            else:
                break
        merged.append((cur_s, cur_e, base))
        i = j
    return merged


def tokenize_and_align_labels(text, spans_prefixed, tokenizer, add_special_tokens=True):
    """
    text: str
    spans_prefixed: list of (start,end,label) where label may be 'B-TYPE','I-TYPE','O'
    tokenizer: HuggingFace tokenizer with use_fast=True (must provide offset_mapping)
    Возвращает dict:
      {
        'tokens': [...],
        'input_ids': [...],
        'offsets': [(s,e), ...],
        'token_labels': ['B-TYPE','I-TYPE','O', ...]   # BIO per token
      }
    Логика:
      1) Сначала объединяем префиксные char-спаны в базовые entity spans (merge_prefixed_char_spans)
      2) Для каждого токена находим span с максимальным overlap. Если overlap==0 => 'O'
      3) Помечаем токен как B-<TYPE>, если токен содержит начало span (t_start <= span_start < t_end),
         иначе как I-<TYPE> (если частично или полностью внутри).
    """
    # 1) merge char spans to base spans
    merged_spans = merge_prefixed_char_spans(spans_prefixed)

    enc = tokenizer(
        text,
        return_offsets_mapping=True,
        add_special_tokens=add_special_tokens
    )
    offsets = enc["offset_mapping"]
    input_ids = enc["input_ids"]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    token_labels = []
    for (t_start, t_end) in offsets:
        if t_start == t_end:
            # special token ([CLS],[SEP]) — пометим 'O' (для обучения можете заменить на -100)
            token_labels.append("O")
            continue

        best_span = None
        best_overlap = 0
        for (s_start, s_end, s_lab) in merged_spans:
            # s_lab is base label or 'O'
            overlap = min(t_end, s_end) - max(t_start, s_start)
            if overlap > best_overlap:
                best_overlap = overlap
                best_span = (s_start, s_end, s_lab)

        if best_span is None or best_overlap <= 0:
            token_labels.append("O")
        else:
            s_start, s_end, s_lab = best_span
            if s_lab == 'O':
                token_labels.append("O")
            else:
                # decide B vs I:
                if t_start <= s_start < t_end:
                    token_labels.append("B-" + s_lab)
                elif s_start <= t_start < s_end:
                    token_labels.append("I-" + s_lab)
                else:
                    # fallback
                    token_labels.append("I-" + s_lab)

    return {
        "tokens": tokens,
        "input_ids": input_ids,
        "offsets": offsets,
        "token_labels": token_labels,
    }


def token_labels_to_char_spans(offsets, token_labels):
    """
    offsets: list of (start,end)
    token_labels: list like ['B-BRAND','I-BRAND','O',...]
    Возвращает список char-spans в формате:
       [(start,end,'B-BRAND'), ..., (start,end,'O'), ...]
    Правила:
      - Non-O spans возвращаются как единственный B-<TYPE> спан (начало => 'B-', внутри => объединяется)
      - O-спаны возвращаются как 'O'
      - Объединяем токены в один char-span только если смежны (next.start == cur.end).
    """
    spans = []
    cur = None  # [start, end, base_label or 'O']
    for (off, lab) in zip(offsets, token_labels):
        t_s, t_e = off
        if t_s == t_e:
            # skip special tokens
            continue
        if lab == "O":
            if cur is None:
                cur = [t_s, t_e, "O"]
            else:
                if cur[2] == "O" and t_s == cur[1]:
                    # extend contiguous O span
                    cur[1] = t_e
                else:
                    # push previous and start new O span
                    spans.append((cur[0], cur[1], "B-" + cur[2] if cur[2] != "O" else "O") if cur[2] != "O" else (cur[0], cur[1], "O"))
                    cur = [t_s, t_e, "O"]
        else:
            # labels like B-X or I-X (robust to plain X)
            if lab.startswith("B-"):
                base = lab.split("-", 1)[1]
                if cur is not None:
                    # push previous
                    spans.append((cur[0], cur[1], "B-" + cur[2] if cur[2] != "O" else "O") if cur[2] != "O" else (cur[0], cur[1], "O"))
                cur = [t_s, t_e, base]
            elif lab.startswith("I-"):
                base = lab.split("-", 1)[1]
                if cur is not None and cur[2] == base and t_s == cur[1]:
                    cur[1] = t_e
                else:
                    # I- without B- : начинаем новый span (robust)
                    if cur is not None:
                        spans.append((cur[0], cur[1], "B-" + cur[2] if cur[2] != "O" else "O") if cur[2] != "O" else (cur[0], cur[1], "O"))
                    cur = [t_s, t_e, base]
            else:
                # plain label like 'TYPE' -> treat as B-<TYPE>
                base = lab
                if cur is not None:
                    spans.append((cur[0], cur[1], "B-" + cur[2] if cur[2] != "O" else "O") if cur[2] != "O" else (cur[0], cur[1], "O"))
                cur = [t_s, t_e, base]

    if cur is not None:
        if cur[2] == "O":
            spans.append((cur[0], cur[1], "O"))
        else:
            spans.append((cur[0], cur[1], "B-" + cur[2]))
    return spans


def build_label_maps_from_examples(all_prefixed_spans):
    """
    all_prefixed_spans: iterable of spans-lists (raw from CSV)
    Возвращает label2id, id2label covering all 'B-X','I-X' and 'O'.
    """
    bases = set()
    for spans in all_prefixed_spans:
        merged = merge_prefixed_char_spans(spans)
        for s,e,lab in merged:
            if lab == 'O':
                continue
            bases.add(lab)
    labels = ["O"]
    for b in sorted(bases):
        labels.append("B-" + b)
        labels.append("I-" + b)
    label2id = {lab: i for i, lab in enumerate(labels)}
    id2label = {i: lab for lab, i in label2id.items()}
    return label2id, id2label

In [14]:
examples = [
    ("яйцо куриное", [(0, 4, 'B-TYPE'), (5, 12, 'I-TYPE')]),
    ("яйцо куриное 30шт", [(0,4,'B-TYPE'), (5,12,'I-TYPE'), (13,17,'B-VOLUME')]),
    ("сок 0.2 л 10%", [(0,3,'B-TYPE'), (4,7,'B-VOLUME'), (8,9,'I-VOLUME'), (10,13,'B-PERCENT')]),
    ("масло сливочное 72% 250 г President", [(0,5,'B-TYPE'), (6,15,'I-TYPE'), (16,19,'B-PERCENT'), (20,23,'B-VOLUME'), (24,25,'I-VOLUME'), (26,35,'B-BRAND')]),
    ("пиво Baltika 4.8% 0.5 л", [(0,4,'B-TYPE'), (5,12,'B-BRAND'), (13,17,'B-PERCENT'), (18,21,'B-VOLUME'), (22,23,'I-VOLUME')]),
    ("global village летняя ягода", [(0,6,'B-BRAND'), (7,14,'I-BRAND'), (15,21,'B-TYPE'), (22,27,'I-TYPE')]),
    ("arkhangel'skkhleb багет", [(0,17,'B-BRAND'), (18,23,'B-TYPE')]),
    ("aunfed", [(0,6,'O')]),
    ("bunk club", [(0,4,'O'), (5,9,'O')]),
]

# Run tests
for text, entities in examples:
    print(f"TEXT:{text} Entities:{entities}")
    parsed = entities  # if you had strings: parse_span_str(...)
    enc = tokenize_and_align_labels(text, parsed, tokenizer, add_special_tokens=True)
    tokens = enc['tokens']; offsets = enc['offsets']; tlabels = enc['token_labels']
    print("Токены:", tokens)
    print("OFFSETS:", offsets)
    print("TOKEN LABELS:", tlabels)
    recovered = token_labels_to_char_spans(offsets, tlabels)
    print("RECOVERED CHAR-SPANS:", recovered)
    merged = merge_prefixed_char_spans(parsed)
    # prepare expected in recovered-format: B-<BASE> for entities, 'O' for O
    expected = []
    for s,e,lab in merged:
        if lab == 'O':
            expected.append((s,e,'O'))
        else:
            expected.append((s,e,'B-' + lab))
    print("EXPECTED (merged):", expected)
    ok = (recovered == expected)
    print("ROUND-TRIP OK:", ok)
    if not ok:
        print("NOTE: mismatches can happen if tokenizer splits differently; inspect offsets and labels.")
    print("-" * 60)

TEXT:яйцо куриное Entities:[(0, 4, 'B-TYPE'), (5, 12, 'I-TYPE')]
Токены: ['[CLS]', 'яйцо', 'кури', '##ное', '[SEP]']
OFFSETS: [(0, 0), (0, 4), (5, 9), (9, 12), (0, 0)]
TOKEN LABELS: ['O', 'B-TYPE', 'B-TYPE', 'I-TYPE', 'O']
RECOVERED CHAR-SPANS: [(0, 4, 'B-TYPE'), (5, 12, 'B-TYPE')]
EXPECTED (merged): [(0, 4, 'B-TYPE'), (5, 12, 'B-TYPE')]
ROUND-TRIP OK: True
------------------------------------------------------------
TEXT:яйцо куриное 30шт Entities:[(0, 4, 'B-TYPE'), (5, 12, 'I-TYPE'), (13, 17, 'B-VOLUME')]
Токены: ['[CLS]', 'яйцо', 'кури', '##ное', '30', '##шт', '[SEP]']
OFFSETS: [(0, 0), (0, 4), (5, 9), (9, 12), (13, 15), (15, 17), (0, 0)]
TOKEN LABELS: ['O', 'B-TYPE', 'B-TYPE', 'I-TYPE', 'B-VOLUME', 'I-VOLUME', 'O']
RECOVERED CHAR-SPANS: [(0, 4, 'B-TYPE'), (5, 12, 'B-TYPE'), (13, 17, 'B-VOLUME')]
EXPECTED (merged): [(0, 4, 'B-TYPE'), (5, 12, 'B-TYPE'), (13, 17, 'B-VOLUME')]
ROUND-TRIP OK: True
------------------------------------------------------------
TEXT:сок 0.2 л 10% Entities:[

In [20]:
class NERDataset(Dataset):
    def __init__(self, data, tokenizer, label2id):
        self.data = data
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, annotations = self.data[idx]
        entities = annotations['entities']

        # УБРАН параметр self.label2id - функция tokenize_and_align_labels
        # не принимает label2id, только add_special_tokens (bool)
        tokenized = tokenize_and_align_labels(text, entities, self.tokenizer)

        # Преобразуем текстовые метки в числовые ID
        labels = [self.label2id[label] for label in tokenized['token_labels']]

        return {
            'input_ids': torch.tensor(tokenized['input_ids']),
            'attention_mask': torch.tensor([1] * len(tokenized['input_ids'])),
            'labels': torch.tensor(labels)
        }


def collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_mask = [item["attention_mask"] for item in batch]
    labels = [item["labels"] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 для игнорирования в loss

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }



def evaluate_model(model, eval_data, tokenizer, id2label):
    entity_pairs = []
    for text, annotations in eval_data:
        tokenized = tokenizer([text], padding=True, truncation=True, return_tensors="pt", return_offsets_mapping=True)
        input_ids = tokenized["input_ids"].to(model.bert.device)
        attention_mask = tokenized["attention_mask"].to(model.bert.device)
        with torch.no_grad():
            pred = model(input_ids, attention_mask)[0]
        bio_labels = [id2label[p.item()] for p in pred[0]]
        offsets = tokenized["offset_mapping"][0].tolist()
        pred_entities = token_labels_to_char_spans(offsets, bio_labels)
        true_entities = annotations['entities']
        entity_pairs.append((true_entities, pred_entities))
    return calculate_macro_f1(entity_pairs)



In [21]:
print("Label2id mapping:", CONFIG['label2id'])

Label2id mapping: {'O': 0, 'B-TYPE': 1, 'I-TYPE': 2, 'B-BRAND': 3, 'I-BRAND': 4, 'B-VOLUME': 5, 'I-VOLUME': 6, 'B-PERCENT': 7, 'I-PERCENT': 8}


In [22]:
print("=== ИНИЦИАЛИЗАЦИЯ МОДЕЛИ ===")
model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
optimizer = AdamW(model.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"])
num_training_steps = CONFIG["num_epochs"] * len(train_data) // CONFIG["batch_size"]
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

train_dataset = NERDataset(train_data, tokenizer, label2id=CONFIG['label2id'])
train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"], shuffle=True, collate_fn=collate_fn)


metrics_df = pd.DataFrame(columns=['epoch', 'loss', 'f1_macro', 'f1_TYPE', 'f1_BRAND', 'f1_VOLUME', 'f1_PERCENT'])
best_f1 = 0
patience_counter = 0
best_epoch = 0

print("✅ Модель и данные успешно подготовлены!")
print(f"Размер обучающей выборки: {len(train_dataset)}")


=== ИНИЦИАЛИЗАЦИЯ МОДЕЛИ ===


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Модель и данные успешно подготовлены!
Размер обучающей выборки: 21792


In [24]:
print(train_dataset[100])

{'input_ids': tensor([  101, 10968,   261,   102]), 'attention_mask': tensor([1, 1, 1, 1]), 'labels': tensor([0, 3, 4, 0])}


In [25]:
print(model)

NERModelWithCRF(
  (bert): BertForTokenClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, b

In [27]:
print(valid_data[5])

('aktime', {'entities': [(0, 6, 'B-BRAND')]})


In [32]:
print(valid_data[5][0], valid_data[5][1]['entities'])

aktime [(0, 6, 'B-BRAND')]


In [26]:
evaluate_model(model, valid_data[5], tokenizer=tokenizer, id2label=CONFIG['id2label'])

ValueError: too many values to unpack (expected 2)

In [None]:
print("\n=== НАЧАЛО SCREENING ОБУЧЕНИЯ ===")
try:
    for epoch in range(CONFIG["num_epochs"]):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            loss = model(input_ids, attention_mask, labels)

            # Усредняем лосс по батчу и затем получаем скаляр
            loss_mean = loss.mean()  # Добавляем эту строку
            total_loss += loss_mean.item()  # Используем усредненное значение

            optimizer.zero_grad()
            loss_mean.backward()  # Используем усредненное значение для обратного распространения
            optimizer.step()
            scheduler.step()

        avg_loss = total_loss / len(train_loader)
        eval_metrics = evaluate_model(model, valid_data, tokenizer)
        current_f1 = eval_metrics["f1_macro"]

        metrics_row = {
            'epoch': epoch + 1,
            'loss': avg_loss,
            **eval_metrics
        }
        metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics_row])], ignore_index=True)

        print(f'Эпоха {epoch + 1:<3} | Loss: {avg_loss:.4f} | '
              f'F1-macro: {current_f1:.4f} | '
              f'F1-TYPE: {eval_metrics["f1_TYPE"]:.4f} | '
              f'F1-BRAND: {eval_metrics["f1_BRAND"]:.4f} | '
              f'F1-VOLUME: {eval_metrics["f1_VOLUME"]:.4f} | '
              f'F1-PERCENT: {eval_metrics["f1_PERCENT"]:.4f}')

        if current_f1 > best_f1:
            best_f1 = current_f1
            best_epoch = epoch + 1
            patience_counter = 0
        else:
            patience_counter += 1
            print(f"⏳ Patience: {patience_counter}/{PATIENCE}")
            if patience_counter >= PATIENCE:
                print(f"\n🛑 Ранняя остановка на эпохе {epoch + 1}")
                print(f"Лучший F1-macro: {best_f1:.4f} достигнут на эпохе {best_epoch}")
                break

except Exception as e:
    print(f'💥 Критическая ошибка: {str(e)}')
    print(traceback.format_exc())

finally:
    # Сохранение screening модели на HF
    print(f"\n💾 Сохранение screening модели на HF: {BERT_REPO_NAME+'_screening'}")
    success = save_bert_to_hf(model, tokenizer, CONFIG, BERT_REPO_NAME+'_screening', HF_TOKEN)

    if success:
        print(f"🎉 BERT screening модель успешно сохранена на HF: {BERT_REPO_NAME+'_screening'}")
    else:
        print("❌ Не удалось сохранить BERT screening модель на HF")

    # Локальное сохранение
    # torch.save(model.state_dict(), f"{MODEL_PATH}/model_screening.pt")
    metrics_df.to_csv(CONFIG["metrics_csv"], index=False)
    print("💾Метрики сохранены локально")

print("\n" + "="*80)
print("ИТОГОВЫЕ РЕЗУЛЬТАТЫ SCREENING:")
print("="*80)
print(f"Лучший F1-macro: {best_f1:.4f} на эпохе {best_epoch}")
print(f"Всего эпох выполнено: {len(metrics_df)}")


=== НАЧАЛО SCREENING ОБУЧЕНИЯ ===


  metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics_row])], ignore_index=True)


Эпоха 1   | Loss: 1.8344 | F1-macro: 0.0000 | F1-TYPE: 0.0000 | F1-BRAND: 0.0000 | F1-VOLUME: 0.0000 | F1-PERCENT: 0.0000
⏳ Patience: 1/2
Эпоха 2   | Loss: 0.6936 | F1-macro: 0.0000 | F1-TYPE: 0.0000 | F1-BRAND: 0.0000 | F1-VOLUME: 0.0000 | F1-PERCENT: 0.0000
⏳ Patience: 2/2

🛑 Ранняя остановка на эпохе 2
Лучший F1-macro: 0.0000 достигнут на эпохе 0

💾 Сохранение screening модели на HF: alexflex04/NER_cleared_data_bert_screening
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
✅ Авторизация HF настроена
✅ Репозиторий найден: alexflex04/NER_cleared_data_bert_screening
❌ Ошибка сохранения BERT модели: 'NERModelWithCRF' object has no attribute 'save_pretrained'
❌ Не удалось сохранить BERT screening модель на HF
💾Метрики сохранены локально

ИТОГОВЫЕ РЕЗУЛЬТАТЫ SCREENING:
Лучший F1-macro: 0.0000 н

In [None]:
# Визуализация
plt.figure(figsize=(15, 10))
plt.subplot(2, 1, 1)
plt.plot(metrics_df['epoch'], metrics_df['loss'], 'b-', linewidth=2, label='Loss')
plt.axvline(x=best_epoch, color='r', linestyle='--', alpha=0.7, label=f'Лучшая эпоха ({best_epoch})')
plt.xlabel('Эпоха')
plt.ylabel('Loss')
plt.title('Loss по эпохам')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(2, 1, 2)
plt.plot(metrics_df['epoch'], metrics_df['f1_macro'], 'r-', linewidth=3, label='F1-macro')
plt.plot(metrics_df['epoch'], metrics_df['f1_TYPE'], 'g--', label='F1-TYPE')
plt.plot(metrics_df['epoch'], metrics_df['f1_BRAND'], 'b--', label='F1-BRAND')
plt.plot(metrics_df['epoch'], metrics_df['f1_VOLUME'], 'y--', label='F1-VOLUME')
plt.plot(metrics_df['epoch'], metrics_df['f1_PERCENT'], 'c--', label='F1-PERCENT')
plt.axvline(x=best_epoch, color='r', linestyle='--', alpha=0.7, label=f'Лучшая эпоха ({best_epoch})')
plt.xlabel('Эпоха')
plt.ylabel('F1 Score')
plt.title('F1 Scores по эпохам')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{OUT_DIR}/screening_metrics.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
print("\n=== ПРОВЕРКА ЗАГРУЗКИ МОДЕЛИ ===")
loaded_model, loaded_tokenizer, loaded_config = load_bert_from_hf(BERT_REPO_NAME+'_screening', HF_TOKEN, device)

if loaded_model:
    print("✅ Модель успешно загружена с HF!")
    test_text = "молоко Простоквашино 2.5% 1л"
    from module import HFWrapper
    wrapper = HFWrapper(loaded_model, loaded_tokenizer)
    doc = wrapper(test_text)
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    print(f"Тестовый текст: '{test_text}'")
    print(f"Извлеченные сущности: {entities}")

    # Обработка submission файла
    print(f"\n=== ОБРАБОТКА SUBMISSION ФАЙЛА ===")
    process_submission_bert(
        model=loaded_model,
        tokenizer=loaded_tokenizer,
        input_file=CONFIG["submission_input"],
        output_file=f"{OUT_DIR}/submission_screening.csv"
    )
else:
    print("❌ Не удалось загрузить модель для тестирования")

In [None]:
# Ячейка 2: Подбор гиперпараметров (Tuning) с grid search
PARAM_GRID = {
    "learning_rate": [1e-5, 2e-5, 3e-5],
    "batch_size": [32, 64],
    "epochs": [10, 20],
    "weight_decay": [0.01, 0.1]
}

grid_results = []

for lr in PARAM_GRID["learning_rate"]:
    for bsz in PARAM_GRID["batch_size"]:
        for max_ep in PARAM_GRID["epochs"]:
            for wd in PARAM_GRID["weight_decay"]:
                combo = {"learning_rate": lr, "batch_size": bsz, "epochs": max_ep, "weight_decay": wd}
                print(f"\n=== Tuning combo: learning_rate={lr}, batch_size={bsz}, epochs={max_ep}, weight_decay={wd} ===")

                model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
                optimizer = AdamW(model.parameters(), lr=lr, weight_decay=wd)
                num_training_steps = max_ep * len(train_data) // bsz
                scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

                train_loader = DataLoader(train_dataset, batch_size=bsz, shuffle=True)

                patience_counter, best_f1, best_metrics = 0, 0.0, None
                for epoch in range(1, max_ep + 1):
                    model.train()
                    total_loss = 0
                    for batch in train_loader:
                        input_ids = batch["input_ids"].to(device)
                        attention_mask = batch["attention_mask"].to(device)
                        labels = batch["labels"].to(device)
                        loss = model(input_ids, attention_mask, labels)
                        loss_mean = loss.mean()
                        total_loss += loss_mean.item()
                        optimizer.zero_grad()
                        loss_mean.backward()
                        optimizer.step()
                        scheduler.step()

                    avg_loss = total_loss / len(train_loader)
                    metrics = evaluate_model(model, valid_data, tokenizer)
                    metrics["epoch"] = epoch
                    metrics["loss"] = avg_loss
                    current_f1 = metrics["f1_macro"]

                    print(f"Ep {epoch} | Loss: {metrics['loss']:.4f} | F1-macro: {current_f1:.4f}")

                    if current_f1 > best_f1:
                        best_f1 = current_f1
                        best_metrics = metrics
                        patience_counter = 0
                    else:
                        patience_counter += 1
                        if patience_counter >= PATIENCE:
                            break

                combo["best_f1_macro"] = best_f1
                combo["best_metrics"] = best_metrics
                grid_results.append(combo)

# Выбор лучших параметров
best_combo = max(grid_results, key=lambda x: x["best_f1_macro"])
print("\nBest tuning params:", best_combo)

# Сохранение результатов
pd.DataFrame(grid_results).to_csv(f"{OUT_DIR}/tuning_summary.csv", index=False)
with open(f"{OUT_DIR}/tuning_detailed.json", "w", encoding="utf-8") as f:
    json.dump(grid_results, f, ensure_ascii=False, indent=2)
with open(f"{OUT_DIR}/best_combo.json", "w", encoding="utf-8") as f:
    json.dump({k: v for k, v in best_combo.items() if k != "best_metrics"}, f, ensure_ascii=False, indent=2)
print("💾 Tuning результаты и best_combo сохранены")


In [None]:
# Ячейка 3: Кросс-валидация (CV) с лучшими параметрами
with open(f"{OUT_DIR}/best_combo.json", "r", encoding="utf-8") as f:
    best_combo = json.load(f)

best_lr = best_combo["learning_rate"]
best_bsz = best_combo["batch_size"]
best_max_ep = best_combo["epochs"]
best_wd = best_combo["weight_decay"]

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
cv_results = []
fold_best_f1s = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(train_data), 1):
    print(f"\n=== CV Fold {fold} ===")
    fold_train = [train_data[i] for i in tr_idx]
    fold_valid = [train_data[i] for i in val_idx]

    fold_train_dataset = NERDataset(fold_train, tokenizer)
    fold_train_loader = DataLoader(fold_train_dataset, batch_size=best_bsz, shuffle=True)

    model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
    optimizer = AdamW(model.parameters(), lr=best_lr, weight_decay=best_wd)
    num_training_steps = best_max_ep * len(fold_train) // best_bsz
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    patience_counter, best_f1, best_metrics = 0, 0.0, None
    for epoch in range(1, best_max_ep + 1):
        model.train()
        total_loss = 0
        for batch in fold_train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            loss = model(input_ids, attention_mask, labels)
            loss_mean = loss.mean()
            total_loss += loss_mean.item()
            optimizer.zero_grad()
            loss_mean.backward()
            optimizer.step()
            scheduler.step()

        avg_loss = total_loss / len(fold_train_loader)
        metrics = evaluate_model(model, fold_valid, tokenizer)
        metrics["epoch"] = epoch
        metrics["loss"] = avg_loss
        current_f1 = metrics["f1_macro"]

        print(f"Fold {fold} Ep {epoch} | Loss: {metrics['loss']:.4f} | F1-macro: {current_f1:.4f}")

        if current_f1 > best_f1:
            best_f1 = current_f1
            best_metrics = metrics
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                break

    cv_results.append({"fold": fold, "best_f1_macro": best_f1, "best_metrics": best_metrics})
    fold_best_f1s.append(best_f1)

mean_f1 = np.mean(fold_best_f1s)
std_f1 = np.std(fold_best_f1s)
print(f"\nCV Results: Mean F1_macro = {mean_f1:.4f} ± {std_f1:.4f}")

# Сохранение результатов CV
pd.DataFrame(cv_results).to_csv(f"{OUT_DIR}/cv_summary.csv", index=False)
with open(f"{OUT_DIR}/cv_detailed.json", "w", encoding="utf-8") as f:
    json.dump(cv_results, f, ensure_ascii=False, indent=2)
print("💾 CV результаты сохранены")

In [None]:
# Ячейка 4: Финальное обучение на объединённом датасете (train+val)
train_val = [(row['sample'], {'entities': ast.literal_eval(row['annotation'])}) for _, row in pd.concat([train_split, valid_data]).iterrows()]
train_val_dataset = NERDataset(train_val, tokenizer)
train_val_loader = DataLoader(train_val_dataset, batch_size=best_bsz, shuffle=True)

model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
optimizer = AdamW(model.parameters(), lr=best_lr, weight_decay=best_wd)
num_training_steps = best_max_ep * len(train_val) // best_bsz
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

records = []
best_final_f1, patience_counter = 0.0, 0
for epoch in range(1, best_max_ep + 1):
    model.train()
    total_loss = 0
    for batch in train_val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        loss = model(input_ids, attention_mask, labels)
        loss_mean = loss.mean()
        total_loss += loss_mean.item()
        optimizer.zero_grad()
        loss_mean.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_val_loader)
    print(f"Эпоха {epoch} | Loss: {avg_loss:.4f}")

# Сохранение screening модели на HF
    print(f"\n💾 Сохранение screening модели на HF: {BERT_REPO_NAME}")
    success = save_bert_to_hf(model, tokenizer, CONFIG, BERT_REPO_NAME, HF_TOKEN)

    if success:
        print(f"🎉 BERT screening модель успешно сохранена на HF: {BERT_REPO_NAME}")
    else:
        print("❌ Не удалось сохранить BERT screening модель на HF")
# Вариант 1: Сохранение BERT-модели
# model.bert.save_pretrained(MODEL_PATH)
# Вариант 2: Экспорт в ONNX (закомментирован)
# dummy_input_ids = torch.randint(0, tokenizer.vocab_size, (1, 512)).to(device)
# dummy_attention_mask = torch.ones(1, 512).to(device)
# torch.onnx.export(model.bert, (dummy_input_ids, dummy_attention_mask),
#                   CONFIG["onnx_model_path"],
#                   export_params=True,
#                   opset_version=14,  # Изменено на 14
#                   input_names=['input_ids', 'attention_mask'],
#                   output_names=['logits'],
#                   dynamic_axes={'input_ids': {0: 'batch', 1: 'seq'},
#                                 'attention_mask': {0: 'batch', 1: 'seq'},
#                                 'logits': {0: 'batch', 1: 'seq'}})
# quantize_dynamic(CONFIG["onnx_model_path"], CONFIG["quantized_onnx_path"], weight_type=QuantType.QUInt8)
# print(f"\nFinal model saved: {MODEL_PATH}")

In [None]:
print("\n=== ПРОВЕРКА ЗАГРУЗКИ МОДЕЛИ ===")
loaded_model, loaded_tokenizer, loaded_config = load_bert_from_hf(BERT_REPO_NAME, HF_TOKEN, device)

if loaded_model:
    print("✅ Модель успешно загружена с HF!")
    test_text = "молоко Простоквашино 2.5% 1л"
    from module import HFWrapper
    wrapper = HFWrapper(loaded_model, loaded_tokenizer)
    doc = wrapper(test_text)
    entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    print(f"Тестовый текст: '{test_text}'")
    print(f"Извлеченные сущности: {entities}")

    # Обработка submission файла
    print(f"\n=== ОБРАБОТКА SUBMISSION ФАЙЛА ===")
    process_submission_bert(
        model=loaded_model,
        tokenizer=loaded_tokenizer,
        input_file=CONFIG["submission_input"],
        output_file=f"{OUT_DIR}/submission.csv"
    )
else:
    print("❌ Не удалось загрузить модель для тестирования")