In [None]:
from google.colab import drive
import getpass, os

# === Настройка проекта ===
USER = "tokarevdr"   # твой GitHub username
REPO = "entities-extraction-x5"            # название репозитория
EMAIL = "fedorov.alexander.04@gmail.com"    # твоя почта для git
NAME = "Alexander"           # твоё имя для git
# === Подключение Google Drive ===
drive.mount('/content/drive')
PROJECTS_DIR = "/content/drive/MyDrive/Colab Notebooks"
%cd $PROJECTS_DIR
# === GitHub авторизация ===
token = getpass.getpass('Введи GitHub PAT токен: ')
os.environ["GITHUB_TOKEN"] = token


# === Проверяем: если репозиторий ещё не скачан, клонируем ===
if not os.path.exists(f"{PROJECTS_DIR}/{REPO}/ML PART"):
    print('Заново склонировали репу')
    !git clone https://{USER}:{os.environ["GITHUB_TOKEN"]}@github.com/{USER}/{REPO}.git
# === Переходим в папку проекта ===
%cd {REPO}/{'ML_PART'}

# === Настройка Git ===
!git config --global user.email "{EMAIL}"
!git config --global user.name "{NAME}"
!git remote set-url origin https://{USER}:{os.environ["GITHUB_TOKEN"]}@github.com/{USER}/{REPO}.git

print("✅ Всё готово! Рабочая папка:", os.getcwd())


Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
Введи GitHub PAT токен: ··········
/content/drive/MyDrive/Colab Notebooks/entities-extraction-x5
✅ Всё готово! Рабочая папка: /content/drive/MyDrive/Colab Notebooks/entities-extraction-x5


In [None]:
!pip install transformers torch torchvision torchaudio accelerate tokenizers sentencepiece torchcrf datasets seqeval matplotlib onnx onnxruntime

Collecting torchcrf
  Downloading TorchCRF-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnx
  Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.23.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading TorchCRF-1.1.0-py3-none-any.whl (5.2 kB)
Downloading onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━

In [None]:
import os
import random
import numpy as np
import torch
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForTokenClassification, get_scheduler
from torch.utils.data import Dataset, DataLoader
from TorchCRF import CRF
import matplotlib.pyplot as plt
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType
from seqeval.metrics import f1_score
import traceback
from module import calculate_ner_metrics, calculate_macro_f1, evaluate_model, process_submission

In [None]:
# Фиксация random seed для воспроизводимости
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# Проверка GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используемое устройство: {device}")

Используемое устройство: cuda


In [None]:
CONFIG = {
    "project_dir": os.getcwd(),
    "data_path": os.getcwd()+"/train.csv",
    "module_path": os.getcwd()+"/module.py",
    "model_checkpoint": "DeepPavlov/rubert-base-cased",  # Альтернатива: "ai-forever/ruRoberta-base" 'DeepPavlov/ruBERT-large' для большей точности надо много ресурсов при инференсе
    "num_epochs": 10,
    "batch_size": 16,
    "learning_rate": 2e-5,
    "weight_decay": 0.01,
    "patience": 5,  # Для early stopping
    "label_list": ["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"],
    "id2label": {i: label for i, label in enumerate(["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"])},
    "label2id": {label: i for i, label in enumerate(["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"])},
    "metrics_csv": "training_metrics_bert.csv",
    "best_model_dir": "bert_model",
    "onnx_model_path": "model.onnx",
    "quantized_onnx_path": "model_quantized.onnx",
    "submission_input": "submission.csv",
    "submission_output": "submission_response_bert.csv"
}

In [None]:
# Загрузка сырых данных
df = pd.read_csv(CONFIG["data_path"], sep=';')
print(f"Загружено {len(df)} строк")

Загружено 27251 строк


In [None]:
raw_data = []
for _, row in df.iterrows():
    sample = row['sample']
    annotations = ast.literal_eval(row['annotation'])
    entities = [tuple(ann) for ann in annotations]
    raw_data.append((sample, {'entities': entities}))

In [None]:
print(raw_data[125][0],raw_data[125][1]['entities'] )

artfruit виноград [(0, 8, 'B-BRAND'), (9, 17, 'B-TYPE')]


In [None]:
# Инициализация токенизатора
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_checkpoint"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Функция для преобразования спанов в BIO (с токенизацией)
def spans_to_bio(text, entities, tokenizer):
    tokenized = tokenizer(text, return_offsets_mapping=True, truncation=True, max_length=512)
    labels = [CONFIG["label2id"]["O"]] * len(tokenized["input_ids"])
    offsets = tokenized["offset_mapping"]

    for start, end, label in entities:
        b_label = CONFIG["label2id"][f"B-{label.split('-')[-1]}"]
        i_label = CONFIG["label2id"][f"I-{label.split('-')[-1]}"]
        entity_started = False
        for idx, (off_start, off_end) in enumerate(offsets):
            if off_start == off_end:  # Special token
                continue
            if off_start >= end:
                break
            if off_end <= start:
                continue
            if off_start >= start and off_end <= end:
                labels[idx] = b_label if not entity_started else i_label
                entity_started = True
            elif off_start < start < off_end:
                labels[idx] = b_label
                entity_started = True
            elif off_start < end < off_end:
                labels[idx] = i_label if entity_started else b_label

    return tokenized["input_ids"], tokenized["attention_mask"], labels

In [None]:
for i in range(789, 900, 20):
  print(raw_data[i][0], raw_data[i][1]['entities'])
  print(spans_to_bio(raw_data[i][0], raw_data[i][1]['entities'], tokenizer))

In [None]:
def bio_to_spans(text, bio_labels, offsets):
    entities = []
    current_start = None
    current_label = None
    for i, (label_id, (start, end)) in enumerate(zip(bio_labels, offsets)):
        if label_id == 0 or start == end:  # O or special
            if current_start is not None:
                entities.append((current_start, end, current_label))
                current_start = None
            continue
        label = CONFIG["id2label"][label_id]
        prefix, ent_type = label.split('-')
        if prefix == 'B':
            if current_start is not None:
                entities.append((current_start, start, current_label))
            current_start = start
            current_label = ent_type
        elif prefix == 'I' and current_label == ent_type:
            continue
        else:
            if current_start is not None:
                entities.append((current_start, start, current_label))
            current_start = None
    if current_start is not None:
        entities.append((current_start, len(text), current_label))
    return entities

In [None]:
processed_data = raw_data

In [None]:
# Разбиение на train/val (10% val)
train_data, val_data = train_test_split(processed_data, test_size=0.1, random_state=SEED)

In [None]:
# Кастомный Dataset
class NERDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids, attention_mask, labels = self.data[idx]
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long)
        }

In [None]:
train_dataset = NERDataset(train_data)
val_dataset = NERDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=CONFIG["batch_size"], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG["batch_size"])

In [None]:
# Модель с CRF
class NERModelWithCRF(torch.nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        # Загрузка предобученной модели
        self.bert = AutoModelForTokenClassification.from_pretrained(CONFIG["model_checkpoint"], num_labels=num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        emissions = outputs.logits
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.type(torch.uint8))
            return loss
        else:
            return self.crf.decode(emissions, mask=attention_mask.type(torch.uint8))

In [None]:
model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)

In [None]:
# Оптимизатор и scheduler
optimizer = AdamW(model.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"])
num_training_steps = CONFIG["num_epochs"] * len(train_loader)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [None]:
# Метрики (адаптировано для BIO)
def compute_metrics(preds, labels):
    preds_flat = [p for pred in preds for p in pred]
    labels_flat = [l for label in labels for l in label]
    preds_str = [[CONFIG["id2label"][p] for p in pred if p != -100] for pred in preds]  # Игнор -100 для padding
    labels_str = [[CONFIG["id2label"][l] for l in label if l != -100] for label in labels]
    f1 = f1_score(labels_str, preds_str, average='macro')
    return {"f1_macro": f1}

In [None]:
# Обучение с early stopping и метриками
metrics_df = pd.DataFrame(columns=['epoch', 'loss', 'val_loss', 'f1_macro'])
best_f1 = 0
patience_counter = 0

In [None]:
for epoch in range(CONFIG["num_epochs"]):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        loss = model(input_ids, attention_mask, labels)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            loss = model(input_ids, attention_mask, labels)
            val_loss += loss.item()
            pred = model(input_ids, attention_mask)
            preds.extend(pred)
            true_labels.extend(labels.cpu().tolist())

    avg_val_loss = val_loss / len(val_loader)
    metrics = compute_metrics(preds, true_labels)
    current_f1 = metrics["f1_macro"]

    print(f"Эпоха {epoch+1}: Loss {avg_loss:.4f}, Val Loss {avg_val_loss:.4f}, F1-macro {current_f1:.4f}")

    metrics_row = {'epoch': epoch+1, 'loss': avg_loss, 'val_loss': avg_val_loss, 'f1_macro': current_f1}
    metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics_row])], ignore_index=True)

    if current_f1 > best_f1:
        best_f1 = current_f1
        model.bert.save_pretrained(CONFIG["best_model_dir"])  # Сохранение лучшей модели (только BERT)
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= CONFIG["patience"]:
            print("Ранняя остановка")
            break

In [None]:
metrics_df.to_csv(CONFIG["metrics_csv"], index=False)

In [None]:
# Вывод метрик за время обучения
print("\nМетрики за время обучения:")
print(metrics_df.round(4))

In [None]:
# Визуализация метрик
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(metrics_df['epoch'], metrics_df['loss'], label='Train Loss')
plt.plot(metrics_df['epoch'], metrics_df['val_loss'], label='Val Loss')
plt.legend()
plt.title('Loss')

plt.subplot(1, 2, 2)
plt.plot(metrics_df['epoch'], metrics_df['f1_macro'], label='F1 Macro')
plt.legend()
plt.title('F1')

plt.show()

In [None]:
# Batch-инференс
def batch_predict(texts):
    model.eval()
    tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", return_offsets_mapping=True)
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    with torch.no_grad():
        preds = model(input_ids, attention_mask)
    results = []
    for i, pred in enumerate(preds):
        offsets = tokenized["offset_mapping"][i]
        spans = bio_to_spans(texts[i], pred, offsets.tolist())
        results.append(spans)
    return results

In [None]:
# Примеры выводов (демонстрация инференса)
demo_texts = [
    "чипсы русская картошка краб",
    "чипсы паприка",
    "чипсы острые из",
    "яблоки 3 штуки",
    "апельсин большой"
]

In [None]:
predictions = batch_predict(demo_texts)
print("\nПримеры предсказаний:")
for text, pred in zip(demo_texts, predictions):
    print(f"Текст: {text}")
    print(f"Предсказания: {pred}")

In [None]:
# Сохранение в ONNX (экспорт BERT, CRF отдельно)
dummy_input_ids = torch.randint(0, tokenizer.vocab_size, (1, 512)).to(device)
dummy_attention_mask = torch.ones(1, 512).to(device)
torch.onnx.export(model.bert, (dummy_input_ids, dummy_attention_mask), CONFIG["onnx_model_path"],
                  export_params=True, opset_version=12, input_names=['input_ids', 'attention_mask'],
                  output_names=['logits'], dynamic_axes={'input_ids': {0: 'batch', 1: 'seq'},
                                                         'attention_mask': {0: 'batch', 1: 'seq'},
                                                         'logits': {0: 'batch', 1: 'seq'}})

In [None]:
# Квантизация ONNX
quantize_dynamic(CONFIG["onnx_model_path"], CONFIG["quantized_onnx_path"], weight_type=QuantType.QUInt8)

# Загрузка сохранённой модели и использование (для финальной оценки и submission)
# Загрузка лучшей модели
loaded_bert = AutoModelForTokenClassification.from_pretrained(CONFIG["best_model_dir"]).to(device)
loaded_model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
loaded_model.bert = loaded_bert  # CRF заново, так как не сохранён

In [None]:
# Финальная оценка с module.py (на val_data)
entity_pairs = []
val_texts = [text for text, _ in raw_data[-len(val_data):]]  # Примерный val split
val_anns = [ann for _, ann in raw_data[-len(val_data):]]
preds = batch_predict(val_texts)
for true_ann, pred_spans in zip(val_anns, preds):
    true_entities = true_ann['entities']
    entity_pairs.append((true_entities, pred_spans))

macro_f1, f1_type, f1_brand, f1_volume, f1_percent = calculate_macro_f1(entity_pairs)
print(f"\nФинальные метрики (macro F1: {macro_f1:.4f}, TYPE: {f1_type:.4f}, BRAND: {f1_brand:.4f}, VOLUME: {f1_volume:.4f}, PERCENT: {f1_percent:.4f})")

In [None]:
# Process submission с загруженной моделью
class HFWrapper:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, text):
        class Doc:
            def __init__(self, ents):
                self.ents = ents

        class Ent:
            def __init__(self, start, end, label):
                self.start_char = start
                self.end_char = end
                self.label_ = label

        spans = batch_predict([text])[0]
        ents = [Ent(s, e, l) for s, e, l in spans]
        return Doc(ents)

trained_model = HFWrapper(loaded_model, tokenizer)
process_submission(trained_model, CONFIG["submission_input"], CONFIG["submission_output"])

print("Ноутбук завершен!")