In [2]:
from google.colab import drive
import getpass, os

# === Настройка проекта ===
USER = "tokarevdr"   # твой GitHub username
REPO = "entities-extraction-x5"            # название репозитория
EMAIL = "fedorov.alexander.04@gmail.com"    # твоя почта для git
NAME = "Alexander"           # твоё имя для git
# === Подключение Google Drive ===
drive.mount('/content/drive')
PROJECTS_DIR = "/content/drive/MyDrive/Colab Notebooks"
%cd $PROJECTS_DIR
# === GitHub авторизация ===
token = getpass.getpass('Введи GitHub PAT токен: ')
os.environ["GITHUB_TOKEN"] = token

# === Проверяем: если репозиторий ещё не скачан, клонируем ===
if not os.path.exists(f"{PROJECTS_DIR}/{REPO}/ML PART"):
    print('Заново склонировали репу')
    !git clone https://{USER}:{os.environ["GITHUB_TOKEN"]}@github.com/{USER}/{REPO}.git
# === Переходим в папку проекта ===
%cd {REPO}/{'ML_PART'}

# === Настройка Git ===
!git config --global user.email "{EMAIL}"
!git config --global user.name "{NAME}"
!git remote set-url origin https://{USER}:{os.environ["GITHUB_TOKEN"]}@github.com/{USER}/{REPO}.git

print("✅ Всё готово! Рабочая папка:", os.getcwd())

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
Введи GitHub PAT токен: ··········
Заново склонировали репу
fatal: destination path 'entities-extraction-x5' already exists and is not an empty directory.
/content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART
✅ Всё готово! Рабочая папка: /content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART


In [6]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from TorchCRF import CRF
import onnxruntime as ort
import spacy
import time
import psutil
from module import process_submission, HFWrapper

In [None]:
# Конфигурация для BERT (аналогична fine_tune_bert.py)
CONFIG = {
    "model_checkpoint": "DeepPavlov/rubert-base-cased",
    "label_list": ["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"],
    "id2label": {i: label for i, label in enumerate(["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"])},
    "label2id": {label: i for i, label in enumerate(["O", "B-TYPE", "I-TYPE", "B-BRAND", "I-BRAND", "B-VOLUME", "I-VOLUME", "B-PERCENT", "I-PERCENT"])}
}

# Функция для преобразования BIO в спаны (из fine_tune_bert.py)
def bio_to_spans(text, bio_labels, offsets):
    entities = []
    current_start = None
    current_label = None
    for i, (label_id, (start, end)) in enumerate(zip(bio_labels, offsets)):
        if label_id == 0 or start == end:  # O or special
            if current_start is not None:
                entities.append((current_start, end, current_label))
                current_start = None
            continue
        label = CONFIG["id2label"][label_id]
        prefix, ent_type = label.split('-')
        if prefix == 'B':
            if current_start is not None:
                entities.append((current_start, start, current_label))
            current_start = start
            current_label = ent_type
        elif prefix == 'I' and current_label == ent_type:
            continue
        else:
            if current_start is not None:
                entities.append((current_start, start, current_label))
            current_start = None
    if current_start is not None:
        entities.append((current_start, len(text), current_label))
    return entities

# Модель с CRF для BERT
class NERModelWithCRF(torch.nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = AutoModelForTokenClassification.from_pretrained(CONFIG["model_checkpoint"], num_labels=num_labels)
        self.crf = CRF(num_labels)  # Без batch_first

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        emissions = outputs.logits
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.type(torch.uint8))
            return loss
        else:
            return self.crf.decode(emissions, mask=attention_mask.type(torch.uint8))

In [None]:
def plot_screening_metrics(model_path, dataset_path):
    """
    Построение графиков и вывод метрик для скрининга.

    Args:
        model_path (str): Путь к директории модели (например, 'MODELS/cleared_data/bert' или 'ru_core_news_lg').
        dataset_path (str): Путь к данным (например, 'data/cleared_data/').
    """
    out_dir = f"OUTPUT{dataset_path.replace('data', '')}/{os.path.basename(model_path)}"
    metrics_file = f"{out_dir}/screening_metrics.csv"

    if not os.path.exists(metrics_file):
        print(f"Файл метрик {metrics_file} не найден.")
        return

    metrics_df = pd.read_csv(metrics_file)
    best_f1 = metrics_df['f1_macro'].max()
    best_epoch = metrics_df.loc[metrics_df['f1_macro'].idxmax(), 'epoch']

    print("\n=== Метрики скрининга ===")
    print(f"Лучший F1-macro: {best_f1:.4f} на эпохе {best_epoch}")
    print("\nДетальные метрики по эпохам:")
    print(metrics_df.round(4))

    plt.figure(figsize=(15, 10))
    # График Loss
    plt.subplot(2, 1, 1)
    plt.plot(metrics_df['epoch'], metrics_df['loss'], 'b-', linewidth=2, label='Loss')
    plt.axvline(x=best_epoch, color='r', linestyle='--', alpha=0.7, label=f'Лучшая эпоха ({best_epoch})')
    plt.xlabel('Эпоха')
    plt.ylabel('Loss')
    plt.title('Loss по эпохам')
    plt.legend()
    plt.grid(True, alpha=0.3)

    # График F1 scores
    plt.subplot(2, 1, 2)
    plt.plot(metrics_df['epoch'], metrics_df['f1_macro'], 'r-', linewidth=3, label='F1-macro')
    plt.plot(metrics_df['epoch'], metrics_df['f1_TYPE'], 'g--', label='F1-TYPE')
    plt.plot(metrics_df['epoch'], metrics_df['f1_BRAND'], 'b--', label='F1-BRAND')
    plt.plot(metrics_df['epoch'], metrics_df['f1_VOLUME'], 'y--', label='F1-VOLUME')
    plt.plot(metrics_df['epoch'], metrics_df['f1_PERCENT'], 'c--', label='F1-PERCENT')
    plt.axvline(x=best_epoch, color='r', linestyle='--', alpha=0.7, label=f'Лучшая эпоха ({best_epoch})')
    plt.xlabel('Эпоха')
    plt.ylabel('F1 Score')
    plt.title('F1 Scores по эпохам')
    plt.legend()
    plt.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig(f"{out_dir}/screening_metrics_comparison.png", dpi=300, bbox_inches='tight')
    plt.show()

def plot_tuning_metrics(model_path, dataset_path):
    """
    Вывод таблицы и графика лучших F1 для tuning.

    Args:
        model_path (str): Путь к директории модели.
        dataset_path (str): Путь к данным.
    """
    out_dir = f"OUTPUT{dataset_path.replace('data', '')}/{os.path.basename(model_path)}"
    tuning_file = f"{out_dir}/tuning_summary.csv"

    if not os.path.exists(tuning_file):
        print(f"Файл tuning {tuning_file} не найден.")
        return

    tuning_df = pd.read_csv(tuning_file)
    best_combo = tuning_df.loc[tuning_df['best_f1_macro'].idxmax()]

    print("\n=== Результаты подбора гиперпараметров ===")
    print(f"Лучшая комбинация: {best_combo.to_dict()}")
    print("\nВсе комбинации:")
    print(tuning_df.round(4))

    plt.figure(figsize=(10, 6))
    plt.bar(tuning_df.index, tuning_df['best_f1_macro'], color='skyblue')
    plt.xlabel('Комбинация гиперпараметров')
    plt.ylabel('F1-macro')
    plt.title('F1-macro для всех комбинаций гиперпараметров')
    plt.grid(True, alpha=0.3)
    plt.savefig(f"{out_dir}/tuning_metrics_comparison.png", dpi=300, bbox_inches='tight')
    plt.show()

def plot_cv_metrics(model_path, dataset_path):
    """
    Вывод метрик и графика для кросс-валидации.

    Args:
        model_path (str): Путь к директории модели.
        dataset_path (str): Путь к данным.
    """
    out_dir = f"OUTPUT{dataset_path.replace('data', '')}/{os.path.basename(model_path)}"
    cv_file = f"{out_dir}/cv_summary.csv"

    if not os.path.exists(cv_file):
        print(f"Файл CV {cv_file} не найден.")
        return

    cv_df = pd.read_csv(cv_file)
    mean_f1 = cv_df['best_f1_macro'].mean()
    std_f1 = cv_df['best_f1_macro'].std()

    print("\n=== Результаты кросс-валидации ===")
    print(f"Mean F1-macro: {mean_f1:.4f} ± {std_f1:.4f}")
    print("\nМетрики по фолдам:")
    print(cv_df.round(4))

    plt.figure(figsize=(10, 6))
    plt.bar(cv_df['fold'], cv_df['best_f1_macro'], color='lightgreen')
    plt.axhline(y=mean_f1, color='r', linestyle='--', label=f'Mean F1-macro: {mean_f1:.4f}')
    plt.xlabel('Фолд')
    plt.ylabel('F1-macro')
    plt.title('F1-macro по фолдам кросс-валидации')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig(f"{out_dir}/cv_metrics_comparison.png", dpi=300, bbox_inches='tight')
    plt.show()


In [None]:
def process_submission_hf(model_path, input_file, output_file):
    """
    Создание тестового датасета с использованием Hugging Face модели.

    Args:
        model_path (str): Путь к модели (например, 'MODELS/cleared_data/bert_screening').
        input_file (str): Путь к входному CSV (например, 'data/cleared_data/submission.csv').
        output_file (str): Путь к выходному CSV.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_checkpoint"])
    bert_model = AutoModelForTokenClassification.from_pretrained(model_path).to(device)
    model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
    model.bert = bert_model
    wrapped_model = HFWrapper(model, tokenizer)
    process_submission(wrapped_model, input_file, output_file)
    print(f"Тестовый датасет сохранён: {output_file}")

def process_submission_onnx(onnx_path, input_file, output_file):
    """
    Создание тестового датасета с использованием ONNX модели.

    Args:
        onnx_path (str): Путь к ONNX файлу (например, 'OUTPUT/cleared_data/bert/model.onnx').
        input_file (str): Путь к входному CSV.
        output_file (str): Путь к выходному CSV.
    """
    class ONNXWrapper:
        def __init__(self, onnx_path, tokenizer):
            self.ort_session = ort.InferenceSession(onnx_path, providers=['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider'])
            self.tokenizer = tokenizer

        def __call__(self, text):
            class Doc:
                def __init__(self, ents):
                    self.ents = ents

            class Ent:
                def __init__(self, start, end, label):
                    self.start_char = start
                    self.end_char = end
                    self.label_ = label

            tokenized = self.tokenizer([text], padding=True, truncation=True, return_tensors="np", return_offsets_mapping=True)
            ort_inputs = {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]}
            logits = self.ort_session.run(None, ort_inputs)[0]
            pred_labels = np.argmax(logits, axis=-1)[0]
            spans = bio_to_spans(text, pred_labels, tokenized["offset_mapping"][0].tolist())
            ents = [Ent(s, e, l) for s, e, l in spans]
            return Doc(ents)

    tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_checkpoint"])
    wrapped_model = ONNXWrapper(onnx_path, tokenizer)
    process_submission(wrapped_model, input_file, output_file)
    print(f"Тестовый датасет сохранён: {output_file}")

def process_submission_spacy(model_path, input_file, output_file):
    """
    Создание тестового датасета с использованием SpaCy модели.

    Args:
        model_path (str): Путь к модели (например, 'MODELS/cleared_data/ru_core_news_lg').
        input_file (str): Путь к входному CSV.
        output_file (str): Путь к выходному CSV.
    """
    nlp = spacy.load(model_path)
    process_submission(nlp, input_file, output_file)
    print(f"Тестовый датасет сохранён: {output_file}")

# 3. Функция для измерения производительности

def measure_inference_performance(model_path, model_type, sample_text, batch_size=32):
    """
    Измерение производительности модели в инференсе.

    Args:
        model_path (str): Путь к модели или ONNX файлу.
        model_type (str): Тип модели ('hf', 'onnx', 'spacy').
        sample_text (str or list): Текст для одного примера или список текстов для батча.
        batch_size (int): Размер батча для измерения.

    Returns:
        Dict: Метрики производительности (время, CPU/GPU, память).
    """
    process = psutil.Process()
    start_memory = process.memory_info().rss / 1024**2  # MB

    if model_type == 'hf':
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_checkpoint"])
        bert_model = AutoModelForTokenClassification.from_pretrained(model_path).to(device)
        model = NERModelWithCRF(len(CONFIG["label_list"])).to(device)
        model.bert = bert_model
        wrapped_model = HFWrapper(model, tokenizer)

        # Один пример
        start_time = time.time()
        _ = wrapped_model(sample_text if isinstance(sample_text, str) else sample_text[0])
        single_time = time.time() - start_time

        # Батч
        texts = [sample_text] * batch_size if isinstance(sample_text, str) else sample_text[:batch_size]
        start_time = time.time()
        for text in texts:
            _ = wrapped_model(text)
        batch_time = time.time() - start_time

    elif model_type == 'onnx':
        tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_checkpoint"])
        ort_session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider'])

        # Один пример
        start_time = time.time()
        tokenized = tokenizer([sample_text if isinstance(sample_text, str) else sample_text[0]],
                            padding=True, truncation=True, return_tensors="np")
        ort_inputs = {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]}
        _ = ort_session.run(None, ort_inputs)
        single_time = time.time() - start_time

        # Батч
        texts = [sample_text] * batch_size if isinstance(sample_text, str) else sample_text[:batch_size]
        tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors="np")
        start_time = time.time()
        _ = ort_session.run(None, {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]})
        batch_time = time.time() - start_time

    elif model_type == 'spacy':
        nlp = spacy.load(model_path)

        # Один пример
        start_time = time.time()
        _ = nlp(sample_text if isinstance(sample_text, str) else sample_text[0])
        single_time = time.time() - start_time

        # Батч
        texts = [sample_text] * batch_size if isinstance(sample_text, str) else sample_text[:batch_size]
        start_time = time.time()
        for text in texts:
            _ = nlp(text)
        batch_time = time.time() - start_time

    else:
        raise ValueError("model_type должен быть 'hf', 'onnx' или 'spacy'")

    end_memory = process.memory_info().rss / 1024**2  # MB
    memory_used = end_memory - start_memory
    cpu_usage = psutil.cpu_percent(interval=None)
    gpu_usage = torch.cuda.memory_allocated() / 1024**2 if torch.cuda.is_available() else 0  # MB

    return {
        "single_inference_time_s": single_time,
        "batch_inference_time_s": batch_time,
        "batch_size": batch_size,
        "cpu_usage_percent": cpu_usage,
        "gpu_usage_mb": gpu_usage,
        "memory_used_mb": memory_used
    }

In [None]:
# dataset_path = "data/cleared_data/"
# sample_text = "чипсы русская картошка краб"

# # BERT: скрининг и финальная модель
# print("\n=== BERT Screening ===")
# plot_screening_metrics("MODELS/cleared_data/bert_screening", dataset_path)
# plot_tuning_metrics("MODELS/cleared_data/bert", dataset_path)
# plot_cv_metrics("MODELS/cleared_data/bert", dataset_path)
# process_submission_hf("MODELS/cleared_data/bert_screening", f"{dataset_path}/submission.csv", "submission_hf_screening.csv")
# print("\nPerformance for BERT Screening:")
# print(measure_inference_performance("MODELS/cleared_data/bert_screening", "hf", sample_text))

# print("\n=== BERT Final ===")
# process_submission_hf("MODELS/cleared_data/bert", f"{dataset_path}/submission.csv", "submission_hf_final.csv")
# print("\nPerformance for BERT Final:")
# print(measure_inference_performance("MODELS/cleared_data/bert", "hf", sample_text))

# # ONNX
# print("\n=== BERT ONNX ===")
# process_submission_onnx("OUTPUT/cleared_data/bert/model.onnx", f"{dataset_path}/submission.csv", "submission_onnx.csv")
# print("\nPerformance for ONNX:")
# print(measure_inference_performance("OUTPUT/cleared_data/bert/model.onnx", "onnx", sample_text))

# # SpaCy
# print("\n=== SpaCy ===")
# plot_screening_metrics("MODELS/cleared_data/ru_core_news_lg", dataset_path)
# plot_tuning_metrics("MODELS/cleared_data/ru_core_news_lg", dataset_path)
# plot_cv_metrics("MODELS/cleared_data/ru_core_news_lg", dataset_path)
# process_submission_spacy("MODELS/cleared_data/ru_core_news_lg", f"{dataset_path}/submission.csv", "submission_spacy.csv")
# print("\nPerformance for SpaCy:")
# print(measure_inference_performance("MODELS/cleared_data/ru_core_news_lg", "spacy", sample_text))

In [12]:
entity_pairs = [
    (
        [(0, 7, 'B-TYPE')],  # авокадо
        [(0, 7, 'B-BRAND')]
    ),
    (
        [(0, 5, 'B-TYPE')],  # батат
        [(0, 5, 'O')]
    ),
    (
        [(0, 8, 'B-TYPE'), (9, 13, 'B-VOLUME'), (14, 20, 'B-BRAND'), (21, 28, 'I-BRAND')],
        [(0, 8, 'B-TYPE'), (9, 13, 'B-VOLUME'), (14, 20, 'B-BRAND'), (21, 28, 'I-BRAND')]
    )
]

macro_f1, f1_type, f1_brand, f1_volume, f1_percent = calculate_macro_f1(entity_pairs)
print(f"Macro-averaged F1-score: {macro_f1:.4f}")
print(f"F1 TYPE: {f1_type:.4f}")
print(f"F1 BRAND: {f1_brand:.4f}")
print(f"F1 VOLUME: {f1_volume:.4f}")
print(f"F1 PERCENT: {f1_percent:.4f}")

Macro-averaged F1-score: 0.7222
F1 TYPE: 0.5000
F1 BRAND: 0.6667
F1 VOLUME: 1.0000
F1 PERCENT: 0.0000


In [13]:
loaded_models = compare_model_dataset(
        model_paths=[f"{BASE_DIR}/MODELS/cleared_data/ru_core_news_lg_screening"],
        model_types=["spacy"],
        dataset_names=["cleared_data"]
    )

Loaded spacy model for cleared_data from /content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART/MODELS/cleared_data/ru_core_news_lg_screening


In [15]:
# Ручной вызов функций
for model, tokenizer, model_type, dataset_name in loaded_models:
    # Загрузка метрик
    metrics_path = f"{BASE_DIR}/OUTPUT/cleared_data/ru_core_news_lg/screening_metrics.csv"
    try:
        metrics_df = pd.read_csv(metrics_path)
        print(f"\n=== Metrics for {model_type}_{dataset_name} ===")
        print(metrics_df[['epoch', 'f1_macro', 'loss']].tail())
    except FileNotFoundError:
        print(f"Metrics not found at {metrics_path}")
        continue

    # Построение графика
    if 'metrics_df' in locals():
        plot_metrics_per_epoch(
            metrics_dfs=[metrics_df],
            model_names=[model_type],
            dataset_names=[dataset_name],
            output_path=f"{OUTPUT_DIR}/metrics_per_epoch_{model_type}_{dataset_name}.png"
        )
        print(f"Plot saved to {OUTPUT_DIR}/metrics_per_epoch_{model_type}_{dataset_name}.png")

    # Оценка (если есть test.csv)
    test_path = f"{BASE_DIR}/data/cleared_data/test.csv"
    test_data = load_test_data(test_path)
    if test_data:
        if model_type == 'spacy':
            metrics = evaluate_spacy(model, test_data)
        elif model_type == 'bert':
            metrics = evaluate_bert(model, tokenizer, test_data)
        elif model_type == 'frida':
            metrics = evaluate_frida(model, tokenizer, test_data)
        if metrics:
            print(f"\n=== Evaluation for {model_type}_{dataset_name} ===")
            print(f"F1_macro: {metrics['f1_macro']:.4f}, "
                  f"F1_TYPE: {metrics['f1_TYPE']:.4f}, F1_BRAND: {metrics['f1_BRAND']:.4f}, "
                  f"F1_VOLUME: {metrics['f1_VOLUME']:.4f}, F1_PERCENT: {metrics['f1_PERCENT']:.4f}")

    # Submission (если есть input.csv)
    input_path = f"{BASE_DIR}/data/cleared_data/input.csv"
    submission_output = f"{OUTPUT_DIR}/submission_{model_type}_{dataset_name}.csv"
    if os.path.exists(input_path):
        if model_type == 'spacy':
            process_submission_spacy(model, input_path, submission_output)
        elif model_type == 'bert':
            process_submission_bert(model, tokenizer, input_path, submission_output)
        elif model_type == 'frida':
            process_submission_frida(model, tokenizer, input_path, submission_output)


=== Metrics for spacy_cleared_data ===
    epoch  f1_macro       loss
10     11  0.806042  2020.3744
11     12  0.901783  1803.6511
12     13  0.828672  1715.7328
13     14  0.861513  1545.9503
14     15  0.816769  1453.2258
Plot saved to /content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART/OUTPUT/metrics_per_epoch_spacy_cleared_data.png
Test data not found at /content/drive/MyDrive/Colab Notebooks/entities-extraction-x5/ML_PART/data/cleared_data/test.csv
