In [None]:
import os
import json
from pathlib import Path
import numpy as np

import yaml
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, cohen_kappa_score

from openai import OpenAI

client = OpenAI(
    base_url="https://endpoint/v1",
)

In [None]:
# Папка с .txt транскриптами
TRANSCRIPTS_DIR = Path("/path/to/transcripts_dir")

# YAML с критериями/промптами
PROMPTS_YAML_PATH = Path("/path/to/prompts.yaml")

# Куда сохранить результат
OUTPUT_CSV_PATH = Path("/path/to/output.csv")

# Модель
LLM_MODEL = "gpt-oss_20b"

# Загружаем критерии и транскрипты

In [None]:
def load_criteria_from_yaml(path: Path):
    with open(path, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f)

    raw_criteria = data["criteria"]

    criteria = []
    for c in raw_criteria:
        criteria.append({
            "id": c.get("id"),
            "name": c.get("name"),
            "description": c.get("description"),
            "prompt": c.get("prompt")
        })
    return criteria


criteria = load_criteria_from_yaml(PROMPTS_YAML_PATH)

In [None]:
def load_transcripts_from_dir(dir_path: Path):
    transcripts = []

    for path in sorted(dir_path.glob("*.txt")):
        with open(path, "r", encoding="utf-8") as f:
            text = f.read()
        transcripts.append({
            "file_name": path.name,
            "path": str(path),
            "text": text
        })

    return transcripts

transcripts = load_transcripts_from_dir(TRANSCRIPTS_DIR)

# Подготовка к работе с LLM

In [None]:
SYSTEM_PROMPT = """
Ты — эксперт по контролю качества работы операторов контакт-центра.
Твоя задача — оценивать, насколько оператор выполняет заданный критерий.

Оцени по шкале:
0 — критерий не выполнен (плохо)
1 — критерий выполнен частично
2 — критерий выполнен полностью

Отвечай строго в формате JSON с полями:
{
  "score": 0/1/2,
  "explanation": "краткое объяснение на русском"
}
Без лишнего текста до или после JSON.
""".strip()


def build_user_prompt(conversation_text: str, criterion: dict) -> str:
    """
    Строим текст запроса к модели:
    - описание критерия
    - сам промпт критерия
    - сам диалог
    """
    template = f"""
Критерий: {criterion.get("name")}

Описание критерия:
{criterion.get("description")}

Инструкция для оценки:
{criterion.get("prompt")}

Транскрипт разговора (Оператор/Клиент):
\"\"\"text
{conversation_text}
\"\"\"

Оцени, насколько оператор удовлетворяет этому критерию.
Помни, нужно вернуть только JSON.
"""
    return template.strip()


def evaluate_conversation_with_criterion(conversation_text: str, criterion: dict) -> dict:
    user_prompt = build_user_prompt(conversation_text, criterion)

    response = client.chat.completions.create(
        model=LLM_MODEL,
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.0,
    )

    raw_content = response.choices[0].message.content

    try:
        result = json.loads(raw_content)
    except json.JSONDecodeError:
        raise ValueError(f"Невозможно распарсить JSON из ответа модели:\n{raw_content}")

    score = int(result.get("score"))
    if score not in (0, 1, 2):
        raise ValueError(f"Неверное значение score: {score}, ответ модели: {result}")

    explanation = result.get("explanation", "").strip()
    return {"score": score, "explanation": explanation}

# Прогон бейзлайна

In [None]:
results = []

for t in tqdm(transcripts, desc="Transcripts"):
    conv_text = t["text"]
    file_name = t["file_name"]

    for criterion in criteria:
        criterion_id = criterion.get("id")
        criterion_name = criterion.get("name")

        try:
            eval_result = evaluate_conversation_with_criterion(conv_text, criterion)
        except Exception as e:
            eval_result = {
                "score": None,
                "explanation": f"Ошибка оценки: {e}"
            }

        results.append({
            "file_name": file_name,
            "criterion_id": criterion_id,
            "criterion_name": criterion_name,
            "score": eval_result["score"],
            "explanation": eval_result["explanation"],
        })

df = pd.DataFrame(results)

# Загружаем человеческую разметку

In [None]:
HUMAN_LABELS_PATH = "/path/to/human_labels.csv"

human_df = pd.read_csv(HUMAN_LABELS_PATH)

In [None]:
merged = (
    df.merge(
        human_df,
        on=["file_name", "criterion_id"],
        how="inner",
        suffixes=("_llm", "_human"),
    )
)

# Метрики

In [None]:
def compute_metrics(y_true, y_pred, weighted=False):
    """
    weighted=False  -> обычные метрики
    weighted=True   -> квадратически-взвешенная Kappa
    """
    acc = accuracy_score(y_true, y_pred)

    if weighted:
        kappa = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    else:
        kappa = cohen_kappa_score(y_true, y_pred)

    return acc, kappa

In [None]:
criterion_metrics = []

for crit_id, g in merged.groupby("criterion_id"):
    y_true = g["human_label"]
    y_pred = g["score"]

    acc, kappa = compute_metrics(y_true, y_pred, weighted=True)

    criterion_metrics.append({
        "criterion_id": crit_id,
        "criterion_name": g["criterion_name"].iloc[0],
        "n_samples": len(g),
        "accuracy": acc,
        "cohen_kappa_weighted": kappa,
    })

criterion_metrics_df = pd.DataFrame(criterion_metrics).sort_values("criterion_id")

In [17]:
total_n = merged.shape[0]

criterion_metrics_df["weight"] = (
    criterion_metrics_df["n_samples"] / total_n
)

weighted_accuracy = round(np.sum(
    criterion_metrics_df["accuracy"] * criterion_metrics_df["weight"]
), 3)

weighted_kappa = round(np.sum(
    criterion_metrics_df["cohen_kappa_weighted"] * criterion_metrics_df["weight"]
), 3)

weighted_accuracy, weighted_kappa

(0.688, 0.512)

# Прогон fine-tuned модели

In [None]:
# YAML с новыми промптами (few-shot)
FINETUNE_PROMPTS_YAML_PATH = Path("/path/to/finetune_prompts.yaml")

OUTPUT_CSV_FINETUNE_PATH = Path("/path/to/output_finetune.csv")

FINETUNE_MODEL_NAME = "gpt-oss_20b-finetuned"

finetune_client = OpenAI(
    base_url="https://endpoint/v1",
)

In [None]:
finetune_criteria = load_criteria_from_yaml(FINETUNE_PROMPTS_YAML_PATH)

In [None]:
def build_finetune_messages(conversation_text: str, criterion: dict):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT}
    ]

    for ex in criterion.get("few_shots", []):
        ex_conv = ex.get("conversation", "")
        ex_answer = ex.get("answer_json", "").strip()

        ex_user = f"""
Критерий: {criterion.get("name")}

Описание критерия:
{criterion.get("description")}

Инструкция:
{criterion.get("prompt")}

Транскрипт разговора:
\"\"\"text
{ex_conv}
\"\"\"
Оцени, насколько оператор выполняет критерий. Ответь JSON.
""".strip()

        messages.append({"role": "user", "content": ex_user})
        messages.append({"role": "assistant", "content": ex_answer})

    final_user = f"""
Критерий: {criterion.get("name")}

Описание критерия:
{criterion.get("description")}

Инструкция:
{criterion.get("prompt")}

Транскрипт разговора:
\"\"\"text
{conversation_text}
\"\"\"
Оцени, насколько оператор выполняет критерий. Ответь JSON.
""".strip()

    messages.append({"role": "user", "content": final_user})
    return messages


def evaluate_conversation_with_criterion_finetune(conversation_text: str, criterion: dict) -> dict:
    """
    Вызывает fine-tuned модель и возвращает:
    {
        "score": 0/1/2 или None,
        "explanation": str
    }
    """
    messages = build_finetune_messages(conversation_text, criterion)

    response = finetune_client.chat.completions.create(
        model=FINETUNE_MODEL_NAME,
        messages=messages,
        temperature=0.0,
    )

    raw_content = response.choices[0].message.content

    try:
        result = json.loads(raw_content)
    except json.JSONDecodeError:
        raise ValueError(f"[finetune] Невозможно распарсить JSON:\n{raw_content}")

    score = int(result.get("score"))
    if score not in (0, 1, 2):
        raise ValueError(f"[finetune] Неверный score: {score}, ответ: {result}")

    explanation = result.get("explanation", "").strip()
    return {"score": score, "explanation": explanation}

In [None]:
finetune_results = []

for t in tqdm(transcripts, desc="Transcripts (finetune)"):
    conv_text = t["text"]
    file_name = t["file_name"]

    for criterion in finetune_criteria:
        criterion_id = criterion.get("id")
        criterion_name = criterion.get("name")

        try:
            eval_result = evaluate_conversation_with_criterion_finetune(conv_text, criterion)
        except Exception as e:
            eval_result = {
                "score": None,
                "explanation": f"Ошибка оценки (finetune): {e}"
            }

        finetune_results.append({
            "file_name": file_name,
            "criterion_id": criterion_id,
            "criterion_name": criterion_name,
            "score_finetune": eval_result["score"],
            "explanation_finetune": eval_result["explanation"],
        })

In [None]:
df_finetune = pd.DataFrame(finetune_results)

In [None]:
merged_finetune = (
    df_finetune.merge(
        human_df,
        on=["file_name", "criterion_id"],
        how="inner",
    )
)

In [None]:
criterion_metrics_finetune = []

for crit_id, g in merged_finetune.groupby("criterion_id"):
    y_true = g["human_label"]
    y_pred = g["score_finetune"]

    acc, kappa = compute_metrics(y_true, y_pred, weighted=True)

    criterion_metrics_finetune.append({
        "criterion_id": crit_id,
        "criterion_name": g["criterion_name"].iloc[0],
        "n_samples": len(g),
        "accuracy_finetune": acc,
        "cohen_kappa_weighted_finetune": kappa,
    })

criterion_metrics_finetune_df = (
    pd.DataFrame(criterion_metrics_finetune)
    .sort_values("criterion_id")
)

In [19]:
total_n_finetune = merged_finetune.shape[0]
criterion_metrics_finetune_df["weight"] = (
    criterion_metrics_finetune_df["n_samples"] / total_n_finetune
)

weighted_accuracy_finetune = np.sum(
    criterion_metrics_finetune_df["accuracy_finetune"] * criterion_metrics_finetune_df["weight"]
)

weighted_kappa_finetune = np.sum(
    criterion_metrics_finetune_df["cohen_kappa_weighted_finetune"] * criterion_metrics_finetune_df["weight"]
)

weighted_accuracy_finetune, weighted_kappa_finetune

(0.873, 0.809)