# SGR Quality: Dialog-Level Executive Notebook

Цель: дать **супер понятную** аналитику качества продаж на уровне диалогов и кейсов.

Контракт (консистентен с `sgr_core.py` и `stability_case_review.md`):
- единица оценки: `conversation_id`;
- один bundled `evaluator` + один bundled `judge` на диалог;
- `greeting` считается только в первых 3 сообщениях продавца;
- `upsell` и `empathy` считаются по всему диалогу;
- evidence-якорение: `evidence_message_id` + `evidence_message_order`.


In [1]:
from __future__ import annotations

import json
import sqlite3
from pathlib import Path

import pandas as pd
from IPython.display import display

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 240)


def resolve_db_path() -> Path:
    cwd = Path.cwd().resolve()
    candidates = [cwd / "dialogs.db", cwd.parent / "dialogs.db"]
    candidates.extend(parent / "dialogs.db" for parent in cwd.parents)
    for path in candidates:
        if path.exists():
            return path
    raise FileNotFoundError("dialogs.db not found. Run: make init-fresh && make scan")


def qdf(sql: str, params: tuple[object, ...] = ()) -> pd.DataFrame:
    return pd.read_sql_query(sql, conn, params=params)


DB_PATH = resolve_db_path()
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row

run_row = conn.execute(
    """
    SELECT run_id, summary_json
    FROM scan_runs
    WHERE status='success'
    ORDER BY started_at_utc DESC
    LIMIT 1
    """
).fetchone()
if run_row is None:
    raise ValueError("No successful run found. Run: make scan")

RUN_ID = str(run_row["run_id"])
SUMMARY = json.loads(str(run_row["summary_json"] or "{}"))

print(f"Используется последний успешный run_id: {RUN_ID}")
print(f"База: {DB_PATH}")
print(f"metrics_version: {SUMMARY.get('metrics_version')}")


Используется последний успешный run_id: scan_fdd8a854d55b
База: /Users/ablackman/go/src/github.com/tetraminz/sales_protocol/dialogs.db
metrics_version: v5_dialog_level_bundle


## 1) KPI latest run

Быстрая операционная сводка: объем, покрытие judge, стабильность контрактной схемы и LLM-нагрузка.


In [2]:
llm_calls = qdf(
    """
    SELECT phase, COUNT(*) AS calls
    FROM llm_calls
    WHERE run_id=?
    GROUP BY phase
    ORDER BY phase
    """,
    (RUN_ID,),
)

phase_calls = {str(r["phase"]): int(r["calls"]) for _, r in llm_calls.iterrows()} if not llm_calls.empty else {}

kpi = pd.DataFrame(
    [
        {"Показатель": "Run ID", "Значение": RUN_ID},
        {"Показатель": "Metrics version", "Значение": SUMMARY.get("metrics_version", "n/a")},
        {"Показатель": "Selected conversations", "Значение": int(SUMMARY.get("selected_conversations", 0))},
        {"Показатель": "Evaluated conversations", "Значение": int(SUMMARY.get("evaluated_conversations", 0))},
        {
            "Показатель": "Skipped without seller",
            "Значение": int(SUMMARY.get("skipped_conversations_without_seller", 0)),
        },
        {"Показатель": "Rule checks inserted", "Значение": int(SUMMARY.get("inserted", 0))},
        {"Показатель": "Judge coverage", "Значение": f"{float(SUMMARY.get('judge_coverage', 0.0)):.1%}"},
        {
            "Показатель": "LLM calls (expected)",
            "Значение": int(SUMMARY.get("evaluated_conversations", 0)) * 2,
        },
        {"Показатель": "LLM calls evaluator", "Значение": int(phase_calls.get("evaluator", 0))},
        {"Показатель": "LLM calls judge", "Значение": int(phase_calls.get("judge", 0))},
    ]
)

display(kpi)


Unnamed: 0,Показатель,Значение
0,Run ID,scan_fdd8a854d55b
1,Metrics version,v5_dialog_level_bundle
2,Selected conversations,5
3,Evaluated conversations,5
4,Skipped without seller,0
5,Rule checks inserted,15
6,Judge coverage,100.0%
7,LLM calls (expected),10
8,LLM calls evaluator,5
9,LLM calls judge,5


## 2) Rule Stats (по диалогам)

`scan_metrics` отражает агрегаты качества по каждому правилу на dialog-level.


In [3]:
rule_metrics = qdf(
    """
    SELECT
      rule_key,
      eval_total,
      eval_true,
      ROUND(evaluator_hit_rate, 4) AS evaluator_hit_rate,
      ROUND(judge_correctness, 4) AS judge_correctness,
      ROUND(judge_coverage, 4) AS judge_coverage
    FROM scan_metrics
    WHERE run_id=?
    ORDER BY rule_key
    """,
    (RUN_ID,),
)

display(rule_metrics)

conv_quality = qdf(
    """
    SELECT
      conversation_id,
      ROUND(AVG(CASE WHEN judge_label=1 THEN 1.0 ELSE 0.0 END), 4) AS conversation_quality,
      SUM(CASE WHEN judge_label=0 THEN 1 ELSE 0 END) AS failed_rules,
      COUNT(*) AS total_rules
    FROM scan_results
    WHERE run_id=?
    GROUP BY conversation_id
    ORDER BY conversation_quality ASC, conversation_id
    """,
    (RUN_ID,),
)

display(conv_quality)


Unnamed: 0,rule_key,eval_total,eval_true,evaluator_hit_rate,judge_correctness,judge_coverage
0,empathy,5,5,1.0,1.0,1.0
1,greeting,5,5,1.0,1.0,1.0
2,upsell,5,5,1.0,1.0,1.0


Unnamed: 0,conversation_id,conversation_quality,failed_rules,total_rules
0,modamart__0_transcript,1.0,0,3
1,modamart__1_transcript,1.0,0,3
2,modamart__2_transcript,1.0,0,3
3,modamart__3_transcript,1.0,0,3
4,modamart__4_transcript,1.0,0,3


## 3) Greeting Window Контроль

Проверяем бизнес-инвариант: если `greeting` засчитан (`eval_hit=1`), его anchor должен быть в первых 3 seller-сообщениях.


In [4]:
greeting_rows = qdf(
    """
    SELECT
      conversation_id,
      eval_hit,
      eval_reason_code,
      evidence_message_id,
      evidence_message_order,
      CASE
        WHEN eval_hit=1 AND evidence_message_order <= 3 THEN 'ok'
        WHEN eval_hit=1 THEN 'violation'
        ELSE 'not_applicable'
      END AS greeting_window_check
    FROM scan_results
    WHERE run_id=? AND rule_key='greeting'
    ORDER BY conversation_id
    """,
    (RUN_ID,),
)

greeting_summary = qdf(
    """
    SELECT
      eval_reason_code,
      COUNT(*) AS dialogs
    FROM scan_results
    WHERE run_id=? AND rule_key='greeting'
    GROUP BY eval_reason_code
    ORDER BY dialogs DESC, eval_reason_code
    """,
    (RUN_ID,),
)

display(greeting_summary)
display(greeting_rows)

violations = int((greeting_rows["greeting_window_check"] == "violation").sum()) if not greeting_rows.empty else 0
print("Greeting-window violations:", violations)


Unnamed: 0,eval_reason_code,dialogs
0,greeting_present,5


Unnamed: 0,conversation_id,eval_hit,eval_reason_code,evidence_message_id,evidence_message_order,greeting_window_check
0,modamart__0_transcript,1,greeting_present,1,1,ok
1,modamart__1_transcript,1,greeting_present,20,1,ok
2,modamart__2_transcript,1,greeting_present,40,1,ok
3,modamart__3_transcript,1,greeting_present,60,1,ok
4,modamart__4_transcript,1,greeting_present,77,1,ok


Greeting-window violations: 0


## 4) Кейсы: что получилось / что не получилось

Показываем реальные кейсы с объяснениями evaluator/judge и anchor-цитатой.

- **Получилось**: `eval_hit=1` (правило реально достигнуто в диалоге).
- **Не получилось**: `eval_hit=0` (правило не достигнуто в диалоге).
- **QA-расхождения**: `judge_label=0` (несогласие judge с evaluator).


In [5]:
cases = qdf(
    """
    SELECT
      sr.conversation_id,
      sr.rule_key,
      sr.eval_hit,
      sr.judge_expected_hit,
      sr.judge_label,
      ROUND(sr.eval_confidence, 3) AS eval_conf,
      ROUND(sr.judge_confidence, 3) AS judge_conf,
      sr.eval_reason_code,
      sr.eval_reason,
      sr.judge_rationale,
      sr.evidence_quote,
      sr.evidence_message_id,
      sr.evidence_message_order,
      COALESCE(anchor.text, '') AS evidence_message_text
    FROM scan_results sr
    LEFT JOIN messages anchor ON anchor.message_id = sr.evidence_message_id
    WHERE sr.run_id=?
    ORDER BY sr.rule_key, sr.conversation_id
    """,
    (RUN_ID,),
)

achieved_cases = cases[(cases["eval_hit"] == 1) & (cases["judge_label"] == 1)].head(8).copy()
missed_cases = cases[cases["eval_hit"] == 0].head(8).copy()
qa_disagreements = cases[cases["judge_label"] == 0].head(8).copy()

print("ACHIEVED (eval_hit=1):", int((cases["eval_hit"] == 1).sum()))
display(achieved_cases)
print("MISSED (eval_hit=0):", int((cases["eval_hit"] == 0).sum()))
display(missed_cases)
print("QA disagreements (judge_label=0):", int((cases["judge_label"] == 0).sum()))
display(qa_disagreements)

rule_advice = {
    "greeting": "Добавить приветствие в первых трех seller-сообщениях.",
    "upsell": "Добавить конкретный следующий платный шаг по контексту диалога.",
    "empathy": "Явно признать ситуацию клиента перед предложением решения.",
}

def make_case_note(row: pd.Series) -> str:
    if int(row["eval_hit"]) == 1:
        note = (
            f"OK: rule={row['rule_key']}, reason={row['eval_reason_code']}, "
            f"anchor_order={row['evidence_message_order']}, quote='{str(row['evidence_quote'])[:80]}'"
        )
    else:
        note = (
            f"MISSED: rule={row['rule_key']}, reason={row['eval_reason_code']}. "
            f"Action: {rule_advice.get(str(row['rule_key']), 'Уточнить правило.')}"
        )
    if int(row["judge_label"]) == 0:
        note += f" | QA mismatch: {str(row['judge_rationale'])[:100]}"
    return note

notes = cases.head(12).copy()
notes["case_note"] = notes.apply(make_case_note, axis=1)
display(notes[["conversation_id", "rule_key", "eval_hit", "judge_label", "case_note"]])


ACHIEVED (eval_hit=1): 15


Unnamed: 0,conversation_id,rule_key,eval_hit,judge_expected_hit,judge_label,eval_conf,judge_conf,eval_reason_code,eval_reason,judge_rationale,evidence_quote,evidence_message_id,evidence_message_order,evidence_message_text
0,modamart__0_transcript,empathy,1,1,1,1.0,1.0,empathy_acknowledged,"Продавец признаёт и понимает беспокойства клиента несколько раз, включая фразу ""I completely understand your concerns"".",Продавец выражает понимание и сочувствие к сомнениям клиента несколько раз.,I completely understand your concerns.,5,5,I completely understand your concerns. Our winter jackets are made from high-quality materials designed to keep you warm even in the harshest conditions. May I ask what specifically you’re looking for in a jacket?
1,modamart__1_transcript,empathy,1,1,1,1.0,1.0,empathy_acknowledged,"Продавец признает и понимает ситуацию клиента, подтверждая его сомнения и предоставляя совет.",Продавец проявляет понимание и учитывает потребности клиента.,** I understand. Our insulated jackets are designed to keep you warm even in very cold temperatures. We use high-quality down and synthetic materials. Do you have any preferences between down or synthetic insulation?,24,5,** I understand. Our insulated jackets are designed to keep you warm even in very cold temperatures. We use high-quality down and synthetic materials. Do you have any preferences between down or synthetic insulation?
2,modamart__2_transcript,empathy,1,1,1,1.0,1.0,empathy_acknowledged,"Продавец признает и понимает беспокойства клиента, выражая понимание в сообщениях, например: ""I completely understand!"" и ""I hear you.""","Продавец выражает понимание и сочувствие беспокойствам клиента, что соответствует правилу.",I completely understand! I'll keep it brief. I wanted to touch base about your recent online browsing and see if you had any questions or needed assistance with anything.,42,3,I completely understand! I'll keep it brief. I wanted to touch base about your recent online browsing and see if you had any questions or needed assistance with anything.
3,modamart__3_transcript,empathy,1,1,1,1.0,1.0,empathy_acknowledged,"Продавец выражает понимание и сочувствие по поводу опасений клиента с высказыванием ""I understand your concerns."" и ""I'm really sorry to hear that."", что явно показывает эмпатию.",Продавец выражает понимание и сочувствие клиенту.,I understand your concerns. Online shopping can be tricky sometimes. Could you tell me more about what issues you've faced before?,62,3,I understand your concerns. Online shopping can be tricky sometimes. Could you tell me more about what issues you've faced before?
4,modamart__4_transcript,empathy,1,1,1,1.0,1.0,empathy_acknowledged,"Продавец выражает сочувствие к неприятностям клиента с предыдущей покупкой, демонстрируя понимание его ситуации.","Продавец выражает сочувствие и понимание проблем клиента с прошлой покупкой, что соответствует правилу empathy.",I'm sorry to hear that. Do you remember what brand or model it was? Maybe I can understand better what went wrong last time.,83,7,I'm sorry to hear that. Do you remember what brand or model it was? Maybe I can understand better what went wrong last time.
5,modamart__0_transcript,greeting,1,1,1,1.0,1.0,greeting_present,"Присутствует приветствие в первых трех сообщениях продавца, что задаёт дружелюбный и профессиональный тон.",Приветствие присутствует в первых трёх сообщениях продавца.,"Hi there! Thank you for taking the time to speak with me today. My name is Jamie, and I’m a sales representative from ModaMart. How are you today?",1,1,"Hi there! Thank you for taking the time to speak with me today. My name is Jamie, and I’m a sales representative from ModaMart. How are you today?"
6,modamart__1_transcript,greeting,1,1,1,1.0,1.0,greeting_present,"Приветствие присутствует в первых трех сообщениях продавца, задает дружелюбный тон.",Приветствие есть в первых трех сообщениях продавца.,"** Good morning, this is Sarah from ModaMart. How can I assist you today?",20,1,"** Good morning, this is Sarah from ModaMart. How can I assist you today?"
7,modamart__2_transcript,greeting,1,1,1,1.0,1.0,greeting_present,"Продавец приветствует клиента в первом сообщении, задавая дружественный тон диалогу.","В первых трёх сообщениях продавец приветствует клиента, что соответствует правилу.","Hi there, this is Jake from ModaMart. How are you today?",40,1,"Hi there, this is Jake from ModaMart. How are you today?"


MISSED (eval_hit=0): 0


Unnamed: 0,conversation_id,rule_key,eval_hit,judge_expected_hit,judge_label,eval_conf,judge_conf,eval_reason_code,eval_reason,judge_rationale,evidence_quote,evidence_message_id,evidence_message_order,evidence_message_text


QA disagreements (judge_label=0): 0


Unnamed: 0,conversation_id,rule_key,eval_hit,judge_expected_hit,judge_label,eval_conf,judge_conf,eval_reason_code,eval_reason,judge_rationale,evidence_quote,evidence_message_id,evidence_message_order,evidence_message_text


Unnamed: 0,conversation_id,rule_key,eval_hit,judge_label,case_note
0,modamart__0_transcript,empathy,1,1,"OK: rule=empathy, reason=empathy_acknowledged, anchor_order=5, quote='I completely understand your concerns.'"
1,modamart__1_transcript,empathy,1,1,"OK: rule=empathy, reason=empathy_acknowledged, anchor_order=5, quote='** I understand. Our insulated jackets are designed to keep you warm even in ver'"
2,modamart__2_transcript,empathy,1,1,"OK: rule=empathy, reason=empathy_acknowledged, anchor_order=3, quote='I completely understand! I'll keep it brief. I wanted to touch base about your r'"
3,modamart__3_transcript,empathy,1,1,"OK: rule=empathy, reason=empathy_acknowledged, anchor_order=3, quote='I understand your concerns. Online shopping can be tricky sometimes. Could you t'"
4,modamart__4_transcript,empathy,1,1,"OK: rule=empathy, reason=empathy_acknowledged, anchor_order=7, quote='I'm sorry to hear that. Do you remember what brand or model it was? Maybe I can '"
5,modamart__0_transcript,greeting,1,1,"OK: rule=greeting, reason=greeting_present, anchor_order=1, quote='Hi there! Thank you for taking the time to speak with me today. My name is Jamie'"
6,modamart__1_transcript,greeting,1,1,"OK: rule=greeting, reason=greeting_present, anchor_order=1, quote='** Good morning, this is Sarah from ModaMart. How can I assist you today?'"
7,modamart__2_transcript,greeting,1,1,"OK: rule=greeting, reason=greeting_present, anchor_order=1, quote='Hi there, this is Jake from ModaMart. How are you today?'"
8,modamart__3_transcript,greeting,1,1,"OK: rule=greeting, reason=greeting_present, anchor_order=1, quote='Good afternoon! Thank you for calling ModaMart. My name is Alex. How can I assis'"
9,modamart__4_transcript,greeting,1,1,"OK: rule=greeting, reason=greeting_present, anchor_order=1, quote='Good afternoon, thank you for calling ModaMart. My name is Sarah, how can I assi'"


## 5) Итог для бизнес-пайплайна

Короткий executive-резюме: что стабильно, где зона риска, что делать дальше.


In [6]:
if rule_metrics.empty:
    print("Нет данных rule_metrics для выбранного run")
else:
    weakest = rule_metrics.sort_values(["judge_correctness", "evaluator_hit_rate", "rule_key"]).iloc[0]
    achieved_total = int((cases["eval_hit"] == 1).sum()) if not cases.empty else 0
    missed_total = int((cases["eval_hit"] == 0).sum()) if not cases.empty else 0
    qa_mismatch_total = int((cases["judge_label"] == 0).sum()) if not cases.empty else 0

    print("1) Контрактная модель:", SUMMARY.get("metrics_version", "n/a"))
    print("2) Judge coverage:", f"{float(SUMMARY.get('judge_coverage', 0.0)):.1%}")
    print("3) Достигнутые правила (eval_hit=1):", achieved_total)
    print("4) Недостигнутые правила (eval_hit=0):", missed_total)
    print("5) QA расхождения (judge_label=0):", qa_mismatch_total)
    print(
        "6) Самое уязвимое правило:",
        f"{weakest['rule_key']} (judge_correctness={weakest['judge_correctness']:.4f}, evaluator_hit_rate={weakest['evaluator_hit_rate']:.4f})",
    )


1) Контрактная модель: v5_dialog_level_bundle
2) Judge coverage: 100.0%
3) Достигнутые правила (eval_hit=1): 15
4) Недостигнутые правила (eval_hit=0): 0
5) QA расхождения (judge_label=0): 0
6) Самое уязвимое правило: empathy (judge_correctness=1.0000, evaluator_hit_rate=1.0000)


## 6) Глоссарий полей (core)

Ключевые поля для чтения отчета и обсуждения с бизнесом.


In [7]:
glossary = pd.DataFrame(
    [
        {"Поле": "conversation_id", "Описание": "Идентификатор диалога (единица оценки)."},
        {"Поле": "rule_key", "Описание": "Бизнес-правило: greeting / upsell / empathy."},
        {"Поле": "eval_hit", "Описание": "Решение evaluator о срабатывании правила."},
        {"Поле": "eval_reason_code", "Описание": "Стандартизированный код причины evaluator."},
        {"Поле": "evidence_quote", "Описание": "Дословная цитата evidence из anchor-сообщения продавца."},
        {"Поле": "evidence_message_id", "Описание": "ID anchor-сообщения продавца."},
        {"Поле": "evidence_message_order", "Описание": "Порядок anchor-сообщения в диалоге."},
        {"Поле": "judge_label", "Описание": "Корректность evaluator по мнению независимого judge."},
        {"Поле": "judge_rationale", "Описание": "Краткая аргументация judge."},
    ]
)

display(glossary)


Unnamed: 0,Поле,Описание
0,conversation_id,Идентификатор диалога (единица оценки).
1,rule_key,Бизнес-правило: greeting / upsell / empathy.
2,eval_hit,Решение evaluator о срабатывании правила.
3,eval_reason_code,Стандартизированный код причины evaluator.
4,evidence_quote,Дословная цитата evidence из anchor-сообщения продавца.
5,evidence_message_id,ID anchor-сообщения продавца.
6,evidence_message_order,Порядок anchor-сообщения в диалоге.
7,judge_label,Корректность evaluator по мнению независимого judge.
8,judge_rationale,Краткая аргументация judge.
