# SGR Quality: Core Debug

Цель ноутбука: быстро проверить качество текущего run и прозрачно разобрать кейсы, где `judge_label=0`.


In [None]:
from __future__ import annotations

import json
import sqlite3
from pathlib import Path

import pandas as pd

DB_PATH = Path("dialogs.db")
if not DB_PATH.exists():
    raise FileNotFoundError("dialogs.db not found. Run: make init-fresh && make scan")

conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row

def qdf(sql: str, params: tuple[object, ...] = ()) -> pd.DataFrame:
    return pd.read_sql_query(sql, conn, params=params)


## 1) Run Snapshot


In [None]:
run_df = qdf(
    """
    SELECT run_id, status, model, conversation_from, conversation_to,
           selected_conversations, messages_count, started_at_utc, finished_at_utc, summary_json
    FROM scan_runs
    ORDER BY started_at_utc DESC
    LIMIT 1
    """
)
if run_df.empty:
    raise ValueError("No scan runs found. Run: make scan")

RUN_ID = str(run_df.loc[0, "run_id"])
SUMMARY = json.loads(str(run_df.loc[0, "summary_json"]) or "{}")

display(run_df.drop(columns=["summary_json"]))
display(pd.DataFrame([{
    "run_id": RUN_ID,
    "processed": SUMMARY.get("processed"),
    "inserted": SUMMARY.get("inserted"),
    "judged": SUMMARY.get("judged"),
    "skipped_due_to_errors": SUMMARY.get("skipped_due_to_errors"),
    "metrics_version": SUMMARY.get("metrics_version"),
}]))


## 2) Rule Quality + Heatmap


In [None]:
rule_metrics = qdf(
    """
    SELECT rule_key, judge_correctness, judged_total, judge_true, judge_false
    FROM scan_metrics
    WHERE run_id=?
    ORDER BY rule_key
    """
    , (RUN_ID,)
)
display(rule_metrics)

heatmap = qdf(
    """
    SELECT conversation_id, rule_key,
           SUM(CASE WHEN judge_label IS NOT NULL THEN 1 ELSE 0 END) AS judged_total,
           SUM(CASE WHEN judge_label=1 THEN 1 ELSE 0 END) AS correct_total,
           CASE
             WHEN SUM(CASE WHEN judge_label IS NOT NULL THEN 1 ELSE 0 END) = 0 THEN NULL
             ELSE 1.0 * SUM(CASE WHEN judge_label=1 THEN 1 ELSE 0 END)
                  / SUM(CASE WHEN judge_label IS NOT NULL THEN 1 ELSE 0 END)
           END AS score
    FROM scan_results
    WHERE run_id=?
    GROUP BY conversation_id, rule_key
    ORDER BY conversation_id, rule_key
    """
    , (RUN_ID,)
)
display(heatmap.head(20))

zone_counts = {"green": 0, "yellow": 0, "red": 0, "na": 0}
for score in heatmap["score"].tolist():
    if pd.isna(score):
        zone_counts["na"] += 1
    elif score >= 0.9:
        zone_counts["green"] += 1
    elif score >= 0.8:
        zone_counts["yellow"] += 1
    else:
        zone_counts["red"] += 1
display(pd.DataFrame([zone_counts]))


## 3) Judge-Confirmed Bad Cases (`judge_label=0`)


In [None]:
bad_cases = qdf(
    """
    SELECT sr.conversation_id, sr.message_id, sr.rule_key,
           sr.eval_hit, sr.judge_expected_hit,
           sr.eval_reason_code, sr.eval_reason, sr.judge_rationale,
           sr.evidence_quote, sr.eval_confidence, sr.judge_confidence,
           ABS(sr.eval_confidence - COALESCE(sr.judge_confidence, 0)) AS confidence_gap,
           m.text
    FROM scan_results sr
    JOIN messages m ON m.message_id = sr.message_id
    WHERE sr.run_id=? AND sr.judge_label=0
    ORDER BY confidence_gap DESC, sr.rule_key, sr.message_id
    """
    , (RUN_ID,)
)
print(f"Всего bad-cases: {len(bad_cases)}")
display(bad_cases.head(25))


## 4) Bad Case Drilldown (super clear)


In [None]:
if bad_cases.empty:
    print("bad-cases не найдено в текущем run")
else:
    case = bad_cases.iloc[0]
    print(
        f"Выбран кейс: conv={case['conversation_id']} msg={int(case['message_id'])} rule={case['rule_key']} "
        f"eval_hit={int(case['eval_hit'])} expected_hit={int(case['judge_expected_hit']) if pd.notna(case['judge_expected_hit']) else 'NA'}"
    )

    display(pd.DataFrame([case[[
        'conversation_id', 'message_id', 'rule_key', 'eval_hit', 'judge_expected_hit',
        'eval_reason_code', 'eval_reason', 'judge_rationale', 'evidence_quote',
        'eval_confidence', 'judge_confidence', 'confidence_gap'
    ]].to_dict()]))

    print('Текст сообщения:')
    print(str(case['text']))

    trace = qdf(
        """
        SELECT phase, attempt, parse_ok, validation_ok, response_http_status, error_message, latency_ms,
               request_json, extracted_json
        FROM llm_calls
        WHERE run_id=? AND message_id=? AND rule_key=?
        ORDER BY phase, attempt
        """
        , (RUN_ID, int(case['message_id']), str(case['rule_key']))
    )
    display(trace[['phase', 'attempt', 'parse_ok', 'validation_ok', 'response_http_status', 'error_message', 'latency_ms']])

    for i, row in trace.iterrows():
        print(f"\n--- TRACE {i + 1}: phase={row['phase']} attempt={int(row['attempt'])} ---")
        try:
            req = json.loads(str(row['request_json']))
            system = req.get('input', [{}])[0].get('content', '')
            user = req.get('input', [{}, {}])[1].get('content', '')
            print('SYSTEM PROMPT (first 400 chars):')
            print(str(system)[:400])
            print('USER PROMPT (first 600 chars):')
            print(str(user)[:600])
        except Exception as exc:
            print(f'failed to parse request_json: {exc}')

        print('EXTRACTED JSON (first 600 chars):')
        print(str(row['extracted_json'])[:600])


In [None]:
conn.close()
