# SGR Full Debug: Последний Run

Этот ноутбук делает полный разбор **последнего scan-run** из `dialogs.db`:
- что получилось,
- что не получилось,
- где есть пропуски/ошибки,
- какие кейсы judge посчитал плохими.

In [1]:
from pathlib import Path
import json
import re
import sqlite3
import textwrap
import sys

import pandas as pd

ROOT = Path("..").resolve()
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))

from dialogs.sgr_core import quality_thresholds, threshold_doc_line

pd.set_option("display.max_colwidth", 220)
pd.set_option("display.max_rows", 200)

DB_PATH = '../dialogs.db'

conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row


def qdf(sql: str, params: tuple = ()) -> pd.DataFrame:
    return pd.read_sql_query(sql, conn, params=params)


QUALITY = quality_thresholds()
exe = sys.executable.replace('\\', '/')
print(f"Python executable: {sys.executable}")
if '/.venv/' not in exe:
    print("WARNING: выберите kernel 'Python (dialogs-sgr)' чтобы использовать зависимости проекта.")
print(f"DB: {DB_PATH}")
print(f"Thresholds: {threshold_doc_line(thresholds=QUALITY)}")


Python executable: /Users/ablackman/go/src/github.com/tetraminz/sales_protocol/.venv/bin/python
DB: ../dialogs.db
Thresholds: green >= 0.90, yellow >= 0.80, red < 0.80, na = no_judged


## 1) Run для анализа: последний успешный (fallback: последний любой)

In [2]:
run_df = qdf(
    """
    SELECT run_id, status, model, conversation_from, conversation_to,
           selected_conversations, messages_count, started_at_utc, finished_at_utc, summary_json
    FROM scan_runs
    ORDER BY CASE WHEN status='success' THEN 0 ELSE 1 END, started_at_utc DESC
    LIMIT 1
    """
)

HAS_RUN = not run_df.empty
if not HAS_RUN:
    print("Нет scan-run в БД. Запустите: make scan")
else:
    run_row = run_df.iloc[0].to_dict()
    RUN_ID = str(run_row["run_id"])
    summary = json.loads(run_row.get("summary_json") or "{}")

    started = pd.to_datetime(run_row["started_at_utc"], utc=True, errors="coerce")
    finished = pd.to_datetime(run_row["finished_at_utc"], utc=True, errors="coerce")
    duration_sec = None
    if pd.notna(started) and pd.notna(finished):
        duration_sec = (finished - started).total_seconds()

    snapshot = {
        "run_id": RUN_ID,
        "status": run_row.get("status"),
        "model": run_row.get("model"),
        "metrics_version": summary.get("metrics_version"),
        "conversation_range": f"{run_row.get('conversation_from')}..{run_row.get('conversation_to')}",
        "selected_conversations": run_row.get("selected_conversations"),
        "messages_total": run_row.get("messages_count"),
        "seller_messages": summary.get("seller_messages"),
        "customer_context_only": summary.get("customer_messages_context_only"),
        "processed": summary.get("processed"),
        "inserted": summary.get("inserted"),
        "judged": summary.get("judged"),
        "skipped_due_to_errors": summary.get("skipped_due_to_errors"),
        "evidence_mismatch_skipped": summary.get("evidence_mismatch_skipped"),
        "judge_inconsistency_soft_flags": summary.get("judge_inconsistency_soft_flags"),
        "schema_errors": summary.get("schema_errors"),
        "non_schema_errors": summary.get("non_schema_errors"),
        "duration_sec": duration_sec,
        "started_at_utc": run_row.get("started_at_utc"),
        "finished_at_utc": run_row.get("finished_at_utc"),
    }

    pd.DataFrame([snapshot])


## 2) LLM pipeline health (calls/errors/coverage)

In [3]:
if not HAS_RUN:
    print("Нет данных")
else:
    llm_health = qdf(
        """
        SELECT phase, rule_key, attempt,
               COUNT(*) AS calls,
               SUM(CASE WHEN error_message<>'' THEN 1 ELSE 0 END) AS errors,
               SUM(CASE WHEN parse_ok=0 THEN 1 ELSE 0 END) AS parse_fail,
               SUM(CASE WHEN validation_ok=0 THEN 1 ELSE 0 END) AS validation_fail,
               ROUND(AVG(latency_ms), 1) AS avg_latency_ms
        FROM llm_calls
        WHERE run_id=?
        GROUP BY phase, rule_key, attempt
        ORDER BY phase, rule_key, attempt
        """,
        (RUN_ID,),
    )

    seller_messages = int(summary.get("seller_messages") or 0)
    expected_eval_calls = seller_messages * 3
    actual_eval_calls = int(
        qdf("SELECT COUNT(*) AS c FROM llm_calls WHERE run_id=? AND phase='evaluator'", (RUN_ID,)).iloc[0]["c"]
    )
    unique_eval_cases = int(
        qdf(
            """
            SELECT COUNT(*) AS c
            FROM (
              SELECT DISTINCT run_id, conversation_id, message_id, rule_key
              FROM llm_calls
              WHERE run_id=? AND phase='evaluator'
            )
            """,
            (RUN_ID,),
        ).iloc[0]["c"]
    )
    dropped_cases = int(
        qdf(
            """
            WITH e AS (
              SELECT run_id, conversation_id, message_id, rule_key
              FROM llm_calls
              WHERE run_id=? AND phase='evaluator'
              GROUP BY run_id, conversation_id, message_id, rule_key
            )
            SELECT COUNT(*) AS c
            FROM e
            LEFT JOIN scan_results r
              ON r.run_id=e.run_id AND r.message_id=e.message_id AND r.rule_key=e.rule_key
            WHERE r.result_id IS NULL
            """,
            (RUN_ID,),
        ).iloc[0]["c"]
    )
    inserted = int(summary.get("inserted") or 0)
    judged = int(summary.get("judged") or 0)
    judge_coverage = (judged / inserted) if inserted else 0.0

    coverage = pd.DataFrame([
        {
            "expected_evaluator_calls": expected_eval_calls,
            "actual_evaluator_calls": actual_eval_calls,
            "unique_eval_cases": unique_eval_cases,
            "inserted_results": inserted,
            "dropped_cases": dropped_cases,
            "judged_results": judged,
            "judge_coverage": round(judge_coverage, 4),
        }
    ])

    print("### Aggregated health")
    display(coverage)
    print()
    print("### LLM calls breakdown")
    display(llm_health)


### Aggregated health


Unnamed: 0,expected_evaluator_calls,actual_evaluator_calls,unique_eval_cases,inserted_results,dropped_cases,judged_results,judge_coverage
0,147,150,147,146,1,146,1.0



### LLM calls breakdown


Unnamed: 0,phase,rule_key,attempt,calls,errors,parse_fail,validation_fail,avg_latency_ms
0,evaluator,empathy,1,49,0,0,0,2280.6
1,evaluator,empathy,2,1,0,0,0,1800.0
2,evaluator,greeting,1,49,0,0,0,2345.6
3,evaluator,upsell,1,49,0,0,0,2451.9
4,evaluator,upsell,2,2,0,0,0,2551.0
5,judge,empathy,1,48,0,0,0,1758.5
6,judge,greeting,1,49,0,0,0,1558.6
7,judge,upsell,1,49,0,0,0,1754.8


## 3) Качество по правилам (таблично)

In [4]:
if not HAS_RUN:
    print("Нет данных")
else:
    metrics_long = qdf(
        """
        SELECT rule_key, metric_name, metric_value
        FROM scan_metrics
        WHERE run_id=?
        """,
        (RUN_ID,),
    )

    if metrics_long.empty:
        print("Для run нет метрик")
    else:
        metrics = metrics_long.pivot_table(
            index="rule_key",
            columns="metric_name",
            values="metric_value",
            aggfunc="first",
        ).reset_index()

        if "accuracy" not in metrics.columns:
            metrics["accuracy"] = pd.NA
        if "judge_correctness" not in metrics.columns:
            metrics["judge_correctness"] = metrics["accuracy"]
        metrics["judge_correctness"] = metrics["judge_correctness"].fillna(metrics["accuracy"])

        cols_order = [
            "rule_key", "judge_correctness", "accuracy", "precision", "recall", "f1", "coverage",
            "tp", "fp", "tn", "fn", "total"
        ]
        for c in cols_order:
            if c not in metrics.columns:
                metrics[c] = pd.NA
        metrics = metrics[cols_order].sort_values(by=["judge_correctness", "f1"], ascending=False)

        try:
            import matplotlib  # noqa: F401
            has_matplotlib = True
        except Exception:
            has_matplotlib = False

        if has_matplotlib:
            styled = metrics.style.format(
                {
                    "judge_correctness": "{:.3f}",
                    "accuracy": "{:.3f}",
                    "precision": "{:.3f}",
                    "recall": "{:.3f}",
                    "f1": "{:.3f}",
                    "coverage": "{:.3f}",
                }
            ).background_gradient(
                subset=["judge_correctness", "accuracy", "precision", "recall", "f1", "coverage"],
                cmap="RdYlGn",
            )
            display(styled)
        else:
            print("matplotlib не установлен: показываю plain DataFrame без background_gradient")
            display(metrics.round(3))


metric_name,rule_key,judge_correctness,accuracy,precision,recall,f1,coverage,tp,fp,tn,fn,total
1,greeting,1.0,1.0,1.0,1.0,1.0,0.122,6.0,0.0,43.0,0.0,49.0
2,upsell,0.98,0.98,1.0,0.909,0.952,0.204,10.0,0.0,38.0,1.0,49.0
0,empathy,0.958,0.958,1.0,0.905,0.95,0.396,19.0,0.0,27.0,2.0,48.0


## 4) Judge bad-cases (judge_label = 0)

In [5]:
TOP_N = 25

if not HAS_RUN:
    print("Нет данных")
else:
    bad_cases = qdf(
        """
        SELECT r.rule_key, r.conversation_id, r.message_id,
               m.speaker_label, m.text,
               r.eval_hit, r.eval_confidence, r.judge_confidence,
               r.eval_reason, r.judge_rationale
        FROM scan_results r
        JOIN messages m ON m.message_id = r.message_id
        WHERE r.run_id=?
          AND r.judge_label=0
        ORDER BY r.rule_key, r.message_id
        """,
        (RUN_ID,),
    )

    print(f"Всего bad-cases judge_label=0: {len(bad_cases)}")
    display(bad_cases.head(TOP_N))

    print("\n### bad-cases, где evaluator сработал (eval_hit=1):")
    display(bad_cases[bad_cases["eval_hit"] == 1].head(TOP_N))

    print("\n### bad-cases, где evaluator не сработал (eval_hit=0):")
    display(bad_cases[bad_cases["eval_hit"] == 0].head(TOP_N))


Всего bad-cases judge_label=0: 3


Unnamed: 0,rule_key,conversation_id,message_id,speaker_label,text,eval_hit,eval_confidence,judge_confidence,eval_reason,judge_rationale
0,empathy,modamart__1_transcript,4967,Sales Rep,"** Hi John! Thanks for reaching out. I’d be happy to help you find the perfect jacket. Are you looking for something specific, like a certain material or style?",0,0.9,0.95,"The message contains a polite greeting and offer to help, but it does not explicitly acknowledge or recognize the customer's feelings, concerns, or situation, thus lacks true empathy.","The customer's message expresses uncertainty about choosing a jacket among similar options, indicating a need for empathetic acknowledgment. The sales rep's response, while polite and helpful, does not explicitly rec..."
1,empathy,modamart__1_transcript,4971,Sales Rep,"** That’s a great point. Down is incredibly warm, but it can lose its insulating properties when wet. Synthetic insulation, on the other hand, retains warmth even when damp. If you’re expecting to face wet conditions...",0,0.9,1.0,"The message provides detailed information and acknowledges the customer's point ('That’s a great point'), but it does not explicitly express empathy towards the customer's concern or emotional state.","The message includes an empathetic phrase 'That’s a great point' acknowledging the customer's concern, which fulfills the empathy criterion. Therefore, expected_hit is true. The evaluator labeled hit as false, which ..."
2,upsell,modamart__3_transcript,5017,Sales Rep,"Great to hear! You can visit our website and sign up for the ModaMart Rewards program right away. If you need any assistance with sizing or product details, feel free to reach out to our customer service team. They'l...",0,0.9,1.0,"The message does not contain an explicit offer of an additional paid option, tariff, or package; it only encourages signing up for a rewards program and offers assistance, which is informational but not an upsell.","The message explicitly encourages signing up for the ModaMart Rewards program, which is an additional package offering exclusive discounts and early access to sales. This fits the upsell criterion of offering an addi..."



### bad-cases, где evaluator сработал (eval_hit=1):


Unnamed: 0,rule_key,conversation_id,message_id,speaker_label,text,eval_hit,eval_confidence,judge_confidence,eval_reason,judge_rationale



### bad-cases, где evaluator не сработал (eval_hit=0):


Unnamed: 0,rule_key,conversation_id,message_id,speaker_label,text,eval_hit,eval_confidence,judge_confidence,eval_reason,judge_rationale
0,empathy,modamart__1_transcript,4967,Sales Rep,"** Hi John! Thanks for reaching out. I’d be happy to help you find the perfect jacket. Are you looking for something specific, like a certain material or style?",0,0.9,0.95,"The message contains a polite greeting and offer to help, but it does not explicitly acknowledge or recognize the customer's feelings, concerns, or situation, thus lacks true empathy.","The customer's message expresses uncertainty about choosing a jacket among similar options, indicating a need for empathetic acknowledgment. The sales rep's response, while polite and helpful, does not explicitly rec..."
1,empathy,modamart__1_transcript,4971,Sales Rep,"** That’s a great point. Down is incredibly warm, but it can lose its insulating properties when wet. Synthetic insulation, on the other hand, retains warmth even when damp. If you’re expecting to face wet conditions...",0,0.9,1.0,"The message provides detailed information and acknowledges the customer's point ('That’s a great point'), but it does not explicitly express empathy towards the customer's concern or emotional state.","The message includes an empathetic phrase 'That’s a great point' acknowledging the customer's concern, which fulfills the empathy criterion. Therefore, expected_hit is true. The evaluator labeled hit as false, which ..."
2,upsell,modamart__3_transcript,5017,Sales Rep,"Great to hear! You can visit our website and sign up for the ModaMart Rewards program right away. If you need any assistance with sizing or product details, feel free to reach out to our customer service team. They'l...",0,0.9,1.0,"The message does not contain an explicit offer of an additional paid option, tariff, or package; it only encourages signing up for a rewards program and offers assistance, which is informational but not an upsell.","The message explicitly encourages signing up for the ModaMart Rewards program, which is an additional package offering exclusive discounts and early access to sales. This fits the upsell criterion of offering an addi..."


## 5) Disagreements: evaluator vs judge

In [6]:
if not HAS_RUN:
    print("Нет данных")
else:
    disagree = qdf(
        """
        SELECT r.rule_key, r.conversation_id, r.message_id,
               m.speaker_label, m.text,
               r.eval_hit, r.judge_label,
               r.eval_confidence, r.judge_confidence,
               r.eval_reason, r.judge_rationale
        FROM scan_results r
        JOIN messages m ON m.message_id = r.message_id
        WHERE r.run_id=?
          AND r.judge_label=0
        ORDER BY r.rule_key, r.message_id
        """,
        (RUN_ID,),
    )

    if disagree.empty:
        print("Расхождений evaluator vs judge нет")
    else:
        disagree["confidence_gap"] = (disagree["eval_confidence"] - disagree["judge_confidence"]).abs()
        disagree = disagree.sort_values("confidence_gap", ascending=False)
        print(f"Всего disagreements: {len(disagree)}")
        display(disagree.head(TOP_N))


Всего disagreements: 3


Unnamed: 0,rule_key,conversation_id,message_id,speaker_label,text,eval_hit,judge_label,eval_confidence,judge_confidence,eval_reason,judge_rationale,confidence_gap
1,empathy,modamart__1_transcript,4971,Sales Rep,"** That’s a great point. Down is incredibly warm, but it can lose its insulating properties when wet. Synthetic insulation, on the other hand, retains warmth even when damp. If you’re expecting to face wet conditions...",0,0,0.9,1.0,"The message provides detailed information and acknowledges the customer's point ('That’s a great point'), but it does not explicitly express empathy towards the customer's concern or emotional state.","The message includes an empathetic phrase 'That’s a great point' acknowledging the customer's concern, which fulfills the empathy criterion. Therefore, expected_hit is true. The evaluator labeled hit as false, which ...",0.1
2,upsell,modamart__3_transcript,5017,Sales Rep,"Great to hear! You can visit our website and sign up for the ModaMart Rewards program right away. If you need any assistance with sizing or product details, feel free to reach out to our customer service team. They'l...",0,0,0.9,1.0,"The message does not contain an explicit offer of an additional paid option, tariff, or package; it only encourages signing up for a rewards program and offers assistance, which is informational but not an upsell.","The message explicitly encourages signing up for the ModaMart Rewards program, which is an additional package offering exclusive discounts and early access to sales. This fits the upsell criterion of offering an addi...",0.1
0,empathy,modamart__1_transcript,4967,Sales Rep,"** Hi John! Thanks for reaching out. I’d be happy to help you find the perfect jacket. Are you looking for something specific, like a certain material or style?",0,0,0.9,0.95,"The message contains a polite greeting and offer to help, but it does not explicitly acknowledge or recognize the customer's feelings, concerns, or situation, thus lacks true empathy.","The customer's message expresses uncertainty about choosing a jacket among similar options, indicating a need for empathetic acknowledgment. The sales rep's response, while polite and helpful, does not explicitly rec...",0.05


## 6) Dropped/Skipped кейсы (вызов evaluator был, результата нет)

In [7]:
if not HAS_RUN:
    print("Нет данных")
else:
    dropped = qdf(
        """
        WITH e AS (
          SELECT run_id, conversation_id, message_id, rule_key, MIN(call_id) AS call_id
          FROM llm_calls
          WHERE run_id=? AND phase='evaluator'
          GROUP BY run_id, conversation_id, message_id, rule_key
        )
        SELECT e.rule_key, e.conversation_id, e.message_id, m.speaker_label, m.text, e.call_id
        FROM e
        LEFT JOIN scan_results r
          ON r.run_id=e.run_id AND r.message_id=e.message_id AND r.rule_key=e.rule_key
        JOIN messages m ON m.message_id = e.message_id
        WHERE r.result_id IS NULL
        ORDER BY e.call_id
        """,
        (RUN_ID,),
    )

    print(f"Dropped/Skipped кейсов: {len(dropped)}")
    display(dropped.head(100))

Dropped/Skipped кейсов: 1


Unnamed: 0,rule_key,conversation_id,message_id,speaker_label,text,call_id
0,empathy,modamart__4_transcript,5024,Sales Rep,"Of course! What products are you looking at, and what concerns do you have?",485


## 7) Авто-вывод: что получилось / что не получилось

In [8]:
if not HAS_RUN:
    print("Нет run для интерпретации")
else:
    schema_errors = int(summary.get("schema_errors") or 0)
    evidence_skips = int(summary.get("evidence_mismatch_skipped") or 0)
    skipped = int(summary.get("skipped_due_to_errors") or 0)
    soft_flags = int(summary.get("judge_inconsistency_soft_flags") or 0)
    inserted = int(summary.get("inserted") or 0)
    judged = int(summary.get("judged") or 0)
    judge_coverage = (judged / inserted) if inserted else 0.0

    metrics_short = qdf(
        """
        SELECT rule_key,
               MAX(CASE WHEN metric_name='judge_correctness' THEN metric_value END) AS judge_correctness,
               MAX(CASE WHEN metric_name='accuracy' THEN metric_value END) AS accuracy,
               MAX(CASE WHEN metric_name='precision' THEN metric_value END) AS precision,
               MAX(CASE WHEN metric_name='recall' THEN metric_value END) AS recall,
               MAX(CASE WHEN metric_name='f1' THEN metric_value END) AS f1
        FROM scan_metrics
        WHERE run_id=?
        GROUP BY rule_key
        ORDER BY rule_key
        """,
        (RUN_ID,),
    )
    if not metrics_short.empty:
        metrics_short["judge_correctness"] = metrics_short["judge_correctness"].fillna(metrics_short["accuracy"])
        metrics_short = metrics_short.sort_values("judge_correctness", ascending=False)

    good = []
    bad = []
    actions = []

    if schema_errors == 0:
        good.append("Schema-level ошибок нет: JSON schema/validation контур устойчив.")
    else:
        bad.append(f"Есть schema-level ошибки: {schema_errors}.")

    if judge_coverage >= QUALITY.judge_coverage_min:
        good.append(f"Judge coverage высокий: {judge_coverage:.1%}.")
    else:
        bad.append(f"Judge coverage низкий: {judge_coverage:.1%} (порог {QUALITY.judge_coverage_min:.0%}).")

    if not metrics_short.empty and metrics_short["judge_correctness"].notna().any():
        top_rule = metrics_short.iloc[0]
        good.append(
            f"Лучшее правило по judge_correctness: {top_rule['rule_key']} "
            f"({top_rule['judge_correctness']:.3f})."
        )

        weak = metrics_short[metrics_short["judge_correctness"] < QUALITY.rule_alert_min]
        if not weak.empty:
            bad.append(
                "Есть правила ниже quality-порога "
                f"({QUALITY.rule_alert_min:.2f}): "
                + ", ".join(weak["rule_key"].tolist())
            )
    else:
        bad.append("Нет достаточных метрик judge_correctness/accuracy для интерпретации.")

    md_path = ROOT / "artifacts" / "metrics.md"
    if not md_path.exists():
        bad.append("Report sync check: artifacts/metrics.md не найден.")
    else:
        md_text = md_path.read_text(encoding="utf-8")
        if f"current_run_id: `{RUN_ID}`" not in md_text:
            bad.append("Report sync check: metrics.md собран для другого run_id.")
        else:
            md_map = {}
            for line in md_text.splitlines():
                m = re.match(r"^\|\s*`([^`]+)`\s*\|\s*([0-9.]+)\s*\|\s*([0-9.]+)\s*\|\s*([+-]?[0-9.]+)\s*\|$", line)
                if m:
                    md_map[m.group(1)] = float(m.group(3))
            metric_map = {
                str(r["rule_key"]): float(r["judge_correctness"])
                for _, r in metrics_short.iterrows()
                if pd.notna(r["judge_correctness"])
            }
            if not metric_map:
                bad.append("Report sync check: отсутствуют числовые rule-level метрики.")
            else:
                missing = sorted(set(metric_map) - set(md_map))
                drift = [abs(metric_map[k] - md_map[k]) for k in metric_map if k in md_map]
                if missing:
                    bad.append("Report sync check: нет правил в metrics.md: " + ", ".join(missing))
                elif drift and max(drift) > 1e-9:
                    bad.append(f"Report sync check: обнаружен drift метрик (max={max(drift):.6f}).")
                else:
                    good.append("Report sync check: notebook и metrics.md синхронизированы по rule-level метрикам.")

    if soft_flags > 0:
        bad.append(f"Есть judge inconsistency soft-flags: {soft_flags}.")
        actions.append("Проверить кейсы с конфликтом label/expected_hit/rationale и уточнить judge prompt.")

    if evidence_skips > 0:
        bad.append(f"Пропуски из-за evidence mismatch после retry: {evidence_skips}.")
        actions.append("Проверить dropped/skip кейсы и усилить prompt-инструкцию для evidence по проблемным rule_key.")

    if skipped > 0:
        bad.append(f"Общий счетчик skipped_due_to_errors: {skipped}.")
        actions.append("Сверить llm_calls (attempt=2) и тексты сообщений, где отсутствует запись в scan_results.")

    actions.append("Просмотреть топ-25 bad-cases judge_label=0 и выделить 3 повторяющихся паттерна ошибок.")
    actions.append("Для правил ниже порога качества подготовить набор targeted prompt-fixes по reason_code.")

    print("Что получилось:")
    for x in good:
        print("-", x)

    print()
    print("Что не получилось:")
    if bad:
        for x in bad:
            print("-", x)
    else:
        print("- Критичных проблем по текущему run не найдено.")

    print()
    print("Что проверить дальше:")
    for x in actions[:3]:
        print("-", x)


Что получилось:
- Schema-level ошибок нет: JSON schema/validation контур устойчив.
- Judge coverage высокий: 100.0%.
- Лучшее правило по judge_correctness: greeting (1.000).

Что не получилось:
- Report sync check: обнаружен drift метрик (max=0.000033).
- Пропуски из-за evidence mismatch после retry: 1.
- Общий счетчик skipped_due_to_errors: 1.

Что проверить дальше:
- Проверить dropped/skip кейсы и усилить prompt-инструкцию для evidence по проблемным rule_key.
- Сверить llm_calls (attempt=2) и тексты сообщений, где отсутствует запись в scan_results.
- Просмотреть топ-25 bad-cases judge_label=0 и выделить 3 повторяющихся паттерна ошибок.


In [9]:
conn.close()