In [1]:
"""
Backfill missing/empty `pages.summary` using the SECTION model helper.

Behavior:
- Reads candidate rows from `pages` (optionally joined with `page_labels`).
- For each with NULL/empty `summary`, generate a concise factual summary using
  summarize_page_with_section_model(...) and write it back to DB (unless --dry-run).

Config defaults:
- --days defaults to MAX_DOC_AGE_DAYS from config
- Uses the same SQLite DB at DB_PATH
"""

from __future__ import annotations
import argparse
import sqlite3
import sys
from typing import Iterable, Tuple, Optional

from src.config import DB_PATH, MAX_DOC_AGE_DAYS
from src.extract.summarize import summarize_page_with_section_model


def _fetch_candidates(
    conn: sqlite3.Connection,
    country: str,
    days: int,
    only_report_candidates: bool,
    limit: Optional[int],
) -> Iterable[Tuple[int, str, str, str]]:
    """
    Returns iterable of (id, url, title, text) for pages needing summary backfill.
    Filters by recency (published_at within `days` or NULL).
    If only_report_candidates=True, restrict to rows with page_labels.is_primary=1 for the country.
    """
    base_where = """
        (p.summary IS NULL OR TRIM(p.summary) = '')
        AND (p.published_at IS NULL OR julianday('now') - julianday(p.published_at) <= ?)
    """
    params = [days]

    if only_report_candidates:
        sql = f"""
            SELECT p.id, p.url, COALESCE(p.title, ''), COALESCE(p.text, '')
            FROM pages p
            JOIN page_labels L
              ON L.url = p.url
            WHERE L.country_key = ?
              AND L.is_primary = 1
              AND {base_where}
            ORDER BY p.published_at DESC NULLS LAST, p.id DESC
            {f"LIMIT {int(limit)}" if limit else ""}
        """
        params = [country, days]
    else:
        sql = f"""
            SELECT p.id, p.url, COALESCE(p.title, ''), COALESCE(p.text, '')
            FROM pages p
            WHERE {base_where}
            ORDER BY p.published_at DESC NULLS LAST, p.id DESC
            {f"LIMIT {int(limit)}" if limit else ""}
        """

    cur = conn.cursor()
    cur.execute(sql, tuple(params))
    return cur.fetchall()


def backfill_summaries(
    country: str,
    days: int,
    only_report_candidates: bool,
    limit: Optional[int],
    dry_run: bool,
) -> int:
    """
    Returns number of rows updated (or would update if dry_run).
    """
    updated = 0
    print("Starting backfill")
    with sqlite3.connect(DB_PATH) as conn:
        rows = list(_fetch_candidates(conn, country, days, only_report_candidates, limit))
        print(f"Found {len(rows)} candidate pages (country={country}, days={days}, only_report_candidates={only_report_candidates}, limit={limit})")

        cur = conn.cursor()
        for (pid, url, title, text) in rows:
            # Build a small "desc" context from text (first ~120 words)
            lead = " ".join((text or "").split()[:120])
            try:
                summary = summarize_page_with_section_model(title or "", lead, text or "", country)
                summary = (summary or "").strip()
            except Exception as e:
                print(f"  ⚠️  Summarizer failed for id={pid} url={url}: {e}")
                continue

            if not summary:
                continue

            if dry_run:
                print(f"  DRY-RUN would UPDATE id={pid} url={url} summary=«{summary[:120]}...»")
                updated += 1
            else:
                cur.execute("UPDATE pages SET summary=? WHERE id=?", (summary, pid))
                updated += 1

            if not dry_run:
                conn.commit()

            print(f"Updated {updated} / {len(rows)} pages")

    print(("Would update" if dry_run else "Updated") + f" {updated} rows.")
    return updated

backfill_summaries(
    country="ukraine",
    days=180,
    only_report_candidates=True,
    limit=None,
    dry_run=False,
)

Starting backfill
Found 81 candidate pages (country=ukraine, days=180, only_report_candidates=True, limit=None)
Updated 1 / 81 pages
Updated 2 / 81 pages
Updated 3 / 81 pages
Updated 4 / 81 pages
Updated 5 / 81 pages
Updated 6 / 81 pages
Updated 7 / 81 pages
Updated 8 / 81 pages
Updated 9 / 81 pages
Updated 10 / 81 pages
Updated 11 / 81 pages
Updated 12 / 81 pages
Updated 13 / 81 pages
Updated 14 / 81 pages
Updated 15 / 81 pages
Updated 16 / 81 pages
Updated 17 / 81 pages
Updated 18 / 81 pages
Updated 19 / 81 pages
Updated 20 / 81 pages
Updated 21 / 81 pages
Updated 22 / 81 pages
Updated 23 / 81 pages
Updated 24 / 81 pages
Updated 25 / 81 pages
Updated 26 / 81 pages
Updated 27 / 81 pages
Updated 28 / 81 pages
Updated 29 / 81 pages
Updated 30 / 81 pages
Updated 31 / 81 pages
Updated 32 / 81 pages
Updated 33 / 81 pages
Updated 34 / 81 pages
Updated 35 / 81 pages
Updated 36 / 81 pages
Updated 37 / 81 pages
Updated 38 / 81 pages
Updated 39 / 81 pages
Updated 40 / 81 pages
Updated 41 / 81 p

81