From a0cfeed8c447cb50d9bd63e2535a1d6e3893e3fe Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Wed, 27 May 2026 01:13:06 -0700 Subject: [PATCH 1/4] fix(cli): make streamed query totalApprox a meaningful lower bound collectPage now counts every yielded match (up to a shared soft cap) instead of stopping at offset+limit, so total / totalApprox.atLeast is a real lower bound rather than a constant page-end value. Both collectPage and queryCiks now share the single MAX_FUZZY_MATCHES cap exported from _streamMatches so the two streaming query surfaces report totalApprox with identical semantics. --- src/cli/queries/CikQuery.ts | 17 ++++---- src/cli/queries/_streamMatches.test.ts | 60 ++++++++++++++++++++++++++ src/cli/queries/_streamMatches.ts | 59 +++++++++++++++++++------ 3 files changed, 115 insertions(+), 21 deletions(-) create mode 100644 src/cli/queries/_streamMatches.test.ts diff --git a/src/cli/queries/CikQuery.ts b/src/cli/queries/CikQuery.ts index 4a2799f..aa659c6 100644 --- a/src/cli/queries/CikQuery.ts +++ b/src/cli/queries/CikQuery.ts @@ -7,6 +7,7 @@ import { globalServiceRegistry } from "workglow"; import { CIK_NAME_REPOSITORY_TOKEN, type CikNameType } from "../../storage/entity/CikNameSchema"; import type { QueryResult } from "./EntityQuery"; +import { MAX_FUZZY_MATCHES } from "./_streamMatches"; export interface CikQueryParams { readonly name?: string; @@ -19,15 +20,6 @@ export interface CikQueryResult extends QueryResult { readonly tableEmpty: boolean; } -/** - * Soft cap on substring/prefix matches we'll collect before sorting. Stops - * the empty-needle case (which previously walked the entire ~1M-row table) - * and any pathologically broad needle from exhausting memory. Picked so a - * normal `offset+limit` of a few hundred has plenty of headroom for the - * rank-based reordering. - */ -const MAX_FUZZY_MATCHES = 1000; - /** * Queries the `cik_names` table for companies whose name matches the given * needle. Case-insensitive. Ranks exact match first, then prefix, then @@ -69,6 +61,13 @@ export async function queryCiks(params: CikQueryParams): Promise // matches have to be evaluated client-side. Capped at MAX_FUZZY_MATCHES // so the worst case is bounded; if the cap fires, `totalApprox.exhausted` // is `false` and the UI renders "≥ N". + // + // Unlike collectPage (which buffers only the requested window because it + // preserves stream order), CikQuery must buffer ALL matches up to the + // cap: ranking reorders the whole match set, so we can't know which rows + // land in [offset, offset + limit) until every match has been collected + // and sorted. The shared MAX_FUZZY_MATCHES cap keeps that buffer bounded + // and keeps the two surfaces' totalApprox semantics identical. const matches: { row: CikNameType; rank: number }[] = []; let anyRowSeen = false; let exhausted = true; diff --git a/src/cli/queries/_streamMatches.test.ts b/src/cli/queries/_streamMatches.test.ts new file mode 100644 index 0000000..7a0c0e3 --- /dev/null +++ b/src/cli/queries/_streamMatches.test.ts @@ -0,0 +1,60 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect, it } from "bun:test"; +import { MAX_FUZZY_MATCHES, collectPage } from "./_streamMatches"; + +async function* gen(count: number): AsyncGenerator { + for (let i = 0; i < count; i++) yield i; +} + +describe("collectPage", () => { + it("counts the FULL match set when it drains below the cap", async () => { + // 50 matches, window is [10, 15). total must be the full count (50), + // not offset+limit (15) — this is the H1 fix. + const result = await collectPage(gen(50), 10, 5, 1000); + expect(result.total).toBe(50); + expect(result.exhausted).toBe(true); + expect(result.rows).toEqual([10, 11, 12, 13, 14]); + }); + + it("reports total === maxScan and exhausted false when the cap fires", async () => { + // 500 matches available, cap at 100. We stop counting at the cap and + // signal that more may exist. + const result = await collectPage(gen(500), 0, 5, 100); + expect(result.total).toBe(100); + expect(result.exhausted).toBe(false); + expect(result.rows).toEqual([0, 1, 2, 3, 4]); + }); + + it("returns empty rows but the exact total when offset is past the match set", async () => { + const result = await collectPage(gen(8), 20, 5, 1000); + expect(result.rows).toEqual([]); + expect(result.total).toBe(8); + expect(result.exhausted).toBe(true); + }); + + it("handles an empty iterator: total 0, exhausted true", async () => { + const result = await collectPage(gen(0), 0, 5, 1000); + expect(result.rows).toEqual([]); + expect(result.total).toBe(0); + expect(result.exhausted).toBe(true); + }); + + it("keeps the window O(limit) even when far more matches precede it", async () => { + const result = await collectPage(gen(800), 100, 3, 1000); + expect(result.rows).toEqual([100, 101, 102]); + expect(result.total).toBe(800); + expect(result.exhausted).toBe(true); + }); + + it("defaults maxScan to the shared MAX_FUZZY_MATCHES cap", async () => { + const result = await collectPage(gen(MAX_FUZZY_MATCHES + 500), 0, 2); + expect(result.total).toBe(MAX_FUZZY_MATCHES); + expect(result.exhausted).toBe(false); + expect(result.rows).toEqual([0, 1]); + }); +}); diff --git a/src/cli/queries/_streamMatches.ts b/src/cli/queries/_streamMatches.ts index 6f57aa4..8622cbc 100644 --- a/src/cli/queries/_streamMatches.ts +++ b/src/cli/queries/_streamMatches.ts @@ -11,6 +11,18 @@ import type { ITabularStorage, SearchCriteria } from "workglow"; // importing it. type CursorOf = Awaited["queryPage"]>>["nextCursor"]; +/** + * Soft cap on streamed substring/prefix matches before we stop counting. + * + * Single source of truth shared by `collectPage` (this module) and + * `queryCiks` (CikQuery.ts) so the two streaming query surfaces report + * `totalApprox` with identical semantics. Stops the empty-needle / + * pathologically-broad-needle case from walking the entire ~1M-row table + * and exhausting memory, while leaving plenty of headroom for a normal + * `offset + limit` of a few hundred. + */ +export const MAX_FUZZY_MATCHES = 1000; + /** * Iterates `repo.queryPage(criteria, ...)` cursor-by-cursor, yielding each * row that satisfies `predicate`. Memory is bounded to one page; total @@ -40,25 +52,48 @@ export async function* streamMatchingRows( } /** - * Collects an async iterable into the slice `[offset, offset + limit)`. + * Collects an async iterable into the slice `[offset, offset + limit)` + * while counting EVERY match (not just the ones that land in the window). * - * Returns `exhausted: true` only if the iterator drained — otherwise the - * caller observed exactly `offset + limit` matches and there may be more. - * Callers fold this into a `totalApprox` so the UI can render "≥ N" - * instead of pretending to know the exact match count. + * Counting all matches makes `total` a meaningful number rather than a + * constant equal to the page end: it is the number of matches observed up + * to `maxScan`. Memory stays O(limit) — only rows inside the requested + * window are retained; everything before `offset` and after + * `offset + limit` is counted then discarded. + * + * Stops early when the running match count reaches `maxScan` + * (`exhausted: false` — more matches may exist beyond the cap) or when the + * iterator drains (`exhausted: true` — `total` is exact). + * + * Callers fold this into a `totalApprox` so the UI can render "≥ N" when + * the cap fired, instead of pretending to know the exact match count. + * + * @param maxScan Soft cap on matches counted; defaults to the shared + * `MAX_FUZZY_MATCHES`. + * @returns `total` — matches counted up to `maxScan`. `exhausted` — + * `false` if the cap stopped us (more may exist), `true` if the iterator + * drained (in which case `total` is exact). */ export async function collectPage( iter: AsyncIterable, offset: number, - limit: number + limit: number, + maxScan: number = MAX_FUZZY_MATCHES ): Promise<{ rows: T[]; total: number; exhausted: boolean }> { - const target = offset + limit; - const collected: T[] = []; + const windowEnd = offset + limit; + const window: T[] = []; + let matched = 0; for await (const row of iter) { - collected.push(row); - if (collected.length >= target) { - return { rows: collected.slice(offset, target), total: target, exhausted: false }; + // Retain only rows in [offset, offset + limit) — O(limit) memory. + if (matched >= offset && matched < windowEnd) { + window.push(row); + } + matched++; + if (matched >= maxScan) { + // Hit the soft cap: there may be more matches we never counted. + return { rows: window, total: matched, exhausted: false }; } } - return { rows: collected.slice(offset), total: collected.length, exhausted: true }; + // Iterator drained: `matched` is the exact total. + return { rows: window, total: matched, exhausted: true }; } From 7af010499b162821ca6b6ed168f06d2e9a4375ef Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Wed, 27 May 2026 01:15:04 -0700 Subject: [PATCH 2/4] test(cli): assert streamed total is the full match count; fix stale comments Updates EntityQuery/FilingQuery comments that claimed collectPage stops after offset+limit matches, and adds tests asserting total equals the full match count with totalApprox undefined when the stream drains. --- src/cli/queries/EntityQuery.test.ts | 35 +++++++++++++++++++++++++++++ src/cli/queries/EntityQuery.ts | 29 +++++++++++++++--------- src/cli/queries/FilingQuery.test.ts | 25 +++++++++++++++++++++ src/cli/queries/FilingQuery.ts | 5 +++-- src/cli/queries/PersonQuery.test.ts | 29 ++++++++++++++++++++++++ 5 files changed, 110 insertions(+), 13 deletions(-) diff --git a/src/cli/queries/EntityQuery.test.ts b/src/cli/queries/EntityQuery.test.ts index ffd3804..045c09d 100644 --- a/src/cli/queries/EntityQuery.test.ts +++ b/src/cli/queries/EntityQuery.test.ts @@ -4,6 +4,23 @@ import { resetDependencyInjectionsForTesting } from "../../config/TestingDI"; import { ENTITY_REPOSITORY_TOKEN } from "../../storage/entity/EntitySchema"; import { queryEntities } from "./EntityQuery"; +function makeEntity(cik: number, name: string | null) { + return { + cik, + name, + type: null, + sic: null, + ein: null, + description: null, + website: null, + investor_website: null, + category: null, + fiscal_year: null, + state_incorporation: null, + state_incorporation_desc: null, + }; +} + describe("queryEntities", () => { beforeEach(() => { resetDependencyInjectionsForTesting(); @@ -161,6 +178,24 @@ describe("queryEntities", () => { expect(result.total).toBe(10); }); + it("streamed search reports the FULL match count, not offset+limit", async () => { + // H1 regression: collectPage used to stop at offset+limit and report + // that as total, so total was a constant equal to the page end. Now + // it counts every match. 20 entities match "acme"; with limit 3 the + // window has 3 rows but total must be the full 20, and because the + // stream drained well under the cap totalApprox must be undefined. + const repo = globalServiceRegistry.get(ENTITY_REPOSITORY_TOKEN); + for (let i = 1; i <= 20; i++) { + await repo.put(makeEntity(i, `Acme ${i}`)); + } + await repo.put(makeEntity(999, "Globex")); + + const result = await queryEntities({ search: "acme", limit: 3, offset: 0 }); + expect(result.rows.length).toBe(3); + expect(result.total).toBe(20); + expect(result.totalApprox).toBeUndefined(); + }); + it("filters by SIC code", async () => { const repo = globalServiceRegistry.get(ENTITY_REPOSITORY_TOKEN); await repo.put({ diff --git a/src/cli/queries/EntityQuery.ts b/src/cli/queries/EntityQuery.ts index 7f188bb..4762a8a 100644 --- a/src/cli/queries/EntityQuery.ts +++ b/src/cli/queries/EntityQuery.ts @@ -18,16 +18,19 @@ export interface EntityQueryParams { export interface QueryResult { readonly rows: T[]; /** - * When `totalApprox` is set, `total` is the number of matches observed so - * far (offset + limit, give or take), not the exact dataset cardinality. - * Pagination UX should render this as a lower-bound (e.g. "≥ N") rather - * than an exact count. Used when streaming substring searches that - * cannot be pushed down to the database — forcing an exact total would - * require a full table scan. + * When `totalApprox` is set, `total` is the number of matches counted + * before the streaming soft cap fired (a lower bound), not the exact + * dataset cardinality. Pagination UX should render this as a lower + * bound (e.g. "≥ N") rather than an exact count. Used when streaming + * substring searches that cannot be pushed down to the database — a + * truly exact total would require an unbounded full table scan. + * + * When `totalApprox` is absent, the stream drained and `total` is the + * exact match count. */ readonly total: number; readonly totalApprox?: { - /** The match count we got to before stopping. */ + /** The match count we got to before stopping (the soft cap). */ readonly atLeast: number; /** True if the iterator drained — in which case `total` is exact. */ readonly exhausted: boolean; @@ -109,7 +112,9 @@ export async function queryEntities(params: EntityQueryParams): Promise e.name !== null && e.name.toLowerCase().includes(searchLower); @@ -120,9 +125,11 @@ export async function queryEntities(params: EntityQueryParams): Promise { diff --git a/src/cli/queries/FilingQuery.test.ts b/src/cli/queries/FilingQuery.test.ts index b80bc96..62fdce7 100644 --- a/src/cli/queries/FilingQuery.test.ts +++ b/src/cli/queries/FilingQuery.test.ts @@ -99,6 +99,31 @@ describe("queryFilings", () => { expect(result.rows[0].primary_doc_description).toBe("Annual Report"); }); + it("streamed search reports the FULL match count with no totalApprox when drained", async () => { + // H1 regression: streamed total used to be pinned at offset+limit. It + // must now equal the full count of matching rows, and totalApprox must + // be undefined because the stream drained under the soft cap. + for (let i = 1; i <= 12; i++) { + await repo.put( + makeFiling({ + accession_number: `0001-26-${String(i).padStart(3, "0")}`, + primary_doc_description: "Annual Report", + }) + ); + } + await repo.put( + makeFiling({ + accession_number: "0001-26-999", + primary_doc_description: "Quarterly Report", + }) + ); + + const result = await queryFilings({ search: "annual", limit: 4, offset: 0 }); + expect(result.rows.length).toBe(4); + expect(result.total).toBe(12); + expect(result.totalApprox).toBeUndefined(); + }); + it("combines filters", async () => { await repo.put( makeFiling({ diff --git a/src/cli/queries/FilingQuery.ts b/src/cli/queries/FilingQuery.ts index 33ff013..bed4d77 100644 --- a/src/cli/queries/FilingQuery.ts +++ b/src/cli/queries/FilingQuery.ts @@ -72,8 +72,9 @@ export async function queryFilings(params: FilingQueryParams): Promise { if (params.after !== undefined && f.filing_date < params.after) return false; diff --git a/src/cli/queries/PersonQuery.test.ts b/src/cli/queries/PersonQuery.test.ts index 1ebdd08..6831049 100644 --- a/src/cli/queries/PersonQuery.test.ts +++ b/src/cli/queries/PersonQuery.test.ts @@ -82,6 +82,35 @@ describe("queryPersons", () => { expect(result.rows[0].first_name).toBe("John"); }); + it("streamed search reports the FULL match count with no totalApprox when drained", async () => { + // H1 regression: streamed total was pinned at offset+limit. With 15 + // "Aaron" matches and a limit of 4, total must be 15 (the full match + // count) and totalApprox must be undefined since the stream drained. + for (let i = 1; i <= 15; i++) { + await repo.put( + makeObservation({ + observation_id: i, + accession_number: `000123456${i}-25-000001`, + first_name: "Aaron", + last_name: `Surname${i}`, + }) + ); + } + await repo.put( + makeObservation({ + observation_id: 999, + accession_number: "0009999999-25-000001", + first_name: "Zelda", + last_name: "Other", + }) + ); + + const result = await queryPersons({ search: "aaron", limit: 4, offset: 0 }); + expect(result.rows.length).toBe(4); + expect(result.total).toBe(15); + expect(result.totalApprox).toBeUndefined(); + }); + it("filters by relationship (partial match)", async () => { await repo.put(makeObservation({ observation_id: 1, relationship: "Director" })); await repo.put( From 7a06139637ac0f41524a8fb65afcfaf6fa8536b5 Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Wed, 27 May 2026 10:20:46 -0700 Subject: [PATCH 3/4] fix(test): isolate cik-name bulk writer test from leaked DB config bun test shares module state across files in a worker, and FetchDailyIndexTask.test.ts / FetchQuarterlyIndexTask.test.ts call EnvToDI() at module load, which registers SEC_DB_FOLDER / SEC_DB_NAME / SEC_DB_TYPE=sqlite (from .env.test) into the shared globalServiceRegistry. The registry has no unregister API, so once a sibling runs first those tokens stay set. FetchAllCikNamesTask.test.ts then (a) failed its `has(SEC_DB_FOLDER) === false` precondition and (b) drove createCikNameBulkWriter() down the SQLite fast path -> getDb() opened ./sec-db/edgar.sqlite and prepared an INSERT against a non-existent cik_names table, throwing a bun:sqlite prepare error. This was latent since #111 and only surfaced now because adding _streamMatches.test.ts in this PR shifted bun's test-file execution order so the index task tests run before this one in the same worker. Make the test deterministic: pin SEC_DB_TYPE to a non-sqlite/non-postgres value in beforeEach so the writer always routes to the repository (in-memory) writer regardless of any leaked config, and drop the order-dependent has(SEC_DB_FOLDER) assertion. --- .../ciknames/FetchAllCikNamesTask.test.ts | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/task/ciknames/FetchAllCikNamesTask.test.ts b/src/task/ciknames/FetchAllCikNamesTask.test.ts index 4098e1f..fcffeaf 100644 --- a/src/task/ciknames/FetchAllCikNamesTask.test.ts +++ b/src/task/ciknames/FetchAllCikNamesTask.test.ts @@ -7,7 +7,7 @@ import { beforeEach, describe, expect, it } from "bun:test"; import { globalServiceRegistry } from "workglow"; import { resetDependencyInjectionsForTesting } from "../../config/TestingDI"; -import { SEC_DB_FOLDER } from "../../config/tokens"; +import { SEC_DB_TYPE } from "../../config/tokens"; import { createCikNameBulkWriter, type CikNameRow, @@ -26,15 +26,20 @@ import { describe("createCikNameBulkWriter", () => { beforeEach(() => { resetDependencyInjectionsForTesting(); + // Pin SEC_DB_TYPE to a value that is neither "sqlite" nor "postgres" so + // createCikNameBulkWriter() deterministically selects the in-memory + // repository writer. bun shares module state across test files in a + // worker, and sibling task tests (FetchDailyIndexTask/ + // FetchQuarterlyIndexTask) call EnvToDI() at module load, which + // registers SEC_DB_FOLDER / SEC_DB_NAME / SEC_DB_TYPE=sqlite from + // .env.test into the shared registry. The registry has no unregister + // API, so without this pin a leaked sqlite config would route the + // writer down the getDb() fast path and prepare an INSERT against a + // cik_names table that does not exist in the test SQLite file. + globalServiceRegistry.registerInstance(SEC_DB_TYPE, "memory" as unknown as "sqlite"); }); it("falls back to the repository writer when no real DB is configured", async () => { - // SEC_DB_TYPE may be left as "sqlite" by a sibling test that called - // EnvToDI() at module-load time. Without SEC_DB_FOLDER the SQLite fast - // path can't run (getDb() would throw), so the writer must route - // through the in-memory repository. - expect(globalServiceRegistry.has(SEC_DB_FOLDER)).toBe(false); - const writer = createCikNameBulkWriter(); const rows: CikNameRow[] = [ { cik: 320193, name: "APPLE INC" }, From 45308c8b21bff2332bb8719a27e69512d50d1289 Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Wed, 27 May 2026 12:41:40 -0700 Subject: [PATCH 4/4] fix(cli): fill window past cap in collectPage; simplify totalApprox; clarify test cast --- src/cli/output/TableRenderer.ts | 10 ++--- src/cli/queries/CikQuery.ts | 9 ++--- src/cli/queries/CrowdfundingQuery.ts | 2 +- src/cli/queries/EntityQuery.ts | 7 ++-- src/cli/queries/FactsQuery.ts | 2 +- src/cli/queries/FilingQuery.ts | 4 +- src/cli/queries/OfferingQuery.ts | 2 +- src/cli/queries/PersonQuery.ts | 2 +- src/cli/queries/_streamMatches.ts | 37 +++++++++++++------ .../ciknames/FetchAllCikNamesTask.test.ts | 11 +++++- 10 files changed, 53 insertions(+), 33 deletions(-) diff --git a/src/cli/output/TableRenderer.ts b/src/cli/output/TableRenderer.ts index a45d67f..e92426d 100644 --- a/src/cli/output/TableRenderer.ts +++ b/src/cli/output/TableRenderer.ts @@ -12,12 +12,11 @@ export interface RenderOptions { /** * Set when the displayed `total` is a lower bound — the underlying * query streamed and stopped after collecting offset+limit matches - * without exhausting the dataset. Rendered as "≥ N" with a hint to - * narrow the filter. + * without exhausting the dataset. Its PRESENCE marks `total` as + * approximate; rendered as "≥ N" with a hint to narrow the filter. */ readonly totalApprox?: { readonly atLeast: number; - readonly exhausted: boolean; }; } @@ -94,8 +93,9 @@ function renderTextTable( const start = count === 0 ? 0 : offset + 1; const end = count === 0 ? 0 : offset + count; lines.push(""); - const isApprox = - options.totalApprox !== undefined && options.totalApprox.exhausted === false; + // The presence of totalApprox is the "approximate/capped, more may + // exist" signal — total is a lower bound, so render "≥ N". + const isApprox = options.totalApprox !== undefined; const totalLabel = isApprox ? `≥ ${options.total}` : `${options.total}`; lines.push(`Showing ${start}-${end} of ${totalLabel} results`); if (isApprox) { diff --git a/src/cli/queries/CikQuery.ts b/src/cli/queries/CikQuery.ts index aa659c6..de4c913 100644 --- a/src/cli/queries/CikQuery.ts +++ b/src/cli/queries/CikQuery.ts @@ -39,8 +39,7 @@ export interface CikQueryResult extends QueryResult { * (so even `--exact` can't push down: SEC stores names as * "Apple Inc." and a user querying "APPLE INC." would miss). Capped * at `MAX_FUZZY_MATCHES` so the worst case is bounded; if the cap - * fires, `totalApprox.exhausted` is `false` and the UI renders - * "≥ N". + * fires, `totalApprox` is set and the UI renders "≥ N". */ export async function queryCiks(params: CikQueryParams): Promise { const repo = globalServiceRegistry.get(CIK_NAME_REPOSITORY_TOKEN); @@ -59,8 +58,8 @@ export async function queryCiks(params: CikQueryParams): Promise // because workglow's equality is case-sensitive and there is no LIKE // operator — both case-insensitive exact match and prefix/substring // matches have to be evaluated client-side. Capped at MAX_FUZZY_MATCHES - // so the worst case is bounded; if the cap fires, `totalApprox.exhausted` - // is `false` and the UI renders "≥ N". + // so the worst case is bounded; if the cap fires, `totalApprox` is set + // and the UI renders "≥ N". // // Unlike collectPage (which buffers only the requested window because it // preserves stream order), CikQuery must buffer ALL matches up to the @@ -113,6 +112,6 @@ export async function queryCiks(params: CikQueryParams): Promise rows: matches.slice(offset, offset + limit).map((m) => m.row), total: matches.length, tableEmpty: !anyRowSeen, - ...(exhausted ? {} : { totalApprox: { atLeast: matches.length, exhausted: false } }), + ...(exhausted ? {} : { totalApprox: { atLeast: matches.length } }), }; } diff --git a/src/cli/queries/CrowdfundingQuery.ts b/src/cli/queries/CrowdfundingQuery.ts index 07a5e5a..f925025 100644 --- a/src/cli/queries/CrowdfundingQuery.ts +++ b/src/cli/queries/CrowdfundingQuery.ts @@ -64,5 +64,5 @@ export async function queryCrowdfunding( ); // totalApprox is the "this number is a lower bound" signal — only // emit it when the stream was capped, not when it drained. - return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total, exhausted } }; + return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total } }; } diff --git a/src/cli/queries/EntityQuery.ts b/src/cli/queries/EntityQuery.ts index 4762a8a..f2a88fc 100644 --- a/src/cli/queries/EntityQuery.ts +++ b/src/cli/queries/EntityQuery.ts @@ -32,8 +32,6 @@ export interface QueryResult { readonly totalApprox?: { /** The match count we got to before stopping (the soft cap). */ readonly atLeast: number; - /** True if the iterator drained — in which case `total` is exact. */ - readonly exhausted: boolean; }; } @@ -144,6 +142,7 @@ export async function queryEntities(params: EntityQueryParams): Promise( * * Counting all matches makes `total` a meaningful number rather than a * constant equal to the page end: it is the number of matches observed up - * to `maxScan`. Memory stays O(limit) — only rows inside the requested - * window are retained; everything before `offset` and after + * to the scan bound. Memory stays O(limit) — only rows inside the + * requested window are retained; everything before `offset` and after * `offset + limit` is counted then discarded. * - * Stops early when the running match count reaches `maxScan` - * (`exhausted: false` — more matches may exist beyond the cap) or when the - * iterator drains (`exhausted: true` — `total` is exact). + * The requested window is ALWAYS filled to `limit` whenever enough matches + * exist: the effective scan bound is `Math.max(maxScan, offset + limit)`, + * so `maxScan` can never truncate the window — it only bounds the SURPLUS + * count past the window used for the lower bound. (A naive `matched >= + * maxScan` break would short the final page whenever `offset + limit > + * maxScan`, returning fewer than `limit` rows and making the renderer + * advertise an always-empty next-page offset.) + * + * Stops when the running match count reaches the scan bound + * (`exhausted: false` — more matches may exist beyond the bound) or when + * the iterator drains (`exhausted: true` — `total` is exact). * * Callers fold this into a `totalApprox` so the UI can render "≥ N" when * the cap fired, instead of pretending to know the exact match count. * - * @param maxScan Soft cap on matches counted; defaults to the shared - * `MAX_FUZZY_MATCHES`. - * @returns `total` — matches counted up to `maxScan`. `exhausted` — - * `false` if the cap stopped us (more may exist), `true` if the iterator + * @param maxScan Soft cap on the surplus matches counted past the window; + * defaults to the shared `MAX_FUZZY_MATCHES`. The window itself is always + * filled regardless of this value. + * @returns `total` — matches counted up to the scan bound. `exhausted` — + * `false` if the bound stopped us (more may exist), `true` if the iterator * drained (in which case `total` is exact). */ export async function collectPage( @@ -81,6 +90,10 @@ export async function collectPage( maxScan: number = MAX_FUZZY_MATCHES ): Promise<{ rows: T[]; total: number; exhausted: boolean }> { const windowEnd = offset + limit; + // The cap bounds the COUNT, never the WINDOW: always scan at least far + // enough to fill [offset, offset + limit), then keep counting up to + // `maxScan` for the lower bound. + const scanLimit = Math.max(maxScan, windowEnd); const window: T[] = []; let matched = 0; for await (const row of iter) { @@ -89,11 +102,11 @@ export async function collectPage( window.push(row); } matched++; - if (matched >= maxScan) { - // Hit the soft cap: there may be more matches we never counted. + if (matched >= scanLimit) { + // Hit the scan bound: there may be more matches we never counted. return { rows: window, total: matched, exhausted: false }; } } - // Iterator drained: `matched` is the exact total. + // Iterator drained before the scan bound: `matched` is the exact total. return { rows: window, total: matched, exhausted: true }; } diff --git a/src/task/ciknames/FetchAllCikNamesTask.test.ts b/src/task/ciknames/FetchAllCikNamesTask.test.ts index fcffeaf..e4c17cf 100644 --- a/src/task/ciknames/FetchAllCikNamesTask.test.ts +++ b/src/task/ciknames/FetchAllCikNamesTask.test.ts @@ -36,7 +36,16 @@ describe("createCikNameBulkWriter", () => { // API, so without this pin a leaked sqlite config would route the // writer down the getDb() fast path and prepare an INSERT against a // cik_names table that does not exist in the test SQLite file. - globalServiceRegistry.registerInstance(SEC_DB_TYPE, "memory" as unknown as "sqlite"); + // + // "memory" is intentionally OUTSIDE the token's declared + // "sqlite" | "postgres" union: at runtime it is the sentinel that makes + // createCikNameBulkWriter() fall through to the repository writer, but + // the type only admits the two real backends, so we cast through + // `unknown` to register the out-of-union sentinel. + globalServiceRegistry.registerInstance( + SEC_DB_TYPE, + "memory" as unknown as "sqlite" | "postgres" + ); }); it("falls back to the repository writer when no real DB is configured", async () => {