Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/cli/output/TableRenderer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@ export interface RenderOptions {
/**
* Set when the displayed `total` is a lower bound — the underlying
* query streamed and stopped after collecting offset+limit matches
* without exhausting the dataset. Rendered as "≥ N" with a hint to
* narrow the filter.
* without exhausting the dataset. Its PRESENCE marks `total` as
* approximate; rendered as "≥ N" with a hint to narrow the filter.
*/
readonly totalApprox?: {
readonly atLeast: number;
readonly exhausted: boolean;
};
}

Expand Down Expand Up @@ -94,8 +93,9 @@ function renderTextTable(
const start = count === 0 ? 0 : offset + 1;
const end = count === 0 ? 0 : offset + count;
lines.push("");
const isApprox =
options.totalApprox !== undefined && options.totalApprox.exhausted === false;
// The presence of totalApprox is the "approximate/capped, more may
// exist" signal — total is a lower bound, so render "≥ N".
const isApprox = options.totalApprox !== undefined;
const totalLabel = isApprox ? `≥ ${options.total}` : `${options.total}`;
lines.push(`Showing ${start}-${end} of ${totalLabel} results`);
if (isApprox) {
Expand Down
26 changes: 12 additions & 14 deletions src/cli/queries/CikQuery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import { globalServiceRegistry } from "workglow";
import { CIK_NAME_REPOSITORY_TOKEN, type CikNameType } from "../../storage/entity/CikNameSchema";
import type { QueryResult } from "./EntityQuery";
import { MAX_FUZZY_MATCHES } from "./_streamMatches";

export interface CikQueryParams {
readonly name?: string;
Expand All @@ -19,15 +20,6 @@ export interface CikQueryResult extends QueryResult<CikNameType> {
readonly tableEmpty: boolean;
}

/**
* Soft cap on substring/prefix matches we'll collect before sorting. Stops
* the empty-needle case (which previously walked the entire ~1M-row table)
* and any pathologically broad needle from exhausting memory. Picked so a
* normal `offset+limit` of a few hundred has plenty of headroom for the
* rank-based reordering.
*/
const MAX_FUZZY_MATCHES = 1000;

/**
* Queries the `cik_names` table for companies whose name matches the given
* needle. Case-insensitive. Ranks exact match first, then prefix, then
Expand All @@ -47,8 +39,7 @@ const MAX_FUZZY_MATCHES = 1000;
* (so even `--exact` can't push down: SEC stores names as
* "Apple Inc." and a user querying "APPLE INC." would miss). Capped
* at `MAX_FUZZY_MATCHES` so the worst case is bounded; if the cap
* fires, `totalApprox.exhausted` is `false` and the UI renders
* "≥ N".
* fires, `totalApprox` is set and the UI renders "≥ N".
*/
export async function queryCiks(params: CikQueryParams): Promise<CikQueryResult> {
const repo = globalServiceRegistry.get(CIK_NAME_REPOSITORY_TOKEN);
Expand All @@ -67,8 +58,15 @@ export async function queryCiks(params: CikQueryParams): Promise<CikQueryResult>
// because workglow's equality is case-sensitive and there is no LIKE
// operator — both case-insensitive exact match and prefix/substring
// matches have to be evaluated client-side. Capped at MAX_FUZZY_MATCHES
// so the worst case is bounded; if the cap fires, `totalApprox.exhausted`
// is `false` and the UI renders "≥ N".
// so the worst case is bounded; if the cap fires, `totalApprox` is set
// and the UI renders "≥ N".
//
// Unlike collectPage (which buffers only the requested window because it
// preserves stream order), CikQuery must buffer ALL matches up to the
// cap: ranking reorders the whole match set, so we can't know which rows
// land in [offset, offset + limit) until every match has been collected
// and sorted. The shared MAX_FUZZY_MATCHES cap keeps that buffer bounded
// and keeps the two surfaces' totalApprox semantics identical.
const matches: { row: CikNameType; rank: number }[] = [];
let anyRowSeen = false;
let exhausted = true;
Expand Down Expand Up @@ -114,6 +112,6 @@ export async function queryCiks(params: CikQueryParams): Promise<CikQueryResult>
rows: matches.slice(offset, offset + limit).map((m) => m.row),
total: matches.length,
tableEmpty: !anyRowSeen,
...(exhausted ? {} : { totalApprox: { atLeast: matches.length, exhausted: false } }),
...(exhausted ? {} : { totalApprox: { atLeast: matches.length } }),
};
}
2 changes: 1 addition & 1 deletion src/cli/queries/CrowdfundingQuery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,5 @@ export async function queryCrowdfunding(
);
// totalApprox is the "this number is a lower bound" signal — only
// emit it when the stream was capped, not when it drained.
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total, exhausted } };
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total } };
}
35 changes: 35 additions & 0 deletions src/cli/queries/EntityQuery.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@ import { resetDependencyInjectionsForTesting } from "../../config/TestingDI";
import { ENTITY_REPOSITORY_TOKEN } from "../../storage/entity/EntitySchema";
import { queryEntities } from "./EntityQuery";

function makeEntity(cik: number, name: string | null) {
return {
cik,
name,
type: null,
sic: null,
ein: null,
description: null,
website: null,
investor_website: null,
category: null,
fiscal_year: null,
state_incorporation: null,
state_incorporation_desc: null,
};
}

describe("queryEntities", () => {
beforeEach(() => {
resetDependencyInjectionsForTesting();
Expand Down Expand Up @@ -161,6 +178,24 @@ describe("queryEntities", () => {
expect(result.total).toBe(10);
});

it("streamed search reports the FULL match count, not offset+limit", async () => {
// H1 regression: collectPage used to stop at offset+limit and report
// that as total, so total was a constant equal to the page end. Now
// it counts every match. 20 entities match "acme"; with limit 3 the
// window has 3 rows but total must be the full 20, and because the
// stream drained well under the cap totalApprox must be undefined.
const repo = globalServiceRegistry.get(ENTITY_REPOSITORY_TOKEN);
for (let i = 1; i <= 20; i++) {
await repo.put(makeEntity(i, `Acme ${i}`));
}
await repo.put(makeEntity(999, "Globex"));

const result = await queryEntities({ search: "acme", limit: 3, offset: 0 });
expect(result.rows.length).toBe(3);
expect(result.total).toBe(20);
expect(result.totalApprox).toBeUndefined();
});

it("filters by SIC code", async () => {
const repo = globalServiceRegistry.get(ENTITY_REPOSITORY_TOKEN);
await repo.put({
Expand Down
36 changes: 21 additions & 15 deletions src/cli/queries/EntityQuery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,20 @@ export interface EntityQueryParams {
export interface QueryResult<T> {
readonly rows: T[];
/**
* When `totalApprox` is set, `total` is the number of matches observed so
* far (offset + limit, give or take), not the exact dataset cardinality.
* Pagination UX should render this as a lower-bound (e.g. "≥ N") rather
* than an exact count. Used when streaming substring searches that
* cannot be pushed down to the database — forcing an exact total would
* require a full table scan.
* When `totalApprox` is set, `total` is the number of matches counted
* before the streaming soft cap fired (a lower bound), not the exact
* dataset cardinality. Pagination UX should render this as a lower
* bound (e.g. "≥ N") rather than an exact count. Used when streaming
* substring searches that cannot be pushed down to the database — a
* truly exact total would require an unbounded full table scan.
*
* When `totalApprox` is absent, the stream drained and `total` is the
* exact match count.
*/
readonly total: number;
readonly totalApprox?: {
/** The match count we got to before stopping. */
/** The match count we got to before stopping (the soft cap). */
readonly atLeast: number;
/** True if the iterator drained — in which case `total` is exact. */
readonly exhausted: boolean;
};
}

Expand Down Expand Up @@ -109,7 +110,9 @@ export async function queryEntities(params: EntityQueryParams): Promise<QueryRes
return { rows, total };
}

// Substring search — stream and stop after offset + limit matches.
// Substring search — stream matches, counting every one up to the soft
// cap so `total` is a meaningful lower bound, while retaining only the
// requested window in memory.
const searchLower = params.search!.toLowerCase();
const predicate = (e: Entity): boolean =>
e.name !== null && e.name.toLowerCase().includes(searchLower);
Expand All @@ -120,9 +123,11 @@ export async function queryEntities(params: EntityQueryParams): Promise<QueryRes
limit
);

// Apply sort to the collected window. With the cap this stays bounded
// and matches the previous semantics (sort runs over the whole match
// set when the stream drains, or over the truncated window otherwise).
// Sort the collected window only. collectPage preserves stream order and
// retains just [offset, offset + limit), so this re-orders the page the
// user sees — it does NOT globally sort the full match set. (A global
// sort would require buffering every match, which the window-only design
// deliberately avoids.)
if (params.sort) {
const sortKey = params.sort as keyof Entity;
rows.sort((a, b) => {
Expand All @@ -137,6 +142,7 @@ export async function queryEntities(params: EntityQueryParams): Promise<QueryRes
}

// totalApprox is the "this number is a lower bound" signal — only
// emit it when the stream was capped, not when it drained.
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total, exhausted } };
// emit it when the stream was capped, not when it drained. Its presence
// (not any field on it) is what marks `total` as approximate.
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total } };
}
2 changes: 1 addition & 1 deletion src/cli/queries/FactsQuery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,5 +51,5 @@ export async function queryFacts(params: FactsQueryParams): Promise<QueryResult<
);
// totalApprox is the "this number is a lower bound" signal — only
// emit it when the stream was capped, not when it drained.
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total, exhausted } };
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total } };
}
25 changes: 25 additions & 0 deletions src/cli/queries/FilingQuery.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,31 @@ describe("queryFilings", () => {
expect(result.rows[0].primary_doc_description).toBe("Annual Report");
});

it("streamed search reports the FULL match count with no totalApprox when drained", async () => {
// H1 regression: streamed total used to be pinned at offset+limit. It
// must now equal the full count of matching rows, and totalApprox must
// be undefined because the stream drained under the soft cap.
for (let i = 1; i <= 12; i++) {
await repo.put(
makeFiling({
accession_number: `0001-26-${String(i).padStart(3, "0")}`,
primary_doc_description: "Annual Report",
})
);
}
await repo.put(
makeFiling({
accession_number: "0001-26-999",
primary_doc_description: "Quarterly Report",
})
);

const result = await queryFilings({ search: "annual", limit: 4, offset: 0 });
expect(result.rows.length).toBe(4);
expect(result.total).toBe(12);
expect(result.totalApprox).toBeUndefined();
});

it("combines filters", async () => {
await repo.put(
makeFiling({
Expand Down
9 changes: 5 additions & 4 deletions src/cli/queries/FilingQuery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ export async function queryFilings(params: FilingQueryParams): Promise<QueryResu
}

// Streaming path. Push criteria down to the DB; apply substring and the
// closing range bound in JS. We stop once we have offset + limit
// matches so memory stays bounded.
// closing range bound in JS. collectPage counts every match up to the
// soft cap (so `total` is a meaningful lower bound) while keeping only
// the requested window in memory.
const searchLower = hasSearch ? params.search!.toLowerCase() : null;
const predicate = (f: Filing): boolean => {
if (params.after !== undefined && f.filing_date < params.after) return false;
Expand All @@ -93,7 +94,7 @@ export async function queryFilings(params: FilingQueryParams): Promise<QueryResu
);
// Only emit totalApprox when we actually capped the stream. If the
// iterator drained, `total` is exact; consumers (TableRenderer, JSON
// output) treat the presence of totalApprox as the "this is a lower
// output) treat the PRESENCE of totalApprox as the "this is a lower
// bound" signal.
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total, exhausted } };
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total } };
}
2 changes: 1 addition & 1 deletion src/cli/queries/OfferingQuery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,5 +83,5 @@ export async function queryOfferings(
);
// totalApprox is the "this number is a lower bound" signal — only
// emit it when the stream was capped, not when it drained.
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total, exhausted } };
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total } };
}
29 changes: 29 additions & 0 deletions src/cli/queries/PersonQuery.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,35 @@ describe("queryPersons", () => {
expect(result.rows[0].first_name).toBe("John");
});

it("streamed search reports the FULL match count with no totalApprox when drained", async () => {
// H1 regression: streamed total was pinned at offset+limit. With 15
// "Aaron" matches and a limit of 4, total must be 15 (the full match
// count) and totalApprox must be undefined since the stream drained.
for (let i = 1; i <= 15; i++) {
await repo.put(
makeObservation({
observation_id: i,
accession_number: `000123456${i}-25-000001`,
first_name: "Aaron",
last_name: `Surname${i}`,
})
);
}
await repo.put(
makeObservation({
observation_id: 999,
accession_number: "0009999999-25-000001",
first_name: "Zelda",
last_name: "Other",
})
);

const result = await queryPersons({ search: "aaron", limit: 4, offset: 0 });
expect(result.rows.length).toBe(4);
expect(result.total).toBe(15);
expect(result.totalApprox).toBeUndefined();
});

it("filters by relationship (partial match)", async () => {
await repo.put(makeObservation({ observation_id: 1, relationship: "Director" }));
await repo.put(
Expand Down
2 changes: 1 addition & 1 deletion src/cli/queries/PersonQuery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,5 +76,5 @@ export async function queryPersons(
);
// totalApprox is the "this number is a lower bound" signal — only
// emit it when the stream was capped, not when it drained.
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total, exhausted } };
return exhausted ? { rows, total } : { rows, total, totalApprox: { atLeast: total } };
}
60 changes: 60 additions & 0 deletions src/cli/queries/_streamMatches.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/**
* @license
* Copyright 2026 Steven Roussey <sroussey@gmail.com>
* SPDX-License-Identifier: Apache-2.0
*/

import { describe, expect, it } from "bun:test";
import { MAX_FUZZY_MATCHES, collectPage } from "./_streamMatches";

async function* gen(count: number): AsyncGenerator<number, void, undefined> {
for (let i = 0; i < count; i++) yield i;
}

describe("collectPage", () => {
it("counts the FULL match set when it drains below the cap", async () => {
// 50 matches, window is [10, 15). total must be the full count (50),
// not offset+limit (15) — this is the H1 fix.
const result = await collectPage(gen(50), 10, 5, 1000);
expect(result.total).toBe(50);
expect(result.exhausted).toBe(true);
expect(result.rows).toEqual([10, 11, 12, 13, 14]);
});

it("reports total === maxScan and exhausted false when the cap fires", async () => {
// 500 matches available, cap at 100. We stop counting at the cap and
// signal that more may exist.
const result = await collectPage(gen(500), 0, 5, 100);
expect(result.total).toBe(100);
expect(result.exhausted).toBe(false);
expect(result.rows).toEqual([0, 1, 2, 3, 4]);
});

it("returns empty rows but the exact total when offset is past the match set", async () => {
const result = await collectPage(gen(8), 20, 5, 1000);
expect(result.rows).toEqual([]);
expect(result.total).toBe(8);
expect(result.exhausted).toBe(true);
});

it("handles an empty iterator: total 0, exhausted true", async () => {
const result = await collectPage(gen(0), 0, 5, 1000);
expect(result.rows).toEqual([]);
expect(result.total).toBe(0);
expect(result.exhausted).toBe(true);
});

it("keeps the window O(limit) even when far more matches precede it", async () => {
const result = await collectPage(gen(800), 100, 3, 1000);
expect(result.rows).toEqual([100, 101, 102]);
expect(result.total).toBe(800);
expect(result.exhausted).toBe(true);
});

it("defaults maxScan to the shared MAX_FUZZY_MATCHES cap", async () => {
const result = await collectPage(gen(MAX_FUZZY_MATCHES + 500), 0, 2);
expect(result.total).toBe(MAX_FUZZY_MATCHES);
expect(result.exhausted).toBe(false);
expect(result.rows).toEqual([0, 1]);
});
});
Loading