diff --git a/src/sec/forms/insider-trading/Form_144.storage.test.ts b/src/sec/forms/insider-trading/Form_144.storage.test.ts index 3175605..8cba2fb 100644 --- a/src/sec/forms/insider-trading/Form_144.storage.test.ts +++ b/src/sec/forms/insider-trading/Form_144.storage.test.ts @@ -189,6 +189,79 @@ describe("Form 144 storage", () => { expect(filing?.no_of_units_sold).toBe(129915); }); + it("stores null (not a fabricated 0) for a whitespace-only aggregateMarketValue", async () => { + const accession = "0001663266-26-000003"; + const xml = readFileSync( + join(__dirname, "mock_data", "form-144", "000166326626000003-primary_doc.xml"), + "utf-8" + ); + const doc = await Form_144.parse("144", xml); + // Filings have been observed with whitespace-only numeric elements, which + // the previous local num() coerced to 0 via Number(" ") and silently + // fabricated a market value. + doc.formData!.securitiesInformation!.aggregateMarketValue = " "; + await processForm144({ + cik: 1534263, + file_number: "", + accession_number: accession, + filing_date: "2026-05-27", + primary_doc: "x.xml", + form: "144", + doc, + }); + + const filing = await repo.getFiling(accession); + expect(filing?.aggregate_market_value).toBeNull(); + expect(filing?.no_of_units_sold).toBe(129915); + }); + + it("stores null (not a fabricated 0) for whitespace-only grossProceeds on a recent sale", async () => { + const accession = "0001663266-26-000003"; + const xml = readFileSync( + join(__dirname, "mock_data", "form-144", "000166326626000003-primary_doc.xml"), + "utf-8" + ); + const doc = await Form_144.parse("144", xml); + doc.formData!.securitiesSoldInPast3Months![0].grossProceeds = " "; + await processForm144({ + cik: 1534263, + file_number: "", + accession_number: accession, + filing_date: "2026-05-27", + primary_doc: "x.xml", + form: "144", + doc, + }); + + const sales = await repo.getRecentSales(accession); + expect(sales[0].gross_proceeds).toBeNull(); + expect(sales[0].amount_sold).toBe(16814); + }); + + it("stores null (not a fabricated 0) for whitespace-only amountOfSecuritiesAcquired", async () => { + const accession = "0001663266-26-000003"; + const xml = readFileSync( + join(__dirname, "mock_data", "form-144", "000166326626000003-primary_doc.xml"), + "utf-8" + ); + const doc = await Form_144.parse("144", xml); + doc.formData!.securitiesToBeSold![0].amountOfSecuritiesAcquired = " "; + await processForm144({ + cik: 1534263, + file_number: "", + accession_number: accession, + filing_date: "2026-05-27", + primary_doc: "x.xml", + form: "144", + doc, + }); + + const acquisitions = await repo.getAcquisitions(accession); + expect(acquisitions[0].amount_acquired).toBeNull(); + // The second acquisition's populated field is unaffected. + expect(acquisitions[1].amount_acquired).not.toBeNull(); + }); + it("clears stale rows when re-extracted with fewer acquisitions", async () => { const accession = "0001663266-26-000003"; const xml = readFileSync( diff --git a/src/sec/forms/insider-trading/Form_144.storage.ts b/src/sec/forms/insider-trading/Form_144.storage.ts index ed3c52c..d703fa0 100644 --- a/src/sec/forms/insider-trading/Form_144.storage.ts +++ b/src/sec/forms/insider-trading/Form_144.storage.ts @@ -31,23 +31,12 @@ import { getActiveSlot } from "../../../storage/versioning/getActiveSlot"; import { formToExtractorId } from "../../../storage/versioning/extractorIds"; import { Form144Repo } from "../../../storage/form144/Form144Repo"; import type { Form144 } from "./Form_144.schema"; +import { numScalar as num, strScalar as str } from "./_valueHelpers"; type AddressShape = NonNullable< NonNullable["issuerInfo"] >["issuerAddress"]; -function str(s: string | undefined | null): string | null { - if (s === undefined || s === null) return null; - const t = String(s).trim(); - return t === "" ? null : t; -} - -function num(n: number | string | undefined | null): number | null { - if (n === undefined || n === null || n === "") return null; - const v = typeof n === "number" ? n : Number(n); - return Number.isFinite(v) ? v : null; -} - // EDGAR Y/N flags. function toBoolYN(raw: string | undefined): boolean { return str(raw)?.toUpperCase() === "Y"; @@ -113,7 +102,9 @@ export async function processForm144({ const activeResolverPersonVersion = personSlot?.semver ?? "1.0.0"; const activeResolverCompanyVersion = companySlot?.semver ?? "1.0.0"; - const extractor_version = "1.0.0"; + // 1.1.0: num() now treats whitespace-only numeric elements as null instead + // of fabricating 0 via Number(" "). Bumped to force production re-extract. + const extractor_version = "1.1.0"; const extractor_id = formToExtractorId(form) ?? "144"; const formData = doc.formData ?? {}; diff --git a/src/sec/forms/insider-trading/OwnershipDocument.storage.test.ts b/src/sec/forms/insider-trading/OwnershipDocument.storage.test.ts index aeb58a9..0047962 100644 --- a/src/sec/forms/insider-trading/OwnershipDocument.storage.test.ts +++ b/src/sec/forms/insider-trading/OwnershipDocument.storage.test.ts @@ -249,6 +249,31 @@ describe("OwnershipDocument storage (Forms 3/4/5)", () => { expect(nonDeriv.price_per_share).toBe(1.405); }); + it("stores null (not 0) for a whitespace-only transactionShares element", async () => { + const accession = "0001493152-26-025476"; + const xml = readFileSync( + join(__dirname, "mock_data", "form-4", "000149315226025476-primary_doc.xml"), + "utf-8" + ); + const doc = await Form_4.parse("4", xml); + const nonDerivTxn = doc.nonDerivativeTable!.nonDerivativeTransaction![0]; + nonDerivTxn.transactionAmounts!.transactionShares!.value = " "; + await processOwnershipForm({ + cik: 1828673, + file_number: "", + accession_number: accession, + filing_date: "2026-05-27", + primary_doc: "x.xml", + form: "4", + doc, + }); + + const txns = await repo.getTransactions(accession); + const nonDeriv = txns.find((t) => !t.is_derivative)!; + expect(nonDeriv.shares).toBeNull(); + expect(nonDeriv.price_per_share).toBe(1.405); + }); + it("stores null (not 0) for an empty transactionPricePerShare element", async () => { const accession = "0001493152-26-025476"; const xml = readFileSync( diff --git a/src/sec/forms/insider-trading/OwnershipDocument.storage.ts b/src/sec/forms/insider-trading/OwnershipDocument.storage.ts index d779422..caec23f 100644 --- a/src/sec/forms/insider-trading/OwnershipDocument.storage.ts +++ b/src/sec/forms/insider-trading/OwnershipDocument.storage.ts @@ -35,6 +35,7 @@ import { getActiveSlot } from "../../../storage/versioning/getActiveSlot"; import { isBadPersonField } from "../../../types/edgar/bad-data"; import { parseCikSafely } from "../../../util/parseCik"; import type { OwnershipDocument } from "./OwnershipDocument.schema"; +import { numWrapped as num, strWrapped as str } from "./_valueHelpers"; // EDGAR ownership flags appear as "1"/"0" (X0609) or "true"/"false" (X0607). function toBool(raw: string | undefined): boolean { @@ -43,26 +44,6 @@ function toBool(raw: string | undefined): boolean { return v === "1" || v === "true"; } -// Unwrap a `{ value }` leaf to its string, treating empty as null. -function str(field: { value?: string } | string | undefined): string | null { - if (field === undefined || field === null) return null; - if (typeof field === "string") return field.trim() || null; - const v = field.value; - return v === undefined || v === null || String(v).trim() === "" ? null : String(v).trim(); -} - -// Unwrap a `{ value }` leaf to a finite number, or null. The schema types the -// inner value as a string so that an empty XML element (parsed as "") survives -// Value.Convert intact and reaches this helper, which maps "" -> null. If we -// typed it as a number, Value.Convert would fabricate a 0 here instead. -function num(field: { value?: string } | string | undefined): number | null { - if (field === undefined || field === null || typeof field === "string") return null; - const v = field.value; - if (v === undefined || v === null || v.trim() === "") return null; - const n = Number(v); - return Number.isFinite(n) ? n : null; -} - interface OwnershipStorageContext { readonly accession_number: string; readonly extractor_id: string; diff --git a/src/sec/forms/insider-trading/_valueHelpers.test.ts b/src/sec/forms/insider-trading/_valueHelpers.test.ts new file mode 100644 index 0000000..ae13094 --- /dev/null +++ b/src/sec/forms/insider-trading/_valueHelpers.test.ts @@ -0,0 +1,90 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, expect, it } from "bun:test"; +import { numScalar, numWrapped, strScalar, strWrapped } from "./_valueHelpers"; + +describe("strScalar", () => { + it("returns null for undefined/null/empty/whitespace", () => { + expect(strScalar(undefined)).toBeNull(); + expect(strScalar(null)).toBeNull(); + expect(strScalar("")).toBeNull(); + expect(strScalar(" ")).toBeNull(); + expect(strScalar("\t\n")).toBeNull(); + }); + it("trims and returns a non-empty string", () => { + expect(strScalar("abc")).toBe("abc"); + expect(strScalar(" 42 ")).toBe("42"); + expect(strScalar("0")).toBe("0"); + expect(strScalar(0)).toBe("0"); + }); +}); + +describe("numScalar", () => { + it("returns null for undefined/null/empty/whitespace", () => { + expect(numScalar(undefined)).toBeNull(); + expect(numScalar(null)).toBeNull(); + expect(numScalar("")).toBeNull(); + expect(numScalar(" ")).toBeNull(); + expect(numScalar("\t\n")).toBeNull(); + }); + it("coerces trimmed numeric strings", () => { + expect(numScalar("0")).toBe(0); + expect(numScalar(" 42 ")).toBe(42); + expect(numScalar("3.14")).toBe(3.14); + }); + it("returns null for non-numeric input", () => { + expect(numScalar("abc")).toBeNull(); + }); + it("passes through finite numbers and rejects non-finite", () => { + expect(numScalar(7)).toBe(7); + expect(numScalar(Number.NaN)).toBeNull(); + expect(numScalar(Number.POSITIVE_INFINITY)).toBeNull(); + }); +}); + +describe("strWrapped", () => { + it("returns null for undefined/null/empty/whitespace", () => { + expect(strWrapped(undefined)).toBeNull(); + expect(strWrapped(null)).toBeNull(); + expect(strWrapped("")).toBeNull(); + expect(strWrapped(" ")).toBeNull(); + expect(strWrapped("\t\n")).toBeNull(); + }); + it("unwraps {value} leaves with the same semantics", () => { + expect(strWrapped({ value: "abc" })).toBe("abc"); + expect(strWrapped({ value: " 42 " })).toBe("42"); + expect(strWrapped({ value: "0" })).toBe("0"); + expect(strWrapped({ value: " " })).toBeNull(); + expect(strWrapped({ value: undefined })).toBeNull(); + expect(strWrapped({})).toBeNull(); + }); + it("accepts a bare string value", () => { + expect(strWrapped("abc")).toBe("abc"); + }); +}); + +describe("numWrapped", () => { + it("returns null for undefined/null and bare strings", () => { + expect(numWrapped(undefined)).toBeNull(); + expect(numWrapped(null)).toBeNull(); + // Bare string is a schema mismatch at a wrapped call site. + expect(numWrapped("42")).toBeNull(); + expect(numWrapped("")).toBeNull(); + expect(numWrapped(" ")).toBeNull(); + }); + it("unwraps {value} leaves and coerces with finite check", () => { + expect(numWrapped({ value: "0" })).toBe(0); + expect(numWrapped({ value: " 42 " })).toBe(42); + expect(numWrapped({ value: "3.14" })).toBe(3.14); + expect(numWrapped({ value: "abc" })).toBeNull(); + expect(numWrapped({ value: "" })).toBeNull(); + expect(numWrapped({ value: " " })).toBeNull(); + expect(numWrapped({ value: "\t\n" })).toBeNull(); + expect(numWrapped({ value: undefined })).toBeNull(); + expect(numWrapped({})).toBeNull(); + }); +}); diff --git a/src/sec/forms/insider-trading/_valueHelpers.ts b/src/sec/forms/insider-trading/_valueHelpers.ts new file mode 100644 index 0000000..a633864 --- /dev/null +++ b/src/sec/forms/insider-trading/_valueHelpers.ts @@ -0,0 +1,61 @@ +/** + * @license + * Copyright 2025 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +// Shared null-on-empty value helpers for SEC insider-trading form extractors. +// +// EDGAR XML elements that are present-but-empty (e.g. +// or ) parse to "". We must +// preserve those as null rather than fabricating a 0 via Number("") === 0, +// which would lie to downstream consumers about the filing's actual content. +// +// Form 144 fields are scalar (the value sits directly on the element), while +// Ownership Document fields are wrapped in a `{ value }` leaf so the schema +// can distinguish an empty value from a missing element. The two pairs +// intentionally have different signatures so they can't collapse together +// and silently accept the wrong shape at a call site. + +export function strScalar(n: string | number | undefined | null): string | null { + if (n === undefined || n === null) return null; + const t = String(n).trim(); + return t === "" ? null : t; +} + +export function numScalar(n: string | number | undefined | null): number | null { + if (n === undefined || n === null) return null; + const t = String(n).trim(); + if (t === "") return null; + const v = Number(t); + return Number.isFinite(v) ? v : null; +} + +export function strWrapped( + w: { value?: string } | string | undefined | null +): string | null { + if (w === undefined || w === null) return null; + if (typeof w === "string") { + const t = w.trim(); + return t === "" ? null : t; + } + const v = w.value; + if (v === undefined || v === null) return null; + const t = String(v).trim(); + return t === "" ? null : t; +} + +export function numWrapped( + w: { value?: string } | string | undefined | null +): number | null { + if (w === undefined || w === null) return null; + // A bare string at a wrapped call site is a schema mismatch; refuse it + // rather than guessing. + if (typeof w === "string") return null; + const v = w.value; + if (v === undefined || v === null) return null; + const t = String(v).trim(); + if (t === "") return null; + const n = Number(t); + return Number.isFinite(n) ? n : null; +} diff --git a/src/storage/entity/cikNameBulkWriter.ts b/src/storage/entity/cikNameBulkWriter.ts index ca0e409..f6ac79d 100644 --- a/src/storage/entity/cikNameBulkWriter.ts +++ b/src/storage/entity/cikNameBulkWriter.ts @@ -94,12 +94,28 @@ function createPostgresWriter(): CikNameBulkWriter { await client.query("BEGIN"); for (let start = 0; start < rows.length; start += PG_MAX_ROWS_PER_STATEMENT) { const slice = rows.slice(start, start + PG_MAX_ROWS_PER_STATEMENT); + // Per-slice dedup keeps `INSERT ... ON CONFLICT DO UPDATE` from + // failing on duplicate CIKs within a single statement (Postgres + // rejects two rows with the same conflict key in one INSERT). + // Last value wins, matching the SQLite `INSERT OR REPLACE` path. + // Dedup runs AFTER slicing and only shrinks the row set, so the + // 60_000-bind cap (PG_MAX_ROWS_PER_STATEMENT * 2) still holds. + const dedup = new Map(); + for (const r of slice) dedup.set(r.cik, r.name); + if (dedup.size < slice.length) { + console.debug( + `cikNameBulkWriter: dedup dropped ${slice.length - dedup.size} duplicate cik(s) within a ${slice.length}-row slice` + ); + } + if (dedup.size === 0) continue; const values: (number | string)[] = []; const placeholders: string[] = []; - for (let i = 0; i < slice.length; i++) { + let i = 0; + for (const [cik, name] of dedup.entries()) { const base = i * 2; placeholders.push(`($${base + 1}, $${base + 2})`); - values.push(slice[i].cik, slice[i].name); + values.push(cik, name); + i++; } const sql = `INSERT INTO "cik_names" ("cik", "name") VALUES ` + diff --git a/src/task/ciknames/FetchAllCikNamesTask.test.ts b/src/task/ciknames/FetchAllCikNamesTask.test.ts index e4c17cf..62a44f1 100644 --- a/src/task/ciknames/FetchAllCikNamesTask.test.ts +++ b/src/task/ciknames/FetchAllCikNamesTask.test.ts @@ -82,4 +82,18 @@ describe("createCikNameBulkWriter", () => { const repo = globalServiceRegistry.get(CIK_NAME_REPOSITORY_TOKEN); expect((await repo.getAll())?.length ?? 0).toBe(0); }); + + it("dedups duplicate CIKs within a single batch, last value wins", async () => { + const writer = createCikNameBulkWriter(); + await writer.writeBatch([ + { cik: 1, name: "FIRST" }, + { cik: 2, name: "B" }, + { cik: 1, name: "LAST" }, + ]); + await writer.close(); + const repo = globalServiceRegistry.get(CIK_NAME_REPOSITORY_TOKEN); + expect((await repo.get({ cik: 1 }))?.name).toBe("LAST"); + const all = await repo.getAll(); + expect(all?.length).toBe(2); + }); });