diff --git a/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts b/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts index bab859d..b6d8a04 100644 --- a/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts +++ b/backend/BigSet_Data_Collection_Agent/src/agents/extract.ts @@ -13,7 +13,11 @@ import { type ExtractedRecord, type FetchedPage, } from "../models/schemas.js"; -import { deriveRecordSourceUrls } from "../records/source-urls.js"; +import { + deriveRecordSourceUrls, + isHttpUrl, + isUrlLikeColumnName, +} from "../records/source-urls.js"; /** * Extraction is always one source per LLM call in process-pages.ts: @@ -170,6 +174,31 @@ function provenanceUrlColumns(spec: DatasetSpec): ColumnDef[] { return spec.columns.filter(isProvenanceUrlColumn); } +function isUrlLikeColumn(column: ColumnDef): boolean { + return isUrlLikeColumnName(column.name); +} + +function addUrlCellEvidence( + row: Record, + evidence: ExtractedRecord["evidence"], + spec: DatasetSpec, +): void { + const fieldsWithEvidence = new Set(evidence.map((item) => item.field)); + for (const column of spec.columns) { + if (!isUrlLikeColumn(column) || fieldsWithEvidence.has(column.name)) { + continue; + } + const value = row[column.name]; + if (!isHttpUrl(value)) continue; + evidence.push({ + field: column.name, + url: value.trim(), + quote: value.trim(), + }); + fieldsWithEvidence.add(column.name); + } +} + /** Attach evidence URLs and source_urls; keep LLM row and provenance values. */ export function finalizeExtractedRecord( record: LlmExtractionRecord, @@ -190,6 +219,7 @@ export function finalizeExtractedRecord( row[column.name] = pageUrl; } } + addUrlCellEvidence(row, evidence, spec); const source_urls = deriveRecordSourceUrls({ spec, diff --git a/backend/BigSet_Data_Collection_Agent/src/merge/records.ts b/backend/BigSet_Data_Collection_Agent/src/merge/records.ts index 5773ce3..a5ca0e3 100644 --- a/backend/BigSet_Data_Collection_Agent/src/merge/records.ts +++ b/backend/BigSet_Data_Collection_Agent/src/merge/records.ts @@ -1,7 +1,9 @@ import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; import { deriveRecordSourceUrls, + isUrlLikeColumnName, scoreDocsUrlForOfficialSource, + scoreUrlForCanonicalSource, } from "../records/source-urls.js"; function normalizeValue(value: unknown): string { @@ -28,7 +30,12 @@ function valuesMatch(a: unknown, b: unknown): boolean { /** Normalize entity names for stable primary-key matching. */ export function normalizePrimaryKey(value: unknown): string { return normalizeValue(value) + .replace( + /\b(?:incorporated|inc|corporation|corp|company|co|llc|ltd|limited|plc)\b\.?$/i, + "", + ) .replace(/\s+/g, " ") + .trim() .replace(/[''`]/g, "'"); } @@ -136,7 +143,12 @@ export function mergePair( ): ExtractedRecord { const row: Record = { ...a.row }; const fieldsFilledFromIncoming = new Set(); - let replacedDocsUrlFromIncoming = false; + const shouldPreferIncomingCanonicalRecord = prefersIncomingCanonicalRecord( + a, + b, + spec, + ); + let replacedCanonicalUrlFromIncoming = false; for (const col of spec.columns) { const current = row[col.name]; @@ -147,18 +159,26 @@ export function mergePair( if (currentEmpty && incomingFilled) { row[col.name] = incoming ?? null; fieldsFilledFromIncoming.add(col.name); + } else if ( + incomingFilled && + shouldPreferIncomingCanonicalRecord && + !spec.dedupe_keys.includes(col.name) + ) { + row[col.name] = incoming ?? null; + fieldsFilledFromIncoming.add(col.name); + replacedCanonicalUrlFromIncoming ||= isCanonicalSourceUrlColumn(col.name); } else if (incomingFilled && shouldReplaceCell(col.name, current, incoming)) { row[col.name] = incoming ?? null; fieldsFilledFromIncoming.add(col.name); - replacedDocsUrlFromIncoming ||= isDocsUrlColumn(col.name); + replacedCanonicalUrlFromIncoming ||= isCanonicalSourceUrlColumn(col.name); } } - if (replacedDocsUrlFromIncoming) { + if (replacedCanonicalUrlFromIncoming) { for (const col of spec.columns) { const incoming = b.row[col.name]; if ( - isDocsCompanionColumn(col.name) && + shouldReplaceCompanionColumn(col.name, spec) && !isEmpty(incoming) && !spec.dedupe_keys.includes(col.name) ) { @@ -186,7 +206,7 @@ export function mergePair( evidenceFields.add(item.field); } } - const coherentEvidence = filterEvidenceForRetainedDocsUrl(spec, row, evidence); + const coherentEvidence = filterEvidenceForRetainedCanonicalUrl(spec, row, evidence); const extractionConfidence = Math.max( a.extraction_confidence ?? 0, @@ -215,7 +235,7 @@ function shouldMergeIncomingEvidence(input: { fieldsFilledFromIncoming: Set; }): boolean { if ( - isDocsUrlColumn(input.field) && + isCanonicalSourceUrlColumn(input.field) && !urlsReferenceSamePage( input.incomingRow[input.field], input.mergedRow[input.field], @@ -234,15 +254,59 @@ function shouldReplaceCell( current: string | number | boolean | null | undefined, incoming: string | number | boolean | null | undefined, ): boolean { - if (!isDocsUrlColumn(columnName)) { + if (!isCanonicalSourceUrlColumn(columnName)) { return false; } return ( - scoreDocsUrlForOfficialSource(incoming) > - scoreDocsUrlForOfficialSource(current) + scoreUrlForCanonicalSource(incoming) > scoreUrlForCanonicalSource(current) ); } +function prefersIncomingCanonicalRecord( + current: ExtractedRecord, + incoming: ExtractedRecord, + spec: DatasetSpec, +): boolean { + const currentScore = bestCanonicalScore(current, spec); + const incomingScore = bestCanonicalScore(incoming, spec); + if (incomingScore > currentScore) { + return true; + } + if (incomingScore < currentScore) { + return false; + } + + const currentDate = bestRecordTimestamp(current, spec); + const incomingDate = bestRecordTimestamp(incoming, spec); + return incomingDate !== null && currentDate !== null && incomingDate > currentDate; +} + +function bestCanonicalScore(record: ExtractedRecord, spec: DatasetSpec): number { + let bestScore = 0; + for (const column of spec.columns) { + if (!isCanonicalSourceUrlColumn(column.name)) continue; + bestScore = Math.max( + bestScore, + scoreUrlForCanonicalSource(record.row[column.name]), + ); + } + return bestScore; +} + +function bestRecordTimestamp( + record: ExtractedRecord, + spec: DatasetSpec, +): number | null { + const timestamps = spec.columns + .filter((column) => column.name.toLowerCase().includes("date")) + .map((column) => Date.parse(String(record.row[column.name] ?? ""))) + .filter(Number.isFinite); + if (timestamps.length === 0) { + return null; + } + return Math.max(...timestamps); +} + function isDocsUrlColumn(columnName: string): boolean { const lower = columnName.toLowerCase(); return ( @@ -262,60 +326,91 @@ function isDocsCompanionColumn(columnName: string): boolean { ); } -function filterEvidenceForRetainedDocsUrl( +function isCanonicalSourceUrlColumn(columnName: string): boolean { + return isUrlLikeColumnName(columnName); +} + +function shouldReplaceCompanionColumn( + columnName: string, + spec: DatasetSpec, +): boolean { + if (spec.dedupe_keys.includes(columnName)) { + return false; + } + return !isCanonicalSourceUrlColumn(columnName); +} + +function filterEvidenceForRetainedCanonicalUrl( spec: DatasetSpec, row: Record, evidence: ExtractedRecord["evidence"], ): ExtractedRecord["evidence"] { - const retainedDocsUrl = bestRetainedDocsUrl(spec, row); - if (!retainedDocsUrl) { + const retainedUrl = bestRetainedCanonicalUrl(spec, row); + if (!retainedUrl) { return evidence; } return evidence.filter((item) => { - if (isDocsUrlColumn(item.field)) { + if (isCanonicalSourceUrlColumn(item.field)) { return urlsReferenceSamePage(item.url, row[item.field]); } if ( isDocsCompanionColumn(item.field) || + isLikelySourceCompanionColumn(item.field) || spec.dedupe_keys.includes(item.field) ) { - return sourceUrlSupportsRetainedDocsUrl(item.url, retainedDocsUrl); + return sourceUrlSupportsRetainedCanonicalUrl(item.url, retainedUrl); } return true; }); } -function bestRetainedDocsUrl( +function bestRetainedCanonicalUrl( spec: DatasetSpec, row: Record, ): string | null { let bestUrl: string | null = null; let bestScore = 0; for (const col of spec.columns) { - if (!isDocsUrlColumn(col.name)) continue; + if (!isCanonicalSourceUrlColumn(col.name)) continue; const value = row[col.name]; - const score = scoreDocsUrlForOfficialSource(value); + const score = scoreUrlForCanonicalSource(value); if (typeof value === "string" && score > bestScore) { bestUrl = value; bestScore = score; } } - return bestScore >= 4 ? bestUrl : null; + return bestScore >= 2 ? bestUrl : null; } -function sourceUrlSupportsRetainedDocsUrl( +function isLikelySourceCompanionColumn(columnName: string): boolean { + const lower = columnName.toLowerCase(); + return ( + lower.includes("date") || + lower.includes("quarter") || + lower.includes("price") || + lower.includes("plan") || + lower.includes("title") || + lower.includes("summary") || + lower.includes("description") + ); +} + +function sourceUrlSupportsRetainedCanonicalUrl( evidenceUrl: unknown, - retainedDocsUrl: string, + retainedUrl: string, ): boolean { - if (urlsReferenceSamePage(evidenceUrl, retainedDocsUrl)) { + if (urlsReferenceSamePage(evidenceUrl, retainedUrl)) { return true; } + if (scoreDocsUrlForOfficialSource(retainedUrl) < 4) { + return false; + } return ( - sameHostname(evidenceUrl, retainedDocsUrl) && - scoreDocsUrlForOfficialSource(evidenceUrl) >= 4 + sameHostname(evidenceUrl, retainedUrl) && + scoreUrlForCanonicalSource(evidenceUrl) >= 2 ); } diff --git a/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts b/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts index f193ffc..56ca7a4 100644 --- a/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts +++ b/backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts @@ -1,10 +1,10 @@ import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js"; -function isHttpUrl(value: unknown): value is string { +export function isHttpUrl(value: unknown): value is string { return typeof value === "string" && /^https?:\/\//i.test(value.trim()); } -function isUrlLikeColumnName(name: string): boolean { +export function isUrlLikeColumnName(name: string): boolean { const lower = name.toLowerCase(); return lower === "url" || lower.endsWith("_url") || lower.includes("url"); } @@ -52,3 +52,23 @@ export function scoreDocsUrlForOfficialSource(value: unknown): number { } return score; } + +export function scoreUrlForCanonicalSource(value: unknown): number { + if (!isHttpUrl(value)) return 0; + const normalized = value.toLowerCase(); + let score = scoreDocsUrlForOfficialSource(value); + if (/\b(?:pricing|billing)\b/.test(normalized)) score += 3; + if (/\b(?:earnings|press-release|financial-results|reports-.*quarter|quarter-results)\b/.test(normalized)) { + score += 4; + } + if (/\b(?:news|newsroom|investor|investors)\b/.test(normalized)) { + score += 2; + } + if (/\/(?:default|index)\.(?:aspx|html?)$/.test(normalized)) { + score -= 2; + } + if (/\/(?:financial-info|financial-reports|annual-reports)\/(?:default\.aspx)?$/.test(normalized)) { + score -= 2; + } + return score; +} diff --git a/backend/test/collection-extract-finalize.test.ts b/backend/test/collection-extract-finalize.test.ts new file mode 100644 index 0000000..ef3aa85 --- /dev/null +++ b/backend/test/collection-extract-finalize.test.ts @@ -0,0 +1,61 @@ +import assert from "node:assert/strict"; +import { test } from "node:test"; + +import { finalizeExtractedRecord } from "../BigSet_Data_Collection_Agent/src/agents/extract.js"; +import type { DatasetSpec } from "../BigSet_Data_Collection_Agent/src/models/schemas.js"; + +const docsSpec: DatasetSpec = { + intent_summary: "Official docs pages.", + target_row_count: 1, + row_grain: "one row per docs page", + columns: [ + { + name: "entity_name", + type: "string", + description: "Vendor name.", + required: true, + }, + { + name: "docs_url", + type: "string", + description: "Official docs URL.", + required: true, + }, + { + name: "summary", + type: "string", + description: "What the page covers.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["Cloudflare MCP docs"], + extraction_hints: "Prefer official docs pages.", +}; + +test("collection extraction adds URL cell evidence when model omits evidence", () => { + const record = finalizeExtractedRecord( + { + row: { + entity_name: "Cloudflare", + docs_url: "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + summary: "Remote MCP server docs.", + }, + evidence: [], + extraction_confidence: 0.8, + }, + "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + docsSpec, + ); + + assert.deepEqual(record.evidence, [ + { + field: "docs_url", + url: "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + quote: "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + }, + ]); + assert.deepEqual(record.source_urls, [ + "https://developers.cloudflare.com/agents/guides/remote-mcp-server/", + ]); +}); diff --git a/backend/test/collection-record-merge.test.ts b/backend/test/collection-record-merge.test.ts index c2bfd50..93d205f 100644 --- a/backend/test/collection-record-merge.test.ts +++ b/backend/test/collection-record-merge.test.ts @@ -45,6 +45,41 @@ const docsSpec: DatasetSpec = { extraction_hints: "Prefer official docs pages.", }; +const earningsSpec: DatasetSpec = { + intent_summary: "Latest earnings releases.", + target_row_count: 3, + row_grain: "one row per company", + columns: [ + { + name: "entity_name", + type: "string", + description: "Company name.", + required: true, + }, + { + name: "release_date", + type: "date", + description: "Release date.", + required: true, + }, + { + name: "fiscal_quarter", + type: "string", + description: "Fiscal quarter.", + required: true, + }, + { + name: "source_url", + type: "string", + description: "Official earnings release source URL.", + required: true, + }, + ], + dedupe_keys: ["entity_name"], + search_queries: ["latest earnings releases"], + extraction_hints: "Prefer official dated earnings release pages.", +}; + test("collection record merge does not attach evidence from conflicting duplicate rows", () => { const officialRecord = record({ row: { @@ -325,6 +360,78 @@ test("collection record merge drops docs URL evidence from unrelated source page ]); }); +test("collection record merge folds corporate suffix variants and prefers stronger source pages", () => { + const merged = mergeRecords(earningsSpec, [ + record({ + row: { + entity_name: "Nvidia", + release_date: "2026-02-25", + fiscal_quarter: "Q4 Fiscal 2026", + source_url: "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + }, + evidence: [ + evidence( + "release_date", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + "February 25, 2026", + ), + evidence( + "fiscal_quarter", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + "fourth quarter fiscal 2026", + ), + evidence( + "source_url", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + ), + ], + sourceUrls: [ + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-fourth-quarter-and-fiscal-2026", + ], + }), + record({ + row: { + entity_name: "NVIDIA Corporation", + release_date: "2026-05-20", + fiscal_quarter: "FY27 Q1", + source_url: "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + }, + evidence: [ + evidence( + "release_date", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + "May 20, 2026", + ), + evidence( + "fiscal_quarter", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + "first quarter fiscal 2027", + ), + evidence( + "source_url", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ), + ], + sourceUrls: [ + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ], + }), + ]).records; + + assert.equal(merged.length, 1); + assert.equal(merged[0]?.row.entity_name, "Nvidia"); + assert.equal(merged[0]?.row.fiscal_quarter, "FY27 Q1"); + assert.equal( + merged[0]?.row.source_url, + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ); + assert.deepEqual(merged[0]?.source_urls, [ + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", + ]); +}); + test("collection record merge fixture reaches benchmark-equivalent domain coverage", () => { const merged = mergeRecords(docsSpec, [ record({ diff --git a/benchmarks/dataset-agent/run-benchmark.mjs b/benchmarks/dataset-agent/run-benchmark.mjs index d3fd0f5..bc22fff 100755 --- a/benchmarks/dataset-agent/run-benchmark.mjs +++ b/benchmarks/dataset-agent/run-benchmark.mjs @@ -118,14 +118,14 @@ const answerKeysByPromptId = { officialSourceDomains: ["openai.com", "anthropic.com", "deepmind.google"], }, "saas-pricing-pages": { - verifiedAt, + verifiedAt: "2026-05-22", sourceUrls: [ "https://stripe.com/pricing", - "https://www.paddle.com/billing", + "https://www.paddle.com/pricing", "https://www.chargebee.com/pricing/", ], scoringNotes: - "Pass requires all three vendors, official domains, and visible plan or price text. Paddle may route pricing through Billing or sales-led pages.", + "Pass requires all three vendors, official domains, and visible plan or price text. Paddle's current pricing page can show Checkout transaction pricing.", expectedBehavior: "answer", requiredColumns: ["entity_name", "pricing_page_url", "plan_or_price", "source_url"], expectedEntities: [ @@ -141,7 +141,7 @@ const answerKeysByPromptId = { label: "Paddle", aliases: ["paddle"], allowedSourceDomains: ["paddle.com"], - requiredText: ["merchant of record", "billing"], + requiredText: ["checkout", "5%", "50"], }, { id: "chargebee", @@ -155,14 +155,14 @@ const answerKeysByPromptId = { officialSourceDomains: ["stripe.com", "paddle.com", "chargebee.com"], }, "earnings-release-pages": { - verifiedAt, + verifiedAt: "2026-05-22", sourceUrls: [ "https://www.apple.com/newsroom/2026/04/apple-reports-second-quarter-results/", "https://www.microsoft.com/en-us/investor/earnings/fy-2026-q3/press-release-webcast", - "https://investor.nvidia.com/news/press-release-details/2026/NVIDIA-Announces-Financial-Results-for-Fourth-Quarter-and-Fiscal-2026/", + "https://nvidianews.nvidia.com/news/nvidia-announces-financial-results-for-first-quarter-fiscal-2027", ], scoringNotes: - "As of 2026-05-20, Apple latest verified release is fiscal 2026 Q2 on 2026-04-30, Microsoft is FY26 Q3 on 2026-04-29, and NVIDIA is Q4 fiscal 2026 on 2026-02-25.", + "As of 2026-05-22, Apple latest verified release is fiscal 2026 Q2 on 2026-04-30, Microsoft is FY26 Q3 on 2026-04-29, and NVIDIA is Q1 fiscal 2027 on 2026-05-20.", expectedBehavior: "answer", requiredColumns: ["entity_name", "release_date", "fiscal_quarter", "source_url"], expectedEntities: [ @@ -185,7 +185,7 @@ const answerKeysByPromptId = { label: "NVIDIA", aliases: ["nvidia"], allowedSourceDomains: ["nvidia.com"], - requiredText: ["fourth quarter", "q4", "fiscal 2026"], + requiredText: ["first quarter", "q1", "fiscal 2027", "may 20"], }, ], minimumExpectedEntityMatches: 3,