Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion backend/BigSet_Data_Collection_Agent/src/agents/extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,11 @@ import {
type ExtractedRecord,
type FetchedPage,
} from "../models/schemas.js";
import { deriveRecordSourceUrls } from "../records/source-urls.js";
import {
deriveRecordSourceUrls,
isHttpUrl,
isUrlLikeColumnName,
} from "../records/source-urls.js";

/**
* Extraction is always one source per LLM call in process-pages.ts:
Expand Down Expand Up @@ -170,6 +174,31 @@ function provenanceUrlColumns(spec: DatasetSpec): ColumnDef[] {
return spec.columns.filter(isProvenanceUrlColumn);
}

function isUrlLikeColumn(column: ColumnDef): boolean {
return isUrlLikeColumnName(column.name);
}

function addUrlCellEvidence(
row: Record<string, string | number | boolean | null>,
evidence: ExtractedRecord["evidence"],
spec: DatasetSpec,
): void {
const fieldsWithEvidence = new Set(evidence.map((item) => item.field));
for (const column of spec.columns) {
if (!isUrlLikeColumn(column) || fieldsWithEvidence.has(column.name)) {
continue;
}
const value = row[column.name];
if (!isHttpUrl(value)) continue;
evidence.push({
field: column.name,
url: value.trim(),
quote: value.trim(),
});
fieldsWithEvidence.add(column.name);
}
}

/** Attach evidence URLs and source_urls; keep LLM row and provenance values. */
export function finalizeExtractedRecord(
record: LlmExtractionRecord,
Expand All @@ -190,6 +219,7 @@ export function finalizeExtractedRecord(
row[column.name] = pageUrl;
}
}
addUrlCellEvidence(row, evidence, spec);

const source_urls = deriveRecordSourceUrls({
spec,
Expand Down
141 changes: 118 additions & 23 deletions backend/BigSet_Data_Collection_Agent/src/merge/records.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js";
import {
deriveRecordSourceUrls,
isUrlLikeColumnName,
scoreDocsUrlForOfficialSource,
scoreUrlForCanonicalSource,
} from "../records/source-urls.js";

function normalizeValue(value: unknown): string {
Expand All @@ -28,7 +30,12 @@ function valuesMatch(a: unknown, b: unknown): boolean {
/** Normalize entity names for stable primary-key matching. */
export function normalizePrimaryKey(value: unknown): string {
return normalizeValue(value)
.replace(
/\b(?:incorporated|inc|corporation|corp|company|co|llc|ltd|limited|plc)\b\.?$/i,
"",
)
.replace(/\s+/g, " ")
.trim()
.replace(/[''`]/g, "'");
}

Expand Down Expand Up @@ -136,7 +143,12 @@ export function mergePair(
): ExtractedRecord {
const row: Record<string, string | number | boolean | null> = { ...a.row };
const fieldsFilledFromIncoming = new Set<string>();
let replacedDocsUrlFromIncoming = false;
const shouldPreferIncomingCanonicalRecord = prefersIncomingCanonicalRecord(
a,
b,
spec,
);
let replacedCanonicalUrlFromIncoming = false;

for (const col of spec.columns) {
const current = row[col.name];
Expand All @@ -147,18 +159,26 @@ export function mergePair(
if (currentEmpty && incomingFilled) {
row[col.name] = incoming ?? null;
fieldsFilledFromIncoming.add(col.name);
} else if (
incomingFilled &&
shouldPreferIncomingCanonicalRecord &&
!spec.dedupe_keys.includes(col.name)
) {
row[col.name] = incoming ?? null;
fieldsFilledFromIncoming.add(col.name);
replacedCanonicalUrlFromIncoming ||= isCanonicalSourceUrlColumn(col.name);
} else if (incomingFilled && shouldReplaceCell(col.name, current, incoming)) {
row[col.name] = incoming ?? null;
fieldsFilledFromIncoming.add(col.name);
replacedDocsUrlFromIncoming ||= isDocsUrlColumn(col.name);
replacedCanonicalUrlFromIncoming ||= isCanonicalSourceUrlColumn(col.name);
}
}

if (replacedDocsUrlFromIncoming) {
if (replacedCanonicalUrlFromIncoming) {
for (const col of spec.columns) {
const incoming = b.row[col.name];
if (
isDocsCompanionColumn(col.name) &&
shouldReplaceCompanionColumn(col.name, spec) &&
!isEmpty(incoming) &&
!spec.dedupe_keys.includes(col.name)
) {
Expand Down Expand Up @@ -186,7 +206,7 @@ export function mergePair(
evidenceFields.add(item.field);
}
}
const coherentEvidence = filterEvidenceForRetainedDocsUrl(spec, row, evidence);
const coherentEvidence = filterEvidenceForRetainedCanonicalUrl(spec, row, evidence);

const extractionConfidence = Math.max(
a.extraction_confidence ?? 0,
Expand Down Expand Up @@ -215,7 +235,7 @@ function shouldMergeIncomingEvidence(input: {
fieldsFilledFromIncoming: Set<string>;
}): boolean {
if (
isDocsUrlColumn(input.field) &&
isCanonicalSourceUrlColumn(input.field) &&
!urlsReferenceSamePage(
input.incomingRow[input.field],
input.mergedRow[input.field],
Expand All @@ -234,15 +254,59 @@ function shouldReplaceCell(
current: string | number | boolean | null | undefined,
incoming: string | number | boolean | null | undefined,
): boolean {
if (!isDocsUrlColumn(columnName)) {
if (!isCanonicalSourceUrlColumn(columnName)) {
return false;
}
return (
scoreDocsUrlForOfficialSource(incoming) >
scoreDocsUrlForOfficialSource(current)
scoreUrlForCanonicalSource(incoming) > scoreUrlForCanonicalSource(current)
);
}

function prefersIncomingCanonicalRecord(
current: ExtractedRecord,
incoming: ExtractedRecord,
spec: DatasetSpec,
): boolean {
const currentScore = bestCanonicalScore(current, spec);
const incomingScore = bestCanonicalScore(incoming, spec);
if (incomingScore > currentScore) {
return true;
}
if (incomingScore < currentScore) {
return false;
}

const currentDate = bestRecordTimestamp(current, spec);
const incomingDate = bestRecordTimestamp(incoming, spec);
return incomingDate !== null && currentDate !== null && incomingDate > currentDate;
}

function bestCanonicalScore(record: ExtractedRecord, spec: DatasetSpec): number {
let bestScore = 0;
for (const column of spec.columns) {
if (!isCanonicalSourceUrlColumn(column.name)) continue;
bestScore = Math.max(
bestScore,
scoreUrlForCanonicalSource(record.row[column.name]),
);
}
return bestScore;
}

function bestRecordTimestamp(
record: ExtractedRecord,
spec: DatasetSpec,
): number | null {
const timestamps = spec.columns
.filter((column) => column.name.toLowerCase().includes("date"))
.map((column) => Date.parse(String(record.row[column.name] ?? "")))
.filter(Number.isFinite);
if (timestamps.length === 0) {
return null;
}
return Math.max(...timestamps);
}

function isDocsUrlColumn(columnName: string): boolean {
const lower = columnName.toLowerCase();
return (
Expand All @@ -262,60 +326,91 @@ function isDocsCompanionColumn(columnName: string): boolean {
);
}

function filterEvidenceForRetainedDocsUrl(
function isCanonicalSourceUrlColumn(columnName: string): boolean {
return isUrlLikeColumnName(columnName);
}

function shouldReplaceCompanionColumn(
columnName: string,
spec: DatasetSpec,
): boolean {
if (spec.dedupe_keys.includes(columnName)) {
return false;
}
return !isCanonicalSourceUrlColumn(columnName);
}

function filterEvidenceForRetainedCanonicalUrl(
spec: DatasetSpec,
row: Record<string, string | number | boolean | null>,
evidence: ExtractedRecord["evidence"],
): ExtractedRecord["evidence"] {
const retainedDocsUrl = bestRetainedDocsUrl(spec, row);
if (!retainedDocsUrl) {
const retainedUrl = bestRetainedCanonicalUrl(spec, row);
if (!retainedUrl) {
return evidence;
}

return evidence.filter((item) => {
if (isDocsUrlColumn(item.field)) {
if (isCanonicalSourceUrlColumn(item.field)) {
return urlsReferenceSamePage(item.url, row[item.field]);
}

if (
isDocsCompanionColumn(item.field) ||
isLikelySourceCompanionColumn(item.field) ||
spec.dedupe_keys.includes(item.field)
) {
return sourceUrlSupportsRetainedDocsUrl(item.url, retainedDocsUrl);
return sourceUrlSupportsRetainedCanonicalUrl(item.url, retainedUrl);
}

return true;
});
}

function bestRetainedDocsUrl(
function bestRetainedCanonicalUrl(
spec: DatasetSpec,
row: Record<string, string | number | boolean | null>,
): string | null {
let bestUrl: string | null = null;
let bestScore = 0;
for (const col of spec.columns) {
if (!isDocsUrlColumn(col.name)) continue;
if (!isCanonicalSourceUrlColumn(col.name)) continue;
const value = row[col.name];
const score = scoreDocsUrlForOfficialSource(value);
const score = scoreUrlForCanonicalSource(value);
if (typeof value === "string" && score > bestScore) {
bestUrl = value;
bestScore = score;
}
}
return bestScore >= 4 ? bestUrl : null;
return bestScore >= 2 ? bestUrl : null;
}

function sourceUrlSupportsRetainedDocsUrl(
function isLikelySourceCompanionColumn(columnName: string): boolean {
const lower = columnName.toLowerCase();
return (
lower.includes("date") ||
lower.includes("quarter") ||
lower.includes("price") ||
lower.includes("plan") ||
lower.includes("title") ||
lower.includes("summary") ||
lower.includes("description")
);
}

function sourceUrlSupportsRetainedCanonicalUrl(
evidenceUrl: unknown,
retainedDocsUrl: string,
retainedUrl: string,
): boolean {
if (urlsReferenceSamePage(evidenceUrl, retainedDocsUrl)) {
if (urlsReferenceSamePage(evidenceUrl, retainedUrl)) {
return true;
}
if (scoreDocsUrlForOfficialSource(retainedUrl) < 4) {
return false;
}
return (
sameHostname(evidenceUrl, retainedDocsUrl) &&
scoreDocsUrlForOfficialSource(evidenceUrl) >= 4
sameHostname(evidenceUrl, retainedUrl) &&
scoreUrlForCanonicalSource(evidenceUrl) >= 2
);
}

Expand Down
24 changes: 22 additions & 2 deletions backend/BigSet_Data_Collection_Agent/src/records/source-urls.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import type { DatasetSpec, ExtractedRecord } from "../models/schemas.js";

function isHttpUrl(value: unknown): value is string {
export function isHttpUrl(value: unknown): value is string {
return typeof value === "string" && /^https?:\/\//i.test(value.trim());
}

function isUrlLikeColumnName(name: string): boolean {
export function isUrlLikeColumnName(name: string): boolean {
const lower = name.toLowerCase();
return lower === "url" || lower.endsWith("_url") || lower.includes("url");
}
Expand Down Expand Up @@ -52,3 +52,23 @@ export function scoreDocsUrlForOfficialSource(value: unknown): number {
}
return score;
}

export function scoreUrlForCanonicalSource(value: unknown): number {
if (!isHttpUrl(value)) return 0;
const normalized = value.toLowerCase();
let score = scoreDocsUrlForOfficialSource(value);
if (/\b(?:pricing|billing)\b/.test(normalized)) score += 3;
if (/\b(?:earnings|press-release|financial-results|reports-.*quarter|quarter-results)\b/.test(normalized)) {
score += 4;
}
if (/\b(?:news|newsroom|investor|investors)\b/.test(normalized)) {
score += 2;
}
if (/\/(?:default|index)\.(?:aspx|html?)$/.test(normalized)) {
score -= 2;
}
if (/\/(?:financial-info|financial-reports|annual-reports)\/(?:default\.aspx)?$/.test(normalized)) {
score -= 2;
}
return score;
}
Loading