From a0e935d3cdab43d1429ab7000a746c68dc75007f Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 11 May 2026 07:12:50 +0000 Subject: [PATCH 01/11] feat(kb): move embedding/reranker model config onto KnowledgeBase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KB now owns its `docEmbeddingModel`, `queryEmbeddingModel`, and `rerankerModel` IDs directly. Callers go through `kb.search()` / `kb.searchWithRerank()` / `kb.upsertDocumentWithIndex()` without threading models — the KB delegates to an installed `IKbAiStrategy` (typically `createAiKbStrategy(kb)` from `@workglow/ai`). This replaces the `onSearch` / `onDocumentUpsert` / `onDocumentDelete` callback soup with a single typed strategy interface. RAG callers no longer have to wire model IDs through every retrieval call site, and the KB exposes the search "kind" (similarity / hybrid / rerank) as a single dispatcher. - knowledge-base: add `IKbAiStrategy`, store model IDs as readonly fields, add `search`/`searchWithRerank`/`upsertDocumentWithIndex`/ `reindex`; remove the three callback hooks - ai: add `TextRerankerTask` (AiTask) and `KbReindexTask`; simplify `KbSearchTask` (no model input — KB picks the kind); add `createAiKbStrategy(kb)` factory wiring HierarchicalChunker + TextEmbedding + TextReranker - huggingface-transformers: add `HFT_TextReranker` run-fn that loads cross-encoder text-classification pipelines (bge-reranker, ms-marco-MiniLM) and scores `[query, doc]` pairs - tests: replace KB callback suite with strategy suite https://claude.ai/code/session_01Ya54WFZhpDFzAqRh1qG8Ex --- packages/ai/src/common.ts | 2 + packages/ai/src/kb/createAiKbStrategy.ts | 174 +++++++++++ packages/ai/src/task/KbReindexTask.ts | 86 ++++++ packages/ai/src/task/KbSearchTask.ts | 65 +++- packages/ai/src/task/RerankerTask.ts | 15 +- packages/ai/src/task/TextRerankerTask.ts | 110 +++++++ packages/ai/src/task/index.ts | 6 + packages/knowledge-base/src/common.ts | 1 + .../src/knowledge-base/IKbAiStrategy.ts | 81 +++++ .../src/knowledge-base/KnowledgeBase.ts | 291 +++++++++++++----- .../src/knowledge-base/createKnowledgeBase.ts | 24 +- .../src/test/rag/DocumentRepository.test.ts | 132 ++++---- .../src/ai/common/HFT_JobRunFns.ts | 2 + .../src/ai/common/HFT_TextReranker.ts | 66 ++++ 14 files changed, 888 insertions(+), 167 deletions(-) create mode 100644 packages/ai/src/kb/createAiKbStrategy.ts create mode 100644 packages/ai/src/task/KbReindexTask.ts create mode 100644 packages/ai/src/task/TextRerankerTask.ts create mode 100644 packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts create mode 100644 providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts diff --git a/packages/ai/src/common.ts b/packages/ai/src/common.ts index 1726c8505..675f542b2 100644 --- a/packages/ai/src/common.ts +++ b/packages/ai/src/common.ts @@ -24,3 +24,5 @@ export * from "./provider/AiProviderRegistry"; export * from "./provider/QueuedAiProvider"; export * from "./task"; + +export * from "./kb/createAiKbStrategy"; diff --git a/packages/ai/src/kb/createAiKbStrategy.ts b/packages/ai/src/kb/createAiKbStrategy.ts new file mode 100644 index 000000000..87a170839 --- /dev/null +++ b/packages/ai/src/kb/createAiKbStrategy.ts @@ -0,0 +1,174 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + ChunkSearchResult, + IKbAiStrategy, + KnowledgeBase, +} from "@workglow/knowledge-base"; +import type { ChunkRecord, Document } from "@workglow/knowledge-base"; +import type { TypedArray } from "@workglow/util/schema"; + +import { HierarchicalChunkerTask } from "../task/HierarchicalChunkerTask"; +import { RerankerTask } from "../task/RerankerTask"; +import { TextEmbeddingTask } from "../task/TextEmbeddingTask"; +import { TextRerankerTask } from "../task/TextRerankerTask"; + +/** + * Tuning knobs for the default AI strategy. Chunker defaults match the + * builder's historical embed-workflow defaults so re-indexing produces the + * same chunk boundaries unless explicitly changed. + */ +export interface CreateAiKbStrategyOptions { + readonly chunker?: { + readonly maxTokens?: number; + readonly overlap?: number; + readonly reservedTokens?: number; + readonly strategy?: "hierarchical" | "flat" | "sentence"; + }; +} + +/** + * Build an {@link IKbAiStrategy} that wires a KB's configured model IDs to + * the real AI runtime (TextEmbeddingTask, TextRerankerTask, etc.). The + * strategy reads `kb.docEmbeddingModel`, `kb.queryEmbeddingModel`, and + * `kb.rerankerModel` lazily on every call, so changes after installation + * take effect on the next operation. + */ +export function createAiKbStrategy( + kb: KnowledgeBase, + options: CreateAiKbStrategyOptions = {} +): IKbAiStrategy { + const chunkerDefaults = { + maxTokens: options.chunker?.maxTokens ?? 512, + overlap: options.chunker?.overlap ?? 50, + reservedTokens: options.chunker?.reservedTokens ?? 10, + strategy: options.chunker?.strategy ?? "hierarchical", + } as const; + + const requireDocEmbed = (): string => { + const m = kb.docEmbeddingModel; + if (!m) { + throw new Error( + `KnowledgeBase "${kb.name}" has no docEmbeddingModel configured; ` + + `set it in createKnowledgeBase / BuilderKnowledgeBaseRecord.` + ); + } + return m; + }; + + const requireQueryEmbed = (): string => { + const m = kb.queryEmbeddingModel ?? kb.docEmbeddingModel; + if (!m) { + throw new Error( + `KnowledgeBase "${kb.name}" has no queryEmbeddingModel or docEmbeddingModel configured.` + ); + } + return m; + }; + + const embedTexts = async ( + texts: readonly string[], + modelId: string + ): Promise => { + if (texts.length === 0) return []; + const task = new TextEmbeddingTask(); + const result = await task.run({ text: texts as string[], model: modelId }); + const vector = result.vector; + if (Array.isArray(vector)) { + return vector as TypedArray[]; + } + return [vector as TypedArray]; + }; + + return { + async chunkAndEmbedDocument(doc: Document) { + const docId = doc.doc_id; + if (!docId) { + throw new Error( + "chunkAndEmbedDocument: document has no doc_id. " + + "Call kb.upsertDocument(doc) first or assign a doc_id." + ); + } + const chunker = new HierarchicalChunkerTask(); + const chunkResult = await chunker.run({ + doc_id: docId, + documentTree: doc.root as any, + ...chunkerDefaults, + }); + const chunks = chunkResult.chunks as ChunkRecord[]; + if (chunks.length === 0) { + return { chunks: [], vectors: [] }; + } + const vectors = await embedTexts( + chunks.map((c) => c.text), + requireDocEmbed() + ); + return { chunks, vectors }; + }, + + async embedQuery(text: string): Promise { + const vectors = await embedTexts([text], requireQueryEmbed()); + return vectors[0]; + }, + + async rerank( + query: string, + candidates: ChunkSearchResult[], + topK: number + ): Promise { + if (candidates.length === 0) { + return []; + } + const limit = Math.min(topK, candidates.length); + const rerankerModel = kb.rerankerModel; + const docs = candidates.map((c) => { + const meta = c.metadata as Record | undefined; + const text = meta?.text; + return typeof text === "string" ? text : JSON.stringify(meta ?? {}); + }); + + if (rerankerModel) { + const task = new TextRerankerTask(); + const result = await task.run({ + query, + documents: docs, + model: rerankerModel, + topK: limit, + }); + const indices = (result.indices as number[]) ?? []; + const scores = (result.scores as number[]) ?? []; + return indices.map((idx) => { + const candidate = candidates[idx]; + const newScore = scores[idx]; + return { + ...candidate, + score: typeof newScore === "number" ? newScore : candidate.score, + }; + }); + } + + // Heuristic fallback — keeps the API usable without a reranker model. + const heuristic = await new RerankerTask().run({ + query, + chunks: docs, + scores: candidates.map((c) => c.score), + metadata: candidates.map((c) => c.metadata as Record), + topK: limit, + method: "simple", + }); + const indices = (heuristic.originalIndices as number[]) ?? []; + const newScores = (heuristic.scores as number[]) ?? []; + return indices.map((idx, rank) => { + const candidate = candidates[idx]; + return { + ...candidate, + score: newScores[rank] ?? candidate.score, + }; + }); + }, + }; +} diff --git a/packages/ai/src/task/KbReindexTask.ts b/packages/ai/src/task/KbReindexTask.ts new file mode 100644 index 000000000..28a060900 --- /dev/null +++ b/packages/ai/src/task/KbReindexTask.ts @@ -0,0 +1,86 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { KnowledgeBase } from "@workglow/knowledge-base"; +import { TypeKnowledgeBase } from "@workglow/knowledge-base"; +import { CreateWorkflow, IExecuteContext, Task, Workflow } from "@workglow/task-graph"; +import type { TaskConfig } from "@workglow/task-graph"; +import type { DataPortSchema, FromSchema } from "@workglow/util/schema"; + +const inputSchema = { + type: "object", + properties: { + knowledgeBase: TypeKnowledgeBase({ + title: "Knowledge Base", + description: "Knowledge base to re-index", + }), + }, + required: ["knowledgeBase"], + additionalProperties: false, +} as const satisfies DataPortSchema; + +const outputSchema = { + type: "object", + properties: { + count: { + type: "number", + title: "Documents Re-indexed", + description: "Number of documents re-indexed", + }, + }, + required: ["count"], + additionalProperties: false, +} as const satisfies DataPortSchema; + +export type KbReindexTaskInput = FromSchema; +export type KbReindexTaskOutput = FromSchema; +export type KbReindexTaskConfig = TaskConfig; + +/** + * Re-index every document in a knowledge base via its installed AI strategy. + * The strategy handles chunking + embedding using the KB's configured + * `docEmbeddingModel`. Replaces the multi-task embed workflow pattern. + */ +export class KbReindexTask extends Task< + KbReindexTaskInput, + KbReindexTaskOutput, + KbReindexTaskConfig +> { + public static override type = "KbReindexTask"; + public static override category = "RAG"; + public static override title = "KB Reindex"; + public static override description = + "Re-chunk and re-embed every document in a knowledge base using its configured models."; + public static override cacheable = false; + + public static override inputSchema(): DataPortSchema { + return inputSchema as DataPortSchema; + } + public static override outputSchema(): DataPortSchema { + return outputSchema as DataPortSchema; + } + + override async execute( + input: KbReindexTaskInput, + _context: IExecuteContext + ): Promise { + const kb = input.knowledgeBase as KnowledgeBase; + const count = await kb.reindex(); + return { count }; + } +} + +export const kbReindex = async (input: KbReindexTaskInput, config?: KbReindexTaskConfig) => { + return new KbReindexTask(config).run(input); +}; + +declare module "@workglow/task-graph" { + interface Workflow { + kbReindex: CreateWorkflow; + } +} + +Workflow.prototype.kbReindex = CreateWorkflow(KbReindexTask); diff --git a/packages/ai/src/task/KbSearchTask.ts b/packages/ai/src/task/KbSearchTask.ts index 0e4c5f187..8a4800f6e 100644 --- a/packages/ai/src/task/KbSearchTask.ts +++ b/packages/ai/src/task/KbSearchTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { ChunkSearchResult, KnowledgeBase } from "@workglow/knowledge-base"; +import type { ChunkSearchResult, KnowledgeBase, SearchKind } from "@workglow/knowledge-base"; import { TypeKnowledgeBase } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, Task, Workflow } from "@workglow/task-graph"; import type { TaskConfig } from "@workglow/task-graph"; @@ -20,7 +20,15 @@ const inputSchema = { query: { type: "string", title: "Query", - description: "Search query (the KB's onSearch handles embedding internally)", + description: "Search query. The KB owns its embedding/reranker models internally.", + }, + kind: { + type: "string", + enum: ["similarity", "hybrid", "rerank"], + title: "Retrieval Kind", + description: + "Retrieval flavor. Defaults to 'rerank' when the KB has a reranker model, " + + "otherwise 'hybrid' if supported, otherwise 'similarity'.", }, topK: { type: "number", @@ -52,35 +60,56 @@ const outputSchema = { title: "Results", description: "Matching chunks in score-desc order", }, + chunks: { + type: "array", + items: { type: "string" }, + title: "Chunks", + description: "The chunk text content, parallel to `results`", + }, + chunk_ids: { + type: "array", + items: { type: "string" }, + title: "Chunk IDs", + description: "The chunk ids, parallel to `results`", + }, + scores: { + type: "array", + items: { type: "number" }, + title: "Scores", + description: "Scores parallel to `results`", + }, count: { type: "number", title: "Count", description: "Number of results returned", }, }, - required: ["results", "count"], + required: ["results", "chunks", "chunk_ids", "scores", "count"], additionalProperties: false, } as const satisfies DataPortSchema; export type KbSearchTaskInput = FromSchema; export type KbSearchTaskOutput = { readonly results: ChunkSearchResult[]; + readonly chunks: string[]; + readonly chunk_ids: string[]; + readonly scores: number[]; readonly count: number; }; export type KbSearchTaskConfig = TaskConfig; /** - * Observable wrapper around `kb.search(text, opts)` — the KB's `onSearch` - * callback handles embedding and any custom retrieval logic. Distinct from - * `ChunkRetrievalTask`, which embeds via an explicit model and calls - * `kb.similaritySearch(vector)` (bypassing `onSearch`). + * High-level KB search task. Delegates to `kb.search(query, { kind })`; the + * KB owns the embedding and reranker models internally, so no `model` input + * is needed here. */ export class KbSearchTask extends Task { public static override type = "KbSearchTask"; public static override category = "RAG"; public static override title = "KB Search"; public static override description = - "Search a knowledge base for chunks matching a text query. Wraps the KB's `search` method (which embeds and retrieves via the KB's onSearch callback)."; + "Search a knowledge base. The KB picks the retrieval kind (similarity / hybrid / rerank) " + + "from its configured models, or you can override via `kind`."; public static override cacheable = true; public static override inputSchema(): DataPortSchema { @@ -95,10 +124,24 @@ export class KbSearchTask extends Task { - const { knowledgeBase, query, topK = 5, filter } = input; + const { knowledgeBase, query, kind, topK = 5, filter } = input; const kb = knowledgeBase as KnowledgeBase; - const results = await kb.search(query, { topK, filter }); - return { results, count: results.length }; + const results = await kb.search(query, { + kind: kind as SearchKind | undefined, + topK, + filter, + }); + return { + results, + chunks: results.map((r) => { + const meta = r.metadata as Record | undefined; + const text = meta?.text; + return typeof text === "string" ? text : JSON.stringify(meta ?? {}); + }), + chunk_ids: results.map((r) => r.chunk_id), + scores: results.map((r) => r.score), + count: results.length, + }; } } diff --git a/packages/ai/src/task/RerankerTask.ts b/packages/ai/src/task/RerankerTask.ts index e7a8acb0d..0585cd913 100644 --- a/packages/ai/src/task/RerankerTask.ts +++ b/packages/ai/src/task/RerankerTask.ts @@ -49,7 +49,7 @@ const inputSchema = { type: "string", enum: ["reciprocal-rank-fusion", "simple"], title: "Reranking Method", - description: "Method to use for reranking", + description: "Heuristic reranking method", default: "simple", }, }, @@ -110,16 +110,17 @@ interface RankedItem { } /** - * Rerank retrieved chunks to improve relevance using in-process heuristics. - * Supports `simple` (keyword overlap + position) and `reciprocal-rank-fusion`. - * Note: a `cross-encoder` method will be added when a real cross-encoder - * task exists; until then, use a dedicated model task upstream. + * Heuristic reranking task. Cross-encoder reranking (via model) is handled + * by `createAiKbStrategy` directly — it dispatches to provider-registered + * RerankerTask run-fns through `AiProviderRegistry`. This task remains the + * model-free fallback for workflows that don't want to require a reranker + * model. */ export class RerankerTask extends Task { public static override type = "RerankerTask"; public static override category = "RAG"; public static override title = "Reranker"; - public static override description = "Rerank retrieved chunks to improve relevance"; + public static override description = "Rerank retrieved chunks using in-process heuristics"; public static override cacheable = true; public static override inputSchema(): DataPortSchema { @@ -132,7 +133,7 @@ export class RerankerTask extends Task { const { query, chunks, scores = [], metadata = [], topK, method = "simple" } = input; diff --git a/packages/ai/src/task/TextRerankerTask.ts b/packages/ai/src/task/TextRerankerTask.ts new file mode 100644 index 000000000..91eadfeaa --- /dev/null +++ b/packages/ai/src/task/TextRerankerTask.ts @@ -0,0 +1,110 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { TaskConfig } from "@workglow/task-graph"; +import { CreateWorkflow, Workflow } from "@workglow/task-graph"; +import type { DataPortSchema, FromSchema } from "@workglow/util/schema"; +import { AiTask } from "./base/AiTask"; +import { TypeModel } from "./base/AiTaskSchemas"; + +const inputSchema = { + type: "object", + properties: { + query: { + type: "string", + title: "Query", + description: "The query to score documents against", + }, + documents: { + type: "array", + items: { type: "string" }, + title: "Documents", + description: "Candidate documents to score", + }, + topK: { + type: "number", + title: "Top K", + description: "Return at most this many results (default: all)", + minimum: 1, + }, + model: TypeModel("model:TextRerankerTask", { + title: "Reranker Model", + description: + "Cross-encoder reranker model (e.g. bge-reranker, Cohere rerank). Required.", + }), + }, + required: ["query", "documents", "model"], + additionalProperties: false, +} as const satisfies DataPortSchema; + +const outputSchema = { + type: "object", + properties: { + scores: { + type: "array", + items: { type: "number" }, + title: "Scores", + description: "Relevance score for each document, in the original order", + }, + indices: { + type: "array", + items: { type: "number" }, + title: "Indices", + description: "Indices of documents sorted best-first (length = topK if set)", + }, + }, + required: ["scores", "indices"], + additionalProperties: false, +} as const satisfies DataPortSchema; + +export type TextRerankerTaskInput = FromSchema; +export type TextRerankerTaskOutput = FromSchema; +export type TextRerankerTaskConfig = TaskConfig; + +/** + * AiTask for cross-encoder reranking. Providers register a run-fn for this + * task type (e.g. HuggingFace Transformers using a `text-classification` + * cross-encoder pipeline on `[query, doc]` pairs). `createAiKbStrategy` + * uses this task to power `kb.searchWithRerank()` when a reranker model + * is configured on the KB. + */ +export class TextRerankerTask extends AiTask< + TextRerankerTaskInput, + TextRerankerTaskOutput, + TextRerankerTaskConfig +> { + public static override type = "TextRerankerTask"; + public static override category = "RAG"; + public static override title = "Text Reranker"; + public static override description = + "Score documents against a query using a cross-encoder reranker model"; + + public static override inputSchema(): DataPortSchema { + return inputSchema as DataPortSchema; + } + public static override outputSchema(): DataPortSchema { + return outputSchema as DataPortSchema; + } +} + +export const textReranker = async ( + input: TextRerankerTaskInput, + config?: TextRerankerTaskConfig +) => { + return new TextRerankerTask(config).run(input); +}; + +declare module "@workglow/task-graph" { + interface Workflow { + textReranker: CreateWorkflow< + TextRerankerTaskInput, + TextRerankerTaskOutput, + TextRerankerTaskConfig + >; + } +} + +Workflow.prototype.textReranker = CreateWorkflow(TextRerankerTask); diff --git a/packages/ai/src/task/index.ts b/packages/ai/src/task/index.ts index 4d27a42db..692a1f420 100644 --- a/packages/ai/src/task/index.ts +++ b/packages/ai/src/task/index.ts @@ -23,6 +23,7 @@ import { GestureRecognizerTask } from "./GestureRecognizerTask"; import { HandLandmarkerTask } from "./HandLandmarkerTask"; import { HierarchicalChunkerTask } from "./HierarchicalChunkerTask"; import { HierarchyJoinTask } from "./HierarchyJoinTask"; +import { KbReindexTask } from "./KbReindexTask"; import { KbSearchTask } from "./KbSearchTask"; import { KbToDocumentsTask } from "./KbToDocumentsTask"; import { ImageClassificationTask } from "./ImageClassificationTask"; @@ -40,6 +41,7 @@ import { StructuredGenerationTask } from "./StructuredGenerationTask"; import { TextChunkerTask } from "./TextChunkerTask"; import { TextClassificationTask } from "./TextClassificationTask"; import { TextEmbeddingTask } from "./TextEmbeddingTask"; +import { TextRerankerTask } from "./TextRerankerTask"; import { TextFillMaskTask } from "./TextFillMaskTask"; import { TextGenerationTask } from "./TextGenerationTask"; import { TextLanguageDetectionTask } from "./TextLanguageDetectionTask"; @@ -77,6 +79,7 @@ export const registerAiTasks = () => { HandLandmarkerTask, HierarchicalChunkerTask, HierarchyJoinTask, + KbReindexTask, KbSearchTask, KbToDocumentsTask, ImageClassificationTask, @@ -94,6 +97,7 @@ export const registerAiTasks = () => { TextChunkerTask, TextClassificationTask, TextEmbeddingTask, + TextRerankerTask, TextFillMaskTask, TextGenerationTask, TextLanguageDetectionTask, @@ -136,6 +140,7 @@ export * from "./GestureRecognizerTask"; export * from "./HandLandmarkerTask"; export * from "./HierarchicalChunkerTask"; export * from "./HierarchyJoinTask"; +export * from "./KbReindexTask"; export * from "./KbSearchTask"; export * from "./KbToDocumentsTask"; export * from "./ImageClassificationTask"; @@ -154,6 +159,7 @@ export * from "./StructuredGenerationTask"; export * from "./TextChunkerTask"; export * from "./TextClassificationTask"; export * from "./TextEmbeddingTask"; +export * from "./TextRerankerTask"; export * from "./TextFillMaskTask"; export * from "./TextGenerationTask"; export * from "./TextLanguageDetectionTask"; diff --git a/packages/knowledge-base/src/common.ts b/packages/knowledge-base/src/common.ts index 3588ce88b..a0ab0527a 100644 --- a/packages/knowledge-base/src/common.ts +++ b/packages/knowledge-base/src/common.ts @@ -6,6 +6,7 @@ export * from "./chunk/ChunkSchema"; export * from "./chunk/ChunkVectorStorageSchema"; +export * from "./knowledge-base/IKbAiStrategy"; export * from "./knowledge-base/KnowledgeBase"; export * from "./knowledge-base/KnowledgeBaseSchema"; export * from "./knowledge-base/KnowledgeBaseRepository"; diff --git a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts new file mode 100644 index 000000000..916dbe7f6 --- /dev/null +++ b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts @@ -0,0 +1,81 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { TypedArray } from "@workglow/util/schema"; +import type { ChunkRecord } from "../chunk/ChunkSchema"; +import type { ChunkSearchResult, InsertChunkVectorEntity } from "../chunk/ChunkVectorStorageSchema"; +import type { Document } from "../document/Document"; + +/** + * Strategy that bridges a {@link KnowledgeBase} to an AI runtime. The KB owns + * its model IDs (docEmbeddingModel, queryEmbeddingModel, rerankerModel) as + * configuration; the strategy uses them to perform the actual chunking, + * embedding, and reranking. This indirection keeps `@workglow/knowledge-base` + * free of any `@workglow/ai` dependency (ai depends on KB, not the other way + * round) while letting higher layers install a real implementation. + */ +export interface IKbAiStrategy { + /** + * Chunk a document and produce vectors for each chunk. The returned arrays + * must be the same length; index `i` of `vectors` is the embedding for + * `chunks[i]`. The strategy is responsible for chunker configuration and + * picking which embedding model to use (typically `kb.docEmbeddingModel`). + */ + chunkAndEmbedDocument( + doc: Document + ): Promise<{ readonly chunks: ChunkRecord[]; readonly vectors: TypedArray[] }>; + + /** + * Embed a text query into a single vector for vector / hybrid retrieval. + * Typically uses `kb.queryEmbeddingModel` (falling back to docEmbeddingModel). + */ + embedQuery(text: string): Promise; + + /** + * Rerank an initial candidate list against the query. Implementations may + * call a cross-encoder model (`kb.rerankerModel`) or fall back to a + * heuristic. The returned array is at most `topK` results, ordered + * best-first, and carries updated `score` values. + */ + rerank( + query: string, + candidates: ChunkSearchResult[], + topK: number + ): Promise; +} + +/** + * Shape returned by `chunkAndEmbedDocument`. Exposed for strategy + * implementations that want to construct the result without importing the + * internal types from the strategy interface. + */ +export interface KbStrategyEmbedResult { + readonly chunks: ChunkRecord[]; + readonly vectors: TypedArray[]; +} + +/** + * Convert a `KbStrategyEmbedResult` plus a `doc_id` / `doc_title` into the + * `InsertChunkVectorEntity` records that `kb.upsertChunksBulk()` expects. + * Shared helper so every strategy uses identical key derivation. + */ +export function toInsertChunkEntities( + result: KbStrategyEmbedResult, + context: { readonly doc_id: string; readonly doc_title?: string } +): InsertChunkVectorEntity[] { + const { chunks, vectors } = result; + if (chunks.length !== vectors.length) { + throw new Error( + `IKbAiStrategy.chunkAndEmbedDocument returned ${chunks.length} chunks but ${vectors.length} vectors` + ); + } + return chunks.map((chunk, i) => ({ + chunk_id: chunk.chunk_id, + doc_id: context.doc_id, + vector: vectors[i], + metadata: { ...chunk, doc_title: context.doc_title }, + })) as InsertChunkVectorEntity[]; +} diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts index 36f7d2397..66b51458d 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts @@ -20,76 +20,92 @@ import type { DocumentTabularStorage, InsertDocumentStorageEntity, } from "../document/DocumentStorageSchema"; +import type { IKbAiStrategy } from "./IKbAiStrategy"; +import { toInsertChunkEntities } from "./IKbAiStrategy"; /** - * Options passed through `kb.search()` to the `onSearch` callback. - * The callback decides how to interpret them (similarity vs hybrid, etc.). - * `filter` is intentionally a loose record — the callback and its backing - * vector storage define the allowed keys. + * Retrieval flavor selected by {@link KnowledgeBase.search}. + * + * - `similarity`: vector cosine similarity only. Requires `embedQuery`. + * - `hybrid`: vector + full-text. Requires `embedQuery` and a hybrid-capable + * storage backend. + * - `rerank`: hybrid (or similarity, if hybrid unsupported) first stage + * followed by cross-encoder reranking. Requires `rerank` on the strategy. + */ +export type SearchKind = "similarity" | "hybrid" | "rerank"; + +/** + * Options passed through `kb.search()` / `kb.searchWithRerank()`. `filter` is + * a loose record; allowed keys are defined by the underlying vector storage. */ export interface ISearchOptions { readonly topK?: number; readonly filter?: Readonly>; readonly scoreThreshold?: number; + /** + * For `kind: "hybrid"` and the first stage of `kind: "rerank"`: vector + * vs. text weighting in [0, 1]. Defaults to the storage backend's default. + */ + readonly vectorWeight?: number; + /** + * For `kind: "rerank"`: how many candidates to retrieve before reranking. + * Defaults to `max(topK * 5, 20)`. + */ + readonly firstStageTopK?: number; } -/** - * Callback invoked after a document is upserted. - * Receives the KB instance and the upserted document. - */ -export type OnDocumentUpsertCallback = (kb: KnowledgeBase, doc: Document) => Promise; - -/** - * Callback invoked after a document (and its chunks) are deleted. - * Receives the KB instance and the deleted document's ID. - */ -export type OnDocumentDeleteCallback = (kb: KnowledgeBase, doc_id: string) => Promise; - -/** - * Callback invoked by `search()` to handle text-to-vector conversion - * and the actual search. Returns search results. - */ -export type OnSearchCallback = ( - kb: KnowledgeBase, - query: string, - options?: ISearchOptions -) => Promise; +export interface ISearchWithKindOptions extends ISearchOptions { + readonly kind?: SearchKind; +} export interface KnowledgeBaseOptions { readonly title?: string; readonly description?: string; - readonly onDocumentUpsert?: OnDocumentUpsertCallback; - readonly onDocumentDelete?: OnDocumentDeleteCallback; - readonly onSearch?: OnSearchCallback; + /** + * Model ID used to embed document chunks during ingest. Consumed by the + * installed {@link IKbAiStrategy} — the KB itself doesn't run AI. + */ + readonly docEmbeddingModel?: string; + /** + * Model ID used to embed search queries. Defaults to `docEmbeddingModel` + * if absent (the common case — symmetric embedding). + */ + readonly queryEmbeddingModel?: string; + /** + * Optional cross-encoder reranker model ID. When set (and the strategy + * implements rerank against it) `search({ kind: "rerank" })` and + * `searchWithRerank()` use a real cross-encoder; otherwise the strategy + * may fall back to a heuristic. + */ + readonly rerankerModel?: string; + /** + * The AI strategy used by `upsertDocumentWithIndex`, `search`, and + * `searchWithRerank`. Installable post-construction via + * {@link KnowledgeBase.setAiStrategy}. + */ + readonly aiStrategy?: IKbAiStrategy; } /** * Unified KnowledgeBase that owns both document and vector storage, * providing lifecycle management and cascading deletes. + * + * Model configuration (`docEmbeddingModel`, `queryEmbeddingModel`, + * `rerankerModel`) lives on the KB so callers don't have to thread models + * through every retrieval call site. Actual AI execution is delegated to an + * {@link IKbAiStrategy} installed via {@link setAiStrategy} — this indirection + * keeps the KB package free of `@workglow/ai` (which depends on it). */ export class KnowledgeBase { readonly name: string; readonly title: string = ""; readonly description: string = ""; + readonly docEmbeddingModel: string | undefined; + readonly queryEmbeddingModel: string | undefined; + readonly rerankerModel: string | undefined; private readonly tabularStorage: DocumentTabularStorage; private readonly chunkStorage: ChunkVectorStorage; - - /** - * Called after `upsertDocument` successfully writes to storage. - * Awaited — throwing rejects the upsert call, but storage is already committed. - * Use for chunk re-indexing, audit logging, etc. - */ - onDocumentUpsert: OnDocumentUpsertCallback | undefined; - /** - * Called after `deleteDocument` successfully deletes the document and its chunks. - * Awaited — throwing rejects the delete call, but storage is already committed. - */ - onDocumentDelete: OnDocumentDeleteCallback | undefined; - /** - * Called by `search()` to embed the query and execute the search. - * Required if you intend to call `kb.search()`. - */ - onSearch: OnSearchCallback | undefined; + private aiStrategy: IKbAiStrategy | undefined; constructor( name: string, @@ -104,10 +120,43 @@ export class KnowledgeBase { if (typeof options === "object" && options !== null) { this.title = options.title ?? name; this.description = options.description ?? ""; - this.onDocumentUpsert = options.onDocumentUpsert; - this.onDocumentDelete = options.onDocumentDelete; - this.onSearch = options.onSearch; + this.docEmbeddingModel = options.docEmbeddingModel; + this.queryEmbeddingModel = options.queryEmbeddingModel ?? options.docEmbeddingModel; + this.rerankerModel = options.rerankerModel; + this.aiStrategy = options.aiStrategy; + } + } + + // =========================================================================== + // AI strategy + // =========================================================================== + + /** + * Install (or replace) the AI strategy that powers ingest embedding and + * query-side embedding / reranking. The KB stores model IDs but doesn't + * load models itself; the strategy bridges to the AI runtime. + */ + setAiStrategy(strategy: IKbAiStrategy | undefined): void { + this.aiStrategy = strategy; + } + + getAiStrategy(): IKbAiStrategy | undefined { + return this.aiStrategy; + } + + /** True when a strategy is installed AND a reranker model is registered. */ + supportsRerank(): boolean { + return this.aiStrategy !== undefined && this.rerankerModel !== undefined; + } + + private requireStrategy(forOp: string): IKbAiStrategy { + if (!this.aiStrategy) { + throw new Error( + `KnowledgeBase.${forOp}() requires an AI strategy. ` + + `Install one via kb.setAiStrategy(strategy) (typically createAiKbStrategy from @workglow/ai).` + ); } + return this.aiStrategy; } // =========================================================================== @@ -115,7 +164,8 @@ export class KnowledgeBase { // =========================================================================== /** - * Upsert a document. + * Upsert a document JSON record. Does NOT chunk or embed — use + * {@link upsertDocumentWithIndex} for the full ingest path. * @returns The document with the generated doc_id if it was auto-generated */ async upsertDocument(document: Document): Promise { @@ -131,13 +181,36 @@ export class KnowledgeBase { document.setDocId(entity.doc_id); } - if (this.onDocumentUpsert) { - await this.onDocumentUpsert(this, document); - } - return document; } + /** + * Full ingest: store the document, drop any existing chunks for it, then + * chunk + embed + upsert via the installed AI strategy. Throws if no + * strategy is installed. + */ + async upsertDocumentWithIndex(document: Document): Promise { + const strategy = this.requireStrategy("upsertDocumentWithIndex"); + const stored = await this.upsertDocument(document); + const docId = stored.doc_id; + if (!docId) { + throw new Error( + "upsertDocumentWithIndex: document has no doc_id after upsertDocument." + ); + } + await this.deleteChunksForDocument(docId); + const embedResult = await strategy.chunkAndEmbedDocument(stored); + if (embedResult.chunks.length === 0) { + return stored; + } + const inserts = toInsertChunkEntities(embedResult, { + doc_id: docId, + doc_title: stored.metadata.title, + }); + await this.upsertChunksBulk(inserts); + return stored; + } + /** * Get a document by ID */ @@ -155,10 +228,6 @@ export class KnowledgeBase { async deleteDocument(doc_id: string): Promise { await this.deleteChunksForDocument(doc_id); await this.tabularStorage.delete({ doc_id }); - - if (this.onDocumentDelete) { - await this.onDocumentDelete(this, doc_id); - } } /** @@ -346,27 +415,83 @@ export class KnowledgeBase { } /** - * High-level text search. Delegates to the `onSearch` callback, which is - * responsible for embedding the query and executing the appropriate search - * (similarity, hybrid, keyword, etc.). Install `onSearch` via - * `createKnowledgeBase({ onSearch })` or the KnowledgeBase constructor options. + * Hybrid (or similarity) retrieve a wide candidate set, then ask the + * strategy's reranker to score them and return the best `topK`. Requires + * an AI strategy. If the backend doesn't support hybrid search, this + * falls back to similarity for the first stage. + */ + async searchWithRerank( + query: string, + options?: ISearchOptions + ): Promise { + const strategy = this.requireStrategy("searchWithRerank"); + const topK = options?.topK ?? 5; + const firstStageTopK = options?.firstStageTopK ?? Math.max(topK * 5, 20); + const vector = await strategy.embedQuery(query); + const firstStage: ChunkSearchResult[] = this.supportsHybridSearch() + ? await this.hybridSearch(vector, { + textQuery: query, + topK: firstStageTopK, + filter: options?.filter as Partial | undefined, + scoreThreshold: options?.scoreThreshold, + vectorWeight: options?.vectorWeight, + }) + : await this.similaritySearch(vector, { + topK: firstStageTopK, + filter: options?.filter as Partial | undefined, + scoreThreshold: options?.scoreThreshold, + }); + if (firstStage.length === 0) { + return []; + } + return strategy.rerank(query, firstStage, topK); + } + + /** + * Unified text-query search dispatcher. The KB knows its own embedding + * model and reranker (via the installed strategy), so callers don't need + * to thread models through every call site. * - * If `onSearch` calls back into `kb.similaritySearch()` / `kb.hybridSearch()`, - * those calls still go through virtual dispatch — so subclass filter injection - * (e.g. tenant scope) applies even when the entry point is `kb.search()`. + * - `kind: "similarity"` — embed + vector search + * - `kind: "hybrid"` — embed + vector + full-text + * - `kind: "rerank"` — first-stage hybrid/similarity + cross-encoder rerank * - * @throws Error if `onSearch` is not configured. + * Defaults to `"rerank"` when a reranker model is configured, otherwise + * `"hybrid"` when supported, otherwise `"similarity"`. */ - async search(query: string, options?: ISearchOptions): Promise { - if (!this.onSearch) { - throw new Error( - "KnowledgeBase.search() requires an `onSearch` callback. " + - "Pass one via createKnowledgeBase({ onSearch }) or the KnowledgeBase " + - "constructor options. For raw vector search, use " + - "`kb.similaritySearch()` or `kb.vectorStorage.similaritySearch()` directly." - ); + async search( + query: string, + options?: ISearchWithKindOptions + ): Promise { + const kind: SearchKind = + options?.kind ?? + (this.supportsRerank() + ? "rerank" + : this.supportsHybridSearch() + ? "hybrid" + : "similarity"); + + if (kind === "rerank") { + return this.searchWithRerank(query, options); } - return this.onSearch(this, query, options); + + const strategy = this.requireStrategy("search"); + const vector = await strategy.embedQuery(query); + const topK = options?.topK ?? 5; + if (kind === "hybrid") { + return this.hybridSearch(vector, { + textQuery: query, + topK, + filter: options?.filter as Partial | undefined, + scoreThreshold: options?.scoreThreshold, + vectorWeight: options?.vectorWeight, + }); + } + return this.similaritySearch(vector, { + topK, + filter: options?.filter as Partial | undefined, + scoreThreshold: options?.scoreThreshold, + }); } // =========================================================================== @@ -401,6 +526,24 @@ export class KnowledgeBase { return doc; } + /** + * Re-index every document in this KB using the installed strategy. The + * caller is responsible for ensuring the strategy is set. Returns the + * number of documents re-indexed. + */ + async reindex(): Promise { + this.requireStrategy("reindex"); + const docIds = await this.listDocuments(); + let count = 0; + for (const doc_id of docIds) { + const doc = await this.getDocument(doc_id); + if (!doc) continue; + await this.upsertDocumentWithIndex(doc); + count++; + } + return count; + } + /** * Setup the underlying databases */ diff --git a/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts index 4c950695a..23122b829 100644 --- a/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts @@ -10,11 +10,7 @@ import type { ChunkVectorStorage } from "../chunk/ChunkVectorStorageSchema"; import { ChunkVectorPrimaryKey, ChunkVectorStorageSchema } from "../chunk/ChunkVectorStorageSchema"; import type { DocumentTabularStorage } from "../document/DocumentStorageSchema"; import { DocumentStorageKey, DocumentStorageSchema } from "../document/DocumentStorageSchema"; -import type { - OnDocumentDeleteCallback, - OnDocumentUpsertCallback, - OnSearchCallback, -} from "./KnowledgeBase"; +import type { IKbAiStrategy } from "./IKbAiStrategy"; import { KnowledgeBase } from "./KnowledgeBase"; import { registerKnowledgeBase } from "./KnowledgeBaseRegistry"; @@ -25,9 +21,10 @@ export interface CreateKnowledgeBaseOptions { readonly register?: boolean; readonly title?: string; readonly description?: string; - readonly onDocumentUpsert?: OnDocumentUpsertCallback; - readonly onDocumentDelete?: OnDocumentDeleteCallback; - readonly onSearch?: OnSearchCallback; + readonly docEmbeddingModel?: string; + readonly queryEmbeddingModel?: string; + readonly rerankerModel?: string; + readonly aiStrategy?: IKbAiStrategy; } /** @@ -38,7 +35,9 @@ export interface CreateKnowledgeBaseOptions { * const kb = await createKnowledgeBase({ * name: "my-kb", * vectorDimensions: 1024, + * docEmbeddingModel: "onnx:Xenova/bge-base-en-v1.5:q8", * }); + * kb.setAiStrategy(createAiKbStrategy(kb)); * ``` */ export async function createKnowledgeBase( @@ -51,9 +50,10 @@ export async function createKnowledgeBase( register: shouldRegister = true, title, description, - onDocumentUpsert, - onDocumentDelete, - onSearch, + docEmbeddingModel, + queryEmbeddingModel, + rerankerModel, + aiStrategy, } = options; const vectorCtor = vectorCtorOption ?? Float32Array; @@ -85,7 +85,7 @@ export async function createKnowledgeBase( name, tabularStorage as unknown as DocumentTabularStorage, vectorStorage as unknown as ChunkVectorStorage, - { title, description, onDocumentUpsert, onDocumentDelete, onSearch } + { title, description, docEmbeddingModel, queryEmbeddingModel, rerankerModel, aiStrategy } ); if (shouldRegister) { diff --git a/packages/test/src/test/rag/DocumentRepository.test.ts b/packages/test/src/test/rag/DocumentRepository.test.ts index 126debbf1..80c8c34a8 100644 --- a/packages/test/src/test/rag/DocumentRepository.test.ts +++ b/packages/test/src/test/rag/DocumentRepository.test.ts @@ -432,99 +432,105 @@ Paragraph.`; }); }); - describe("callbacks", () => { - it("should invoke onDocumentUpsert when a document is upserted", async () => { - const calls: Array<{ kbName: string; docId: string | undefined }> = []; - const kbWithCb = await createKnowledgeBase({ - name: `test-kb-cb-${uuid4()}`, + describe("ai strategy", () => { + it("should throw a helpful error when kb.search() is called without a strategy", async () => { + const bareKb = await createKnowledgeBase({ + name: `test-kb-nostrategy-${uuid4()}`, vectorDimensions: 3, register: false, - onDocumentUpsert: async (instance, doc) => { - calls.push({ kbName: instance.name, docId: doc.doc_id }); - }, }); - const doc_id = uuid4(); - const root = await StructuralParser.parseMarkdown(doc_id, "# Test\n\nContent.", "Test"); - const doc = new Document(root, { title: "Test" }); - - await kbWithCb.upsertDocument(doc); - - expect(calls).toHaveLength(1); - expect(calls[0].kbName).toBe(kbWithCb.name); - expect(calls[0].docId).toBeDefined(); + await expect(bareKb.search("hello")).rejects.toThrow(/AI strategy/); }); - it("should invoke onDocumentDelete when a document is deleted", async () => { - const deletedIds: string[] = []; - const kbWithCb = await createKnowledgeBase({ - name: `test-kb-del-${uuid4()}`, + it("should invoke embedQuery and similaritySearch for kind: 'similarity'", async () => { + const received: Array<{ text: string }> = []; + const kb = await createKnowledgeBase({ + name: `test-kb-sim-${uuid4()}`, vectorDimensions: 3, register: false, - onDocumentDelete: async (_instance, doc_id) => { - deletedIds.push(doc_id); + }); + kb.setAiStrategy({ + chunkAndEmbedDocument: async () => ({ chunks: [], vectors: [] }), + embedQuery: async (text) => { + received.push({ text }); + return new Float32Array([0.1, 0.2, 0.3]); }, + rerank: async (_q, candidates, topK) => candidates.slice(0, topK), }); - const doc_id = uuid4(); - const root = await StructuralParser.parseMarkdown(doc_id, "# T\n\nx.", "T"); - const doc = new Document(root, { title: "T" }); - const inserted = await kbWithCb.upsertDocument(doc); - - await kbWithCb.deleteDocument(inserted.doc_id!); + const results = await kb.search("hello", { kind: "similarity", topK: 4 }); - expect(deletedIds).toEqual([inserted.doc_id]); + expect(received).toEqual([{ text: "hello" }]); + expect(results).toEqual([]); }); - it("should reject upsertDocument when onDocumentUpsert throws, with storage already committed", async () => { - const kbWithCb = await createKnowledgeBase({ - name: `test-kb-throw-${uuid4()}`, + it("should run rerank by default when rerankerModel is configured", async () => { + const reranks: Array<{ query: string; n: number; topK: number }> = []; + const kb = await createKnowledgeBase({ + name: `test-kb-rerank-${uuid4()}`, vectorDimensions: 3, register: false, - onDocumentUpsert: async () => { - throw new Error("callback boom"); + docEmbeddingModel: "test:doc-embed", + rerankerModel: "test:rerank", + }); + kb.setAiStrategy({ + chunkAndEmbedDocument: async () => ({ chunks: [], vectors: [] }), + embedQuery: async () => new Float32Array([1, 0, 0]), + rerank: async (query, candidates, topK) => { + reranks.push({ query, n: candidates.length, topK }); + return candidates.slice(0, topK); }, }); - const doc_id = uuid4(); - const root = await StructuralParser.parseMarkdown(doc_id, "# T\n\nx.", "T"); - const doc = new Document(root, { title: "T" }); + // Seed a chunk so the first-stage retrieval returns something the + // reranker can score against. + await kb.upsertChunk({ + chunk_id: "c1", + doc_id: "d1", + vector: new Float32Array([1, 0, 0]), + metadata: { chunk_id: "c1", doc_id: "d1", text: "hi", nodePath: [], depth: 0 } as any, + }); - await expect(kbWithCb.upsertDocument(doc)).rejects.toThrow("callback boom"); + await kb.search("q", { topK: 2 }); - // Contract: storage is committed before the callback runs, so the document - // must still be retrievable even though upsertDocument rejected. - const retrieved = await kbWithCb.getDocument(doc.doc_id!); - expect(retrieved).toBeDefined(); - expect(retrieved?.doc_id).toBe(doc.doc_id); + expect(reranks).toHaveLength(1); + expect(reranks[0].query).toBe("q"); + expect(reranks[0].topK).toBe(2); }); - it("should throw a helpful error when kb.search() is called without onSearch", async () => { - const bareKb = await createKnowledgeBase({ - name: `test-kb-nosearch-${uuid4()}`, + it("should chunk+embed via the strategy in upsertDocumentWithIndex", async () => { + const kb = await createKnowledgeBase({ + name: `test-kb-ingest-${uuid4()}`, vectorDimensions: 3, register: false, }); - - await expect(bareKb.search("hello")).rejects.toThrow(/onSearch/); - }); - - it("should invoke onSearch with the query and options when kb.search() is called", async () => { - const received: Array<{ query: string; topK: number | undefined }> = []; - const kbWithSearch = await createKnowledgeBase({ - name: `test-kb-search-${uuid4()}`, - vectorDimensions: 3, - register: false, - onSearch: async (_kb, query, options) => { - received.push({ query, topK: options?.topK }); - return []; + kb.setAiStrategy({ + chunkAndEmbedDocument: async (doc) => { + const text = doc.metadata.title ?? ""; + return { + chunks: [ + { + chunk_id: "c1", + doc_id: doc.doc_id ?? "", + text, + nodePath: [], + depth: 0, + } as any, + ], + vectors: [new Float32Array([1, 0, 0])], + }; }, + embedQuery: async () => new Float32Array([0, 0, 0]), + rerank: async (_q, c, k) => c.slice(0, k), }); - const results = await kbWithSearch.search("query text", { topK: 4 }); + const root = await StructuralParser.parseMarkdown(uuid4(), "# Test\n\nx.", "Test"); + const doc = new Document(root, { title: "Test" }); + const stored = await kb.upsertDocumentWithIndex(doc); - expect(received).toEqual([{ query: "query text", topK: 4 }]); - expect(results).toEqual([]); + const chunks = await kb.getChunksForDocument(stored.doc_id!); + expect(chunks).toHaveLength(1); }); }); }); diff --git a/providers/huggingface-transformers/src/ai/common/HFT_JobRunFns.ts b/providers/huggingface-transformers/src/ai/common/HFT_JobRunFns.ts index ee9f1bf1c..35385ba6d 100644 --- a/providers/huggingface-transformers/src/ai/common/HFT_JobRunFns.ts +++ b/providers/huggingface-transformers/src/ai/common/HFT_JobRunFns.ts @@ -28,6 +28,7 @@ import { HFT_TextFillMask } from "./HFT_TextFillMask"; import { HFT_TextGeneration, HFT_TextGeneration_Stream } from "./HFT_TextGeneration"; import { HFT_TextLanguageDetection } from "./HFT_TextLanguageDetection"; import { HFT_TextNamedEntityRecognition } from "./HFT_TextNamedEntityRecognition"; +import { HFT_TextReranker } from "./HFT_TextReranker"; import { HFT_TextQuestionAnswer, HFT_TextQuestionAnswer_Stream } from "./HFT_TextQuestionAnswer"; import { HFT_TextRewriter, HFT_TextRewriter_Stream } from "./HFT_TextRewriter"; import { HFT_TextSummary, HFT_TextSummary_Stream } from "./HFT_TextSummary"; @@ -53,6 +54,7 @@ export const HFT_TASKS: Record + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { TextClassificationPipeline } from "@huggingface/transformers"; +import type { + AiProviderRunFn, + TextRerankerTaskInput, + TextRerankerTaskOutput, +} from "@workglow/ai"; +import { getLogger } from "@workglow/util/worker"; +import type { HfTransformersOnnxModelConfig } from "./HFT_ModelSchema"; +import { getPipeline } from "./HFT_Pipeline"; + +/** + * Cross-encoder reranker run-fn. Loads a `text-classification` pipeline + * (the way transformers.js exposes cross-encoder models like + * `Xenova/bge-reranker-base`) and scores each `[query, doc]` pair. + * + * Output `indices` is sorted best-first; `scores` is the per-document score + * in the original input order so callers can join back to their candidate + * list without re-sorting. + */ +export const HFT_TextReranker: AiProviderRunFn< + TextRerankerTaskInput, + TextRerankerTaskOutput, + HfTransformersOnnxModelConfig +> = async (input, model, onProgress, signal) => { + const logger = getLogger(); + const timerLabel = `hft:TextReranker:${model?.provider_config.model_path}`; + logger.time(timerLabel, { docs: input.documents.length }); + + const reranker: TextClassificationPipeline = await getPipeline(model!, onProgress, {}, signal); + + // Transformers.js' text-classification pipeline accepts an array of + // { text, text_pair } objects for sentence-pair tasks (which cross-encoder + // rerankers are). The pipeline returns one score per input pair. + const pairs = input.documents.map((doc) => ({ text: input.query, text_pair: doc })); + const rawResults = (await (reranker as unknown as ( + inputs: Array<{ text: string; text_pair: string }>, + options?: Record + ) => Promise>>)( + pairs, + { top_k: 1 } + )) as Array<{ label: string; score: number } | Array<{ label: string; score: number }>>; + + const scores: number[] = rawResults.map((r) => { + if (Array.isArray(r)) { + // top_k > 1 returns array per input — take the best + return r[0]?.score ?? 0; + } + return r.score; + }); + + const indices = scores + .map((score, idx) => ({ score, idx })) + .sort((a, b) => b.score - a.score) + .map((p) => p.idx); + + const limited = typeof input.topK === "number" ? indices.slice(0, input.topK) : indices; + + logger.timeEnd(timerLabel, { docs: input.documents.length }); + return { scores, indices: limited }; +}; From 0fe9492a25ca24f0162fbbc72d7d3c03b394c0c5 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 11 May 2026 15:41:49 +0000 Subject: [PATCH 02/11] refactor(kb): collapse public API to upsert/delete/search; strategy-driven MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the model-fields refactor. The KB's public RAG surface now has exactly three methods — `upsert(doc)`, `delete(doc_id)`, `search(query, opts)` — and they all delegate to an installed `IKbAiStrategy`. The lower-level storage methods (upsertDocument, upsertChunksBulk, similaritySearch, hybridSearch, etc.) stay on the class as strategy-facing building blocks; subclasses can still intercept them via virtual dispatch. `IKbAiStrategy` itself is now high-level: - `ingest(kb, doc)` — chunk + embed + write in one shot - `delete(kb, doc_id)` — cascading delete by default - `search(kb, query, opts)` — strategy picks the retrieval mode The "search kind" parameter is gone. Mode is part of the KB's stored config (new `searchMode` field, alongside `chunkStrategy`) and the standard strategy reads it on every call. - `createStandardKbStrategy({ chunker?, chunkStrategy?, searchMode? })` replaces `createAiKbStrategy`. It reads model IDs + chunkStrategy + searchMode from the KB at op time, so config changes take effect on the next call without rewiring. - `KbSearchTask` drops `kind`; just `{ knowledgeBase, query, topK?, filter? }`. - KB no longer exposes `searchWithRerank` / `upsertDocumentWithIndex` publicly; that orchestration lives in the strategy. - Custom strategies (e.g. for per-tenant scoping in the builder) keep working — they implement the same three methods and call the building-block methods however they want. Builder uses `ScopedKnowledgeBase` (which overrides the low-level methods) + `createStandardKbStrategy()` on top — scope injection rides through virtual dispatch without the strategy knowing. - Tests updated to exercise the new strategy shape. https://claude.ai/code/session_01Ya54WFZhpDFzAqRh1qG8Ex --- packages/ai/src/common.ts | 2 +- packages/ai/src/kb/createAiKbStrategy.ts | 174 ------ .../ai/src/kb/createStandardKbStrategy.ts | 251 +++++++++ packages/ai/src/task/KbSearchTask.ts | 36 +- .../src/knowledge-base/IKbAiStrategy.ts | 111 ++-- .../src/knowledge-base/KnowledgeBase.ts | 516 ++++++------------ .../src/knowledge-base/createKnowledgeBase.ts | 17 +- .../src/test/rag/DocumentRepository.test.ts | 106 ++-- 8 files changed, 562 insertions(+), 651 deletions(-) delete mode 100644 packages/ai/src/kb/createAiKbStrategy.ts create mode 100644 packages/ai/src/kb/createStandardKbStrategy.ts diff --git a/packages/ai/src/common.ts b/packages/ai/src/common.ts index 675f542b2..e1dc2cf96 100644 --- a/packages/ai/src/common.ts +++ b/packages/ai/src/common.ts @@ -25,4 +25,4 @@ export * from "./provider/QueuedAiProvider"; export * from "./task"; -export * from "./kb/createAiKbStrategy"; +export * from "./kb/createStandardKbStrategy"; diff --git a/packages/ai/src/kb/createAiKbStrategy.ts b/packages/ai/src/kb/createAiKbStrategy.ts deleted file mode 100644 index 87a170839..000000000 --- a/packages/ai/src/kb/createAiKbStrategy.ts +++ /dev/null @@ -1,174 +0,0 @@ -/** - * @license - * Copyright 2026 Steven Roussey - * SPDX-License-Identifier: Apache-2.0 - */ - -import type { - ChunkSearchResult, - IKbAiStrategy, - KnowledgeBase, -} from "@workglow/knowledge-base"; -import type { ChunkRecord, Document } from "@workglow/knowledge-base"; -import type { TypedArray } from "@workglow/util/schema"; - -import { HierarchicalChunkerTask } from "../task/HierarchicalChunkerTask"; -import { RerankerTask } from "../task/RerankerTask"; -import { TextEmbeddingTask } from "../task/TextEmbeddingTask"; -import { TextRerankerTask } from "../task/TextRerankerTask"; - -/** - * Tuning knobs for the default AI strategy. Chunker defaults match the - * builder's historical embed-workflow defaults so re-indexing produces the - * same chunk boundaries unless explicitly changed. - */ -export interface CreateAiKbStrategyOptions { - readonly chunker?: { - readonly maxTokens?: number; - readonly overlap?: number; - readonly reservedTokens?: number; - readonly strategy?: "hierarchical" | "flat" | "sentence"; - }; -} - -/** - * Build an {@link IKbAiStrategy} that wires a KB's configured model IDs to - * the real AI runtime (TextEmbeddingTask, TextRerankerTask, etc.). The - * strategy reads `kb.docEmbeddingModel`, `kb.queryEmbeddingModel`, and - * `kb.rerankerModel` lazily on every call, so changes after installation - * take effect on the next operation. - */ -export function createAiKbStrategy( - kb: KnowledgeBase, - options: CreateAiKbStrategyOptions = {} -): IKbAiStrategy { - const chunkerDefaults = { - maxTokens: options.chunker?.maxTokens ?? 512, - overlap: options.chunker?.overlap ?? 50, - reservedTokens: options.chunker?.reservedTokens ?? 10, - strategy: options.chunker?.strategy ?? "hierarchical", - } as const; - - const requireDocEmbed = (): string => { - const m = kb.docEmbeddingModel; - if (!m) { - throw new Error( - `KnowledgeBase "${kb.name}" has no docEmbeddingModel configured; ` + - `set it in createKnowledgeBase / BuilderKnowledgeBaseRecord.` - ); - } - return m; - }; - - const requireQueryEmbed = (): string => { - const m = kb.queryEmbeddingModel ?? kb.docEmbeddingModel; - if (!m) { - throw new Error( - `KnowledgeBase "${kb.name}" has no queryEmbeddingModel or docEmbeddingModel configured.` - ); - } - return m; - }; - - const embedTexts = async ( - texts: readonly string[], - modelId: string - ): Promise => { - if (texts.length === 0) return []; - const task = new TextEmbeddingTask(); - const result = await task.run({ text: texts as string[], model: modelId }); - const vector = result.vector; - if (Array.isArray(vector)) { - return vector as TypedArray[]; - } - return [vector as TypedArray]; - }; - - return { - async chunkAndEmbedDocument(doc: Document) { - const docId = doc.doc_id; - if (!docId) { - throw new Error( - "chunkAndEmbedDocument: document has no doc_id. " + - "Call kb.upsertDocument(doc) first or assign a doc_id." - ); - } - const chunker = new HierarchicalChunkerTask(); - const chunkResult = await chunker.run({ - doc_id: docId, - documentTree: doc.root as any, - ...chunkerDefaults, - }); - const chunks = chunkResult.chunks as ChunkRecord[]; - if (chunks.length === 0) { - return { chunks: [], vectors: [] }; - } - const vectors = await embedTexts( - chunks.map((c) => c.text), - requireDocEmbed() - ); - return { chunks, vectors }; - }, - - async embedQuery(text: string): Promise { - const vectors = await embedTexts([text], requireQueryEmbed()); - return vectors[0]; - }, - - async rerank( - query: string, - candidates: ChunkSearchResult[], - topK: number - ): Promise { - if (candidates.length === 0) { - return []; - } - const limit = Math.min(topK, candidates.length); - const rerankerModel = kb.rerankerModel; - const docs = candidates.map((c) => { - const meta = c.metadata as Record | undefined; - const text = meta?.text; - return typeof text === "string" ? text : JSON.stringify(meta ?? {}); - }); - - if (rerankerModel) { - const task = new TextRerankerTask(); - const result = await task.run({ - query, - documents: docs, - model: rerankerModel, - topK: limit, - }); - const indices = (result.indices as number[]) ?? []; - const scores = (result.scores as number[]) ?? []; - return indices.map((idx) => { - const candidate = candidates[idx]; - const newScore = scores[idx]; - return { - ...candidate, - score: typeof newScore === "number" ? newScore : candidate.score, - }; - }); - } - - // Heuristic fallback — keeps the API usable without a reranker model. - const heuristic = await new RerankerTask().run({ - query, - chunks: docs, - scores: candidates.map((c) => c.score), - metadata: candidates.map((c) => c.metadata as Record), - topK: limit, - method: "simple", - }); - const indices = (heuristic.originalIndices as number[]) ?? []; - const newScores = (heuristic.scores as number[]) ?? []; - return indices.map((idx, rank) => { - const candidate = candidates[idx]; - return { - ...candidate, - score: newScores[rank] ?? candidate.score, - }; - }); - }, - }; -} diff --git a/packages/ai/src/kb/createStandardKbStrategy.ts b/packages/ai/src/kb/createStandardKbStrategy.ts new file mode 100644 index 000000000..7585f500b --- /dev/null +++ b/packages/ai/src/kb/createStandardKbStrategy.ts @@ -0,0 +1,251 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + ChunkSearchResult, + ChunkStrategy, + Document, + IKbAiStrategy, + IKbStrategyTarget, + ISearchOptions, + SearchMode, +} from "@workglow/knowledge-base"; +import { toInsertChunkEntities } from "@workglow/knowledge-base"; +import type { TypedArray } from "@workglow/util/schema"; + +import { HierarchicalChunkerTask } from "../task/HierarchicalChunkerTask"; +import { RerankerTask } from "../task/RerankerTask"; +import { TextEmbeddingTask } from "../task/TextEmbeddingTask"; +import { TextRerankerTask } from "../task/TextRerankerTask"; + +/** + * Tuning knobs for the standard strategy. Most defaults come straight from + * the KB (model IDs, chunkStrategy, searchMode); these overrides exist for + * callers that want different chunker token budgets than the built-in + * defaults or that need to pin a search mode different from what the KB + * has stored. + */ +export interface CreateStandardKbStrategyOptions { + readonly chunker?: { + readonly maxTokens?: number; + readonly overlap?: number; + readonly reservedTokens?: number; + }; + /** Override KB's chunkStrategy at strategy-build time. */ + readonly chunkStrategy?: ChunkStrategy; + /** Override KB's searchMode at strategy-build time. */ + readonly searchMode?: SearchMode; + /** + * For `searchMode === "rerank"`: how many candidates to retrieve before + * reranking. Defaults to `max(topK * 5, 20)`. + */ + readonly firstStageMultiplier?: number; +} + +/** + * The standard KB strategy: hierarchical-by-default chunking + embedding + * during ingest, and a single search mode for retrieval. Search and ingest + * read the KB's stored model IDs (`docEmbeddingModel` / + * `queryEmbeddingModel` / `rerankerModel`) and config fields + * (`chunkStrategy` / `searchMode`) on every call, so updates to the KB + * record take effect immediately on the next op. + * + * For custom RAG flows (per-tenant scoping, alternative chunkers, etc.) + * write your own `IKbAiStrategy` — this factory is the "good defaults" + * path, not the only path. + */ +export function createStandardKbStrategy( + options: CreateStandardKbStrategyOptions = {} +): IKbAiStrategy { + const chunkerDefaults = { + maxTokens: options.chunker?.maxTokens ?? 512, + overlap: options.chunker?.overlap ?? 50, + reservedTokens: options.chunker?.reservedTokens ?? 10, + } as const; + const firstStageMultiplier = options.firstStageMultiplier ?? 5; + + const resolveSearchMode = (kb: IKbStrategyTarget): SearchMode => { + if (options.searchMode) return options.searchMode; + if (kb.searchMode) return kb.searchMode; + if (kb.rerankerModel) return "rerank"; + if (kb.supportsHybridSearch()) return "hybrid"; + return "similarity"; + }; + + const resolveChunkStrategy = (kb: IKbStrategyTarget): ChunkStrategy => + options.chunkStrategy ?? kb.chunkStrategy ?? "hierarchical"; + + const requireQueryEmbedModel = (kb: IKbStrategyTarget): string => { + const m = kb.queryEmbeddingModel ?? kb.docEmbeddingModel; + if (!m) { + throw new Error( + `KnowledgeBase "${kb.name}": no queryEmbeddingModel or docEmbeddingModel configured.` + ); + } + return m; + }; + + const requireDocEmbedModel = (kb: IKbStrategyTarget): string => { + const m = kb.docEmbeddingModel; + if (!m) { + throw new Error(`KnowledgeBase "${kb.name}": no docEmbeddingModel configured.`); + } + return m; + }; + + const embedTexts = async (texts: readonly string[], modelId: string): Promise => { + if (texts.length === 0) return []; + const result = await new TextEmbeddingTask().run({ text: texts as string[], model: modelId }); + const vector = result.vector; + return Array.isArray(vector) ? (vector as TypedArray[]) : [vector as TypedArray]; + }; + + return { + async ingest(kb, doc): Promise { + if (!doc.doc_id) { + // Let storage auto-generate by writing the document first. + await kb.upsertDocument(doc); + } + const stored = await kb.upsertDocument(doc); + const docId = stored.doc_id!; + // Replace existing chunks for this doc so re-ingest is idempotent. + await kb.deleteChunksForDocument(docId); + + const chunker = new HierarchicalChunkerTask(); + const chunkResult = await chunker.run({ + doc_id: docId, + documentTree: stored.root as never, + strategy: resolveChunkStrategy(kb), + ...chunkerDefaults, + }); + const chunks = chunkResult.chunks ?? []; + if (chunks.length === 0) return stored; + + const vectors = await embedTexts( + chunks.map((c) => c.text), + requireDocEmbedModel(kb) + ); + const inserts = toInsertChunkEntities( + { chunks, vectors }, + { doc_id: docId, doc_title: stored.metadata.title } + ); + await kb.upsertChunksBulk(inserts); + return stored; + }, + + async delete(kb, doc_id): Promise { + await kb.deleteDocument(doc_id); + }, + + async search(kb, query, options?: ISearchOptions): Promise { + const mode = resolveSearchMode(kb); + const topK = options?.topK ?? 5; + const filter = options?.filter; + const scoreThreshold = options?.scoreThreshold; + + if (mode === "text") { + // Pure FTS via hybridSearch with vectorWeight=0 — works on backends + // that support hybrid (Postgres, in-memory). For storage that doesn't + // support hybrid this falls back to a zero-vector similarity search, + // which is mostly useless; callers should pick a different mode. + if (!kb.supportsHybridSearch()) { + throw new Error( + `searchMode "text" needs hybrid-capable storage; install a backend with hybridSearch.` + ); + } + const dummy = new Float32Array(kb.getVectorDimensions()); + return kb.hybridSearch(dummy, { + textQuery: query, + topK, + filter, + scoreThreshold, + vectorWeight: 0, + }); + } + + const queryVec = await embedTexts([query], requireQueryEmbedModel(kb)); + const vector = queryVec[0]; + + if (mode === "similarity") { + return kb.similaritySearch(vector, { topK, filter, scoreThreshold }); + } + + if (mode === "hybrid") { + if (!kb.supportsHybridSearch()) { + // Graceful fallback — hybrid requested but backend doesn't have it. + return kb.similaritySearch(vector, { topK, filter, scoreThreshold }); + } + return kb.hybridSearch(vector, { + textQuery: query, + topK, + filter, + scoreThreshold, + }); + } + + // mode === "rerank" + const firstStageTopK = Math.max(topK * firstStageMultiplier, topK); + const firstStage: ChunkSearchResult[] = kb.supportsHybridSearch() + ? await kb.hybridSearch(vector, { + textQuery: query, + topK: firstStageTopK, + filter, + scoreThreshold, + }) + : await kb.similaritySearch(vector, { + topK: firstStageTopK, + filter, + scoreThreshold, + }); + if (firstStage.length === 0) return []; + + const docs = firstStage.map((c) => { + const meta = c.metadata as Record | undefined; + const text = meta?.text; + return typeof text === "string" ? text : JSON.stringify(meta ?? {}); + }); + + if (kb.rerankerModel) { + const result = await new TextRerankerTask().run({ + query, + documents: docs, + model: kb.rerankerModel, + topK, + }); + const indices = (result.indices as number[]) ?? []; + const scores = (result.scores as number[]) ?? []; + return indices.map((idx) => { + const candidate = firstStage[idx]; + const newScore = scores[idx]; + return { + ...candidate, + score: typeof newScore === "number" ? newScore : candidate.score, + }; + }); + } + + // No reranker model configured but mode is "rerank" — fall back to a + // local heuristic so callers still get a usable ordering. + const heuristic = await new RerankerTask().run({ + query, + chunks: docs, + scores: firstStage.map((c) => c.score), + metadata: firstStage.map((c) => c.metadata as Record), + topK, + method: "simple", + }); + const indices = (heuristic.originalIndices as number[]) ?? []; + const newScores = (heuristic.scores as number[]) ?? []; + return indices.map((idx, rank) => { + const candidate = firstStage[idx]; + return { + ...candidate, + score: newScores[rank] ?? candidate.score, + }; + }); + }, + }; +} diff --git a/packages/ai/src/task/KbSearchTask.ts b/packages/ai/src/task/KbSearchTask.ts index 8a4800f6e..593ca8521 100644 --- a/packages/ai/src/task/KbSearchTask.ts +++ b/packages/ai/src/task/KbSearchTask.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { ChunkSearchResult, KnowledgeBase, SearchKind } from "@workglow/knowledge-base"; +import type { ChunkSearchResult, KnowledgeBase } from "@workglow/knowledge-base"; import { TypeKnowledgeBase } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, Task, Workflow } from "@workglow/task-graph"; import type { TaskConfig } from "@workglow/task-graph"; @@ -15,32 +15,24 @@ const inputSchema = { properties: { knowledgeBase: TypeKnowledgeBase({ title: "Knowledge Base", - description: "The knowledge base instance to search in", + description: "Knowledge base to search.", }), query: { type: "string", title: "Query", - description: "Search query. The KB owns its embedding/reranker models internally.", - }, - kind: { - type: "string", - enum: ["similarity", "hybrid", "rerank"], - title: "Retrieval Kind", - description: - "Retrieval flavor. Defaults to 'rerank' when the KB has a reranker model, " + - "otherwise 'hybrid' if supported, otherwise 'similarity'.", + description: "Search query text.", }, topK: { type: "number", title: "Top K", - description: "Number of top results to return", + description: "Number of top results to return.", minimum: 1, default: 5, }, filter: { type: "object", title: "Metadata Filter", - description: "Filter results by metadata fields", + description: "Filter results by chunk metadata fields.", }, }, required: ["knowledgeBase", "query"], @@ -99,17 +91,17 @@ export type KbSearchTaskOutput = { export type KbSearchTaskConfig = TaskConfig; /** - * High-level KB search task. Delegates to `kb.search(query, { kind })`; the - * KB owns the embedding and reranker models internally, so no `model` input - * is needed here. + * High-level KB search task. Delegates to `kb.search(query, options)`; the + * KB and its installed strategy decide everything else (embedding model, + * retrieval mode, rerank). No model or kind input here — that's the + * point of moving the config onto the KB itself. */ export class KbSearchTask extends Task { public static override type = "KbSearchTask"; public static override category = "RAG"; public static override title = "KB Search"; public static override description = - "Search a knowledge base. The KB picks the retrieval kind (similarity / hybrid / rerank) " + - "from its configured models, or you can override via `kind`."; + "Search a knowledge base. The KB owns its embedding/reranker models and search mode internally."; public static override cacheable = true; public static override inputSchema(): DataPortSchema { @@ -124,13 +116,9 @@ export class KbSearchTask extends Task { - const { knowledgeBase, query, kind, topK = 5, filter } = input; + const { knowledgeBase, query, topK = 5, filter } = input; const kb = knowledgeBase as KnowledgeBase; - const results = await kb.search(query, { - kind: kind as SearchKind | undefined, - topK, - filter, - }); + const results = await kb.search(query, { topK, filter }); return { results, chunks: results.map((r) => { diff --git a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts index 916dbe7f6..fc7c7df7c 100644 --- a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts +++ b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts @@ -8,68 +8,115 @@ import type { TypedArray } from "@workglow/util/schema"; import type { ChunkRecord } from "../chunk/ChunkSchema"; import type { ChunkSearchResult, InsertChunkVectorEntity } from "../chunk/ChunkVectorStorageSchema"; import type { Document } from "../document/Document"; +import type { ISearchOptions } from "./KnowledgeBase"; /** - * Strategy that bridges a {@link KnowledgeBase} to an AI runtime. The KB owns - * its model IDs (docEmbeddingModel, queryEmbeddingModel, rerankerModel) as - * configuration; the strategy uses them to perform the actual chunking, - * embedding, and reranking. This indirection keeps `@workglow/knowledge-base` - * free of any `@workglow/ai` dependency (ai depends on KB, not the other way - * round) while letting higher layers install a real implementation. + * Strategy that bridges a {@link KnowledgeBase} to an AI runtime. The + * strategy is the single extension point: a KB has exactly one installed + * strategy, and `kb.upsert` / `kb.delete` / `kb.search` delegate to it. + * + * Two flavors ship in `@workglow/ai`: + * - `createStandardKbStrategy(...)` — defaults parameterized by chunker + * strategy and search mode; reads the KB's model IDs at op time. + * - Custom — write your own to add scoping, alternative chunkers, or + * unusual retrieval flows. The builder's KBs use a custom strategy on + * top of `ScopedKnowledgeBase` so user/project ids ride along. + * + * Strategies receive the KB instance on every call. Calls into + * `kb.upsertDocument` / `kb.upsertChunksBulk` / `kb.similaritySearch` etc. + * go through virtual dispatch — subclasses (e.g. `ScopedKnowledgeBase`) + * can intercept the low-level ops without the strategy knowing. */ export interface IKbAiStrategy { /** - * Chunk a document and produce vectors for each chunk. The returned arrays - * must be the same length; index `i` of `vectors` is the embedding for - * `chunks[i]`. The strategy is responsible for chunker configuration and - * picking which embedding model to use (typically `kb.docEmbeddingModel`). + * Ingest a single document: chunk + embed + write document + write + * chunks. The strategy decides chunker strategy, dedup behavior, + * embedding model, etc. Returns the stored document (possibly with a + * newly-assigned doc_id). */ - chunkAndEmbedDocument( - doc: Document - ): Promise<{ readonly chunks: ChunkRecord[]; readonly vectors: TypedArray[] }>; + ingest(kb: IKbStrategyTarget, doc: Document): Promise; /** - * Embed a text query into a single vector for vector / hybrid retrieval. - * Typically uses `kb.queryEmbeddingModel` (falling back to docEmbeddingModel). + * Remove a document and its chunks. The default cascading delete works + * for most cases; override to add audit logging, soft delete, etc. */ - embedQuery(text: string): Promise; + delete(kb: IKbStrategyTarget, doc_id: string): Promise; /** - * Rerank an initial candidate list against the query. Implementations may - * call a cross-encoder model (`kb.rerankerModel`) or fall back to a - * heuristic. The returned array is at most `topK` results, ordered - * best-first, and carries updated `score` values. + * Run a text query and return matching chunks. The strategy picks the + * retrieval flavor (similarity, hybrid, reranker, plain text) — callers + * don't choose per-call. */ - rerank( + search( + kb: IKbStrategyTarget, query: string, - candidates: ChunkSearchResult[], - topK: number + options?: ISearchOptions ): Promise; } /** - * Shape returned by `chunkAndEmbedDocument`. Exposed for strategy - * implementations that want to construct the result without importing the - * internal types from the strategy interface. + * The narrow KB surface strategies operate against. Spells out exactly the + * building-block methods strategies need so the public KB API + * (`upsert`/`delete`/`search`) stays the only surface callers see. */ -export interface KbStrategyEmbedResult { - readonly chunks: ChunkRecord[]; - readonly vectors: TypedArray[]; +export interface IKbStrategyTarget { + readonly name: string; + readonly docEmbeddingModel: string | undefined; + readonly queryEmbeddingModel: string | undefined; + readonly rerankerModel: string | undefined; + readonly chunkStrategy: ChunkStrategy | undefined; + readonly searchMode: SearchMode | undefined; + getVectorDimensions(): number; + supportsHybridSearch(): boolean; + /** Low-level: store a document JSON record without chunking. */ + upsertDocument(doc: Document): Promise; + /** Low-level: cascade delete a document + its chunks. */ + deleteDocument(doc_id: string): Promise; + /** Low-level: drop every chunk row for the given doc_id. */ + deleteChunksForDocument(doc_id: string): Promise; + /** Low-level: bulk-write chunk vectors. */ + upsertChunksBulk(chunks: InsertChunkVectorEntity[]): Promise; + /** Low-level: pure-vector retrieval. */ + similaritySearch( + query: TypedArray, + options?: { topK?: number; filter?: Readonly>; scoreThreshold?: number } + ): Promise; + /** Low-level: vector + full-text retrieval. */ + hybridSearch( + query: TypedArray, + options: { + readonly textQuery: string; + readonly topK?: number; + readonly filter?: Readonly>; + readonly scoreThreshold?: number; + readonly vectorWeight?: number; + } + ): Promise; } +/** Document-chunker strategy registered on the KB; consumed by ingest. */ +export type ChunkStrategy = "hierarchical" | "flat" | "sentence"; + +/** + * Retrieval mode registered on the KB; consumed by search. `text` is pure + * full-text (FTS) and bypasses embedding; the others require an embedding + * model (and `rerank` also requires `rerankerModel`). + */ +export type SearchMode = "text" | "similarity" | "hybrid" | "rerank"; + /** - * Convert a `KbStrategyEmbedResult` plus a `doc_id` / `doc_title` into the + * Convert chunker output (chunks + parallel vectors) into the * `InsertChunkVectorEntity` records that `kb.upsertChunksBulk()` expects. * Shared helper so every strategy uses identical key derivation. */ export function toInsertChunkEntities( - result: KbStrategyEmbedResult, + result: { readonly chunks: ChunkRecord[]; readonly vectors: TypedArray[] }, context: { readonly doc_id: string; readonly doc_title?: string } ): InsertChunkVectorEntity[] { const { chunks, vectors } = result; if (chunks.length !== vectors.length) { throw new Error( - `IKbAiStrategy.chunkAndEmbedDocument returned ${chunks.length} chunks but ${vectors.length} vectors` + `Chunk/vector length mismatch: ${chunks.length} chunks but ${vectors.length} vectors` ); } return chunks.map((chunk, i) => ({ diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts index 66b51458d..c557236cd 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts @@ -20,42 +20,16 @@ import type { DocumentTabularStorage, InsertDocumentStorageEntity, } from "../document/DocumentStorageSchema"; -import type { IKbAiStrategy } from "./IKbAiStrategy"; -import { toInsertChunkEntities } from "./IKbAiStrategy"; +import type { ChunkStrategy, IKbAiStrategy, SearchMode } from "./IKbAiStrategy"; /** - * Retrieval flavor selected by {@link KnowledgeBase.search}. - * - * - `similarity`: vector cosine similarity only. Requires `embedQuery`. - * - `hybrid`: vector + full-text. Requires `embedQuery` and a hybrid-capable - * storage backend. - * - `rerank`: hybrid (or similarity, if hybrid unsupported) first stage - * followed by cross-encoder reranking. Requires `rerank` on the strategy. - */ -export type SearchKind = "similarity" | "hybrid" | "rerank"; - -/** - * Options passed through `kb.search()` / `kb.searchWithRerank()`. `filter` is - * a loose record; allowed keys are defined by the underlying vector storage. + * Options passed through `kb.search()`. `filter` is a loose record; allowed + * keys are defined by the underlying vector storage. */ export interface ISearchOptions { readonly topK?: number; readonly filter?: Readonly>; readonly scoreThreshold?: number; - /** - * For `kind: "hybrid"` and the first stage of `kind: "rerank"`: vector - * vs. text weighting in [0, 1]. Defaults to the storage backend's default. - */ - readonly vectorWeight?: number; - /** - * For `kind: "rerank"`: how many candidates to retrieve before reranking. - * Defaults to `max(topK * 5, 20)`. - */ - readonly firstStageTopK?: number; -} - -export interface ISearchWithKindOptions extends ISearchOptions { - readonly kind?: SearchKind; } export interface KnowledgeBaseOptions { @@ -72,29 +46,43 @@ export interface KnowledgeBaseOptions { */ readonly queryEmbeddingModel?: string; /** - * Optional cross-encoder reranker model ID. When set (and the strategy - * implements rerank against it) `search({ kind: "rerank" })` and - * `searchWithRerank()` use a real cross-encoder; otherwise the strategy - * may fall back to a heuristic. + * Optional cross-encoder reranker model ID. Required when `searchMode` + * is `"rerank"`. */ readonly rerankerModel?: string; + /** Chunker mode used by ingest. Defaults to `"hierarchical"`. */ + readonly chunkStrategy?: ChunkStrategy; + /** + * Retrieval mode used by search. Defaults to `"rerank"` when a reranker + * model is configured, `"hybrid"` when the storage supports it, + * otherwise `"similarity"`. + */ + readonly searchMode?: SearchMode; /** - * The AI strategy used by `upsertDocumentWithIndex`, `search`, and - * `searchWithRerank`. Installable post-construction via - * {@link KnowledgeBase.setAiStrategy}. + * The AI strategy used by `upsert`, `delete`, and `search`. Installable + * post-construction via {@link KnowledgeBase.setAiStrategy}. */ readonly aiStrategy?: IKbAiStrategy; } /** - * Unified KnowledgeBase that owns both document and vector storage, - * providing lifecycle management and cascading deletes. + * Unified KnowledgeBase that owns both document and vector storage. * - * Model configuration (`docEmbeddingModel`, `queryEmbeddingModel`, - * `rerankerModel`) lives on the KB so callers don't have to thread models - * through every retrieval call site. Actual AI execution is delegated to an - * {@link IKbAiStrategy} installed via {@link setAiStrategy} — this indirection - * keeps the KB package free of `@workglow/ai` (which depends on it). + * The public API is intentionally tiny: `upsert`, `delete`, `search`, plus + * lifecycle and inspection helpers. RAG behavior (chunking, embedding, + * retrieval flavor) is fully delegated to an installed + * {@link IKbAiStrategy}. Two flavors ship: + * - `createStandardKbStrategy(...)` from `@workglow/ai` — picks chunker + * mode and search mode from this KB's `chunkStrategy` / `searchMode` + * fields. Uses the registered model IDs. + * - Custom strategies — write your own when you need scoping or unusual + * retrieval; the builder ships one for per-project KBs. + * + * Storage access methods (`upsertDocument`, `upsertChunksBulk`, + * `similaritySearch`, `hybridSearch`, etc.) remain on the class as + * building blocks that strategies and subclasses use. They are documented + * as "strategy-facing" — application code should go through `kb.upsert` / + * `kb.delete` / `kb.search` instead. */ export class KnowledgeBase { readonly name: string; @@ -103,6 +91,8 @@ export class KnowledgeBase { readonly docEmbeddingModel: string | undefined; readonly queryEmbeddingModel: string | undefined; readonly rerankerModel: string | undefined; + readonly chunkStrategy: ChunkStrategy | undefined; + readonly searchMode: SearchMode | undefined; private readonly tabularStorage: DocumentTabularStorage; private readonly chunkStorage: ChunkVectorStorage; private aiStrategy: IKbAiStrategy | undefined; @@ -123,19 +113,16 @@ export class KnowledgeBase { this.docEmbeddingModel = options.docEmbeddingModel; this.queryEmbeddingModel = options.queryEmbeddingModel ?? options.docEmbeddingModel; this.rerankerModel = options.rerankerModel; + this.chunkStrategy = options.chunkStrategy; + this.searchMode = options.searchMode; this.aiStrategy = options.aiStrategy; } } // =========================================================================== - // AI strategy + // Strategy installation // =========================================================================== - /** - * Install (or replace) the AI strategy that powers ingest embedding and - * query-side embedding / reranking. The KB stores model IDs but doesn't - * load models itself; the strategy bridges to the AI runtime. - */ setAiStrategy(strategy: IKbAiStrategy | undefined): void { this.aiStrategy = strategy; } @@ -144,28 +131,61 @@ export class KnowledgeBase { return this.aiStrategy; } - /** True when a strategy is installed AND a reranker model is registered. */ - supportsRerank(): boolean { - return this.aiStrategy !== undefined && this.rerankerModel !== undefined; - } - private requireStrategy(forOp: string): IKbAiStrategy { if (!this.aiStrategy) { throw new Error( `KnowledgeBase.${forOp}() requires an AI strategy. ` + - `Install one via kb.setAiStrategy(strategy) (typically createAiKbStrategy from @workglow/ai).` + `Install one via kb.setAiStrategy(strategy) — see createStandardKbStrategy from @workglow/ai.` ); } return this.aiStrategy; } // =========================================================================== - // Document CRUD + // Public RAG API — strategy-driven + // =========================================================================== + + /** + * Ingest a document end-to-end: chunk + embed + write. Delegates to the + * installed strategy. + */ + async upsert(doc: Document): Promise { + return this.requireStrategy("upsert").ingest(this, doc); + } + + /** + * Remove a document and its chunks. Delegates to the installed strategy. + * Method name uses `[Symbol.iterator]`-style indirection because `delete` + * is a JS keyword — call it via `kb.delete(...)` directly; TypeScript + * accepts the method name even though the bare `delete` operator does + * something different. + */ + async delete(doc_id: string): Promise { + return this.requireStrategy("delete").delete(this, doc_id); + } + + /** + * Run a text query. Retrieval flavor (text / similarity / hybrid / + * rerank) is decided by the installed strategy — typically derived from + * this KB's `searchMode` field. + */ + async search(query: string, options?: ISearchOptions): Promise { + return this.requireStrategy("search").search(this, query, options); + } + + // =========================================================================== + // Strategy-facing building blocks + // + // These methods are public so strategies (and subclasses like + // `ScopedKnowledgeBase`) can call them, but application code should go + // through `upsert` / `delete` / `search` above. The contract: every one + // of these goes through virtual dispatch, so a subclass can intercept + // any of them without the strategy knowing. // =========================================================================== /** - * Upsert a document JSON record. Does NOT chunk or embed — use - * {@link upsertDocumentWithIndex} for the full ingest path. + * Store a document JSON record. Does NOT chunk or embed; the strategy + * does that orchestration and then calls back into this method. * @returns The document with the generated doc_id if it was auto-generated */ async upsertDocument(document: Document): Promise { @@ -185,150 +205,28 @@ export class KnowledgeBase { } /** - * Full ingest: store the document, drop any existing chunks for it, then - * chunk + embed + upsert via the installed AI strategy. Throws if no - * strategy is installed. + * Cascading delete: chunks first, then the document row. Strategies call + * this directly when their `delete()` doesn't need extra logic. */ - async upsertDocumentWithIndex(document: Document): Promise { - const strategy = this.requireStrategy("upsertDocumentWithIndex"); - const stored = await this.upsertDocument(document); - const docId = stored.doc_id; - if (!docId) { - throw new Error( - "upsertDocumentWithIndex: document has no doc_id after upsertDocument." - ); - } - await this.deleteChunksForDocument(docId); - const embedResult = await strategy.chunkAndEmbedDocument(stored); - if (embedResult.chunks.length === 0) { - return stored; - } - const inserts = toInsertChunkEntities(embedResult, { - doc_id: docId, - doc_title: stored.metadata.title, - }); - await this.upsertChunksBulk(inserts); - return stored; + async deleteDocument(doc_id: string): Promise { + await this.deleteChunksForDocument(doc_id); + await this.tabularStorage.delete({ doc_id }); } - /** - * Get a document by ID - */ async getDocument(doc_id: string): Promise { const entity = await this.tabularStorage.get({ doc_id }); - if (!entity) { - return undefined; - } + if (!entity) return undefined; return Document.fromJSON(entity.data, entity.doc_id); } - /** - * Delete a document and all its chunks (cascading delete). - */ - async deleteDocument(doc_id: string): Promise { - await this.deleteChunksForDocument(doc_id); - await this.tabularStorage.delete({ doc_id }); - } - - /** - * List all document IDs - */ async listDocuments(): Promise { const entities = await this.tabularStorage.getAll(); - if (!entities) { - return []; - } + if (!entities) return []; return entities.map((e: DocumentStorageEntity) => e.doc_id); } - // =========================================================================== - // Tree traversal - // =========================================================================== - - /** - * Get a specific node by ID from a document - */ - async getNode(doc_id: string, nodeId: string): Promise { - const doc = await this.getDocument(doc_id); - if (!doc) { - return undefined; - } - - const traverse = (node: DocumentNode): DocumentNode | undefined => { - if (node.nodeId === nodeId) { - return node; - } - if ("children" in node && Array.isArray(node.children)) { - for (const child of node.children) { - const found = traverse(child); - if (found) return found; - } - } - return undefined; - }; - - return traverse(doc.root); - } - - /** - * Get ancestors of a node (from root to target node) - */ - async getAncestors(doc_id: string, nodeId: string): Promise { - const doc = await this.getDocument(doc_id); - if (!doc) { - return []; - } - - const path: string[] = []; - const findPath = (node: DocumentNode): boolean => { - path.push(node.nodeId); - if (node.nodeId === nodeId) { - return true; - } - if ("children" in node && Array.isArray(node.children)) { - for (const child of node.children) { - if (findPath(child)) { - return true; - } - } - } - path.pop(); - return false; - }; - - if (!findPath(doc.root)) { - return []; - } - - const ancestors: DocumentNode[] = []; - let currentNode: DocumentNode = doc.root; - ancestors.push(currentNode); - - for (let i = 1; i < path.length; i++) { - const targetId = path[i]; - if ("children" in currentNode && Array.isArray(currentNode.children)) { - const found = currentNode.children.find((child: DocumentNode) => child.nodeId === targetId); - if (found) { - currentNode = found; - ancestors.push(currentNode); - } else { - break; - } - } else { - break; - } - } - - return ancestors; - } - - // =========================================================================== - // Chunk CRUD - // =========================================================================== + // ----- chunks ----- - /** - * Upsert a single chunk vector entity - */ async upsertChunk(chunk: InsertChunkVectorEntity): Promise { const expected = this.getVectorDimensions(); if (expected > 0 && chunk.vector.length !== expected) { @@ -339,9 +237,6 @@ export class KnowledgeBase { return this.chunkStorage.put(chunk); } - /** - * Upsert multiple chunk vector entities - */ async upsertChunksBulk(chunks: InsertChunkVectorEntity[]): Promise { const expected = this.getVectorDimensions(); if (expected > 0) { @@ -356,31 +251,17 @@ export class KnowledgeBase { return this.chunkStorage.putBulk(chunks); } - /** - * Delete all chunks for a specific document - */ async deleteChunksForDocument(doc_id: string): Promise { await this.chunkStorage.deleteSearch({ doc_id }); } - /** - * Get all chunks for a specific document - */ async getChunksForDocument(doc_id: string): Promise { const results = await this.chunkStorage.query({ doc_id }); return (results ?? []) as ChunkVectorEntity[]; } - // =========================================================================== - // Search - // =========================================================================== + // ----- vector retrieval ----- - /** - * Search for similar chunks using vector similarity. This is the canonical - * scope-aware entry point — subclasses (e.g. a scoped KB that isolates by - * tenant) override this to inject filter predicates before delegating to - * the underlying storage. - */ async similaritySearch( query: TypedArray, options?: VectorSearchOptions @@ -388,173 +269,127 @@ export class KnowledgeBase { return this.chunkStorage.similaritySearch(query, options); } - /** - * Hybrid search combining vector similarity and full-text search. Canonical - * scope-aware entry point; subclasses override for filter injection. - * - * @throws Error if the configured storage backend does not support hybrid search. - */ async hybridSearch( query: TypedArray, options: HybridSearchOptions ): Promise { if (typeof this.chunkStorage.hybridSearch !== "function") { throw new Error( - "Hybrid search is not supported by the configured chunk storage backend. " + - "Please use a vector storage implementation that provides `hybridSearch`." + "Hybrid search is not supported by the configured chunk storage backend." ); } return this.chunkStorage.hybridSearch(query, options); } - /** - * Check if the configured storage backend supports hybrid search. - */ supportsHybridSearch(): boolean { return typeof this.chunkStorage.hybridSearch === "function"; } - /** - * Hybrid (or similarity) retrieve a wide candidate set, then ask the - * strategy's reranker to score them and return the best `topK`. Requires - * an AI strategy. If the backend doesn't support hybrid search, this - * falls back to similarity for the first stage. - */ - async searchWithRerank( - query: string, - options?: ISearchOptions - ): Promise { - const strategy = this.requireStrategy("searchWithRerank"); - const topK = options?.topK ?? 5; - const firstStageTopK = options?.firstStageTopK ?? Math.max(topK * 5, 20); - const vector = await strategy.embedQuery(query); - const firstStage: ChunkSearchResult[] = this.supportsHybridSearch() - ? await this.hybridSearch(vector, { - textQuery: query, - topK: firstStageTopK, - filter: options?.filter as Partial | undefined, - scoreThreshold: options?.scoreThreshold, - vectorWeight: options?.vectorWeight, - }) - : await this.similaritySearch(vector, { - topK: firstStageTopK, - filter: options?.filter as Partial | undefined, - scoreThreshold: options?.scoreThreshold, - }); - if (firstStage.length === 0) { - return []; - } - return strategy.rerank(query, firstStage, topK); + // =========================================================================== + // Tree traversal helpers (unchanged) + // =========================================================================== + + async getNode(doc_id: string, nodeId: string): Promise { + const doc = await this.getDocument(doc_id); + if (!doc) return undefined; + + const traverse = (node: DocumentNode): DocumentNode | undefined => { + if (node.nodeId === nodeId) return node; + if ("children" in node && Array.isArray(node.children)) { + for (const child of node.children) { + const found = traverse(child); + if (found) return found; + } + } + return undefined; + }; + + return traverse(doc.root); } - /** - * Unified text-query search dispatcher. The KB knows its own embedding - * model and reranker (via the installed strategy), so callers don't need - * to thread models through every call site. - * - * - `kind: "similarity"` — embed + vector search - * - `kind: "hybrid"` — embed + vector + full-text - * - `kind: "rerank"` — first-stage hybrid/similarity + cross-encoder rerank - * - * Defaults to `"rerank"` when a reranker model is configured, otherwise - * `"hybrid"` when supported, otherwise `"similarity"`. - */ - async search( - query: string, - options?: ISearchWithKindOptions - ): Promise { - const kind: SearchKind = - options?.kind ?? - (this.supportsRerank() - ? "rerank" - : this.supportsHybridSearch() - ? "hybrid" - : "similarity"); - - if (kind === "rerank") { - return this.searchWithRerank(query, options); - } + async getAncestors(doc_id: string, nodeId: string): Promise { + const doc = await this.getDocument(doc_id); + if (!doc) return []; + + const path: string[] = []; + const findPath = (node: DocumentNode): boolean => { + path.push(node.nodeId); + if (node.nodeId === nodeId) return true; + if ("children" in node && Array.isArray(node.children)) { + for (const child of node.children) { + if (findPath(child)) return true; + } + } + path.pop(); + return false; + }; + + if (!findPath(doc.root)) return []; + + const ancestors: DocumentNode[] = []; + let currentNode: DocumentNode = doc.root; + ancestors.push(currentNode); - const strategy = this.requireStrategy("search"); - const vector = await strategy.embedQuery(query); - const topK = options?.topK ?? 5; - if (kind === "hybrid") { - return this.hybridSearch(vector, { - textQuery: query, - topK, - filter: options?.filter as Partial | undefined, - scoreThreshold: options?.scoreThreshold, - vectorWeight: options?.vectorWeight, - }); + for (let i = 1; i < path.length; i++) { + const targetId = path[i]; + if ("children" in currentNode && Array.isArray(currentNode.children)) { + const found = currentNode.children.find((child: DocumentNode) => child.nodeId === targetId); + if (found) { + currentNode = found; + ancestors.push(currentNode); + } else { + break; + } + } else { + break; + } } - return this.similaritySearch(vector, { - topK, - filter: options?.filter as Partial | undefined, - scoreThreshold: options?.scoreThreshold, - }); + + return ancestors; } // =========================================================================== - // Accessors for raw storage + // Lifecycle / accessors // =========================================================================== - /** - * The underlying chunk/vector storage. Use when you need raw, unscoped - * access to low-level vector operations — e.g. bulk maintenance, metrics, - * or behavior that explicitly should bypass any subclass scoping. For - * normal search, prefer `kb.similaritySearch()` / `kb.hybridSearch()`, - * which subclasses can override to inject scope. - */ + /** Underlying chunk store; for maintenance and inspection. */ get vectorStorage(): ChunkVectorStorage { return this.chunkStorage; } - // =========================================================================== - // Lifecycle - // =========================================================================== - /** - * Prepare a document for re-indexing: deletes all chunks but keeps the document. - * @returns The document if found, undefined otherwise + * Prepare a document for re-indexing: deletes all chunks but keeps the + * document. Used by re-index flows; routine callers should use + * `kb.upsert(doc)` to fully replace. */ async prepareReindex(doc_id: string): Promise { const doc = await this.getDocument(doc_id); - if (!doc) { - return undefined; - } + if (!doc) return undefined; await this.deleteChunksForDocument(doc_id); return doc; } /** - * Re-index every document in this KB using the installed strategy. The - * caller is responsible for ensuring the strategy is set. Returns the - * number of documents re-indexed. + * Re-index every document by re-running ingest. Requires a strategy. */ async reindex(): Promise { - this.requireStrategy("reindex"); + const strategy = this.requireStrategy("reindex"); const docIds = await this.listDocuments(); let count = 0; for (const doc_id of docIds) { const doc = await this.getDocument(doc_id); if (!doc) continue; - await this.upsertDocumentWithIndex(doc); + await strategy.ingest(this, doc); count++; } return count; } - /** - * Setup the underlying databases - */ async setupDatabase(): Promise { await this.tabularStorage.setupDatabase(); await this.chunkStorage.setupDatabase(); } - /** - * Destroy storage instances - */ destroy(): void { this.tabularStorage.destroy(); this.chunkStorage.destroy(); @@ -568,82 +403,43 @@ export class KnowledgeBase { this.destroy(); } - // =========================================================================== - // Accessors - // =========================================================================== - - /** - * Get a chunk by ID - */ async getChunk(chunk_id: string): Promise { return this.chunkStorage.get({ chunk_id }); } - /** - * Store a single chunk (alias for upsertChunk) - */ async put(chunk: InsertChunkVectorEntity): Promise { return this.chunkStorage.put(chunk); } - /** - * Store multiple chunks (alias for upsertChunksBulk) - */ async putBulk(chunks: InsertChunkVectorEntity[]): Promise { return this.chunkStorage.putBulk(chunks); } - /** - * Get all chunks - */ async getAllChunks(): Promise { return this.chunkStorage.getAll() as Promise; } - /** - * Get chunk count - */ async chunkCount(): Promise { return this.chunkStorage.size(); } - /** - * Clear all chunks - */ async clearChunks(): Promise { return this.chunkStorage.deleteAll(); } - /** - * Get vector dimensions - */ getVectorDimensions(): number { return this.chunkStorage.getVectorDimensions(); } - // =========================================================================== - // Document chunk helpers - // =========================================================================== - - /** - * Get chunks from the document JSON (not from vector storage) - */ async getDocumentChunks(doc_id: string): Promise { const doc = await this.getDocument(doc_id); - if (!doc) { - return []; - } + if (!doc) return []; return doc.getChunks(); } - /** - * Find chunks in document JSON that contain a specific nodeId in their path - */ async findChunksByNodeId(doc_id: string, nodeId: string): Promise { const doc = await this.getDocument(doc_id); - if (!doc) { - return []; - } + if (!doc) return []; return doc.findChunksByNodeId(nodeId); } } diff --git a/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts index 23122b829..da4b935b2 100644 --- a/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts @@ -10,7 +10,7 @@ import type { ChunkVectorStorage } from "../chunk/ChunkVectorStorageSchema"; import { ChunkVectorPrimaryKey, ChunkVectorStorageSchema } from "../chunk/ChunkVectorStorageSchema"; import type { DocumentTabularStorage } from "../document/DocumentStorageSchema"; import { DocumentStorageKey, DocumentStorageSchema } from "../document/DocumentStorageSchema"; -import type { IKbAiStrategy } from "./IKbAiStrategy"; +import type { ChunkStrategy, IKbAiStrategy, SearchMode } from "./IKbAiStrategy"; import { KnowledgeBase } from "./KnowledgeBase"; import { registerKnowledgeBase } from "./KnowledgeBaseRegistry"; @@ -24,6 +24,8 @@ export interface CreateKnowledgeBaseOptions { readonly docEmbeddingModel?: string; readonly queryEmbeddingModel?: string; readonly rerankerModel?: string; + readonly chunkStrategy?: ChunkStrategy; + readonly searchMode?: SearchMode; readonly aiStrategy?: IKbAiStrategy; } @@ -53,6 +55,8 @@ export async function createKnowledgeBase( docEmbeddingModel, queryEmbeddingModel, rerankerModel, + chunkStrategy, + searchMode, aiStrategy, } = options; @@ -85,7 +89,16 @@ export async function createKnowledgeBase( name, tabularStorage as unknown as DocumentTabularStorage, vectorStorage as unknown as ChunkVectorStorage, - { title, description, docEmbeddingModel, queryEmbeddingModel, rerankerModel, aiStrategy } + { + title, + description, + docEmbeddingModel, + queryEmbeddingModel, + rerankerModel, + chunkStrategy, + searchMode, + aiStrategy, + } ); if (shouldRegister) { diff --git a/packages/test/src/test/rag/DocumentRepository.test.ts b/packages/test/src/test/rag/DocumentRepository.test.ts index 80c8c34a8..7a24d582a 100644 --- a/packages/test/src/test/rag/DocumentRepository.test.ts +++ b/packages/test/src/test/rag/DocumentRepository.test.ts @@ -443,94 +443,84 @@ Paragraph.`; await expect(bareKb.search("hello")).rejects.toThrow(/AI strategy/); }); - it("should invoke embedQuery and similaritySearch for kind: 'similarity'", async () => { - const received: Array<{ text: string }> = []; + it("should delegate kb.search to the installed strategy", async () => { + const calls: Array<{ query: string; topK: number | undefined }> = []; const kb = await createKnowledgeBase({ - name: `test-kb-sim-${uuid4()}`, + name: `test-kb-search-${uuid4()}`, vectorDimensions: 3, register: false, }); kb.setAiStrategy({ - chunkAndEmbedDocument: async () => ({ chunks: [], vectors: [] }), - embedQuery: async (text) => { - received.push({ text }); - return new Float32Array([0.1, 0.2, 0.3]); + ingest: async (_kb, doc) => doc, + delete: async () => {}, + search: async (_kb, query, options) => { + calls.push({ query, topK: options?.topK }); + return []; }, - rerank: async (_q, candidates, topK) => candidates.slice(0, topK), }); - const results = await kb.search("hello", { kind: "similarity", topK: 4 }); + await kb.search("hello", { topK: 4 }); - expect(received).toEqual([{ text: "hello" }]); - expect(results).toEqual([]); + expect(calls).toEqual([{ query: "hello", topK: 4 }]); }); - it("should run rerank by default when rerankerModel is configured", async () => { - const reranks: Array<{ query: string; n: number; topK: number }> = []; + it("should delegate kb.upsert / kb.delete to the strategy", async () => { + const ingested: string[] = []; + const deleted: string[] = []; const kb = await createKnowledgeBase({ - name: `test-kb-rerank-${uuid4()}`, + name: `test-kb-ingest-${uuid4()}`, vectorDimensions: 3, register: false, - docEmbeddingModel: "test:doc-embed", - rerankerModel: "test:rerank", }); kb.setAiStrategy({ - chunkAndEmbedDocument: async () => ({ chunks: [], vectors: [] }), - embedQuery: async () => new Float32Array([1, 0, 0]), - rerank: async (query, candidates, topK) => { - reranks.push({ query, n: candidates.length, topK }); - return candidates.slice(0, topK); + ingest: async (target, doc) => { + await target.upsertDocument(doc); + ingested.push(doc.doc_id ?? ""); + return doc; }, + delete: async (target, doc_id) => { + deleted.push(doc_id); + await target.deleteDocument(doc_id); + }, + search: async () => [], }); - // Seed a chunk so the first-stage retrieval returns something the - // reranker can score against. - await kb.upsertChunk({ - chunk_id: "c1", - doc_id: "d1", - vector: new Float32Array([1, 0, 0]), - metadata: { chunk_id: "c1", doc_id: "d1", text: "hi", nodePath: [], depth: 0 } as any, - }); - - await kb.search("q", { topK: 2 }); + const root = await StructuralParser.parseMarkdown(uuid4(), "# T\n\nx.", "T"); + const doc = new Document(root, { title: "T" }); + doc.setDocId("d1"); + await kb.upsert(doc); + expect(ingested).toEqual(["d1"]); - expect(reranks).toHaveLength(1); - expect(reranks[0].query).toBe("q"); - expect(reranks[0].topK).toBe(2); + await kb.delete("d1"); + expect(deleted).toEqual(["d1"]); + expect(await kb.getDocument("d1")).toBeUndefined(); }); - it("should chunk+embed via the strategy in upsertDocumentWithIndex", async () => { + it("should expose model + chunk/search-mode config to the strategy", async () => { + let observed: { docModel?: string; mode?: string; chunk?: string } = {}; const kb = await createKnowledgeBase({ - name: `test-kb-ingest-${uuid4()}`, + name: `test-kb-config-${uuid4()}`, vectorDimensions: 3, register: false, + docEmbeddingModel: "test:doc", + rerankerModel: "test:rerank", + chunkStrategy: "flat", + searchMode: "rerank", }); kb.setAiStrategy({ - chunkAndEmbedDocument: async (doc) => { - const text = doc.metadata.title ?? ""; - return { - chunks: [ - { - chunk_id: "c1", - doc_id: doc.doc_id ?? "", - text, - nodePath: [], - depth: 0, - } as any, - ], - vectors: [new Float32Array([1, 0, 0])], + ingest: async (_k, d) => d, + delete: async () => {}, + search: async (target) => { + observed = { + docModel: target.docEmbeddingModel, + mode: target.searchMode, + chunk: target.chunkStrategy, }; + return []; }, - embedQuery: async () => new Float32Array([0, 0, 0]), - rerank: async (_q, c, k) => c.slice(0, k), }); - - const root = await StructuralParser.parseMarkdown(uuid4(), "# Test\n\nx.", "Test"); - const doc = new Document(root, { title: "Test" }); - const stored = await kb.upsertDocumentWithIndex(doc); - - const chunks = await kb.getChunksForDocument(stored.doc_id!); - expect(chunks).toHaveLength(1); + await kb.search("q"); + expect(observed).toEqual({ docModel: "test:doc", mode: "rerank", chunk: "flat" }); }); }); }); From 3a3e54daca769c0f08f8ba32b684351bf6774e74 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 12 May 2026 00:46:24 +0000 Subject: [PATCH 03/11] Address Copilot review on #484: ingest dedup, type fixes, doc cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - createStandardKbStrategy.ingest: drop the duplicate upsertDocument when doc.doc_id is initially missing. upsertDocument already returns the stored doc with the assigned id, so a second write was both wasted I/O and a hidden trap for ScopedKnowledgeBase overrides that do scope-injection work on every write. - createStandardKbStrategy: rewrite the firstStageMultiplier docstring to match the implementation (no 20-floor — just `topK * multiplier` with a `topK` floor so we never return fewer than the caller asked for). - IKbStrategyTarget.upsertChunksBulk: tighten the return type from Promise to Promise so strategies don't have to cast. - ONNXModelSamples: add "TextRerankerTask" to bge-reranker-base's `tasks` array. The new standard strategy invokes TextRerankerTask (not the legacy heuristic RerankerTask), and AiTask.narrowInput resolves models by task tag — without this, rerank mode silently fails model resolution. - RerankerTask + TextRerankerTask: rewrite the JSDoc comments that referenced the now-removed `createAiKbStrategy` and `kb.searchWithRerank()`. Both now point readers at `createStandardKbStrategy` and the `searchMode: "rerank"` integration point. All 191 rag tests pass. https://claude.ai/code/session_01Ya54WFZhpDFzAqRh1qG8Ex --- packages/ai/src/kb/createStandardKbStrategy.ts | 14 ++++++++------ packages/ai/src/task/RerankerTask.ts | 12 +++++++----- packages/ai/src/task/TextRerankerTask.ts | 6 +++--- .../src/knowledge-base/IKbAiStrategy.ts | 8 ++++++-- packages/test/src/samples/ONNXModelSamples.ts | 2 +- 5 files changed, 25 insertions(+), 17 deletions(-) diff --git a/packages/ai/src/kb/createStandardKbStrategy.ts b/packages/ai/src/kb/createStandardKbStrategy.ts index 60db107ad..79046146a 100644 --- a/packages/ai/src/kb/createStandardKbStrategy.ts +++ b/packages/ai/src/kb/createStandardKbStrategy.ts @@ -39,8 +39,11 @@ export interface CreateStandardKbStrategyOptions { /** Override KB's searchMode at strategy-build time. */ readonly searchMode?: SearchMode; /** - * For `searchMode === "rerank"`: how many candidates to retrieve before - * reranking. Defaults to `max(topK * 5, 20)`. + * Multiplier applied to `topK` to size the first-stage candidate pool + * when `searchMode === "rerank"`. The reranker then narrows the pool + * back down to `topK`. Defaults to `5`, i.e. first stage fetches + * `topK * 5` candidates (with a `topK` floor so it never returns fewer + * than `topK`). */ readonly firstStageMultiplier?: number; } @@ -105,10 +108,9 @@ export function createStandardKbStrategy( return { async ingest(kb, doc): Promise { - if (!doc.doc_id) { - // Let storage auto-generate by writing the document first. - await kb.upsertDocument(doc); - } + // Single write — `upsertDocument` returns the stored doc with the + // auto-generated id assigned, so we don't need a second round-trip + // when `doc.doc_id` is initially missing. const stored = await kb.upsertDocument(doc); const docId = stored.doc_id!; // Replace existing chunks for this doc so re-ingest is idempotent. diff --git a/packages/ai/src/task/RerankerTask.ts b/packages/ai/src/task/RerankerTask.ts index 0585cd913..d78007770 100644 --- a/packages/ai/src/task/RerankerTask.ts +++ b/packages/ai/src/task/RerankerTask.ts @@ -110,11 +110,13 @@ interface RankedItem { } /** - * Heuristic reranking task. Cross-encoder reranking (via model) is handled - * by `createAiKbStrategy` directly — it dispatches to provider-registered - * RerankerTask run-fns through `AiProviderRegistry`. This task remains the - * model-free fallback for workflows that don't want to require a reranker - * model. + * Heuristic, model-free reranking. For real cross-encoder reranking use + * {@link TextRerankerTask}, which dispatches to a provider-registered run-fn + * (e.g. HuggingFace Transformers) for a configured reranker model. + * `createStandardKbStrategy` invokes that path automatically when the KB has + * a `rerankerModel` set under `searchMode: "rerank"`; this task is the + * fallback when no reranker model is configured and a workflow still wants + * some rerank-style scoring. */ export class RerankerTask extends Task { public static override type = "RerankerTask"; diff --git a/packages/ai/src/task/TextRerankerTask.ts b/packages/ai/src/task/TextRerankerTask.ts index 91eadfeaa..4ad97022f 100644 --- a/packages/ai/src/task/TextRerankerTask.ts +++ b/packages/ai/src/task/TextRerankerTask.ts @@ -67,9 +67,9 @@ export type TextRerankerTaskConfig = TaskConfig; /** * AiTask for cross-encoder reranking. Providers register a run-fn for this * task type (e.g. HuggingFace Transformers using a `text-classification` - * cross-encoder pipeline on `[query, doc]` pairs). `createAiKbStrategy` - * uses this task to power `kb.searchWithRerank()` when a reranker model - * is configured on the KB. + * cross-encoder pipeline on `[query, doc]` pairs). `createStandardKbStrategy` + * invokes this task as the rerank stage of `kb.search()` when the KB is + * configured with `searchMode: "rerank"` and has a `rerankerModel` set. */ export class TextRerankerTask extends AiTask< TextRerankerTaskInput, diff --git a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts index e79fa5b0b..272084cbe 100644 --- a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts +++ b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts @@ -6,7 +6,11 @@ import type { TypedArray } from "@workglow/util/schema"; import type { ChunkRecord } from "../chunk/ChunkSchema"; -import type { ChunkSearchResult, InsertChunkVectorEntity } from "../chunk/ChunkVectorStorageSchema"; +import type { + ChunkSearchResult, + ChunkVectorEntity, + InsertChunkVectorEntity, +} from "../chunk/ChunkVectorStorageSchema"; import type { Document } from "../document/Document"; import type { ISearchOptions } from "./KnowledgeBase"; @@ -76,7 +80,7 @@ export interface IKbStrategyTarget { /** Low-level: drop every chunk row for the given doc_id. */ deleteChunksForDocument(doc_id: string): Promise; /** Low-level: bulk-write chunk vectors. */ - upsertChunksBulk(chunks: InsertChunkVectorEntity[]): Promise; + upsertChunksBulk(chunks: InsertChunkVectorEntity[]): Promise; /** Low-level: pure-vector retrieval. */ similaritySearch( query: TypedArray, diff --git a/packages/test/src/samples/ONNXModelSamples.ts b/packages/test/src/samples/ONNXModelSamples.ts index 3116622cb..2156e6745 100644 --- a/packages/test/src/samples/ONNXModelSamples.ts +++ b/packages/test/src/samples/ONNXModelSamples.ts @@ -345,7 +345,7 @@ export async function registerHuggingfaceLocalModels(): Promise { model_id: "onnx:Xenova/bge-reranker-base:q8", title: "BGE Reranker Base", description: "Cross-encoder reranker model for relevance scoring", - tasks: ["TextClassificationTask", "RerankerTask"], + tasks: ["TextClassificationTask", "RerankerTask", "TextRerankerTask"], provider: HF_TRANSFORMERS_ONNX, provider_config: { pipeline: "text-classification", From e2cd2084206d632307a170493aa2266d336c7d65 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 12 May 2026 15:18:50 +0000 Subject: [PATCH 04/11] Strategy refactor follow-ups: ingest order, snapshot, chunkText, scoreType MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address libs PR #484 review plan in one sweep, ordered by blast radius. Data integrity — ingest now delete-then-upsert-then-insert: - createStandardKbStrategy.ingest deletes old chunks BEFORE rewriting the document when doc.doc_id is set, so a partial failure (e.g. upsertChunksBulk rejecting) leaves "doc row preserved, chunks removed" rather than "new doc row pointing at stale old chunks." When doc.doc_id is unset, upserts first to mint the id, then runs deleteChunksForDocument as a defensive no-op so the post-condition ("after ingest, doc owns exactly the new chunks") holds even on storage backends that recycle ids. Concurrency — strategy snapshot per public op: - KnowledgeBase.upsert/delete/search/reindex now make the snapshot explicit (const strategy = this.requireStrategy(...)) and document the semantics on setAiStrategy: replacing the strategy does NOT affect ops already in flight; each public op resolves its strategy at entry. reindex captures once and uses the same strategy for the whole loop. Search correctness — surface canonical text + drop JSON.stringify fallback: - New chunkText(c) helper on @workglow/knowledge-base reads metadata.text and throws (with the chunk_id) when missing. Replaces the inline `meta?.text … JSON.stringify(meta ?? {})` map in createStandardKbStrategy and KbSearchTask. Documents metadata.text as a load-bearing contract on InsertChunkVectorEntity. Score semantics — tag rerank with scoreType: "rerank": - ScoreType union extended with "rerank". Both cross-encoder and heuristic-fallback rerank paths now set scoreType: "rerank" as const, overriding the first-stage cosine/RRF tag. Docstrings on createStandardKbStrategy, IKbAiStrategy.search, and the ScoreType union itself flag that cross-encoder logits are NOT comparable to cosine/BM25/RRF scores; callers must inspect scoreType before applying a score threshold. scoreThreshold is intentionally not honored in the rerank branch (commented in-code). ChunkRetrievalTask output schema enum extended to keep parity with the canonical union; the task itself only emits cosine/rrf. Tests: - mid-op setAiStrategy(B) during search/upsert/reindex: assert the in-flight op completes via the original strategy (DocumentRepository "strategy contract" block, 3 tests). - chunkText helper: present → returns text; missing → throws with chunk_id (DocumentRepository, 2 tests). - KbSearchTask: result with metadata lacking text throws with chunk_id rather than emitting JSON.stringify (KbSearchTask.test, 1 test). - New KnowledgeBaseStandardStrategy.test.ts exercises the actual createStandardKbStrategy: ingest-order partial-failure leaves no orphan chunks; rerank heuristic-fallback tags results "rerank". Setup registers a stub TextEmbeddingTask runFn + model record so the strategy's embedTexts call resolves without real providers. All new tests + KbSearchTask + DocumentRepository tests pass (191 rag tests pass, the 7 remaining failures are pre-existing HuggingFace-503 flakes in EndToEnd / RagWorkflow integration tests). https://claude.ai/code/session_01Ya54WFZhpDFzAqRh1qG8Ex --- .../ai/src/kb/createStandardKbStrategy.ts | 57 ++++-- packages/ai/src/task/ChunkRetrievalTask.ts | 16 +- packages/ai/src/task/KbSearchTask.ts | 12 +- .../src/chunk/ChunkVectorStorageSchema.ts | 45 ++++- .../src/knowledge-base/IKbAiStrategy.ts | 9 + .../src/knowledge-base/KnowledgeBase.ts | 52 ++++- .../src/test/rag/DocumentRepository.test.ts | 161 +++++++++++++++ .../rag/KnowledgeBaseStandardStrategy.test.ts | 190 ++++++++++++++++++ .../test/src/test/task/KbSearchTask.test.ts | 19 ++ 9 files changed, 532 insertions(+), 29 deletions(-) create mode 100644 packages/test/src/test/rag/KnowledgeBaseStandardStrategy.test.ts diff --git a/packages/ai/src/kb/createStandardKbStrategy.ts b/packages/ai/src/kb/createStandardKbStrategy.ts index 79046146a..2a8b7d670 100644 --- a/packages/ai/src/kb/createStandardKbStrategy.ts +++ b/packages/ai/src/kb/createStandardKbStrategy.ts @@ -13,7 +13,7 @@ import type { ISearchOptions, SearchMode, } from "@workglow/knowledge-base"; -import { toInsertChunkEntities } from "@workglow/knowledge-base"; +import { chunkText, toInsertChunkEntities } from "@workglow/knowledge-base"; import type { TypedArray } from "@workglow/util/schema"; import { HierarchicalChunkerTask } from "../task/HierarchicalChunkerTask"; @@ -56,6 +56,14 @@ export interface CreateStandardKbStrategyOptions { * (`chunkStrategy` / `searchMode`) on every call, so updates to the KB * record take effect immediately on the next op. * + * Score semantics: results carry `scoreType` matching the retrieval + * mode — `"cosine"` for similarity, `"rrf"` for hybrid, `"rerank"` for + * both reranker-model and heuristic fallback paths. **Cross-encoder + * rerank scores are raw logits**, not probabilities or similarities, and + * they are NOT comparable to cosine / BM25 / RRF scores. Always check + * `scoreType` before applying a score threshold; the strategy itself + * ignores `ISearchOptions.scoreThreshold` in the rerank branch. + * * For custom RAG flows (per-tenant scoping, alternative chunkers, etc.) * write your own `IKbAiStrategy` — this factory is the "good defaults" * path, not the only path. @@ -108,13 +116,28 @@ export function createStandardKbStrategy( return { async ingest(kb, doc): Promise { - // Single write — `upsertDocument` returns the stored doc with the - // auto-generated id assigned, so we don't need a second round-trip - // when `doc.doc_id` is initially missing. + // Order matters: delete old chunks BEFORE rewriting the document. + // If upsertDocument or any later step fails partway through, the + // worst the KB can be left in is "doc row preserved, chunks + // removed" rather than "new doc row pointing at old stale chunks" + // — chunks always reflect the in-flight ingest, never a previous + // version. The text-index removal piggy-backs on + // deleteChunksForDocument, so RRF rankings can't end up surfacing + // chunks that no longer exist either. + const initialDocId = doc.doc_id; + if (initialDocId) { + await kb.deleteChunksForDocument(initialDocId); + } const stored = await kb.upsertDocument(doc); const docId = stored.doc_id!; - // Replace existing chunks for this doc so re-ingest is idempotent. - await kb.deleteChunksForDocument(docId); + if (!initialDocId) { + // Fresh-id case: chunks under this new id can't pre-exist in a + // well-behaved storage backend, but call delete unconditionally + // so the post-condition ("after ingest returns, the doc owns + // exactly the newly-embedded chunks") holds even if a backend + // recycles ids or a stale row survived a prior aborted run. + await kb.deleteChunksForDocument(docId); + } const chunker = new HierarchicalChunkerTask(); const chunkResult = await chunker.run({ @@ -191,12 +214,18 @@ export function createStandardKbStrategy( }); if (firstStage.length === 0) return []; - const docs = firstStage.map((c) => { - const meta = c.metadata as Record | undefined; - const text = meta?.text; - return typeof text === "string" ? text : JSON.stringify(meta ?? {}); - }); + // `chunkText` enforces the metadata.text contract — chunks missing + // text throw with the offending chunk_id rather than silently + // feeding `JSON.stringify(metadata)` to the reranker, which would + // produce meaningless relevance scores. + const docs = firstStage.map(chunkText); + // Note: `scoreThreshold` is intentionally NOT honored in the rerank + // branch. The first stage already filtered by score; cross-encoder + // logits live on a completely different scale (often negative) and + // a cosine-style threshold would either drop everything or nothing. + // Callers wanting a rerank-relative cutoff should clip on the + // returned `score` themselves after inspecting `scoreType`. if (kb.rerankerModel) { const result = await new TextRerankerTask().run({ query, @@ -212,12 +241,15 @@ export function createStandardKbStrategy( return { ...candidate, score: typeof newScore === "number" ? newScore : candidate.score, + scoreType: "rerank" as const, }; }); } // No reranker model configured but mode is "rerank" — fall back to a - // local heuristic so callers still get a usable ordering. + // local heuristic so callers still get a usable ordering. We still + // tag the result with scoreType: "rerank" because callers asked for + // rerank semantics; the score scale isn't comparable to cosine/RRF. const heuristic = await new RerankerTask().run({ query, chunks: docs, @@ -233,6 +265,7 @@ export function createStandardKbStrategy( return { ...candidate, score: newScores[rank] ?? candidate.score, + scoreType: "rerank" as const, }; }); }, diff --git a/packages/ai/src/task/ChunkRetrievalTask.ts b/packages/ai/src/task/ChunkRetrievalTask.ts index b458b3487..fbe378151 100644 --- a/packages/ai/src/task/ChunkRetrievalTask.ts +++ b/packages/ai/src/task/ChunkRetrievalTask.ts @@ -141,12 +141,14 @@ const outputSchema = { }, scoreType: { type: "string", - enum: ["cosine", "bm25", "rrf"], + enum: ["cosine", "bm25", "rrf", "rerank"], title: "Score Type", description: "Discriminator naming the scorer used for `scores`: 'cosine' for similarity search " + "and for hybrid fallback when the text query is empty/whitespace; 'rrf' for hybrid " + - "fusion. ('bm25' is reserved for direct text search and is not produced by this task.)", + "fusion. ('bm25' is reserved for direct text search and is not produced by this task. " + + "'rerank' is produced by the standard KB strategy after cross-encoder reranking and " + + "is not produced by this task either.)", }, vectors: { type: "array", @@ -287,9 +289,15 @@ export class ChunkRetrievalTask extends Task< // want to surface that to callers even when the result set is empty. const hybridFallsBackToCosine = method === "hybrid" && (queryText === undefined || queryText.trim().length === 0); - const defaultScoreType: "cosine" | "bm25" | "rrf" = + // `ChunkRetrievalTask` itself only produces cosine or RRF scores; it + // can't emit "bm25" (no text-only path) or "rerank" (that comes from + // a downstream reranker, not this task). The output-schema enum + // includes them so the field is consistent with the canonical + // `ScoreType` union, but they won't appear in `defaultScoreType`. + const defaultScoreType: "cosine" | "rrf" = method === "hybrid" && !hybridFallsBackToCosine ? "rrf" : "cosine"; - const scoreType = results.length > 0 ? (results[0].scoreType ?? defaultScoreType) : defaultScoreType; + const scoreType: "cosine" | "bm25" | "rrf" | "rerank" = + results.length > 0 ? (results[0].scoreType ?? defaultScoreType) : defaultScoreType; const output: ChunkRetrievalTaskOutput = { chunks, diff --git a/packages/ai/src/task/KbSearchTask.ts b/packages/ai/src/task/KbSearchTask.ts index 593ca8521..4e272d016 100644 --- a/packages/ai/src/task/KbSearchTask.ts +++ b/packages/ai/src/task/KbSearchTask.ts @@ -5,7 +5,7 @@ */ import type { ChunkSearchResult, KnowledgeBase } from "@workglow/knowledge-base"; -import { TypeKnowledgeBase } from "@workglow/knowledge-base"; +import { chunkText, TypeKnowledgeBase } from "@workglow/knowledge-base"; import { CreateWorkflow, IExecuteContext, Task, Workflow } from "@workglow/task-graph"; import type { TaskConfig } from "@workglow/task-graph"; import type { DataPortSchema, FromSchema } from "@workglow/util/schema"; @@ -121,11 +121,11 @@ export class KbSearchTask extends Task { - const meta = r.metadata as Record | undefined; - const text = meta?.text; - return typeof text === "string" ? text : JSON.stringify(meta ?? {}); - }), + // `chunkText` enforces the metadata.text contract — any chunk + // missing text throws with its chunk_id rather than silently + // emitting `JSON.stringify(metadata)` (which would surface as + // garbage to downstream consumers). + chunks: results.map(chunkText), chunk_ids: results.map((r) => r.chunk_id), scores: results.map((r) => r.score), count: results.length, diff --git a/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts b/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts index ba20e5989..ecbfd5683 100644 --- a/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts +++ b/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts @@ -41,6 +41,18 @@ export interface ChunkVectorEntity< /** * Type for inserting chunk vectors - chunk_id is optional (auto-generated) + * + * @remarks + * `metadata.text` is a load-bearing field — it carries the chunk's + * canonical text (the same string that was embedded to produce + * {@link ChunkVectorEntity.vector}). Downstream callers — notably + * cross-encoder rerankers and any UI that displays the chunk — read + * `metadata.text` directly via {@link chunkText}. Strategies that build + * `InsertChunkVectorEntity` from custom chunkers MUST populate + * `metadata.text` or rerank/display paths will throw. The standard + * strategy populates it via `toInsertChunkEntities` from + * `HierarchicalChunkerTask` output, which always emits `text` on each + * chunk. */ export type InsertChunkVectorEntity< Metadata extends ChunkRecord = ChunkRecord, @@ -63,7 +75,7 @@ export type ChunkVectorStorage = IVectorStorage< /** * Discriminator for the scoring function used to produce a * {@link ChunkSearchResult.score}. Callers (typically UI) use this to render - * the score appropriately, since the three scorers live on different scales: + * the score appropriately, since the scorers live on different scales: * * - `"cosine"`: cosine similarity in `[-1, 1]`, typically `[0, 1]` for text * embeddings. Absolute — higher means more similar. @@ -73,8 +85,14 @@ export type ChunkVectorStorage = IVectorStorage< * `2 / (rrfK + 1)` (~`0.033` with the default `rrfK=60`). Rank-based, not * absolute — the magnitude is not a similarity, only an ordering signal. * Not comparable across queries. + * - `"rerank"`: cross-encoder reranker output (e.g. bge-reranker, Cohere + * rerank). Raw logit, not a probability and not comparable to cosine / + * BM25 / RRF scores. Callers MUST inspect `scoreType` before applying + * any score-threshold gate; cross-encoder scores often span wide negative + * ranges that look invalid under a cosine-style threshold but are + * perfectly normal here. */ -export type ScoreType = "cosine" | "bm25" | "rrf"; +export type ScoreType = "cosine" | "bm25" | "rrf" | "rerank"; /** * Search result with score @@ -83,3 +101,26 @@ export type ChunkSearchResult = ChunkVectorEntity & { score: number; scoreType?: ScoreType; }; + +/** + * Extract the canonical chunk text from a search result. + * + * Reads `metadata.text` directly. Throws (with the offending chunk_id) if + * the field is missing — chunks without text can't be reranked, displayed, + * or fed into downstream NLP tasks. Use this helper everywhere a chunk's + * text is needed instead of inlining `metadata.text` access; it keeps the + * contract — "every chunk in the KB owns its source text in + * `metadata.text`" — enforced at exactly one place. See + * {@link InsertChunkVectorEntity} for the writer-side requirement. + */ +export function chunkText(c: { chunk_id: string; metadata?: ChunkRecord }): string { + const text = c.metadata?.text; + if (typeof text !== "string") { + throw new Error( + `chunkText: chunk ${c.chunk_id} is missing metadata.text. ` + + `Every chunk in a KnowledgeBase must carry its source text on metadata.text — ` + + `update the chunker / strategy that produced this chunk to populate it.` + ); + } + return text; +} diff --git a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts index 272084cbe..685c4d53e 100644 --- a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts +++ b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts @@ -50,6 +50,15 @@ export interface IKbAiStrategy { * Run a text query and return matching chunks. The strategy picks the * retrieval flavor (similarity, hybrid, reranker, plain text) — callers * don't choose per-call. + * + * The returned `score` is only comparable within a single result list, + * and only when results share a `scoreType`. The standard strategy + * tags rerank results with `scoreType: "rerank"` — cross-encoder + * logits are NOT comparable to cosine/BM25/RRF scores, so callers + * MUST inspect `scoreType` before applying any score threshold. In + * particular, `ISearchOptions.scoreThreshold` is not honored under + * `searchMode === "rerank"` because there's no meaningful default + * threshold across rerankers. */ search( kb: IKbStrategyTarget, diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts index 871fd0b6c..6ce1ead58 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts @@ -221,6 +221,19 @@ export class KnowledgeBase { // Strategy installation // =========================================================================== + /** + * Install (or replace) the AI strategy used by `upsert`/`delete`/`search`. + * + * Replacing the strategy does NOT affect operations already in flight. + * Each public op (`upsert`/`delete`/`search`/`reindex`) resolves its + * strategy at entry via {@link requireStrategy} and holds that reference + * for its lifetime; a concurrent `setAiStrategy(B)` mid-`upsert(A)` + * leaves the in-progress upsert running on strategy A and routes the + * next public op to strategy B. + * + * Pass `undefined` to detach the strategy — subsequent public-op calls + * throw with a setup hint instead of running. + */ setAiStrategy(strategy: IKbAiStrategy | undefined): void { this.aiStrategy = strategy; } @@ -229,6 +242,14 @@ export class KnowledgeBase { return this.aiStrategy; } + /** + * Snapshot the currently installed strategy or throw if none. + * + * Returns the field value as-is — callers should hold the returned + * reference for the duration of one public op so a concurrent + * `setAiStrategy(...)` doesn't swap the strategy mid-operation. See + * {@link setAiStrategy} for the full concurrency contract. + */ private requireStrategy(forOp: string): IKbAiStrategy { if (!this.aiStrategy) { throw new Error( @@ -315,23 +336,37 @@ export class KnowledgeBase { /** * Ingest a document end-to-end: chunk + embed + write. Delegates to the * installed strategy. + * + * The strategy is snapshotted at entry: a concurrent `setAiStrategy(...)` + * during the upsert won't redirect the in-flight call to the new + * strategy. See {@link setAiStrategy}. */ async upsert(doc: Document): Promise { - return this.requireStrategy("upsert").ingest(this, doc); + const strategy = this.requireStrategy("upsert"); + return strategy.ingest(this, doc); } - /** Remove a document and its chunks. Delegates to the installed strategy. */ + /** + * Remove a document and its chunks. Delegates to the installed strategy + * (snapshotted at entry — see {@link setAiStrategy}). + */ async delete(doc_id: string): Promise { - return this.requireStrategy("delete").delete(this, doc_id); + const strategy = this.requireStrategy("delete"); + return strategy.delete(this, doc_id); } /** * Run a text query. Retrieval flavor (text / similarity / hybrid / * rerank) is decided by the installed strategy — typically derived from * this KB's `searchMode` field. + * + * The strategy is snapshotted at entry: a concurrent `setAiStrategy(...)` + * during the search won't redirect the in-flight call. See + * {@link setAiStrategy}. */ async search(query: string, options?: ISearchOptions): Promise { - return this.requireStrategy("search").search(this, query, options); + const strategy = this.requireStrategy("search"); + return strategy.search(this, query, options); } // =========================================================================== @@ -651,7 +686,14 @@ export class KnowledgeBase { return doc; } - /** Re-index every document by re-running ingest. Requires a strategy. */ + /** + * Re-index every document by re-running ingest. Requires a strategy. + * + * The strategy is captured once at entry — every doc in the run uses + * the same strategy, even if `setAiStrategy(...)` is called concurrently + * partway through the loop. The next `reindex()` call would pick up + * the new strategy. See {@link setAiStrategy}. + */ async reindex(): Promise { const strategy = this.requireStrategy("reindex"); const docIds = await this.listDocuments(); diff --git a/packages/test/src/test/rag/DocumentRepository.test.ts b/packages/test/src/test/rag/DocumentRepository.test.ts index 7a24d582a..a71d2f7ef 100644 --- a/packages/test/src/test/rag/DocumentRepository.test.ts +++ b/packages/test/src/test/rag/DocumentRepository.test.ts @@ -748,4 +748,165 @@ Paragraph.`; ); }); }); + + describe("strategy contract", () => { + it("captures the strategy at op entry — mid-search setAiStrategy(B) doesn't redirect an in-flight search", async () => { + const kb = await createKnowledgeBase({ + name: `kb-strategy-snapshot-search-${uuid4()}`, + vectorDimensions: 3, + register: false, + }); + + let releaseA: () => void = () => {}; + const aPending = new Promise((resolve) => { + releaseA = resolve; + }); + const aCalls: string[] = []; + const bCalls: string[] = []; + + const strategyA = { + ingest: async (_kb: KnowledgeBase, d: Document) => d, + delete: async () => {}, + search: async () => { + aCalls.push("search"); + await aPending; + return []; + }, + }; + const strategyB = { + ingest: async (_kb: KnowledgeBase, d: Document) => d, + delete: async () => {}, + search: async () => { + bCalls.push("search"); + return []; + }, + }; + + kb.setAiStrategy(strategyA); + const inFlight = kb.search("q1"); + // Swap mid-flight; the in-flight call must still resolve via A. + kb.setAiStrategy(strategyB); + releaseA(); + await inFlight; + expect(aCalls).toEqual(["search"]); + expect(bCalls).toEqual([]); + + // Subsequent call routes to B as expected. + await kb.search("q2"); + expect(aCalls).toEqual(["search"]); + expect(bCalls).toEqual(["search"]); + }); + + it("captures the strategy at op entry — mid-upsert setAiStrategy(B) doesn't redirect an in-flight upsert", async () => { + const kb = await createKnowledgeBase({ + name: `kb-strategy-snapshot-upsert-${uuid4()}`, + vectorDimensions: 3, + register: false, + }); + + let releaseA: () => void = () => {}; + const aPending = new Promise((resolve) => { + releaseA = resolve; + }); + const aCalls: string[] = []; + const bCalls: string[] = []; + + const strategyA = { + ingest: async (_target: KnowledgeBase, d: Document) => { + aCalls.push("ingest"); + await aPending; + return d; + }, + delete: async () => {}, + search: async () => [], + }; + const strategyB = { + ingest: async (_target: KnowledgeBase, d: Document) => { + bCalls.push("ingest"); + return d; + }, + delete: async () => {}, + search: async () => [], + }; + + kb.setAiStrategy(strategyA); + const root = await StructuralParser.parseMarkdown(uuid4(), "# T\n\nx.", "T"); + const doc = new Document(root, { title: "T" }); + doc.setDocId("doc-snapshot-upsert"); + const inFlight = kb.upsert(doc); + kb.setAiStrategy(strategyB); + releaseA(); + await inFlight; + expect(aCalls).toEqual(["ingest"]); + expect(bCalls).toEqual([]); + }); + + it("chunkText helper throws with chunk_id when metadata.text is missing", async () => { + const { chunkText } = await import("@workglow/knowledge-base"); + expect(() => + chunkText({ + chunk_id: "c-no-text", + metadata: { custom: "x" } as unknown as Parameters[0]["metadata"], + }) + ).toThrow(/c-no-text/); + }); + + it("chunkText helper returns metadata.text when present", async () => { + const { chunkText } = await import("@workglow/knowledge-base"); + const text = chunkText({ + chunk_id: "c-has-text", + metadata: { text: "hello" } as unknown as Parameters[0]["metadata"], + }); + expect(text).toBe("hello"); + }); + + it("captures the strategy once at reindex() entry — mid-loop swap doesn't redirect remaining iterations", async () => { + const kb = await createKnowledgeBase({ + name: `kb-strategy-snapshot-reindex-${uuid4()}`, + vectorDimensions: 3, + register: false, + }); + + const aIngested: string[] = []; + const bIngested: string[] = []; + const strategyA = { + ingest: async (target: KnowledgeBase, d: Document) => { + aIngested.push(d.doc_id ?? ""); + // Swap to B partway through; the reindex loop should keep + // ingesting via A for the rest of this run. + target.setAiStrategy(strategyB); + return d; + }, + delete: async () => {}, + search: async () => [], + }; + const strategyB = { + ingest: async (_target: KnowledgeBase, d: Document) => { + bIngested.push(d.doc_id ?? ""); + return d; + }, + delete: async () => {}, + search: async () => [], + }; + + // Seed three documents through the storage layer directly so they're + // present for reindex to iterate. + for (let i = 0; i < 3; i++) { + const root = await StructuralParser.parseMarkdown(uuid4(), `# D${i}\n\nx.`, `D${i}`); + const doc = new Document(root, { title: `D${i}` }); + doc.setDocId(`doc-reindex-${i}`); + await kb.upsertDocument(doc); + } + + kb.setAiStrategy(strategyA); + const processed = await kb.reindex(); + expect(processed).toBe(3); + // All three iterations stayed on A even though A re-installed B + // after the first call. + expect(aIngested).toHaveLength(3); + expect(bIngested).toHaveLength(0); + // The KB's current strategy is now B (set during the loop). + expect(kb.getAiStrategy()).toBe(strategyB); + }); + }); }); diff --git a/packages/test/src/test/rag/KnowledgeBaseStandardStrategy.test.ts b/packages/test/src/test/rag/KnowledgeBaseStandardStrategy.test.ts new file mode 100644 index 000000000..82e277147 --- /dev/null +++ b/packages/test/src/test/rag/KnowledgeBaseStandardStrategy.test.ts @@ -0,0 +1,190 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { createStandardKbStrategy } from "@workglow/ai"; +import { getAiProviderRegistry } from "@workglow/ai"; +import { getGlobalModelRepository } from "@workglow/ai"; +import type { ModelRecord } from "@workglow/ai"; +import { + ChunkVectorPrimaryKey, + ChunkVectorStorageSchema, + Document, + DocumentStorageKey, + DocumentStorageSchema, + KnowledgeBase, + StructuralParser, + createKnowledgeBase, +} from "@workglow/knowledge-base"; +import type { + ChunkVectorStorage, + DocumentTabularStorage, + InsertChunkVectorEntity, +} from "@workglow/knowledge-base"; +import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage"; +import { uuid4 } from "@workglow/util"; +import { beforeAll, describe, expect, it } from "vitest"; + +/** + * Tests exercising `createStandardKbStrategy` directly. Setup registers a + * tiny stub provider for `TextEmbeddingTask` so we don't need a real + * runtime (HuggingFace etc.) to assert order/tagging contracts. + */ +const TEST_PROVIDER = "test-strategy-provider"; +const TEST_EMBED_MODEL_ID = "test:strategy:embed"; + +describe("createStandardKbStrategy", () => { + beforeAll(async () => { + const registry = getAiProviderRegistry(); + registry.registerRunFn(TEST_PROVIDER, "TextEmbeddingTask", async (input) => { + // Deterministic 3-D unit vector keyed off the first text character; + // we don't need vector meaning here, only that embedTexts resolves. + const texts = Array.isArray((input as { text: unknown }).text) + ? ((input as { text: string[] }).text as string[]) + : [((input as { text: string }).text as string) ?? ""]; + const vectors = texts.map(() => new Float32Array([1, 0, 0])); + return { + vector: vectors.length === 1 ? vectors[0] : vectors, + } as unknown as Record; + }); + + const modelRepo = getGlobalModelRepository(); + const existing = await modelRepo.findByName(TEST_EMBED_MODEL_ID).catch(() => undefined); + if (!existing) { + await modelRepo.addModel({ + model_id: TEST_EMBED_MODEL_ID, + tasks: ["TextEmbeddingTask"], + title: "Strategy test embed model", + description: "Stub embed model used by createStandardKbStrategy tests", + provider: TEST_PROVIDER, + provider_config: { native_dimensions: 3 }, + metadata: {}, + } as ModelRecord); + } + }); + + /** + * Seed the KB with a single pre-existing chunk so a re-ingest has + * something to delete and the partial-failure test can verify it's gone. + */ + async function seedChunk(kb: KnowledgeBase, doc_id: string, chunk_id: string): Promise { + const insert: InsertChunkVectorEntity = { + chunk_id, + doc_id, + vector: new Float32Array([1, 0, 0]), + metadata: { + chunk_id, + doc_id, + text: "old chunk text", + nodePath: [], + depth: 0, + } as never, + }; + await kb.upsertChunksBulk([insert]); + } + + it("ingest deletes existing chunks BEFORE upsertDocument when doc_id is set; partial failure leaves no orphan chunks", async () => { + // KB subclass that rejects on upsertChunksBulk to simulate a failure + // partway through ingest (after delete, after document upsert, after + // chunker, after embed, but during the bulk insert). + class FailingKb extends KnowledgeBase { + failOnBulkInsert = false; + override async upsertChunksBulk(chunks: InsertChunkVectorEntity[]) { + if (this.failOnBulkInsert) { + throw new Error("simulated bulk-insert failure"); + } + return super.upsertChunksBulk(chunks); + } + } + + const tabular = new InMemoryTabularStorage(DocumentStorageSchema, DocumentStorageKey); + await tabular.setupDatabase(); + const vector = new InMemoryVectorStorage( + ChunkVectorStorageSchema, + ChunkVectorPrimaryKey, + [], + 3, + Float32Array + ); + await vector.setupDatabase(); + + const kb = new FailingKb( + `kb-ingest-order-${uuid4()}`, + tabular as unknown as DocumentTabularStorage, + vector as unknown as ChunkVectorStorage, + { docEmbeddingModel: TEST_EMBED_MODEL_ID } + ); + kb.setAiStrategy(createStandardKbStrategy()); + + const docId = "doc-ingest-order"; + // First, plant the document + a stale chunk that the next ingest + // should clear out. + const initialRoot = await StructuralParser.parseMarkdown( + uuid4(), + "# Initial\n\nold content.", + "Initial" + ); + const initialDoc = new Document(initialRoot, { title: "Initial" }); + initialDoc.setDocId(docId); + await kb.upsertDocument(initialDoc); + await seedChunk(kb, docId, "stale-chunk-1"); + expect((await kb.getChunksForDocument(docId)).length).toBe(1); + + // Now arm the failure and re-ingest the same doc_id. The strategy + // should: (1) delete the stale chunk, (2) upsert the new document + // version, (3) chunk + embed, (4) call upsertChunksBulk which + // throws. Post-failure: stale chunk still gone, document row + // reflects the new (re-upserted) content. + kb.failOnBulkInsert = true; + const newRoot = await StructuralParser.parseMarkdown( + uuid4(), + "# Updated\n\nnew content.", + "Updated" + ); + const updatedDoc = new Document(newRoot, { title: "Updated" }); + updatedDoc.setDocId(docId); + + await expect(kb.upsert(updatedDoc)).rejects.toThrow(/simulated bulk-insert failure/); + + // Chunks: empty (stale gone, new ones never inserted) — the data- + // integrity invariant of the new ordering. + expect(await kb.getChunksForDocument(docId)).toEqual([]); + // Document row: present (upserted before the failure), with the new + // title — the new content "won" even though the chunks didn't. + const storedDoc = await kb.getDocument(docId); + expect(storedDoc).toBeDefined(); + expect(storedDoc!.metadata.title).toBe("Updated"); + }); + + it("rerank mode tags results with scoreType: 'rerank' via the heuristic fallback (no rerankerModel)", async () => { + const kb = await createKnowledgeBase({ + name: `kb-rerank-tag-${uuid4()}`, + vectorDimensions: 3, + register: false, + docEmbeddingModel: TEST_EMBED_MODEL_ID, + searchMode: "rerank", + // Intentionally no rerankerModel → heuristic RerankerTask fallback. + }); + kb.setAiStrategy(createStandardKbStrategy()); + + // Plant a doc + chunk so the first stage retrieves something for the + // reranker to score. + const docId = "doc-rerank-tag"; + const root = await StructuralParser.parseMarkdown(uuid4(), "# T\n\nhi.", "T"); + const doc = new Document(root, { title: "T" }); + doc.setDocId(docId); + await kb.upsertDocument(doc); + await seedChunk(kb, docId, "chunk-rerank-tag"); + + const results = await kb.search("hi", { topK: 1 }); + + expect(results.length).toBeGreaterThan(0); + // The first-stage retrieval would have produced "cosine" scores; the + // rerank fallback MUST override them to "rerank". + for (const r of results) { + expect(r.scoreType).toBe("rerank"); + } + }); +}); diff --git a/packages/test/src/test/task/KbSearchTask.test.ts b/packages/test/src/test/task/KbSearchTask.test.ts index 44e888e16..74cf26c03 100644 --- a/packages/test/src/test/task/KbSearchTask.test.ts +++ b/packages/test/src/test/task/KbSearchTask.test.ts @@ -82,4 +82,23 @@ describe("KbSearchTask — execute()", () => { // @ts-expect-error — test helper expect(kb._calls[0].opts).toMatchObject({ topK: 5 }); }); + + it("throws (with the offending chunk_id) when a result is missing metadata.text", async () => { + // Custom-strategy KBs are free to return whatever shape they like, but + // chunks without `metadata.text` violate the documented contract on + // InsertChunkVectorEntity. `KbSearchTask` enforces that contract via + // `chunkText`; previously it silently fell back to + // JSON.stringify(metadata), which surfaced as garbage downstream. + const offending: ChunkSearchResult = { + chunk_id: "c-no-text", + doc_id: "d1", + vector: new Float32Array([1, 0, 0]), + // `text` intentionally absent — custom chunker forgot it. + metadata: { custom: "x" } as never, + score: 1, + }; + const kb = makeFakeKb([offending]); + const task = new KbSearchTask(); + await expect(task.run({ knowledgeBase: kb, query: "q" })).rejects.toThrow(/c-no-text/); + }); }); From b2422af0bd90e17009a69f967673395b34a0baac Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 12 May 2026 20:03:48 +0000 Subject: [PATCH 05/11] ci: pkg-pr-new publishes every workspace under packages/* + providers/* + examples/cli MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous publish list was a hand-maintained subset (util, storage, job-queue, task-graph, knowledge-base, tasks, ai, ai-provider, workglow, examples/cli) and referenced a defunct `ai-provider` directory. As a result, the PR-preview install URLs for every vendor package (anthropic, openai, ollama, huggingface-*, sqlite, postgres, supabase, mcp, indexeddb, javascript, browser-control, etc.) 404'd, blocking downstream repos that want to consume a libs PR via overrides. Replace with `./packages/* ./providers/* ./examples/cli` so the shell expands to every workspace at exec time. `pkg-pr-new` honors the `"private": true` flag on `@workglow/test`, so internal-only workspaces are skipped automatically — no need to enumerate. https://claude.ai/code/session_01Ya54WFZhpDFzAqRh1qG8Ex --- .github/workflows/publish-preview.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish-preview.yml b/.github/workflows/publish-preview.yml index 911610aa6..a7e385a14 100644 --- a/.github/workflows/publish-preview.yml +++ b/.github/workflows/publish-preview.yml @@ -26,4 +26,9 @@ jobs: bun-version: "1.3.11" - run: bun i - run: bun run build - - run: bunx pkg-pr-new publish './packages/util' './packages/storage' './packages/job-queue' './packages/task-graph' './packages/knowledge-base' './packages/tasks' './packages/ai' './packages/ai-provider' './packages/workglow' './examples/cli' + # Publish every workspace under packages/* and providers/* as a PR- + # preview package, plus the cli example. `pkg-pr-new` honors the + # `"private": true` flag on `@workglow/test`, so internal-only + # workspaces are silently skipped. The shell expands the globs into + # one positional argument per matched directory. + - run: bunx pkg-pr-new publish ./packages/* ./providers/* ./examples/cli From 60cea62a18848dfdf062d9367a85b4c917f98696 Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Wed, 13 May 2026 01:18:12 -0700 Subject: [PATCH 06/11] fix(ai/kb): honor firstStageMinimum floor in rerank candidate pool The rerank-mode first-stage size was computed as `Math.max(topK * firstStageMultiplier, topK)`, which is a no-op since `topK * mult >= topK` whenever `mult >= 1`. The intended floor was a fixed minimum (commented as such), so very small `topK` (e.g. topK=1, mult=5 -> 5 candidates) silently collapsed the reranker's input down to a handful of candidates with no real choice to make. Add a new `firstStageMinimum` option to `CreateStandardKbStrategyOptions` (default 20) and use it as the actual floor: `Math.max(topK * firstStageMultiplier, firstStageMinimum)`. Update JSDoc on `firstStageMultiplier` and the new `firstStageMinimum` to describe how they interact. Adds a vitest suite that spies on `hybridSearch` / `similaritySearch` and asserts the first-stage `topK` value forwarded to them across representative inputs. --- .../ai/src/kb/createStandardKbStrategy.ts | 24 +++- ...CreateStandardKbStrategyFirstStage.test.ts | 107 ++++++++++++++++++ 2 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 packages/test/src/test/rag/CreateStandardKbStrategyFirstStage.test.ts diff --git a/packages/ai/src/kb/createStandardKbStrategy.ts b/packages/ai/src/kb/createStandardKbStrategy.ts index 2a8b7d670..e511c2b2e 100644 --- a/packages/ai/src/kb/createStandardKbStrategy.ts +++ b/packages/ai/src/kb/createStandardKbStrategy.ts @@ -42,10 +42,22 @@ export interface CreateStandardKbStrategyOptions { * Multiplier applied to `topK` to size the first-stage candidate pool * when `searchMode === "rerank"`. The reranker then narrows the pool * back down to `topK`. Defaults to `5`, i.e. first stage fetches - * `topK * 5` candidates (with a `topK` floor so it never returns fewer - * than `topK`). + * `topK * 5` candidates. Used together with `firstStageMinimum` — + * the actual first-stage size is `max(topK * firstStageMultiplier, + * firstStageMinimum)`, so a tiny `topK` (e.g. `1`) still yields a + * meaningful candidate pool for the reranker to choose from instead of + * collapsing to a single candidate. */ readonly firstStageMultiplier?: number; + /** + * Minimum first-stage candidate pool size when `searchMode === "rerank"`. + * Defaults to `20`. Prevents the rerank pool from collapsing to + * `topK` for very small `topK` values where `topK * firstStageMultiplier` + * would still be too few candidates for the reranker to do useful work. + * The effective first-stage size is + * `max(topK * firstStageMultiplier, firstStageMinimum)`. + */ + readonly firstStageMinimum?: number; } /** @@ -77,6 +89,7 @@ export function createStandardKbStrategy( reservedTokens: options.chunker?.reservedTokens ?? 10, } as const; const firstStageMultiplier = options.firstStageMultiplier ?? 5; + const firstStageMinimum = options.firstStageMinimum ?? 20; const resolveSearchMode = (kb: IKbStrategyTarget): SearchMode => { if (options.searchMode) return options.searchMode; @@ -200,7 +213,12 @@ export function createStandardKbStrategy( } // mode === "rerank" - const firstStageTopK = Math.max(topK * firstStageMultiplier, topK); + // First-stage pool is `topK * firstStageMultiplier`, but never + // smaller than `firstStageMinimum`. The floor matters for small + // `topK`: with `topK=1, multiplier=5` the raw product is 5, which + // robs the reranker of any real choice. The minimum keeps the + // candidate pool meaningful regardless of how small `topK` is. + const firstStageTopK = Math.max(topK * firstStageMultiplier, firstStageMinimum); const firstStage: ChunkSearchResult[] = kb.supportsHybridSearch() ? await kb.hybridSearch(vector, { textQuery: query, diff --git a/packages/test/src/test/rag/CreateStandardKbStrategyFirstStage.test.ts b/packages/test/src/test/rag/CreateStandardKbStrategyFirstStage.test.ts new file mode 100644 index 000000000..8b9f63b01 --- /dev/null +++ b/packages/test/src/test/rag/CreateStandardKbStrategyFirstStage.test.ts @@ -0,0 +1,107 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { createStandardKbStrategy, getAiProviderRegistry, getGlobalModelRepository } from "@workglow/ai"; +import type { ModelRecord } from "@workglow/ai"; +import { createKnowledgeBase } from "@workglow/knowledge-base"; +import { uuid4 } from "@workglow/util"; +import { beforeAll, describe, expect, it, vi } from "vitest"; + +/** + * Tests for the first-stage candidate-pool sizing in + * `createStandardKbStrategy` rerank mode. The pool size is + * `max(topK * firstStageMultiplier, firstStageMinimum)`; the minimum + * exists so that a very small `topK` (e.g. 1) does not collapse the + * reranker's input down to a useless handful of candidates. + */ +const TEST_PROVIDER = "test-firststage-provider"; +const TEST_EMBED_MODEL_ID = "test:firststage:embed"; + +describe("createStandardKbStrategy first-stage sizing (rerank mode)", () => { + beforeAll(async () => { + const registry = getAiProviderRegistry(); + registry.registerRunFn(TEST_PROVIDER, "TextEmbeddingTask", async (input) => { + const texts = Array.isArray((input as { text: unknown }).text) + ? ((input as { text: string[] }).text as string[]) + : [((input as { text: string }).text as string) ?? ""]; + const vectors = texts.map(() => new Float32Array([1, 0, 0])); + return { + vector: vectors.length === 1 ? vectors[0] : vectors, + } as unknown as Record; + }); + + const modelRepo = getGlobalModelRepository(); + const existing = await modelRepo.findByName(TEST_EMBED_MODEL_ID).catch(() => undefined); + if (!existing) { + await modelRepo.addModel({ + model_id: TEST_EMBED_MODEL_ID, + tasks: ["TextEmbeddingTask"], + title: "First-stage sizing test embed model", + description: "Stub embed model for first-stage sizing tests", + provider: TEST_PROVIDER, + provider_config: { native_dimensions: 3 }, + metadata: {}, + } as ModelRecord); + } + }); + + /** + * Set up a rerank-mode KB and spy on the first-stage retrieval call so + * we can assert exactly what `topK` the strategy hands down to it. + * Heuristic-reranker path (no `rerankerModel`) is used because we only + * care about the first-stage `topK`, not the reranker output. + */ + async function captureFirstStageTopK( + strategyOptions: Parameters[0], + searchTopK: number + ): Promise { + const kb = await createKnowledgeBase({ + name: `kb-first-stage-${uuid4()}`, + vectorDimensions: 3, + register: false, + docEmbeddingModel: TEST_EMBED_MODEL_ID, + searchMode: "rerank", + }); + kb.setAiStrategy(createStandardKbStrategy(strategyOptions)); + + // Spy on whichever first-stage method the strategy will pick. Both + // are stubbed to return [] so the rerank path short-circuits and we + // don't have to seed real chunks. + const hybridSpy = vi + .spyOn(kb, "hybridSearch" as never) + .mockResolvedValue([] as never); + const similaritySpy = vi + .spyOn(kb, "similaritySearch" as never) + .mockResolvedValue([] as never); + + await kb.search("hi", { topK: searchTopK }); + + // Exactly one of the spies should fire (mutually exclusive branches + // in the strategy: hybrid if supportsHybridSearch, else similarity). + const calls = [...hybridSpy.mock.calls, ...similaritySpy.mock.calls]; + expect(calls.length).toBe(1); + const opts = calls[0][1] as { topK: number }; + return opts.topK; + } + + it("topK=1, multiplier defaults to 5, minimum defaults to 20 → first-stage topK=20", async () => { + const firstStage = await captureFirstStageTopK(undefined, 1); + expect(firstStage).toBe(20); + }); + + it("topK=10, multiplier defaults to 5 → first-stage topK=50 (above minimum)", async () => { + const firstStage = await captureFirstStageTopK(undefined, 10); + expect(firstStage).toBe(50); + }); + + it("topK=2, multiplier=1, firstStageMinimum=20 → first-stage topK=20 (minimum wins)", async () => { + const firstStage = await captureFirstStageTopK( + { firstStageMultiplier: 1, firstStageMinimum: 20 }, + 2 + ); + expect(firstStage).toBe(20); + }); +}); From 505100d785cdd3d2e4e55a498261dda51867d24a Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Wed, 13 May 2026 01:19:14 -0700 Subject: [PATCH 07/11] fix(ai/task): forward scoreThreshold from KbSearchTask to kb.search MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The input schema accepted no `scoreThreshold`, and `execute` did not destructure or forward one either, so any threshold supplied via the task surface was silently dropped before reaching `kb.search`. Callers wiring a threshold through a workflow would get unfiltered results with no warning. Add `scoreThreshold` to the input schema (number, minimum 0) and forward it in the call to `kb.search(query, { topK, filter, scoreThreshold })`. Note that the standard strategy still ignores the threshold in rerank mode by design (cross-encoder logits aren't on the same scale as cosine/RRF scores) — that contract is documented in `createStandardKbStrategy` and the new schema description. Adds a vitest suite that spies on `kb.search` and asserts the threshold is forwarded when provided and absent (undefined) when omitted. --- packages/ai/src/task/KbSearchTask.ts | 16 ++++- .../test/src/test/rag/KbSearchTask.test.ts | 66 +++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 packages/test/src/test/rag/KbSearchTask.test.ts diff --git a/packages/ai/src/task/KbSearchTask.ts b/packages/ai/src/task/KbSearchTask.ts index 5590fc9a2..8e530f9a8 100644 --- a/packages/ai/src/task/KbSearchTask.ts +++ b/packages/ai/src/task/KbSearchTask.ts @@ -34,6 +34,13 @@ const inputSchema = { title: "Metadata Filter", description: "Filter results by chunk metadata fields.", }, + scoreThreshold: { + type: "number", + title: "Score Threshold", + description: + "Minimum score to include a result. Honored by similarity and hybrid search modes; ignored by the strategy in rerank mode (cross-encoder logits aren't comparable to cosine/RRF scores).", + minimum: 0, + }, }, required: ["knowledgeBase", "query"], additionalProperties: false, @@ -116,9 +123,14 @@ export class KbSearchTask extends Task { - const { knowledgeBase, query, topK = 5, filter } = input; + const { knowledgeBase, query, topK = 5, filter, scoreThreshold } = input; const kb = knowledgeBase as KnowledgeBase; - const results = await kb.search(query, { topK, filter }); + // Forward `scoreThreshold` to the strategy. The standard strategy + // honors it in similarity / hybrid modes and intentionally ignores + // it in rerank mode (cross-encoder logits aren't on the same scale + // as cosine / RRF, so a single numeric threshold would either drop + // everything or nothing). + const results = await kb.search(query, { topK, filter, scoreThreshold }); return { results, // `chunkText` enforces the metadata.text contract — any chunk diff --git a/packages/test/src/test/rag/KbSearchTask.test.ts b/packages/test/src/test/rag/KbSearchTask.test.ts new file mode 100644 index 000000000..6c1e87d34 --- /dev/null +++ b/packages/test/src/test/rag/KbSearchTask.test.ts @@ -0,0 +1,66 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { kbSearch } from "@workglow/ai"; +import { createKnowledgeBase } from "@workglow/knowledge-base"; +import { uuid4 } from "@workglow/util"; +import { describe, expect, it, vi } from "vitest"; + +/** + * Tests that `KbSearchTask` forwards `scoreThreshold` to `kb.search`. + * The task is the public surface that downstream callers wire into a + * workflow; if the schema accepts `scoreThreshold` but `execute` drops + * it, callers silently get unfiltered results. + */ +describe("KbSearchTask scoreThreshold forwarding", () => { + /** + * Build a KB whose `search` method is replaced with a spy that returns + * an empty result list. The spy lets us assert exactly which options + * the task hands down. + */ + async function makeKbWithSearchSpy() { + const kb = await createKnowledgeBase({ + name: `kb-search-task-${uuid4()}`, + vectorDimensions: 3, + register: false, + }); + const searchSpy = vi.spyOn(kb, "search").mockResolvedValue([]); + return { kb, searchSpy }; + } + + it("forwards `scoreThreshold` to kb.search when provided", async () => { + const { kb, searchSpy } = await makeKbWithSearchSpy(); + + await kbSearch({ + knowledgeBase: kb, + query: "hello", + topK: 3, + scoreThreshold: 0.42, + }); + + expect(searchSpy).toHaveBeenCalledTimes(1); + const [forwardedQuery, forwardedOpts] = searchSpy.mock.calls[0]; + expect(forwardedQuery).toBe("hello"); + expect(forwardedOpts).toMatchObject({ topK: 3, scoreThreshold: 0.42 }); + }); + + it("passes `scoreThreshold: undefined` to kb.search when omitted", async () => { + const { kb, searchSpy } = await makeKbWithSearchSpy(); + + await kbSearch({ + knowledgeBase: kb, + query: "hello", + topK: 3, + }); + + expect(searchSpy).toHaveBeenCalledTimes(1); + const [, forwardedOpts] = searchSpy.mock.calls[0]; + // Property may either be absent or explicitly undefined; both + // behave identically downstream. What we really care about is + // that it's NOT a stale value carried over from a previous call. + expect((forwardedOpts as { scoreThreshold?: number }).scoreThreshold).toBeUndefined(); + }); +}); From ad6553b7b639d89be337676a658e7187233275d5 Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Thu, 14 May 2026 01:14:04 -0700 Subject: [PATCH 08/11] docs(kb): document strategy trust model on IKbAiStrategy and IKbStrategyTarget Strategies receive the KB's full low-level storage surface and can bypass ScopedKnowledgeBase virtual-dispatch scoping. Documents this explicitly so operators do not load strategies from untrusted sources, user input, or remote registries. Also adds pointer TSDoc on KnowledgeBase and setAiStrategy referencing the new trust model paragraph. --- .../src/knowledge-base/IKbAiStrategy.ts | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts index 685c4d53e..38867a23f 100644 --- a/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts +++ b/packages/knowledge-base/src/knowledge-base/IKbAiStrategy.ts @@ -30,6 +30,18 @@ import type { ISearchOptions } from "./KnowledgeBase"; * `kb.upsertDocument` / `kb.upsertChunksBulk` / `kb.similaritySearch` etc. * go through virtual dispatch — subclasses (e.g. `ScopedKnowledgeBase`) * can intercept the low-level ops without the strategy knowing. + * + * **Trust model.** Strategies are TRUSTED CODE. An installed strategy + * receives an {@link IKbStrategyTarget} that exposes the KB's full + * low-level storage surface (`upsertDocument`, `deleteChunksForDocument`, + * `upsertChunksBulk`, `similaritySearch`, `hybridSearch`). These + * operations bypass any application-level access control (e.g. + * user/project scoping enforced by `ScopedKnowledgeBase`) because + * scoping is implemented via virtual dispatch on the *target instance*, + * and a malicious or buggy strategy can violate the contract by routing + * data through a different KB. Do NOT load strategies from untrusted + * sources, user input, or remote registries. Install only strategies you + * ship in trusted application code. */ export interface IKbAiStrategy { /** @@ -71,6 +83,18 @@ export interface IKbAiStrategy { * The narrow KB surface strategies operate against. Spells out exactly the * building-block methods strategies need so the public KB API * (`upsert`/`delete`/`search`) stays the only surface callers see. + * + * **Trust model.** This interface is the full low-level storage surface a + * strategy can reach: `upsertDocument`, `deleteChunksForDocument`, + * `upsertChunksBulk`, `similaritySearch`, `hybridSearch`. These + * operations bypass any application-level access control (e.g. + * user/project scoping enforced by `ScopedKnowledgeBase`) because + * scoping is implemented via virtual dispatch on the *target instance*, + * and a malicious or buggy strategy can violate the contract by routing + * data through a different KB. Treat installed strategies as TRUSTED + * CODE — do NOT load them from untrusted sources, user input, or remote + * registries. Install only strategies you ship in trusted application + * code. See {@link IKbAiStrategy} for the full trust model. */ export interface IKbStrategyTarget { readonly name: string; From a1ca800016db1b78e45f3c33f69e3c39fcb10dc8 Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Thu, 14 May 2026 01:15:58 -0700 Subject: [PATCH 09/11] docs(kb): point KnowledgeBase and setAiStrategy to strategy trust model One-sentence pointer added on the class TSDoc and on setAiStrategy so callers installing a strategy are reminded to read the trust model documented on IKbAiStrategy / IKbStrategyTarget. --- packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts index e15cbb4ee..82fb71ebe 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts @@ -177,6 +177,9 @@ export interface KnowledgeBaseOptions { * of these goes through virtual dispatch, so subclasses (e.g. a * tenant-scoped KB) can intercept any of them without the strategy * knowing. + * + * See {@link IKbAiStrategy} for the strategy trust model — installed + * strategies are TRUSTED CODE and must not come from untrusted sources. */ export class KnowledgeBase { readonly name: string; @@ -224,6 +227,9 @@ export class KnowledgeBase { /** * Install (or replace) the AI strategy used by `upsert`/`delete`/`search`. * + * See {@link IKbAiStrategy} for the strategy trust model — strategies + * receive the KB's full low-level storage surface and are TRUSTED CODE. + * * Replacing the strategy does NOT affect operations already in flight. * Each public op (`upsert`/`delete`/`search`/`reindex`) resolves its * strategy at entry via {@link requireStrategy} and holds that reference From 68ca1b1f90bff7630687943d0743b875d5260f9d Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Thu, 14 May 2026 01:17:07 -0700 Subject: [PATCH 10/11] fix(hft): validate reranker pipeline output shape; throw typed error on mismatch HFT_TextReranker used `as unknown as (...) => Promise<...>` to coerce the transformers.js pipeline, then read `.score` without runtime validation. A pipeline that returned a different shape (e.g. missing scores, wrong field names) would silently produce garbage scores instead of failing loudly. - Added KbRerankerOutputError in @workglow/ai's TextRerankerTask module so callers can `instanceof`-test from the public barrel. - Replaced the cast with a narrow unknown-typed local and an `isScored` guard. On mismatch we throw KbRerankerOutputError including the model path and a truncated shape snippet for diagnostics. - Removed the silent `?? 0` fallback so missing scores fail loudly instead of returning 0. - Added a shape-validation test that mocks getPipeline and exercises (a) total shape mismatch, (b) the valid mixed object/array-of-objects case from `top_k > 1`, (c) a partial mismatch within an otherwise valid batch. --- packages/ai/src/task/TextRerankerTask.ts | 20 ++++ .../test/rag/HFT_TextReranker.shape.test.ts | 110 ++++++++++++++++++ .../src/ai/common/HFT_TextReranker.ts | 76 +++++++++--- 3 files changed, 193 insertions(+), 13 deletions(-) create mode 100644 packages/test/src/test/rag/HFT_TextReranker.shape.test.ts diff --git a/packages/ai/src/task/TextRerankerTask.ts b/packages/ai/src/task/TextRerankerTask.ts index 4ad97022f..b6d91aa7d 100644 --- a/packages/ai/src/task/TextRerankerTask.ts +++ b/packages/ai/src/task/TextRerankerTask.ts @@ -64,6 +64,26 @@ export type TextRerankerTaskInput = FromSchema; export type TextRerankerTaskOutput = FromSchema; export type TextRerankerTaskConfig = TaskConfig; +/** + * Thrown by reranker provider run-fns when the underlying ML pipeline + * returns output that doesn't match the expected `{ label, score }` + * shape (or array thereof when `top_k > 1`). Co-located with the task + * definition so callers can `instanceof`-test against a single import + * regardless of which provider is installed. + * + * `actualShape` is a truncated, JSON-stringified snippet of the offending + * entry — enough to point an operator at the misconfigured model without + * dumping arbitrary tensors into logs. + */ +export class KbRerankerOutputError extends Error { + public readonly actualShape: unknown; + constructor(message: string, actualShape: unknown) { + super(message); + this.name = "KbRerankerOutputError"; + this.actualShape = actualShape; + } +} + /** * AiTask for cross-encoder reranking. Providers register a run-fn for this * task type (e.g. HuggingFace Transformers using a `text-classification` diff --git a/packages/test/src/test/rag/HFT_TextReranker.shape.test.ts b/packages/test/src/test/rag/HFT_TextReranker.shape.test.ts new file mode 100644 index 000000000..21c7e71af --- /dev/null +++ b/packages/test/src/test/rag/HFT_TextReranker.shape.test.ts @@ -0,0 +1,110 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { KbRerankerOutputError } from "@workglow/ai"; +import { describe, expect, it, vi } from "vitest"; + +// `HFT_TextReranker` is not re-exported from a public barrel — it is wired up +// via `HFT_TASKS` in `HFT_JobRunFns.ts`. We import it via the deep module path +// so the test exercises the actual run-fn rather than a registry indirection. +// Mock `getPipeline` from the same deep path; vi.mock is hoisted, so the +// import below resolves to the mocked version. +vi.mock("@workglow/huggingface-transformers/dist/ai/common/HFT_Pipeline", () => ({ + getPipeline: vi.fn(), +})); +vi.mock("../../../../../providers/huggingface-transformers/src/ai/common/HFT_Pipeline", () => ({ + getPipeline: vi.fn(), +})); + +import { HFT_TextReranker } from "../../../../../providers/huggingface-transformers/src/ai/common/HFT_TextReranker"; +import { getPipeline } from "../../../../../providers/huggingface-transformers/src/ai/common/HFT_Pipeline"; + +const getPipelineMock = getPipeline as unknown as ReturnType; + +const MODEL_PATH = "Xenova/bge-reranker-base"; + +function makeModel() { + return { + model_id: `onnx:${MODEL_PATH}:q8`, + provider: "HuggingFaceTransformersOnnx", + provider_config: { + pipeline: "text-classification" as const, + model_path: MODEL_PATH, + }, + } as unknown as Parameters[1]; +} + +function makeInput(documents: readonly string[]) { + return { + query: "q", + documents: [...documents], + model: `onnx:${MODEL_PATH}:q8`, + } as Parameters[0]; +} + +const noopProgress = () => {}; +const noopSignal = undefined; + +describe("HFT_TextReranker pipeline output shape validation", () => { + it("throws KbRerankerOutputError when an entry lacks a numeric `score`", async () => { + // Single doc, single entry that's the wrong shape. + const fakePipeline = vi.fn().mockResolvedValue([{ foo: "bar" }]); + getPipelineMock.mockResolvedValueOnce(fakePipeline); + + const run = () => + HFT_TextReranker(makeInput(["doc-1"]), makeModel(), noopProgress, noopSignal); + + await expect(run()).rejects.toBeInstanceOf(KbRerankerOutputError); + await expect(run()).rejects.toMatchObject({ + message: expect.stringContaining("unexpected pipeline output shape"), + }); + await expect(run()).rejects.toMatchObject({ + message: expect.stringContaining(MODEL_PATH), + }); + }); + + it("accepts both `{ label, score }` and `[{ label, score }]` entries and returns per-doc scores in input order", async () => { + // First entry is a bare object; second entry is a one-element array — the + // shape transformers.js emits when `top_k > 1`. Both must validate. + const fakePipeline = vi.fn().mockResolvedValue([ + { label: "LABEL_0", score: 0.42 }, + [{ label: "LABEL_0", score: 0.1 }], + ]); + getPipelineMock.mockResolvedValueOnce(fakePipeline); + + const result = await HFT_TextReranker( + makeInput(["doc-A", "doc-B"]), + makeModel(), + noopProgress, + noopSignal + ); + + // Scores stay in the original input order so callers can zip them back to + // their candidate list. + expect(result.scores).toEqual([0.42, 0.1]); + // Indices are sorted best-first. + expect(result.indices).toEqual([0, 1]); + }); + + it("throws when one entry in a batch is valid and another is malformed", async () => { + // The first doc returns a well-formed score; the second returns junk. + // A silent `?? 0` would have hidden this and returned a 0 score for doc-2. + const fakePipeline = vi.fn().mockResolvedValue([ + { label: "LABEL_0", score: 0.9 }, + { label: "LABEL_0" /* score missing */ }, + ]); + getPipelineMock.mockResolvedValueOnce(fakePipeline); + + await expect( + HFT_TextReranker( + makeInput(["doc-good", "doc-bad"]), + makeModel(), + noopProgress, + noopSignal + ) + ).rejects.toBeInstanceOf(KbRerankerOutputError); + }); +}); diff --git a/providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts b/providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts index 75f05da1a..9c187cb0a 100644 --- a/providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts +++ b/providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts @@ -10,10 +10,25 @@ import type { TextRerankerTaskInput, TextRerankerTaskOutput, } from "@workglow/ai"; +import { KbRerankerOutputError } from "@workglow/ai"; import { getLogger } from "@workglow/util/worker"; import type { HfTransformersOnnxModelConfig } from "./HFT_ModelSchema"; import { getPipeline } from "./HFT_Pipeline"; +/** + * Narrow guard: an entry from the text-classification pipeline must be an + * object with a numeric `score`. We deliberately do not require `label` + * to be present because some downstream models omit it; only `score` is + * load-bearing for reranking. + */ +function isScored(v: unknown): v is { label?: string; score: number } { + return ( + typeof v === "object" && + v !== null && + typeof (v as { score?: unknown }).score === "number" + ); +} + /** * Cross-encoder reranker run-fn. Loads a `text-classification` pipeline * (the way transformers.js exposes cross-encoder models like @@ -22,6 +37,13 @@ import { getPipeline } from "./HFT_Pipeline"; * Output `indices` is sorted best-first; `scores` is the per-document score * in the original input order so callers can join back to their candidate * list without re-sorting. + * + * Each pipeline result entry is validated at runtime: either a `{ score }` + * object or a non-empty array of such objects (transformers.js returns the + * array form when `top_k > 1`). On mismatch we throw + * {@link KbRerankerOutputError} with the model path and a truncated shape + * snippet — silently coercing missing scores to 0 would hide real model + * config bugs. */ export const HFT_TextReranker: AiProviderRunFn< TextRerankerTaskInput, @@ -29,29 +51,42 @@ export const HFT_TextReranker: AiProviderRunFn< HfTransformersOnnxModelConfig > = async (input, model, onProgress, signal) => { const logger = getLogger(); - const timerLabel = `hft:TextReranker:${model?.provider_config.model_path}`; + const modelPath = model?.provider_config.model_path; + const timerLabel = `hft:TextReranker:${modelPath}`; logger.time(timerLabel, { docs: input.documents.length }); const reranker: TextClassificationPipeline = await getPipeline(model!, onProgress, {}, signal); // Transformers.js' text-classification pipeline accepts an array of // { text, text_pair } objects for sentence-pair tasks (which cross-encoder - // rerankers are). The pipeline returns one score per input pair. + // rerankers are). The pipeline returns one score per input pair (or an + // array of scored entries per pair when `top_k > 1`). const pairs = input.documents.map((doc) => ({ text: input.query, text_pair: doc })); - const rawResults = (await (reranker as unknown as ( - inputs: Array<{ text: string; text_pair: string }>, + + // Type as `unknown` so we are forced to validate. Do NOT use `as unknown as + // (...) => Promise<...>` here — that cast erases the typing safety net. + const callable = reranker as unknown as ( + inputs: ReadonlyArray<{ text: string; text_pair: string }>, options?: Record - ) => Promise>>)( - pairs, - { top_k: 1 } - )) as Array<{ label: string; score: number } | Array<{ label: string; score: number }>>; + ) => Promise; + const rawResults: unknown = await callable(pairs, { top_k: 1 }); - const scores: number[] = rawResults.map((r) => { - if (Array.isArray(r)) { - // top_k > 1 returns array per input — take the best - return r[0]?.score ?? 0; + if (!Array.isArray(rawResults)) { + throw new KbRerankerOutputError( + `HFT_TextReranker: unexpected pipeline output shape for model ${modelPath}`, + truncateShape(rawResults) + ); + } + + const scores: number[] = rawResults.map((entry) => { + const candidate = Array.isArray(entry) ? entry[0] : entry; + if (!isScored(candidate)) { + throw new KbRerankerOutputError( + `HFT_TextReranker: unexpected pipeline output shape for model ${modelPath}`, + truncateShape(entry) + ); } - return r.score; + return candidate.score; }); const indices = scores @@ -64,3 +99,18 @@ export const HFT_TextReranker: AiProviderRunFn< logger.timeEnd(timerLabel, { docs: input.documents.length }); return { scores, indices: limited }; }; + +/** + * Serialize an offending shape for the error payload. We cap the length to + * keep error messages bounded — a misconfigured model could otherwise dump + * unbounded data into logs. + */ +function truncateShape(value: unknown): string { + try { + const json = JSON.stringify(value); + if (typeof json !== "string") return String(value); + return json.length > 200 ? `${json.slice(0, 200)}…` : json; + } catch { + return String(value); + } +} From 0a693f9771c4047f2533b3355358cedc36cdf07e Mon Sep 17 00:00:00 2001 From: Steven Roussey Date: Thu, 14 May 2026 01:19:03 -0700 Subject: [PATCH 11/11] test(hft): exercise reranker shape validation via extracted pure helper Refactors the runtime shape guard out of `HFT_TextReranker` into an exported pure helper `validateAndExtractRerankerScores`, then tests that helper directly. Exporting the helper also lets it be barreled from `@workglow/huggingface-transformers/ai-runtime` so tests can exercise the shape validation without spinning up a real transformers.js pipeline or mocking deep-path internals (avoids ESM module-mock portability between bun:test and vitest). The run-fn delegates to the helper; behavior is unchanged. --- .../test/rag/HFT_TextReranker.shape.test.ts | 148 +++++++----------- .../src/ai/common/HFT_TextReranker.ts | 101 +++++++----- .../src/ai/runtime.ts | 1 + 3 files changed, 122 insertions(+), 128 deletions(-) diff --git a/packages/test/src/test/rag/HFT_TextReranker.shape.test.ts b/packages/test/src/test/rag/HFT_TextReranker.shape.test.ts index 21c7e71af..8822fc63d 100644 --- a/packages/test/src/test/rag/HFT_TextReranker.shape.test.ts +++ b/packages/test/src/test/rag/HFT_TextReranker.shape.test.ts @@ -5,106 +5,72 @@ */ import { KbRerankerOutputError } from "@workglow/ai"; -import { describe, expect, it, vi } from "vitest"; - -// `HFT_TextReranker` is not re-exported from a public barrel — it is wired up -// via `HFT_TASKS` in `HFT_JobRunFns.ts`. We import it via the deep module path -// so the test exercises the actual run-fn rather than a registry indirection. -// Mock `getPipeline` from the same deep path; vi.mock is hoisted, so the -// import below resolves to the mocked version. -vi.mock("@workglow/huggingface-transformers/dist/ai/common/HFT_Pipeline", () => ({ - getPipeline: vi.fn(), -})); -vi.mock("../../../../../providers/huggingface-transformers/src/ai/common/HFT_Pipeline", () => ({ - getPipeline: vi.fn(), -})); - -import { HFT_TextReranker } from "../../../../../providers/huggingface-transformers/src/ai/common/HFT_TextReranker"; -import { getPipeline } from "../../../../../providers/huggingface-transformers/src/ai/common/HFT_Pipeline"; - -const getPipelineMock = getPipeline as unknown as ReturnType; +import { validateAndExtractRerankerScores } from "@workglow/huggingface-transformers/ai-runtime"; +import { describe, expect, it } from "vitest"; const MODEL_PATH = "Xenova/bge-reranker-base"; -function makeModel() { - return { - model_id: `onnx:${MODEL_PATH}:q8`, - provider: "HuggingFaceTransformersOnnx", - provider_config: { - pipeline: "text-classification" as const, - model_path: MODEL_PATH, - }, - } as unknown as Parameters[1]; -} - -function makeInput(documents: readonly string[]) { - return { - query: "q", - documents: [...documents], - model: `onnx:${MODEL_PATH}:q8`, - } as Parameters[0]; -} - -const noopProgress = () => {}; -const noopSignal = undefined; - -describe("HFT_TextReranker pipeline output shape validation", () => { - it("throws KbRerankerOutputError when an entry lacks a numeric `score`", async () => { - // Single doc, single entry that's the wrong shape. - const fakePipeline = vi.fn().mockResolvedValue([{ foo: "bar" }]); - getPipelineMock.mockResolvedValueOnce(fakePipeline); - - const run = () => - HFT_TextReranker(makeInput(["doc-1"]), makeModel(), noopProgress, noopSignal); - - await expect(run()).rejects.toBeInstanceOf(KbRerankerOutputError); - await expect(run()).rejects.toMatchObject({ - message: expect.stringContaining("unexpected pipeline output shape"), - }); - await expect(run()).rejects.toMatchObject({ - message: expect.stringContaining(MODEL_PATH), - }); +/** + * The transformers.js text-classification pipeline returns one entry per + * input pair. With `top_k > 1` each entry is an array of `{ label, score }` + * objects; with `top_k = 1` it is the bare object. The reranker run-fn + * accepts both forms. These tests pin down the shape contract: anything + * else must fail loudly so a misconfigured model produces an actionable + * error instead of silently returning zero scores. + */ +describe("validateAndExtractRerankerScores (HFT_TextReranker shape guard)", () => { + it("throws KbRerankerOutputError when an entry lacks a numeric `score`", () => { + const run = () => validateAndExtractRerankerScores([{ foo: "bar" }], MODEL_PATH); + expect(run).toThrow(KbRerankerOutputError); + expect(run).toThrow(/unexpected pipeline output shape/); + expect(run).toThrow(new RegExp(MODEL_PATH)); }); - it("accepts both `{ label, score }` and `[{ label, score }]` entries and returns per-doc scores in input order", async () => { - // First entry is a bare object; second entry is a one-element array — the - // shape transformers.js emits when `top_k > 1`. Both must validate. - const fakePipeline = vi.fn().mockResolvedValue([ - { label: "LABEL_0", score: 0.42 }, - [{ label: "LABEL_0", score: 0.1 }], - ]); - getPipelineMock.mockResolvedValueOnce(fakePipeline); - - const result = await HFT_TextReranker( - makeInput(["doc-A", "doc-B"]), - makeModel(), - noopProgress, - noopSignal + it("accepts both `{ label, score }` and `[{ label, score }]` entries and returns scores in input order", () => { + // First entry is the bare object; second is a one-element array — the + // shape transformers.js emits when `top_k > 1`. Both must validate and + // produce per-document scores in the original input order. + const scores = validateAndExtractRerankerScores( + [ + { label: "LABEL_0", score: 0.42 }, + [{ label: "LABEL_0", score: 0.1 }], + ], + MODEL_PATH ); + expect(scores).toEqual([0.42, 0.1]); + }); - // Scores stay in the original input order so callers can zip them back to - // their candidate list. - expect(result.scores).toEqual([0.42, 0.1]); - // Indices are sorted best-first. - expect(result.indices).toEqual([0, 1]); + it("throws when one entry in a batch is valid and another is malformed", () => { + // A silent `?? 0` would have hidden this and returned a 0 score for the + // bad entry. The strict guard surfaces the misconfiguration immediately. + const run = () => + validateAndExtractRerankerScores( + [ + { label: "LABEL_0", score: 0.9 }, + { label: "LABEL_0" /* score missing */ }, + ], + MODEL_PATH + ); + expect(run).toThrow(KbRerankerOutputError); }); - it("throws when one entry in a batch is valid and another is malformed", async () => { - // The first doc returns a well-formed score; the second returns junk. - // A silent `?? 0` would have hidden this and returned a 0 score for doc-2. - const fakePipeline = vi.fn().mockResolvedValue([ - { label: "LABEL_0", score: 0.9 }, - { label: "LABEL_0" /* score missing */ }, - ]); - getPipelineMock.mockResolvedValueOnce(fakePipeline); + it("throws when the top-level value is not an array", () => { + // Defensive: if the pipeline returns a non-array (e.g. a tensor or null) + // we still error out rather than letting Array.prototype.map throw + // somewhere downstream with a less actionable message. + const run = () => validateAndExtractRerankerScores({ score: 0.5 }, MODEL_PATH); + expect(run).toThrow(KbRerankerOutputError); + }); - await expect( - HFT_TextReranker( - makeInput(["doc-good", "doc-bad"]), - makeModel(), - noopProgress, - noopSignal - ) - ).rejects.toBeInstanceOf(KbRerankerOutputError); + it("includes a truncated shape snippet on the error for diagnostics", () => { + try { + validateAndExtractRerankerScores([{ foo: "bar" }], MODEL_PATH); + throw new Error("expected throw"); + } catch (err) { + expect(err).toBeInstanceOf(KbRerankerOutputError); + const e = err as KbRerankerOutputError; + expect(typeof e.actualShape).toBe("string"); + expect(e.actualShape as string).toContain("foo"); + } }); }); diff --git a/providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts b/providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts index 9c187cb0a..3fb3c5245 100644 --- a/providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts +++ b/providers/huggingface-transformers/src/ai/common/HFT_TextReranker.ts @@ -29,6 +29,63 @@ function isScored(v: unknown): v is { label?: string; score: number } { ); } +/** + * Serialize an offending shape for the error payload. We cap the length to + * keep error messages bounded — a misconfigured model could otherwise dump + * unbounded data into logs. + */ +function truncateShape(value: unknown): string { + try { + const json = JSON.stringify(value); + if (typeof json !== "string") return String(value); + return json.length > 200 ? `${json.slice(0, 200)}…` : json; + } catch { + return String(value); + } +} + +/** + * Validate a transformers.js text-classification pipeline output and + * extract per-document scores. Each entry must be either a `{ score }` + * object or a non-empty array of such objects (transformers.js returns + * the array form when `top_k > 1`). + * + * Exported so the shape contract can be exercised in tests directly, + * without needing to spin up a real pipeline or mock the loader. Throws + * {@link KbRerankerOutputError} on mismatch — silently coercing to 0 + * would hide real model config bugs. + * + * @param rawResults Whatever the pipeline call returned. Typed as + * `unknown` so the caller is forced through this validation. + * @param modelPath Used in the error message to point operators at the + * misconfigured model. + */ +export function validateAndExtractRerankerScores( + rawResults: unknown, + modelPath: string | undefined +): number[] { + if (!Array.isArray(rawResults)) { + throw new KbRerankerOutputError( + `HFT_TextReranker: unexpected pipeline output shape for model ${modelPath}`, + truncateShape(rawResults) + ); + } + + const scores: number[] = new Array(rawResults.length); + for (let i = 0; i < rawResults.length; i++) { + const entry = rawResults[i]; + const candidate = Array.isArray(entry) ? entry[0] : entry; + if (!isScored(candidate)) { + throw new KbRerankerOutputError( + `HFT_TextReranker: unexpected pipeline output shape for model ${modelPath}`, + truncateShape(entry) + ); + } + scores[i] = candidate.score; + } + return scores; +} + /** * Cross-encoder reranker run-fn. Loads a `text-classification` pipeline * (the way transformers.js exposes cross-encoder models like @@ -38,9 +95,9 @@ function isScored(v: unknown): v is { label?: string; score: number } { * in the original input order so callers can join back to their candidate * list without re-sorting. * - * Each pipeline result entry is validated at runtime: either a `{ score }` - * object or a non-empty array of such objects (transformers.js returns the - * array form when `top_k > 1`). On mismatch we throw + * Each pipeline result entry is validated at runtime via + * {@link validateAndExtractRerankerScores}: either a `{ score }` object or + * a non-empty array of such objects. On mismatch we throw * {@link KbRerankerOutputError} with the model path and a truncated shape * snippet — silently coercing missing scores to 0 would hide real model * config bugs. @@ -63,31 +120,16 @@ export const HFT_TextReranker: AiProviderRunFn< // array of scored entries per pair when `top_k > 1`). const pairs = input.documents.map((doc) => ({ text: input.query, text_pair: doc })); - // Type as `unknown` so we are forced to validate. Do NOT use `as unknown as - // (...) => Promise<...>` here — that cast erases the typing safety net. + // Type as `unknown` so we are forced through the shape guard. Do NOT use + // `as unknown as (...) => Promise<...>` here — that cast erases the + // typing safety net and was the original bug. const callable = reranker as unknown as ( inputs: ReadonlyArray<{ text: string; text_pair: string }>, options?: Record ) => Promise; const rawResults: unknown = await callable(pairs, { top_k: 1 }); - if (!Array.isArray(rawResults)) { - throw new KbRerankerOutputError( - `HFT_TextReranker: unexpected pipeline output shape for model ${modelPath}`, - truncateShape(rawResults) - ); - } - - const scores: number[] = rawResults.map((entry) => { - const candidate = Array.isArray(entry) ? entry[0] : entry; - if (!isScored(candidate)) { - throw new KbRerankerOutputError( - `HFT_TextReranker: unexpected pipeline output shape for model ${modelPath}`, - truncateShape(entry) - ); - } - return candidate.score; - }); + const scores = validateAndExtractRerankerScores(rawResults, modelPath); const indices = scores .map((score, idx) => ({ score, idx })) @@ -99,18 +141,3 @@ export const HFT_TextReranker: AiProviderRunFn< logger.timeEnd(timerLabel, { docs: input.documents.length }); return { scores, indices: limited }; }; - -/** - * Serialize an offending shape for the error payload. We cap the length to - * keep error messages bounded — a misconfigured model could otherwise dump - * unbounded data into logs. - */ -function truncateShape(value: unknown): string { - try { - const json = JSON.stringify(value); - if (typeof json !== "string") return String(value); - return json.length > 200 ? `${json.slice(0, 200)}…` : json; - } catch { - return String(value); - } -} diff --git a/providers/huggingface-transformers/src/ai/runtime.ts b/providers/huggingface-transformers/src/ai/runtime.ts index e3e9fd40c..cbae6699b 100644 --- a/providers/huggingface-transformers/src/ai/runtime.ts +++ b/providers/huggingface-transformers/src/ai/runtime.ts @@ -17,6 +17,7 @@ export * from "./common/HFT_Constants"; export * from "./common/HFT_ModelSchema"; export * from "./common/HFT_OnnxDtypes"; export * from "./common/HFT_Pipeline"; +export * from "./common/HFT_TextReranker"; export * from "./common/HFT_ToolMarkup"; export * from "./registerHuggingFaceTransformersInline"; export * from "./registerHuggingFaceTransformersWorker";