diff --git a/packages/ai/src/task/ChunkRetrievalTask.ts b/packages/ai/src/task/ChunkRetrievalTask.ts index 15ad1896d..b458b3487 100644 --- a/packages/ai/src/task/ChunkRetrievalTask.ts +++ b/packages/ai/src/task/ChunkRetrievalTask.ts @@ -48,7 +48,9 @@ const inputSchema = { enum: ["similarity", "hybrid"], title: "Retrieval Method", description: - "Retrieval strategy: 'similarity' (vector only) or 'hybrid' (vector + full-text).", + "Retrieval strategy: 'similarity' (vector only, scores are cosine similarity in [0,1]) " + + "or 'hybrid' (vector + full-text fused via Reciprocal Rank Fusion; scores are RRF " + + "fusion scores, NOT comparable to cosine similarity).", default: "similarity", }, topK: { @@ -66,7 +68,10 @@ const inputSchema = { scoreThreshold: { type: "number", title: "Score Threshold", - description: "Minimum similarity score threshold (0-1)", + description: + "Minimum cosine similarity score threshold (0-1). Applies only to method='similarity'; " + + "ignored for method='hybrid' because RRF fusion scores are not comparable to cosine " + + "similarity. Use topK to size hybrid results instead.", minimum: 0, maximum: 1, default: 0, @@ -129,7 +134,19 @@ const outputSchema = { type: "array", items: { type: "number" }, title: "Scores", - description: "Similarity scores for each result", + description: + "Per-result scores. For method='similarity', these are cosine similarity scores in " + + "[0,1]. For method='hybrid', these are Reciprocal Rank Fusion scores — small positive " + + "numbers (typically <0.05) that rank results but do not correspond to a similarity.", + }, + scoreType: { + type: "string", + enum: ["cosine", "bm25", "rrf"], + title: "Score Type", + description: + "Discriminator naming the scorer used for `scores`: 'cosine' for similarity search " + + "and for hybrid fallback when the text query is empty/whitespace; 'rrf' for hybrid " + + "fusion. ('bm25' is reserved for direct text search and is not produced by this task.)", }, vectors: { type: "array", @@ -157,7 +174,7 @@ const outputSchema = { description: "The query used for retrieval (pass-through)", }, }, - required: ["chunks", "chunk_ids", "metadata", "scores", "count", "query"], + required: ["chunks", "chunk_ids", "metadata", "scores", "scoreType", "count", "query"], additionalProperties: false, } as const satisfies DataPortSchema; @@ -216,7 +233,9 @@ export class ChunkRetrievalTask extends Task< } if (method === "hybrid" && !kb.supportsHybridSearch()) { throw new Error( - "The provided knowledge base does not support hybrid search. Use method: 'similarity' or a backend with hybrid support (e.g., Postgres with pgvector)." + "Hybrid retrieval requires a text index installed on the knowledge base. " + + "Install one via `kb.installTextIndex(new BM25Index())` or pass " + + "`textIndex` to `createKnowledgeBase`. Otherwise use method: 'similarity'." ); } @@ -250,7 +269,6 @@ export class ChunkRetrievalTask extends Task< textQuery: queryText!, topK, filter, - scoreThreshold, vectorWeight, }) : await kb.similaritySearch(searchVector, { @@ -264,11 +282,21 @@ export class ChunkRetrievalTask extends Task< return meta.text || JSON.stringify(meta); }); + // The KB tags every result with the same scoreType; the empty-textQuery + // fallback inside hybridSearch flips this from "rrf" to "cosine", and we + // want to surface that to callers even when the result set is empty. + const hybridFallsBackToCosine = + method === "hybrid" && (queryText === undefined || queryText.trim().length === 0); + const defaultScoreType: "cosine" | "bm25" | "rrf" = + method === "hybrid" && !hybridFallsBackToCosine ? "rrf" : "cosine"; + const scoreType = results.length > 0 ? (results[0].scoreType ?? defaultScoreType) : defaultScoreType; + const output: ChunkRetrievalTaskOutput = { chunks, chunk_ids: results.map((r) => r.chunk_id), metadata: results.map((r) => r.metadata), scores: results.map((r) => r.score), + scoreType, count: results.length, query, }; diff --git a/packages/indexeddb/src/storage/IndexedDbVectorStorage.ts b/packages/indexeddb/src/storage/IndexedDbVectorStorage.ts index 448ff5308..109f9d699 100644 --- a/packages/indexeddb/src/storage/IndexedDbVectorStorage.ts +++ b/packages/indexeddb/src/storage/IndexedDbVectorStorage.ts @@ -18,7 +18,6 @@ import { getMetadataProperty, getVectorProperty } from "@workglow/storage"; import type { ClientProvidedKeysOption, AnyVectorStorage, - HybridSearchOptions, IVectorStorage, VectorSearchOptions, } from "@workglow/storage"; @@ -40,25 +39,6 @@ function matchesFilter(metadata: Metadata, filter: Partial): return true; } -/** - * Simple full-text search scoring (keyword matching) - */ -function textRelevance(text: string, query: string): number { - const textLower = text.toLowerCase(); - const queryLower = query.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 0); - if (queryWords.length === 0) { - return 0; - } - let matches = 0; - for (const word of queryWords) { - if (textLower.includes(word)) { - matches++; - } - } - return matches / queryWords.length; -} - /** * IndexedDB vector storage implementation. * Extends IndexedDbTabularStorage for storage. @@ -168,55 +148,4 @@ export class IndexedDbVectorStorage< return topResults; } - - async hybridSearch(query: TypedArray, options: HybridSearchOptions>) { - const { topK = 10, filter, scoreThreshold = 0, textQuery, vectorWeight = 0.7 } = options; - - if (!textQuery || textQuery.trim().length === 0) { - // Fall back to regular vector search if no text query - return this.similaritySearch(query, { topK, filter, scoreThreshold }); - } - - const results: Array = []; - const allEntities = (await this.getAll()) || []; - - for (const entity of allEntities) { - // IndexedDB stores TypedArrays natively via structured clone (no deserialization needed) - const vector = entity[this.vectorPropertyName] as TypedArray; - const metadata = this.metadataPropertyName - ? (entity[this.metadataPropertyName] as Metadata) - : ({} as Metadata); - - // Apply filter if provided - if (filter && !matchesFilter(metadata, filter)) { - continue; - } - - // Calculate vector similarity - const vectorScore = cosineSimilarity(query, vector); - - // Calculate text relevance (simple keyword matching) - const metadataText = Object.values(metadata).join(" ").toLowerCase(); - const textScore = textRelevance(metadataText, textQuery); - - // Combine scores - const combinedScore = vectorWeight * vectorScore + (1 - vectorWeight) * textScore; - - // Apply threshold - if (combinedScore < scoreThreshold) { - continue; - } - - results.push({ - ...entity, - score: combinedScore, - } as Entity & { score: number }); - } - - // Sort by combined score descending and take top K - results.sort((a, b) => b.score - a.score); - const topResults = results.slice(0, topK); - - return topResults; - } } diff --git a/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts b/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts index 48fe8f7eb..ba20e5989 100644 --- a/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts +++ b/packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts @@ -60,7 +60,26 @@ export type ChunkVectorStorage = IVectorStorage< ChunkVectorPrimaryKey >; +/** + * Discriminator for the scoring function used to produce a + * {@link ChunkSearchResult.score}. Callers (typically UI) use this to render + * the score appropriately, since the three scorers live on different scales: + * + * - `"cosine"`: cosine similarity in `[-1, 1]`, typically `[0, 1]` for text + * embeddings. Absolute — higher means more similar. + * - `"bm25"`: BM25(F) score in `[0, ∞)`. Absolute but corpus-dependent — not + * comparable across knowledge bases. + * - `"rrf"`: Reciprocal Rank Fusion score, bounded above by + * `2 / (rrfK + 1)` (~`0.033` with the default `rrfK=60`). Rank-based, not + * absolute — the magnitude is not a similarity, only an ordering signal. + * Not comparable across queries. + */ +export type ScoreType = "cosine" | "bm25" | "rrf"; + /** * Search result with score */ -export type ChunkSearchResult = ChunkVectorEntity & { score: number }; +export type ChunkSearchResult = ChunkVectorEntity & { + score: number; + scoreType?: ScoreType; +}; diff --git a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts index 36f7d2397..dbd8409ab 100644 --- a/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/KnowledgeBase.ts @@ -4,7 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { HybridSearchOptions, VectorSearchOptions } from "@workglow/storage"; +import type { ITextIndex, TextFields, VectorSearchOptions } from "@workglow/storage"; import type { TypedArray } from "@workglow/util/schema"; import type { ChunkRecord } from "../chunk/ChunkSchema"; import type { @@ -33,6 +33,91 @@ export interface ISearchOptions { readonly scoreThreshold?: number; } +/** + * Options for {@link KnowledgeBase.hybridSearch}. The fusion is performed at + * the KB layer (Reciprocal Rank Fusion) over the vector storage's + * `similaritySearch` and the installed {@link ITextIndex}'s `search`. + * + * `vectorWeight` controls per-ranker influence in the fused ranking: + * the vector ranker contributes `vectorWeight / (rrfK + rank)` and the text + * ranker contributes `(1 - vectorWeight) / (rrfK + rank)`. Defaults to 0.7. + * + * `scoreThreshold` is intentionally not honoured here: RRF scores are not + * comparable to cosine scores, so a single threshold knob would be + * misleading. Use `topK` to cap result size instead. + */ +export interface HybridSearchOptions< + Metadata extends Record | undefined = Record, +> { + readonly textQuery: string; + readonly topK?: number; + readonly filter?: Partial; + readonly vectorWeight?: number; + /** RRF saturation constant; standard value is 60. */ + readonly rrfK?: number; + /** + * Per-ranker over-fetch multiplier. Each ranker fetches `topK * + * candidatePoolMultiplier` candidates so RRF has overlap to fuse on. + * Defaults to 5; lower values reduce overlap and degenerate RRF toward + * "OR of top-K", higher values cost more I/O. + */ + readonly candidatePoolMultiplier?: number; +} + +/** + * Options for {@link KnowledgeBase.textSearch}. + */ +export interface TextOnlySearchOptions { + readonly topK?: number; + readonly filter?: Partial; +} + +/** + * Fields on a {@link ChunkRecord} that the text index reads. Kept here (rather + * than inside `BM25Index`) because the mapping is a {@link KnowledgeBase} + * concern: the index doesn't know about chunk shape. + */ +const TEXT_INDEXABLE_FIELDS = [ + "text", + "doc_title", + "sectionTitles", + "summary", + "parentSummaries", +] as const; + +function chunkTextFields(metadata: ChunkRecord | undefined): TextFields | undefined { + if (!metadata) return undefined; + const fields: Record = {}; + let any = false; + for (const key of TEXT_INDEXABLE_FIELDS) { + const value = (metadata as Record)[key]; + if (typeof value === "string" && value.length > 0) { + fields[key] = value; + any = true; + } else if (Array.isArray(value) && value.length > 0) { + const filtered = (value as unknown[]).filter( + (v): v is string => typeof v === "string" && v.length > 0 + ); + if (filtered.length > 0) { + fields[key] = filtered; + any = true; + } + } + } + return any ? fields : undefined; +} + +function matchesFilter>( + metadata: T, + filter: Partial | undefined +): boolean { + if (!filter) return true; + for (const [k, v] of Object.entries(filter)) { + if ((metadata as Record)[k] !== v) return false; + } + return true; +} + /** * Callback invoked after a document is upserted. * Receives the KB instance and the upserted document. @@ -61,6 +146,13 @@ export interface KnowledgeBaseOptions { readonly onDocumentUpsert?: OnDocumentUpsertCallback; readonly onDocumentDelete?: OnDocumentDeleteCallback; readonly onSearch?: OnSearchCallback; + /** + * Optional text index. When installed, chunk upserts auto-write to it and + * {@link KnowledgeBase.hybridSearch} fuses vector + text rankings via RRF. + * Equivalent to constructing the KB and calling + * {@link KnowledgeBase.installTextIndex} after. + */ + readonly textIndex?: ITextIndex; } /** @@ -91,6 +183,14 @@ export class KnowledgeBase { */ onSearch: OnSearchCallback | undefined; + /** + * Optional full-text index. When installed, chunk upserts auto-write to it + * (when the chunk has any indexable text field) and {@link hybridSearch} + * becomes available. Install via the constructor option or + * {@link installTextIndex}. + */ + private textIndex: ITextIndex | undefined; + constructor( name: string, documentStorage: DocumentTabularStorage, @@ -107,9 +207,53 @@ export class KnowledgeBase { this.onDocumentUpsert = options.onDocumentUpsert; this.onDocumentDelete = options.onDocumentDelete; this.onSearch = options.onSearch; + if (options.textIndex) { + this.textIndex = options.textIndex; + } } } + /** + * Install (or replace) the full-text index used by {@link hybridSearch} and + * {@link textSearch}. Subsequent {@link upsertChunk} / {@link upsertChunksBulk} + * calls auto-write to the index. Existing chunks are *not* back-indexed — + * call {@link reindexText} after installing if the chunk store already has + * data. + */ + installTextIndex(index: ITextIndex): void { + this.textIndex = index; + } + + /** + * Get the installed text index, if any. Returns `undefined` when no index + * has been installed. + */ + getTextIndex(): ITextIndex | undefined { + return this.textIndex; + } + + /** + * Rebuild the installed text index from the current chunk storage. Use + * after {@link installTextIndex} on a KB that already has chunks, or after + * a tokenizer / field-weight configuration change. + * + * Atomic with respect to async failures: chunks are read and tokenisation + * is staged before the index is mutated. If `chunkStorage.getAll()` throws, + * the existing index is untouched. + */ + async reindexText(): Promise { + const index = this.textIndex; + if (!index) return; + const all = ((await this.chunkStorage.getAll()) ?? []) as ChunkVectorEntity[]; + const writes: Array<{ chunkId: string; docId: string; fields: TextFields }> = []; + for (const entity of all) { + const fields = chunkTextFields(entity.metadata); + if (fields) writes.push({ chunkId: entity.chunk_id, docId: entity.doc_id, fields }); + } + index.clear(); + for (const w of writes) index.add(w.chunkId, w.docId, w.fields); + } + // =========================================================================== // Document CRUD // =========================================================================== @@ -267,7 +411,19 @@ export class KnowledgeBase { `Vector dimension mismatch: expected ${expected}, got ${chunk.vector.length}.` ); } - return this.chunkStorage.put(chunk); + const stored = await this.chunkStorage.put(chunk); + if (this.textIndex) { + const fields = chunkTextFields(stored.metadata); + if (fields) { + this.textIndex.add(stored.chunk_id, stored.doc_id, fields); + } else { + // The chunk has no indexable text — drop any stale postings from a + // prior version where the text was non-empty. Required for upsert + // correctness when text is cleared on update. + this.textIndex.remove(stored.chunk_id); + } + } + return stored; } /** @@ -284,7 +440,18 @@ export class KnowledgeBase { } } } - return this.chunkStorage.putBulk(chunks); + const stored = await this.chunkStorage.putBulk(chunks); + if (this.textIndex) { + for (const entity of stored) { + const fields = chunkTextFields(entity.metadata); + if (fields) { + this.textIndex.add(entity.chunk_id, entity.doc_id, fields); + } else { + this.textIndex.remove(entity.chunk_id); + } + } + } + return stored; } /** @@ -292,6 +459,7 @@ export class KnowledgeBase { */ async deleteChunksForDocument(doc_id: string): Promise { await this.chunkStorage.deleteSearch({ doc_id }); + this.textIndex?.removeByDocument(doc_id); } /** @@ -316,33 +484,169 @@ export class KnowledgeBase { query: TypedArray, options?: VectorSearchOptions ): Promise { - return this.chunkStorage.similaritySearch(query, options); + const raw = await this.chunkStorage.similaritySearch(query, options); + return raw.map((r) => ({ ...r, scoreType: "cosine" }) as ChunkSearchResult); } /** - * Hybrid search combining vector similarity and full-text search. Canonical - * scope-aware entry point; subclasses override for filter injection. + * Hybrid search combining vector similarity and full-text BM25(F) ranking. + * The two rankers run in parallel, then their per-rank contributions are + * fused with Reciprocal Rank Fusion: + * + * ``` + * fused(d) = vectorWeight / (rrfK + rank_v(d)) + * + (1 - vectorWeight) / (rrfK + rank_t(d)) + * ``` + * + * RRF rewards items that appear in both rankings. Score values are *not* + * comparable to cosine scores — use `topK` to size the result, not a score + * threshold. * - * @throws Error if the configured storage backend does not support hybrid search. + * Canonical scope-aware entry point; subclasses override for filter injection. + * + * @throws Error if no text index is installed (call {@link installTextIndex} first). */ async hybridSearch( query: TypedArray, options: HybridSearchOptions ): Promise { - if (typeof this.chunkStorage.hybridSearch !== "function") { + const index = this.textIndex; + if (!index) { + throw new Error( + "Hybrid search requires a text index. Install one via " + + "`kb.installTextIndex(new BM25Index())` or pass `textIndex` to " + + "`createKnowledgeBase`." + ); + } + const { + textQuery, + topK = 10, + filter, + vectorWeight = 0.7, + rrfK = 60, + candidatePoolMultiplier = 5, + } = options; + + // Empty / whitespace-only textQuery has no signal for the BM25 ranker. + // Returning RRF-shaped scores in that case would surprise callers, so + // delegate to the cosine similarity path and return cosine scores. + if (!textQuery || textQuery.trim().length === 0) { + return this.similaritySearch(query, { topK, filter }); + } + + const safeRrfK = Math.max(0, rrfK); + const safePoolMultiplier = Math.max(1, candidatePoolMultiplier); + const poolSize = Math.max(topK, Math.ceil(topK * safePoolMultiplier)); + + const [vectorResults, textResults] = await Promise.all([ + this.similaritySearch(query, { topK: poolSize, filter }), + Promise.resolve(index.search(textQuery, { topK: poolSize })), + ]); + + const vectorWeightClamped = Math.max(0, Math.min(1, vectorWeight)); + const textWeight = 1 - vectorWeightClamped; + + const fused = new Map(); + + vectorResults.forEach((entity, rank) => { + const contribution = vectorWeightClamped / (safeRrfK + rank + 1); + // Strip the cosine scoreType from the wrapped similarity result; the + // outer fused entity will carry "rrf" once we re-emit it. + const { scoreType: _drop, ...rest } = entity; + fused.set(entity.chunk_id, { + score: contribution, + entity: rest as ChunkSearchResult, + }); + }); + + for (let rank = 0; rank < textResults.length; rank++) { + const { chunkId } = textResults[rank]; + const contribution = textWeight / (safeRrfK + rank + 1); + const existing = fused.get(chunkId); + if (existing) { + existing.score += contribution; + } else { + fused.set(chunkId, { score: contribution, entity: undefined }); + } + } + + const missing = Array.from(fused.entries()) + .filter(([, v]) => v.entity === undefined) + .map(([chunkId]) => chunkId); + + if (missing.length > 0) { + const hydrated = await Promise.all( + missing.map((chunk_id) => this.chunkStorage.get({ chunk_id })) + ); + for (let i = 0; i < missing.length; i++) { + const entity = hydrated[i] as ChunkVectorEntity | undefined; + const slot = fused.get(missing[i])!; + if (!entity) { + fused.delete(missing[i]); + continue; + } + if (filter && !matchesFilter(entity.metadata as ChunkRecord, filter)) { + fused.delete(missing[i]); + continue; + } + slot.entity = { ...entity, score: 0 }; + } + } + + const ranked: ChunkSearchResult[] = []; + for (const { score, entity } of fused.values()) { + if (!entity) continue; + ranked.push({ ...entity, score, scoreType: "rrf" }); + } + ranked.sort((a, b) => b.score - a.score); + return ranked.slice(0, topK); + } + + /** + * Pure full-text search via the installed text index. Hydrates ranked + * chunkIds from chunk storage and applies optional metadata filtering + * post-hoc. + * + * @throws Error if no text index is installed. + */ + async textSearch( + query: string, + options: TextOnlySearchOptions = {} + ): Promise { + const index = this.textIndex; + if (!index) { throw new Error( - "Hybrid search is not supported by the configured chunk storage backend. " + - "Please use a vector storage implementation that provides `hybridSearch`." + "Text search requires a text index. Install one via " + + "`kb.installTextIndex(new BM25Index())` or pass `textIndex` to " + + "`createKnowledgeBase`." ); } - return this.chunkStorage.hybridSearch(query, options); + const { topK = 10, filter } = options; + const poolSize = filter ? Math.max(topK * 2, topK) : topK; + const hits = index.search(query, { topK: poolSize }); + if (hits.length === 0) return []; + + const hydrated = await Promise.all( + hits.map((h) => this.chunkStorage.get({ chunk_id: h.chunkId })) + ); + + const results: ChunkSearchResult[] = []; + for (let i = 0; i < hits.length; i++) { + const entity = hydrated[i] as ChunkVectorEntity | undefined; + if (!entity) continue; + if (filter && !matchesFilter(entity.metadata as ChunkRecord, filter)) continue; + results.push({ ...entity, score: hits[i].score, scoreType: "bm25" }); + if (results.length >= topK) break; + } + return results; } /** - * Check if the configured storage backend supports hybrid search. + * Whether {@link hybridSearch} is available — i.e. a text index has been + * installed. */ supportsHybridSearch(): boolean { - return typeof this.chunkStorage.hybridSearch === "function"; + return this.textIndex !== undefined; } /** @@ -437,17 +741,19 @@ export class KnowledgeBase { } /** - * Store a single chunk (alias for upsertChunk) + * Store a single chunk (alias for {@link upsertChunk}). Goes through the + * full upsert path so the text index is kept in sync. */ async put(chunk: InsertChunkVectorEntity): Promise { - return this.chunkStorage.put(chunk); + return this.upsertChunk(chunk); } /** - * Store multiple chunks (alias for upsertChunksBulk) + * Store multiple chunks (alias for {@link upsertChunksBulk}). Goes through + * the full upsert path so the text index is kept in sync. */ async putBulk(chunks: InsertChunkVectorEntity[]): Promise { - return this.chunkStorage.putBulk(chunks); + return this.upsertChunksBulk(chunks); } /** @@ -468,7 +774,8 @@ export class KnowledgeBase { * Clear all chunks */ async clearChunks(): Promise { - return this.chunkStorage.deleteAll(); + await this.chunkStorage.deleteAll(); + this.textIndex?.clear(); } /** diff --git a/packages/knowledge-base/src/knowledge-base/ScopedVectorStorage.ts b/packages/knowledge-base/src/knowledge-base/ScopedVectorStorage.ts index 697b430fe..965e00ce4 100644 --- a/packages/knowledge-base/src/knowledge-base/ScopedVectorStorage.ts +++ b/packages/knowledge-base/src/knowledge-base/ScopedVectorStorage.ts @@ -6,7 +6,6 @@ import type { AnyVectorStorage, - HybridSearchOptions, IVectorStorage, VectorSearchOptions, } from "@workglow/storage"; @@ -77,23 +76,4 @@ export class ScopedVectorStorage< return this.filterAndStrip(results, options?.topK, overfetchLimit); } - - async hybridSearch( - query: TypedArray, - options: HybridSearchOptions - ): Promise<(Entity & { score: number })[]> { - if (typeof this.inner.hybridSearch !== "function") { - throw new Error( - "Hybrid search is not supported by the configured chunk storage backend. " + - "Please use a vector storage implementation that provides `hybridSearch`." - ); - } - const overfetchLimit = options?.topK ? options.topK * this.overFetchMultiplier : undefined; - const results = await this.inner.hybridSearch(query, { - ...options, - topK: overfetchLimit, - } as any); - - return this.filterAndStrip(results, options?.topK, overfetchLimit); - } } diff --git a/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts index 4c950695a..902ed30e6 100644 --- a/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts +++ b/packages/knowledge-base/src/knowledge-base/createKnowledgeBase.ts @@ -4,6 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ +import type { ITextIndex } from "@workglow/storage"; import { InMemoryTabularStorage, InMemoryVectorStorage } from "@workglow/storage"; import type { TypedArrayConstructor } from "@workglow/util/schema"; import type { ChunkVectorStorage } from "../chunk/ChunkVectorStorageSchema"; @@ -28,6 +29,12 @@ export interface CreateKnowledgeBaseOptions { readonly onDocumentUpsert?: OnDocumentUpsertCallback; readonly onDocumentDelete?: OnDocumentDeleteCallback; readonly onSearch?: OnSearchCallback; + /** + * Optional full-text index. When provided, the KB enables + * {@link KnowledgeBase.hybridSearch} and auto-writes chunks to the index + * on upsert. + */ + readonly textIndex?: ITextIndex; } /** @@ -54,6 +61,7 @@ export async function createKnowledgeBase( onDocumentUpsert, onDocumentDelete, onSearch, + textIndex, } = options; const vectorCtor = vectorCtorOption ?? Float32Array; @@ -85,7 +93,7 @@ export async function createKnowledgeBase( name, tabularStorage as unknown as DocumentTabularStorage, vectorStorage as unknown as ChunkVectorStorage, - { title, description, onDocumentUpsert, onDocumentDelete, onSearch } + { title, description, onDocumentUpsert, onDocumentDelete, onSearch, textIndex } ); if (shouldRegister) { diff --git a/packages/storage/src/common.ts b/packages/storage/src/common.ts index da102cd23..4f2180182 100644 --- a/packages/storage/src/common.ts +++ b/packages/storage/src/common.ts @@ -52,5 +52,7 @@ export * from "./vector/InMemoryVectorStorage"; export * from "./vector/IVectorStorage"; export * from "./vector/TelemetryVectorStorage"; +export * from "./text/index"; + export * from "./credentials/EncryptedKvCredentialStore"; export * from "./credentials/LazyEncryptedCredentialStore"; diff --git a/packages/storage/src/text/BM25Index.ts b/packages/storage/src/text/BM25Index.ts new file mode 100644 index 000000000..f125be999 --- /dev/null +++ b/packages/storage/src/text/BM25Index.ts @@ -0,0 +1,405 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { ITextIndex, TextFields, TextSearchOptions, TextSearchResult } from "./ITextIndex"; +import type { Tokenizer } from "./Tokenizer"; +import { createDefaultTokenizer } from "./Tokenizer"; + +/** + * Default per-field weights for {@link BM25Index} when indexing + * {@link ChunkRecord}-shaped data. Tuned for hierarchical chunks — title and + * section headings are heavier than the body to reflect their navigational + * value, summaries are slightly above body, parent summaries are de-weighted + * because they are inherited context rather than direct content. + */ +export const DEFAULT_CHUNK_FIELD_WEIGHTS: Readonly> = { + doc_title: 3, + sectionTitles: 2, + summary: 1.5, + text: 1, + parentSummaries: 0.5, +}; + +export interface BM25IndexOptions { + /** Tokenizer used at index- and query-time. Defaults to {@link createDefaultTokenizer}. */ + readonly tokenizer?: Tokenizer; + /** + * Per-field weight map. Fields not in this map are ignored at index time. + * Defaults to {@link DEFAULT_CHUNK_FIELD_WEIGHTS}. + */ + readonly fieldWeights?: Readonly>; + /** BM25 term-saturation parameter. Typical values 1.2–2.0. */ + readonly k1?: number; + /** BM25 length-normalisation parameter. Typical values 0.5–0.75. */ + readonly b?: number; +} + +interface Posting { + readonly chunkId: string; + readonly tf: number; +} + +interface SerialisedBM25State { + readonly version: 1; + readonly k1: number; + readonly b: number; + readonly fieldWeights: Record; + readonly fieldStats: Record; + readonly postings: Record>>; + readonly docLengths: Record>; + readonly chunkToDoc: Record; +} + +/** + * In-memory BM25F text index. State is JSON-serialisable via + * {@link toJSON} / {@link fromJSON}. The serialised state captures all + * scoring inputs — `k1`, `b`, `fieldWeights`, postings, and per-document + * stats — so a `fromJSON` round-trip reproduces search results exactly. The + * tokenizer is *not* serialised; callers must restore an index configured + * with the same tokenizer that produced the state, otherwise query + * tokenisation will diverge from indexed terms. + * + * Scoring formula (Lucene-style BM25F): + * + * ``` + * idf(t) = ln(1 + (N - df(t) + 0.5) / (df(t) + 0.5)) + * tilde_tf(t,d) = sum_f weight(f) * tf(t,d,f) / (1 - b + b * len_f(d) / avgLen_f) + * score(t,d) = idf(t) * tilde_tf(t,d) / (k1 + tilde_tf(t,d)) + * ``` + * + * Document frequency `df(t)` is the number of distinct chunks containing `t` + * in *any* field. `N` is the total number of chunks. `tf(t,d,f)` is raw + * term frequency in field `f` of chunk `d`. `len_f(d)` is the field length + * (token count) for that chunk, and `avgLen_f` is the average length over + * chunks that have field `f` populated. + */ +export class BM25Index implements ITextIndex { + private readonly tokenizer: Tokenizer; + private fieldWeights: Record; + private k1: number; + private b: number; + + // term -> field -> postings (one entry per chunk that has the term in the field). + private postings = new Map>(); + + // chunkId -> field -> length (token count, post-tokenisation). + private docLengths = new Map>(); + + // field -> { docCount, totalLength } — running stats over chunks that have the field populated. + private fieldStats = new Map(); + + // chunkId -> docId, for removeByDocument cascades. + private chunkToDoc = new Map(); + + // docId -> Set, for fast cascade deletes. + private docToChunks = new Map>(); + + // chunkId -> term -> Set — reverse index used for surgical removal + // without scanning the whole vocabulary on every delete. + private chunkPostings = new Map>>(); + + // term -> document frequency (number of distinct chunks containing the term + // in any field). Maintained incrementally on add/remove so search() doesn't + // have to walk every posting list per query term. + private termDf = new Map(); + + constructor(options: BM25IndexOptions = {}) { + this.tokenizer = options.tokenizer ?? createDefaultTokenizer(); + this.fieldWeights = { ...(options.fieldWeights ?? DEFAULT_CHUNK_FIELD_WEIGHTS) }; + this.k1 = options.k1 ?? 1.2; + this.b = options.b ?? 0.75; + } + + size(): number { + return this.docLengths.size; + } + + add(chunkId: string, docId: string, fields: TextFields): void { + if (this.docLengths.has(chunkId)) { + this.remove(chunkId); + } + + const perField = new Map(); + const termIndex = new Map>(); + let anyTokens = false; + + for (const [field, weight] of Object.entries(this.fieldWeights)) { + if (weight <= 0) continue; + const raw = fields[field]; + if (raw === undefined) continue; + const text = Array.isArray(raw) ? raw.join(" ") : (raw as string); + const tokens = this.tokenizer.tokenize(text); + if (tokens.length === 0) continue; + + anyTokens = true; + perField.set(field, tokens.length); + + const tfMap = new Map(); + for (const term of tokens) { + tfMap.set(term, (tfMap.get(term) ?? 0) + 1); + } + for (const [term, tf] of tfMap) { + let byField = this.postings.get(term); + if (!byField) { + byField = new Map(); + this.postings.set(term, byField); + } + let list = byField.get(field); + if (!list) { + list = []; + byField.set(field, list); + } + list.push({ chunkId, tf }); + + let fieldsForTerm = termIndex.get(term); + if (!fieldsForTerm) { + fieldsForTerm = new Set(); + termIndex.set(term, fieldsForTerm); + } + fieldsForTerm.add(field); + } + } + + if (!anyTokens) return; + + for (const [field, len] of perField) { + const stat = this.fieldStats.get(field) ?? { docCount: 0, totalLength: 0 }; + stat.docCount += 1; + stat.totalLength += len; + this.fieldStats.set(field, stat); + } + + this.docLengths.set(chunkId, perField); + this.chunkToDoc.set(chunkId, docId); + this.chunkPostings.set(chunkId, termIndex); + for (const term of termIndex.keys()) { + this.termDf.set(term, (this.termDf.get(term) ?? 0) + 1); + } + let bucket = this.docToChunks.get(docId); + if (!bucket) { + bucket = new Set(); + this.docToChunks.set(docId, bucket); + } + bucket.add(chunkId); + } + + remove(chunkId: string): void { + const fieldLengths = this.docLengths.get(chunkId); + if (!fieldLengths) return; + + for (const [field, len] of fieldLengths) { + const stat = this.fieldStats.get(field); + if (stat) { + stat.docCount -= 1; + stat.totalLength -= len; + if (stat.docCount <= 0) { + this.fieldStats.delete(field); + } else { + this.fieldStats.set(field, stat); + } + } + } + + const termIndex = this.chunkPostings.get(chunkId); + if (termIndex) { + for (const [term, fields] of termIndex) { + const byField = this.postings.get(term); + if (!byField) continue; + for (const field of fields) { + const list = byField.get(field); + if (!list) continue; + const filtered = list.filter((p) => p.chunkId !== chunkId); + if (filtered.length === 0) { + byField.delete(field); + } else if (filtered.length !== list.length) { + byField.set(field, filtered); + } + } + if (byField.size === 0) { + this.postings.delete(term); + } + const newDf = (this.termDf.get(term) ?? 0) - 1; + if (newDf <= 0) { + this.termDf.delete(term); + } else { + this.termDf.set(term, newDf); + } + } + } + + this.chunkPostings.delete(chunkId); + this.docLengths.delete(chunkId); + + const docId = this.chunkToDoc.get(chunkId); + this.chunkToDoc.delete(chunkId); + if (docId !== undefined) { + const bucket = this.docToChunks.get(docId); + if (bucket) { + bucket.delete(chunkId); + if (bucket.size === 0) this.docToChunks.delete(docId); + } + } + } + + removeByDocument(docId: string): void { + const bucket = this.docToChunks.get(docId); + if (!bucket) return; + for (const chunkId of [...bucket]) { + this.remove(chunkId); + } + } + + clear(): void { + this.postings.clear(); + this.docLengths.clear(); + this.fieldStats.clear(); + this.chunkToDoc.clear(); + this.docToChunks.clear(); + this.chunkPostings.clear(); + this.termDf.clear(); + } + + search(query: string, options: TextSearchOptions = {}): TextSearchResult[] { + const topK = options.topK ?? 10; + const totalDocs = this.docLengths.size; + if (totalDocs === 0) return []; + + const queryTerms = this.tokenizer.tokenize(query); + if (queryTerms.length === 0) return []; + + const uniqueTerms = Array.from(new Set(queryTerms)); + const scores = new Map(); + + for (const term of uniqueTerms) { + const byField = this.postings.get(term); + if (!byField) continue; + + const df = this.termDf.get(term) ?? 0; + if (df === 0) continue; + const idf = Math.log(1 + (totalDocs - df + 0.5) / (df + 0.5)); + if (idf <= 0) continue; + + const pseudoTfByChunk = new Map(); + for (const [field, list] of byField) { + const weight = this.fieldWeights[field] ?? 0; + if (weight <= 0) continue; + const stat = this.fieldStats.get(field); + if (!stat || stat.docCount === 0) continue; + const avgLen = stat.totalLength / stat.docCount; + + for (const { chunkId, tf } of list) { + const fieldLen = this.docLengths.get(chunkId)?.get(field) ?? 0; + const norm = avgLen > 0 ? 1 - this.b + (this.b * fieldLen) / avgLen : 1; + const contribution = (weight * tf) / (norm || 1); + pseudoTfByChunk.set(chunkId, (pseudoTfByChunk.get(chunkId) ?? 0) + contribution); + } + } + + for (const [chunkId, pseudoTf] of pseudoTfByChunk) { + const termScore = (idf * pseudoTf) / (this.k1 + pseudoTf); + scores.set(chunkId, (scores.get(chunkId) ?? 0) + termScore); + } + } + + const ranked = Array.from(scores, ([chunkId, score]) => ({ chunkId, score })); + ranked.sort((a, b) => b.score - a.score); + return ranked.slice(0, topK); + } + + toJSON(): SerialisedBM25State { + const postingsObj: SerialisedBM25State["postings"] = {}; + for (const [term, byField] of this.postings) { + const fieldsObj: Record> = {}; + for (const [field, list] of byField) { + fieldsObj[field] = list.map((p) => ({ chunkId: p.chunkId, tf: p.tf })); + } + postingsObj[term] = fieldsObj; + } + + const fieldStatsObj: SerialisedBM25State["fieldStats"] = {}; + for (const [field, stat] of this.fieldStats) { + fieldStatsObj[field] = { docCount: stat.docCount, totalLength: stat.totalLength }; + } + + const docLengthsObj: SerialisedBM25State["docLengths"] = {}; + for (const [chunkId, perField] of this.docLengths) { + docLengthsObj[chunkId] = Object.fromEntries(perField); + } + + const chunkToDocObj: Record = {}; + for (const [chunkId, docId] of this.chunkToDoc) { + chunkToDocObj[chunkId] = docId; + } + + return { + version: 1, + k1: this.k1, + b: this.b, + fieldWeights: { ...this.fieldWeights }, + fieldStats: fieldStatsObj, + postings: postingsObj, + docLengths: docLengthsObj, + chunkToDoc: chunkToDocObj, + }; + } + + fromJSON(state: unknown): void { + const s = state as SerialisedBM25State; + if (!s || typeof s !== "object" || s.version !== 1) { + throw new Error("BM25Index.fromJSON: unsupported or missing state version"); + } + + this.clear(); + this.k1 = s.k1; + this.b = s.b; + this.fieldWeights = { ...s.fieldWeights }; + + for (const [field, stat] of Object.entries(s.fieldStats)) { + this.fieldStats.set(field, { docCount: stat.docCount, totalLength: stat.totalLength }); + } + + for (const [term, fieldsObj] of Object.entries(s.postings)) { + const byField = new Map(); + const distinctChunks = new Set(); + for (const [field, list] of Object.entries(fieldsObj)) { + byField.set( + field, + list.map((p) => ({ chunkId: p.chunkId, tf: p.tf })) + ); + for (const p of list) { + distinctChunks.add(p.chunkId); + let termIndex = this.chunkPostings.get(p.chunkId); + if (!termIndex) { + termIndex = new Map(); + this.chunkPostings.set(p.chunkId, termIndex); + } + let fields = termIndex.get(term); + if (!fields) { + fields = new Set(); + termIndex.set(term, fields); + } + fields.add(field); + } + } + this.postings.set(term, byField); + this.termDf.set(term, distinctChunks.size); + } + + for (const [chunkId, perField] of Object.entries(s.docLengths)) { + this.docLengths.set(chunkId, new Map(Object.entries(perField))); + } + + for (const [chunkId, docId] of Object.entries(s.chunkToDoc)) { + this.chunkToDoc.set(chunkId, docId); + let bucket = this.docToChunks.get(docId); + if (!bucket) { + bucket = new Set(); + this.docToChunks.set(docId, bucket); + } + bucket.add(chunkId); + } + } +} diff --git a/packages/storage/src/text/ITextIndex.ts b/packages/storage/src/text/ITextIndex.ts new file mode 100644 index 000000000..399c282df --- /dev/null +++ b/packages/storage/src/text/ITextIndex.ts @@ -0,0 +1,81 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Identity score returned by an {@link ITextIndex} search. The index does not + * hydrate full records — callers (typically a knowledge base) look up + * `chunkId` in their own chunk storage. + */ +export interface TextSearchResult { + readonly chunkId: string; + readonly score: number; +} + +/** + * Map from logical field name (e.g. `"text"`, `"doc_title"`) to the chunk's + * value for that field. Strings are tokenised; arrays of strings are joined + * with a space separator before tokenisation. Missing or empty fields are + * skipped. + */ +export type TextFields = Readonly>; + +export interface TextSearchOptions { + readonly topK?: number; +} + +/** + * Sibling of {@link IVectorStorage} for full-text search over chunks. The + * index stores postings + chunk identifiers only — it does not duplicate the + * source text. Persistence is via {@link toJSON} / {@link fromJSON}; the + * serialised form is plain JSON-compatible data. + * + * Implementations are expected to be deterministic given the same tokenizer + * and field weights. + */ +export interface ITextIndex { + /** + * Add or replace a chunk's postings. Calling `add` for an existing + * `chunkId` is an idempotent upsert: previous postings for that chunk are + * removed first. `docId` is captured so {@link removeByDocument} can cascade + * deletions. + */ + add(chunkId: string, docId: string, fields: TextFields): void; + + /** + * Remove all postings for a single chunk. No-op if the chunk is not + * indexed. + */ + remove(chunkId: string): void; + + /** + * Remove all chunks belonging to a document. Used by + * `KnowledgeBase.deleteDocument`. + */ + removeByDocument(docId: string): void; + + /** Drop all postings and reset all statistics. */ + clear(): void; + + /** Number of chunks currently indexed. */ + size(): number; + + /** + * Score the corpus against `query` and return the top-K chunks. The score + * is BM25(F)-style — unbounded above, always non-negative — and is + * suitable for rank-based fusion (e.g. RRF) without normalisation. + */ + search(query: string, options?: TextSearchOptions): TextSearchResult[]; + + /** + * Serialise the index to a JSON-safe value. Round-trips with + * {@link fromJSON} on a fresh instance configured with the same tokenizer + * and field weights. + */ + toJSON(): unknown; + + /** Replace the index's state with a value previously produced by {@link toJSON}. */ + fromJSON(state: unknown): void; +} diff --git a/packages/storage/src/text/Tokenizer.ts b/packages/storage/src/text/Tokenizer.ts new file mode 100644 index 000000000..c2950364c --- /dev/null +++ b/packages/storage/src/text/Tokenizer.ts @@ -0,0 +1,107 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Pluggable tokenizer for text indexes. Implementations must be deterministic + * and side-effect free: indexing and querying always pass through the same + * tokenizer, so any non-determinism produces silent recall regressions. + */ +export interface Tokenizer { + tokenize(input: string): string[]; +} + +/** + * Default English stopwords. Exported so callers can extend (rather than + * replace) the default list: + * + * ```ts + * const stopwords = new Set([...DEFAULT_ENGLISH_STOPWORDS, "foo", "bar"]); + * ``` + */ +export const DEFAULT_ENGLISH_STOPWORDS: ReadonlySet = new Set([ + "a", + "an", + "and", + "are", + "as", + "at", + "be", + "but", + "by", + "for", + "from", + "has", + "have", + "he", + "her", + "hers", + "him", + "his", + "i", + "if", + "in", + "into", + "is", + "it", + "its", + "me", + "my", + "of", + "on", + "or", + "our", + "ours", + "she", + "so", + "than", + "that", + "the", + "their", + "theirs", + "them", + "they", + "this", + "to", + "us", + "was", + "we", + "were", + "will", + "with", + "you", + "your", + "yours", +]); + +export interface DefaultTokenizerOptions { + readonly stopwords?: ReadonlySet; + readonly minTokenLength?: number; +} + +/** + * Default tokenizer: lowercase, split on Unicode non-letter/non-digit + * characters, drop tokens shorter than `minTokenLength`, drop stopwords. + * + * No stemming. Identical at index and query time. + */ +export function createDefaultTokenizer(options: DefaultTokenizerOptions = {}): Tokenizer { + const stopwords = options.stopwords ?? DEFAULT_ENGLISH_STOPWORDS; + const minTokenLength = options.minTokenLength ?? 2; + const splitter = /[^\p{L}\p{N}]+/u; + + return { + tokenize(input: string): string[] { + if (!input) return []; + const out: string[] = []; + for (const raw of input.toLowerCase().split(splitter)) { + if (raw.length < minTokenLength) continue; + if (stopwords.has(raw)) continue; + out.push(raw); + } + return out; + }, + }; +} diff --git a/packages/storage/src/text/index.ts b/packages/storage/src/text/index.ts new file mode 100644 index 000000000..136abf06a --- /dev/null +++ b/packages/storage/src/text/index.ts @@ -0,0 +1,16 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +export type { + ITextIndex, + TextFields, + TextSearchOptions, + TextSearchResult, +} from "./ITextIndex"; +export type { Tokenizer, DefaultTokenizerOptions } from "./Tokenizer"; +export { DEFAULT_ENGLISH_STOPWORDS, createDefaultTokenizer } from "./Tokenizer"; +export type { BM25IndexOptions } from "./BM25Index"; +export { BM25Index, DEFAULT_CHUNK_FIELD_WEIGHTS } from "./BM25Index"; diff --git a/packages/storage/src/vector/IVectorStorage.ts b/packages/storage/src/vector/IVectorStorage.ts index 23de91bae..60c50597a 100644 --- a/packages/storage/src/vector/IVectorStorage.ts +++ b/packages/storage/src/vector/IVectorStorage.ts @@ -102,16 +102,6 @@ export interface VectorSearchOptions< readonly scoreThreshold?: number; } -/** - * Options for hybrid search (vector + full-text) - */ -export interface HybridSearchOptions< - Metadata extends Record | undefined = Record, -> extends VectorSearchOptions { - readonly textQuery: string; - readonly vectorWeight?: number; -} - /** * Type definitions for document chunk vector repository events */ @@ -120,7 +110,6 @@ export interface VectorEventListeners extends TabularEventLi Entity > { similaritySearch: (query: TypedArray, results: (Entity & { score: number })[]) => void; - hybridSearch: (query: TypedArray, results: (Entity & { score: number })[]) => void; } export type VectorEventName = keyof VectorEventListeners; @@ -173,18 +162,6 @@ export interface IVectorStorage< query: TypedArray, options?: VectorSearchOptions ): Promise<(Entity & { score: number })[]>; - - /** - * Hybrid search combining vector similarity with full-text search - * This is optional and may not be supported by all implementations - * @param query - Query vector to compare against - * @param options - Hybrid search options including text query - * @returns Array of search results sorted by combined relevance - */ - hybridSearch?( - query: TypedArray, - options: HybridSearchOptions - ): Promise<(Entity & { score: number })[]>; } /** diff --git a/packages/storage/src/vector/InMemoryVectorStorage.ts b/packages/storage/src/vector/InMemoryVectorStorage.ts index d601b5d02..00b613d73 100644 --- a/packages/storage/src/vector/InMemoryVectorStorage.ts +++ b/packages/storage/src/vector/InMemoryVectorStorage.ts @@ -13,7 +13,7 @@ import type { } from "@workglow/util/schema"; import { cosineSimilarity } from "@workglow/util/schema"; import { InMemoryTabularStorage } from "../tabular/InMemoryTabularStorage"; -import type { HybridSearchOptions, IVectorStorage, VectorSearchOptions } from "./IVectorStorage"; +import type { IVectorStorage, VectorSearchOptions } from "./IVectorStorage"; import { getMetadataProperty, getVectorProperty } from "./IVectorStorage"; /** @@ -28,25 +28,6 @@ function matchesFilter(metadata: Metadata, filter: Partial): return true; } -/** - * Simple full-text search scoring (keyword matching) - */ -function textRelevance(text: string, query: string): number { - const textLower = text.toLowerCase(); - const queryLower = query.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 0); - if (queryWords.length === 0) { - return 0; - } - let matches = 0; - for (const word of queryWords) { - if (textLower.includes(word)) { - matches++; - } - } - return matches / queryWords.length; -} - /** * In-memory document chunk vector repository implementation. * Extends InMemoryTabularStorage for storage. @@ -145,57 +126,4 @@ export class InMemoryVectorStorage< return topResults; } - - async hybridSearch(query: TypedArray, options: HybridSearchOptions>) { - const { topK = 10, filter, scoreThreshold = 0, textQuery, vectorWeight = 0.7 } = options; - - if (!textQuery || textQuery.trim().length === 0) { - // Fall back to regular vector search if no text query - return this.similaritySearch(query, { topK, filter, scoreThreshold }); - } - - const results: Array = []; - const allEntities = (await this.getAll()) || []; - - for (const entity of allEntities) { - // In memory, vectors are stored as TypedArrays directly (not serialized) - const vector = entity[this.vectorPropertyName] as TypedArray; - const metadata = this.metadataPropertyName - ? (entity[this.metadataPropertyName] as Metadata) - : ({} as Metadata); - - // Apply filter if provided - if (filter && !matchesFilter(metadata, filter)) { - continue; - } - - // Calculate vector similarity - const vectorScore = cosineSimilarity(query, vector); - - // Calculate text relevance (simple keyword matching) - const metadataText = Object.values(metadata ?? {}) - .join(" ") - .toLowerCase(); - const textScore = textRelevance(metadataText, textQuery); - - // Combine scores - const combinedScore = vectorWeight * vectorScore + (1 - vectorWeight) * textScore; - - // Apply threshold - if (combinedScore < scoreThreshold) { - continue; - } - - results.push({ - ...entity, - score: combinedScore, - } as Entity & { score: number }); - } - - // Sort by combined score descending and take top K - results.sort((a, b) => b.score - a.score); - const topResults = results.slice(0, topK); - - return topResults; - } } diff --git a/packages/storage/src/vector/TelemetryVectorStorage.ts b/packages/storage/src/vector/TelemetryVectorStorage.ts index e22f7dea2..727192d92 100644 --- a/packages/storage/src/vector/TelemetryVectorStorage.ts +++ b/packages/storage/src/vector/TelemetryVectorStorage.ts @@ -17,7 +17,7 @@ import type { } from "../tabular/ITabularStorage"; import { TelemetryTabularStorage } from "../tabular/TelemetryTabularStorage"; import { traced } from "@workglow/util"; -import type { HybridSearchOptions, IVectorStorage, VectorSearchOptions } from "./IVectorStorage"; +import type { IVectorStorage, VectorSearchOptions } from "./IVectorStorage"; /** * Telemetry wrapper for any IVectorStorage implementation. @@ -65,16 +65,4 @@ export class TelemetryVectorStorage< this.vectorInner.similaritySearch(query, options) ); } - - hybridSearch( - query: TypedArray, - options: HybridSearchOptions - ): Promise<(Entity & { score: number })[]> { - if (!this.vectorInner.hybridSearch) { - throw new Error("hybridSearch is not supported by the underlying storage implementation"); - } - return traced("workglow.storage.vector.hybridSearch", this.storageName, () => - this.vectorInner.hybridSearch!(query, options) - ); - } } diff --git a/packages/test/src/test/mcp/mcp-servers.integration.test.ts b/packages/test/src/test/mcp/mcp-servers.integration.test.ts index 566259942..0c2f85b92 100644 --- a/packages/test/src/test/mcp/mcp-servers.integration.test.ts +++ b/packages/test/src/test/mcp/mcp-servers.integration.test.ts @@ -44,7 +44,6 @@ const MCP_SERVERS = [ { name: "Exa Search", url: "https://mcp.exa.ai/mcp" }, { name: "Hugging Face", url: "https://hf.co/mcp" }, { name: "Remote MCP", url: "https://mcp.remote-mcp.com" }, - { name: "GitMCP", url: "https://gitmcp.io/docs" }, ] as const; /** Build minimal input from tool inputSchema for integration test calls */ diff --git a/packages/test/src/test/rag/KnowledgeBaseHybridSearch.test.ts b/packages/test/src/test/rag/KnowledgeBaseHybridSearch.test.ts new file mode 100644 index 000000000..9f8b4036b --- /dev/null +++ b/packages/test/src/test/rag/KnowledgeBaseHybridSearch.test.ts @@ -0,0 +1,361 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { createKnowledgeBase } from "@workglow/knowledge-base"; +import { BM25Index } from "@workglow/storage"; +import { uuid4 } from "@workglow/util"; +import { beforeEach, describe, expect, it } from "vitest"; + +const dimensions = 3; + +const vec = (a: number, b: number, c: number) => new Float32Array([a, b, c]); + +const makeChunk = ( + chunk_id: string, + doc_id: string, + text: string, + vector: Float32Array, + extras: Partial> = {} +) => ({ + chunk_id, + doc_id, + vector, + metadata: { chunkId: chunk_id, doc_id, text, nodePath: [chunk_id], depth: 0, ...extras }, +}); + +describe("KnowledgeBase hybrid search (RRF over vector + BM25)", () => { + let kbName: string; + + beforeEach(() => { + kbName = `hybrid-test-${uuid4()}`; + }); + + it("hybridSearch throws when no text index is installed", async () => { + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + register: false, + }); + expect(kb.supportsHybridSearch()).toBe(false); + await expect( + kb.hybridSearch(vec(1, 0, 0), { textQuery: "rabbit", topK: 5 }) + ).rejects.toThrow(/text index/i); + }); + + it("auto-indexes text fields on upsertChunk and exposes them via textSearch", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + + await kb.upsertChunk(makeChunk("c1", "d1", "the rabbit jumps over the fence", vec(1, 0, 0))); + await kb.upsertChunk(makeChunk("c2", "d2", "the fox eats grapes", vec(0, 1, 0))); + + expect(kb.supportsHybridSearch()).toBe(true); + expect(index.size()).toBe(2); + + const results = await kb.textSearch("rabbit"); + expect(results).toHaveLength(1); + expect(results[0].chunk_id).toBe("c1"); + expect(results[0].score).toBeGreaterThan(0); + expect(results[0].metadata.text).toContain("rabbit"); + }); + + it("deleteDocument cascades to the text index", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + + await kb.upsertChunk(makeChunk("c1", "d1", "rabbit", vec(1, 0, 0))); + await kb.upsertChunk(makeChunk("c2", "d2", "fox", vec(0, 1, 0))); + expect(index.size()).toBe(2); + + await kb.deleteDocument("d1"); + expect(index.size()).toBe(1); + expect(await kb.textSearch("rabbit")).toEqual([]); + const foxHits = await kb.textSearch("fox"); + expect(foxHits.map((r) => r.chunk_id)).toEqual(["c2"]); + }); + + it("clearChunks empties the text index", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + + await kb.upsertChunk(makeChunk("c1", "d1", "rabbit fence", vec(1, 0, 0))); + expect(index.size()).toBe(1); + + await kb.clearChunks(); + expect(index.size()).toBe(0); + expect(await kb.textSearch("rabbit")).toEqual([]); + }); + + it("RRF surfaces a chunk that ranks high in text but only mid-pack in vector", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + + // Query vector aligned with axis x. c1 is the closest vector match but + // contains no relevant text. c2 has perfect text match but a mediocre + // vector. Pure vector search ranks c1 first; RRF fusion should promote c2. + await kb.upsertChunksBulk([ + makeChunk("c1", "d1", "unrelated content about gardens", vec(1.0, 0.0, 0.0)), + makeChunk("c2", "d2", "rabbit and fence and rabbit", vec(0.5, 0.5, 0.0), { + doc_title: "Rabbits", + }), + makeChunk("c3", "d3", "completely different topic", vec(0.0, 1.0, 0.0)), + ]); + + // Pure vector search: c1 wins. + const vectorOnly = await kb.similaritySearch(vec(1, 0, 0), { topK: 3 }); + expect(vectorOnly[0].chunk_id).toBe("c1"); + + // Hybrid with text-leaning weight: c2's text match should win after RRF. + const fused = await kb.hybridSearch(vec(1, 0, 0), { + textQuery: "rabbit", + topK: 3, + vectorWeight: 0.3, + }); + expect(fused[0].chunk_id).toBe("c2"); + }); + + it("hybridSearch returns an empty array when the index has no chunks", async () => { + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: new BM25Index(), + register: false, + }); + const results = await kb.hybridSearch(vec(1, 0, 0), { + textQuery: "rabbit", + topK: 5, + }); + expect(results).toEqual([]); + }); + + it("upserting a chunk with empty text drops its postings from the index", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + + await kb.upsertChunk(makeChunk("c1", "d1", "rabbit fence", vec(1, 0, 0))); + expect(index.size()).toBe(1); + + // Re-upsert the same chunk with no indexable text — old postings must go. + await kb.upsertChunk({ + chunk_id: "c1", + doc_id: "d1", + vector: vec(1, 0, 0), + metadata: { chunkId: "c1", doc_id: "d1", text: "", nodePath: ["c1"], depth: 0 }, + }); + expect(index.size()).toBe(0); + expect(await kb.textSearch("rabbit")).toEqual([]); + }); + + it("put / putBulk go through the indexing path (alias for upsertChunk*)", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + + await kb.put(makeChunk("c1", "d1", "rabbit fence", vec(1, 0, 0))); + await kb.putBulk([ + makeChunk("c2", "d2", "fox garden", vec(0, 1, 0)), + makeChunk("c3", "d3", "tomato vine", vec(0, 0, 1)), + ]); + + expect(index.size()).toBe(3); + const hits = await kb.textSearch("rabbit"); + expect(hits.map((r) => r.chunk_id)).toEqual(["c1"]); + }); + + it("hybridSearch tolerates fractional candidatePoolMultiplier (integer poolSize)", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + + await kb.upsertChunksBulk([ + makeChunk("c1", "d1", "rabbit", vec(1, 0, 0)), + makeChunk("c2", "d2", "rabbit fence", vec(0, 1, 0)), + makeChunk("c3", "d3", "rabbit garden", vec(0, 0, 1)), + ]); + + // 3 * 1.7 = 5.1 — must not propagate as a non-integer topK. + const results = await kb.hybridSearch(vec(1, 0, 0), { + textQuery: "rabbit", + topK: 3, + candidatePoolMultiplier: 1.7, + }); + expect(results).toHaveLength(3); + }); + + it("installTextIndex after upserts requires reindexText to populate", async () => { + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + register: false, + }); + await kb.upsertChunk(makeChunk("c1", "d1", "rabbit fence", vec(1, 0, 0))); + + const index = new BM25Index(); + kb.installTextIndex(index); + // Pre-existing chunks aren't auto-indexed; reindex picks them up. + expect(index.size()).toBe(0); + await kb.reindexText(); + expect(index.size()).toBe(1); + const hits = await kb.textSearch("rabbit"); + expect(hits[0].chunk_id).toBe("c1"); + }); + + it("reindexText is atomic: a chunkStorage failure leaves the existing index untouched", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + await kb.upsertChunk(makeChunk("c1", "d1", "rabbit fence", vec(1, 0, 0))); + expect(index.size()).toBe(1); + + // Force getAll() to throw. If reindexText() were non-atomic (clear-then-load), + // the index would be left empty after this throw. + const original = kb.vectorStorage.getAll.bind(kb.vectorStorage); + kb.vectorStorage.getAll = () => Promise.reject(new Error("backend gone")); + + await expect(kb.reindexText()).rejects.toThrow(/backend gone/); + expect(index.size()).toBe(1); + const hits = await kb.textSearch("rabbit"); + expect(hits.map((r) => r.chunk_id)).toEqual(["c1"]); + + // Restore so the destroy() in afterEach behaves. + kb.vectorStorage.getAll = original; + }); + + it("hybridSearch falls back to similaritySearch when textQuery is empty", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + await kb.upsertChunksBulk([ + makeChunk("c1", "d1", "rabbit", vec(1, 0, 0)), + makeChunk("c2", "d2", "fox", vec(0, 1, 0)), + ]); + + const empty = await kb.hybridSearch(vec(1, 0, 0), { textQuery: "", topK: 2 }); + const whitespace = await kb.hybridSearch(vec(1, 0, 0), { textQuery: " ", topK: 2 }); + const cosine = await kb.similaritySearch(vec(1, 0, 0), { topK: 2 }); + + // Empty / whitespace query path returns cosine scores, identical ordering. + expect(empty.map((r) => r.chunk_id)).toEqual(cosine.map((r) => r.chunk_id)); + expect(whitespace.map((r) => r.chunk_id)).toEqual(cosine.map((r) => r.chunk_id)); + // And cosine scores are in [0,1], unlike the small RRF range. + expect(empty[0].score).toBeGreaterThan(0.5); + }); + + it("hybridSearch clamps negative rrfK to a safe value (no Infinity scores)", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + await kb.upsertChunksBulk([ + makeChunk("c1", "d1", "rabbit", vec(1, 0, 0)), + makeChunk("c2", "d2", "rabbit fence", vec(0, 1, 0)), + ]); + + // rrfK = -10 would otherwise produce Infinity / negative denominators. + const results = await kb.hybridSearch(vec(1, 0, 0), { + textQuery: "rabbit", + topK: 2, + rrfK: -10, + }); + expect(results).toHaveLength(2); + for (const r of results) { + expect(Number.isFinite(r.score)).toBe(true); + expect(r.score).toBeGreaterThan(0); + } + }); + + it("attaches scoreType to results for each search method", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + await kb.upsertChunksBulk([ + makeChunk("c1", "d1", "rabbit", vec(1, 0, 0)), + makeChunk("c2", "d2", "rabbit fence", vec(0, 1, 0)), + ]); + + const sim = await kb.similaritySearch(vec(1, 0, 0), { topK: 2 }); + const text = await kb.textSearch("rabbit", { topK: 2 }); + const hybrid = await kb.hybridSearch(vec(1, 0, 0), { textQuery: "rabbit", topK: 2 }); + const hybridEmpty = await kb.hybridSearch(vec(1, 0, 0), { textQuery: "", topK: 2 }); + + expect(sim.every((r) => r.scoreType === "cosine")).toBe(true); + expect(text.every((r) => r.scoreType === "bm25")).toBe(true); + expect(hybrid.every((r) => r.scoreType === "rrf")).toBe(true); + // Empty-query fallback routes through similaritySearch, so cosine. + expect(hybridEmpty.every((r) => r.scoreType === "cosine")).toBe(true); + }); + + it("hybridSearch produces RRF-shaped scores (small positives, not cosine)", async () => { + const index = new BM25Index(); + const kb = await createKnowledgeBase({ + name: kbName, + vectorDimensions: dimensions, + textIndex: index, + register: false, + }); + await kb.upsertChunksBulk([ + makeChunk("c1", "d1", "rabbit", vec(1, 0, 0)), + makeChunk("c2", "d2", "fox", vec(0, 1, 0)), + ]); + + const results = await kb.hybridSearch(vec(1, 0, 0), { textQuery: "rabbit", topK: 2 }); + expect(results.length).toBeGreaterThan(0); + // RRF default rrfK=60, so the top contribution is at most 1/(60+1) ≈ 0.0164. + // Sum across two rankers (vectorWeight + textWeight = 1) caps at ~0.0164. + for (const r of results) { + expect(r.score).toBeGreaterThan(0); + expect(r.score).toBeLessThan(0.05); + } + }); +}); diff --git a/packages/test/src/test/storage-text/BM25Index.test.ts b/packages/test/src/test/storage-text/BM25Index.test.ts new file mode 100644 index 000000000..be27f093b --- /dev/null +++ b/packages/test/src/test/storage-text/BM25Index.test.ts @@ -0,0 +1,294 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { + BM25Index, + DEFAULT_CHUNK_FIELD_WEIGHTS, + DEFAULT_ENGLISH_STOPWORDS, + createDefaultTokenizer, +} from "@workglow/storage"; +import { describe, expect, it } from "vitest"; + +describe("BM25Index", () => { + const addDoc = ( + idx: BM25Index, + chunkId: string, + docId: string, + text: string, + extras: Record = {} + ) => { + idx.add(chunkId, docId, { text, ...extras }); + }; + + describe("basic search", () => { + it("returns the only chunk for a single-doc index", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "the rabbit jumps over the fence"); + const results = idx.search("rabbit"); + expect(results).toHaveLength(1); + expect(results[0].chunkId).toBe("c1"); + expect(results[0].score).toBeGreaterThan(0); + }); + + it("returns nothing when the query has no indexable terms", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "rabbit fence garden"); + // All stopwords -> zero tokens after tokenisation. + expect(idx.search("the and of")).toEqual([]); + }); + + it("returns nothing when index is empty", () => { + const idx = new BM25Index(); + expect(idx.search("anything")).toEqual([]); + }); + + it("tokenises queries identically to indexed text (case-insensitive)", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "Rabbit"); + const upper = idx.search("RABBIT"); + const lower = idx.search("rabbit"); + expect(upper).toEqual(lower); + }); + }); + + describe("ranking", () => { + it("ranks rare terms above common terms (IDF)", () => { + const idx = new BM25Index(); + // "fence" is in every doc; "wombat" is unique to c2. + for (let i = 0; i < 10; i++) { + addDoc(idx, `c${i}`, `d${i}`, "fence garden lawn"); + } + addDoc(idx, "c-rare", "d-rare", "fence garden lawn wombat"); + + const results = idx.search("wombat fence"); + // The doc containing the rare term should rank first. + expect(results[0].chunkId).toBe("c-rare"); + }); + + it("weights matches in doc_title above matches in body (BM25F)", () => { + const idx = new BM25Index(); + addDoc(idx, "title-match", "d1", "unrelated body text", { + doc_title: "rabbit", + }); + addDoc(idx, "body-match", "d2", "rabbit appears here", { + doc_title: "unrelated", + }); + + const results = idx.search("rabbit"); + // doc_title weight (3) > text weight (1), so title-match ranks higher. + expect(results[0].chunkId).toBe("title-match"); + }); + + it("respects custom field weights", () => { + const idx = new BM25Index({ fieldWeights: { text: 1, doc_title: 0.1 } }); + addDoc(idx, "title-match", "d1", "unrelated body text", { + doc_title: "rabbit", + }); + addDoc(idx, "body-match", "d2", "rabbit appears here", { + doc_title: "unrelated", + }); + + // With doc_title de-weighted, body-match should win. + const results = idx.search("rabbit"); + expect(results[0].chunkId).toBe("body-match"); + }); + + it("favours shorter documents on otherwise equal matches (length norm)", () => { + const idx = new BM25Index(); + const short = "rabbit fence"; + const long = + "rabbit fence " + "stuffing word ".repeat(50); + addDoc(idx, "short", "d1", short); + addDoc(idx, "long", "d2", long); + + const results = idx.search("rabbit"); + expect(results[0].chunkId).toBe("short"); + }); + + it("indexes string-array fields by joining with whitespace", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "body text", { + sectionTitles: ["Mammals", "Rodents"], + }); + addDoc(idx, "c2", "d2", "body text", { + sectionTitles: ["Plants"], + }); + + const results = idx.search("rodents"); + expect(results.map((r) => r.chunkId)).toContain("c1"); + expect(results[0].chunkId).toBe("c1"); + }); + }); + + describe("upsert / remove", () => { + it("re-adding a chunkId is an idempotent upsert (does not double-count)", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "rabbit"); + const firstScore = idx.search("rabbit")[0].score; + + // Re-add with the same content; counts should not double. + addDoc(idx, "c1", "d1", "rabbit"); + expect(idx.size()).toBe(1); + const secondScore = idx.search("rabbit")[0].score; + expect(secondScore).toBeCloseTo(firstScore, 10); + }); + + it("re-adding overwrites previous content", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "rabbit"); + addDoc(idx, "c1", "d1", "fox"); + expect(idx.search("rabbit")).toEqual([]); + expect(idx.search("fox")).toHaveLength(1); + }); + + it("remove drops the chunk from search results and stats", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "rabbit fence"); + addDoc(idx, "c2", "d2", "rabbit garden"); + idx.remove("c1"); + + expect(idx.size()).toBe(1); + const results = idx.search("rabbit"); + expect(results).toHaveLength(1); + expect(results[0].chunkId).toBe("c2"); + }); + + it("remove is a no-op for unknown chunkId", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "rabbit"); + idx.remove("nonexistent"); + expect(idx.size()).toBe(1); + }); + + it("removeByDocument cascades over all chunks for the doc", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "doc-a", "rabbit"); + addDoc(idx, "c2", "doc-a", "fox"); + addDoc(idx, "c3", "doc-b", "rabbit fence"); + idx.removeByDocument("doc-a"); + + expect(idx.size()).toBe(1); + const results = idx.search("rabbit"); + expect(results).toHaveLength(1); + expect(results[0].chunkId).toBe("c3"); + }); + + it("clear empties the index", () => { + const idx = new BM25Index(); + addDoc(idx, "c1", "d1", "rabbit"); + addDoc(idx, "c2", "d2", "fox"); + idx.clear(); + expect(idx.size()).toBe(0); + expect(idx.search("rabbit")).toEqual([]); + }); + }); + + describe("toJSON / fromJSON round-trip", () => { + it("preserves search results exactly", () => { + const a = new BM25Index(); + addDoc(a, "c1", "d1", "rabbit fence", { doc_title: "Animals" }); + addDoc(a, "c2", "d2", "fox garden", { doc_title: "Animals" }); + addDoc(a, "c3", "d3", "tomato vine", { doc_title: "Plants" }); + + const state = JSON.parse(JSON.stringify(a.toJSON())) as unknown; + const b = new BM25Index(); + b.fromJSON(state); + + const queries = ["rabbit", "fox garden", "Animals", "plants"]; + for (const q of queries) { + const ra = a.search(q); + const rb = b.search(q); + expect(rb.map((r) => r.chunkId)).toEqual(ra.map((r) => r.chunkId)); + for (let i = 0; i < ra.length; i++) { + expect(rb[i].score).toBeCloseTo(ra[i].score, 10); + } + } + }); + + it("rejects unknown state versions", () => { + const idx = new BM25Index(); + expect(() => idx.fromJSON({ version: 999 })).toThrow(); + }); + + it("restores k1 / b from serialised state (does not silently keep constructor defaults)", () => { + const a = new BM25Index({ k1: 2.0, b: 0.5 }); + addDoc(a, "c1", "d1", "rabbit fence"); + addDoc(a, "c2", "d2", "rabbit"); + const stateA = JSON.parse(JSON.stringify(a.toJSON())) as unknown; + + // Fresh instance with *different* k1/b. After fromJSON, scoring must + // match a, not the new instance's constructor defaults. + const b = new BM25Index({ k1: 0.5, b: 0.9 }); + b.fromJSON(stateA); + + const ra = a.search("rabbit"); + const rb = b.search("rabbit"); + expect(rb.map((r) => r.chunkId)).toEqual(ra.map((r) => r.chunkId)); + for (let i = 0; i < ra.length; i++) { + expect(rb[i].score).toBeCloseTo(ra[i].score, 10); + } + }); + + it("preserves remove correctness after a fromJSON round-trip", () => { + const a = new BM25Index(); + addDoc(a, "c1", "d1", "rabbit fence"); + addDoc(a, "c2", "d2", "rabbit garden"); + + const b = new BM25Index(); + b.fromJSON(JSON.parse(JSON.stringify(a.toJSON())) as unknown); + + // The reverse-index for surgical remove must be reconstructed by + // fromJSON; otherwise remove silently leaves stale postings. + b.remove("c1"); + expect(b.size()).toBe(1); + const hits = b.search("rabbit"); + expect(hits).toHaveLength(1); + expect(hits[0].chunkId).toBe("c2"); + }); + }); + + describe("tokenizer & stopwords", () => { + it("default tokenizer drops English stopwords and short tokens", () => { + const tokenizer = createDefaultTokenizer(); + const tokens = tokenizer.tokenize("The quick brown fox is a jumper"); + expect(tokens).not.toContain("the"); + expect(tokens).not.toContain("is"); + expect(tokens).not.toContain("a"); + expect(tokens).toContain("quick"); + expect(tokens).toContain("brown"); + expect(tokens).toContain("fox"); + expect(tokens).toContain("jumper"); + }); + + it("default stopwords are extensible without replacement", () => { + const stopwords = new Set([...DEFAULT_ENGLISH_STOPWORDS, "rabbit"]); + const tokenizer = createDefaultTokenizer({ stopwords }); + expect(tokenizer.tokenize("the rabbit jumps")).toEqual(["jumps"]); + }); + + it("custom tokenizer is used at index- and query-time", () => { + const tokenizer = { + tokenize: (s: string) => + s.split(/[^A-Za-z]+/).filter((t) => t.length > 0).map((t) => t.toUpperCase()), + }; + const idx = new BM25Index({ tokenizer }); + idx.add("c1", "d1", { text: "RABBIT" }); + // Query with lowercase; custom tokenizer uppercases both sides => match. + expect(idx.search("rabbit")).toHaveLength(1); + }); + }); + + describe("default field weights", () => { + it("exports a sensible default map", () => { + expect(DEFAULT_CHUNK_FIELD_WEIGHTS.doc_title).toBeGreaterThan( + DEFAULT_CHUNK_FIELD_WEIGHTS.text + ); + expect(DEFAULT_CHUNK_FIELD_WEIGHTS.parentSummaries).toBeLessThan( + DEFAULT_CHUNK_FIELD_WEIGHTS.text + ); + }); + }); +}); diff --git a/packages/test/src/test/vector/IndexedDbVectorStorage.integration.test.ts b/packages/test/src/test/vector/IndexedDbVectorStorage.integration.test.ts index 822d1fa50..10f3a178c 100644 --- a/packages/test/src/test/vector/IndexedDbVectorStorage.integration.test.ts +++ b/packages/test/src/test/vector/IndexedDbVectorStorage.integration.test.ts @@ -252,97 +252,4 @@ describe("IndexedDbVectorStorage", () => { expect(results[0].chunk_id).toBe("doc1_0"); }); }); - - describe("hybridSearch", () => { - beforeEach(async () => { - await populateStorage(); - }); - - it("should combine vector similarity and text relevance", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const results = await storage.hybridSearch(query, { - textQuery: "machine learning", - topK: 5, - }); - - expect(results.length).toBeGreaterThan(0); - // Scores should be in descending order - for (let i = 1; i < results.length; i++) { - expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score); - } - }); - - it("should fall back to similarity search when textQuery is empty", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const hybridResults = await storage.hybridSearch(query, { - textQuery: "", - topK: 5, - }); - const simResults = await storage.similaritySearch(query, { topK: 5 }); - - expect(hybridResults.length).toBe(simResults.length); - // Same results, same order - hybridResults.forEach((r, i) => { - expect(r.chunk_id).toBe(simResults[i].chunk_id); - expect(r.score).toBeCloseTo(simResults[i].score, 5); - }); - }); - - it("should respect vectorWeight parameter", async () => { - const query = new Float32Array([0.0, 1.0, 0.0]); // points toward doc3 (cooking) - const textQuery = "AI"; // matches doc1 metadata - - // High vector weight should favor doc3 (vector match) - const highVectorResults = await storage.hybridSearch(query, { - textQuery, - topK: 1, - vectorWeight: 0.99, - }); - expect(highVectorResults[0].chunk_id).toBe("doc3_0"); - - // Low vector weight should favor doc1 (text match for "AI") - const lowVectorResults = await storage.hybridSearch(query, { - textQuery, - topK: 1, - vectorWeight: 0.01, - }); - expect(lowVectorResults[0].chunk_id).toBe("doc1_0"); - }); - - it("should filter by metadata in hybrid search", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const results = await storage.hybridSearch(query, { - textQuery: "document", - topK: 10, - filter: { category: "tech" }, - }); - - results.forEach((r) => { - expect(r.metadata).toHaveProperty("category", "tech"); - }); - }); - - it("should apply score threshold in hybrid search", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const results = await storage.hybridSearch(query, { - textQuery: "AI", - topK: 10, - scoreThreshold: 0.5, - }); - - results.forEach((r) => { - expect(r.score).toBeGreaterThanOrEqual(0.5); - }); - }); - - it("should respect topK limit in hybrid search", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const results = await storage.hybridSearch(query, { - textQuery: "document", - topK: 2, - }); - - expect(results.length).toBeLessThanOrEqual(2); - }); - }); }); diff --git a/packages/test/src/test/vector/SqliteAiVectorStorage.integration.test.ts b/packages/test/src/test/vector/SqliteAiVectorStorage.integration.test.ts index 932552ae8..01f50df5a 100644 --- a/packages/test/src/test/vector/SqliteAiVectorStorage.integration.test.ts +++ b/packages/test/src/test/vector/SqliteAiVectorStorage.integration.test.ts @@ -257,97 +257,4 @@ describe.skipIf(!sqliteVectorAvailable)("SqliteAiVectorStorage", async () => { expect(results.length).toBe(0); }); }); - - describe("hybridSearch", () => { - beforeEach(async () => { - await populateStorage(); - }); - - it("should combine vector similarity and text relevance", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const results = await storage.hybridSearch(query, { - textQuery: "machine learning", - topK: 5, - }); - - expect(results.length).toBeGreaterThan(0); - // Scores should be in descending order - for (let i = 1; i < results.length; i++) { - expect(results[i - 1].score).toBeGreaterThanOrEqual(results[i].score); - } - }); - - it("should fall back to similarity search when textQuery is empty", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const hybridResults = await storage.hybridSearch(query, { - textQuery: "", - topK: 5, - }); - const simResults = await storage.similaritySearch(query, { topK: 5 }); - - expect(hybridResults.length).toBe(simResults.length); - // Same results, same order - hybridResults.forEach((r, i) => { - expect(r.chunk_id).toBe(simResults[i].chunk_id); - expect(r.score).toBeCloseTo(simResults[i].score, 2); - }); - }); - - it("should respect vectorWeight parameter", async () => { - const query = new Float32Array([0.0, 1.0, 0.0]); // points toward doc3 (cooking) - const textQuery = "AI"; // matches doc1 metadata - - // High vector weight should favor doc3 (vector match) - const highVectorResults = await storage.hybridSearch(query, { - textQuery, - topK: 1, - vectorWeight: 0.99, - }); - expect(highVectorResults[0].chunk_id).toBe("doc3_0"); - - // Low vector weight should favor doc1 (text match for "AI") - const lowVectorResults = await storage.hybridSearch(query, { - textQuery, - topK: 1, - vectorWeight: 0.01, - }); - expect(lowVectorResults[0].chunk_id).toBe("doc1_0"); - }); - - it("should filter by metadata in hybrid search", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const results = await storage.hybridSearch(query, { - textQuery: "document", - topK: 10, - filter: { category: "tech" }, - }); - - results.forEach((r) => { - expect(r.metadata).toHaveProperty("category", "tech"); - }); - }); - - it("should apply score threshold in hybrid search", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const results = await storage.hybridSearch(query, { - textQuery: "AI", - topK: 10, - scoreThreshold: 0.5, - }); - - results.forEach((r) => { - expect(r.score).toBeGreaterThanOrEqual(0.5); - }); - }); - - it("should respect topK limit in hybrid search", async () => { - const query = new Float32Array([1.0, 0.0, 0.0]); - const results = await storage.hybridSearch(query, { - textQuery: "document", - topK: 2, - }); - - expect(results.length).toBeLessThanOrEqual(2); - }); - }); }); diff --git a/providers/postgres/src/storage/PostgresVectorStorage.ts b/providers/postgres/src/storage/PostgresVectorStorage.ts index 74020cb53..c89e8894d 100644 --- a/providers/postgres/src/storage/PostgresVectorStorage.ts +++ b/providers/postgres/src/storage/PostgresVectorStorage.ts @@ -21,7 +21,6 @@ import { getVectorProperty, } from "@workglow/storage"; import type { - HybridSearchOptions, IVectorStorage, VectorDistanceMetric, VectorIndexOptions, @@ -268,99 +267,6 @@ export class PostgresVectorStorage< } } - async hybridSearch(query: TypedArray, options: HybridSearchOptions) { - const { topK = 10, filter, scoreThreshold = 0, textQuery, vectorWeight = 0.7 } = options; - - if (!textQuery || textQuery.trim().length === 0) { - return this.similaritySearch(query, { topK, filter, scoreThreshold }); - } - - try { - // Try native hybrid search with pgvector + full-text - const queryVector = `[${Array.from(query).join(",")}]`; - // Use plainto_tsquery via parameterized query to avoid tsquery injection - const tsQueryText = textQuery; - const vectorColRaw = String(this.vectorPropertyName); - const vectorCol = PostgresDialect.quoteId(vectorColRaw); - const metadataColRaw = this.metadataPropertyName ? String(this.metadataPropertyName) : null; - const metadataCol = metadataColRaw ? PostgresDialect.quoteId(metadataColRaw) : null; - const vectorScoreExpr = this.buildScoreExpr(vectorCol, "$1"); - const combinedScoreExpr = `( - $2 * ${vectorScoreExpr} + - $3 * ts_rank(to_tsvector('english', ${metadataCol || "''"}::text), plainto_tsquery('english', $4)) - )`; - - let sql = ` - SELECT - *, - ${combinedScoreExpr} as score - FROM "${this.table}" - `; - - const params: any[] = [queryVector, vectorWeight, 1 - vectorWeight, tsQueryText]; - let paramIndex = 5; - // Track whether we've actually emitted a WHERE — see similaritySearch - // for why `filter` alone isn't a sufficient signal. - let hasWhere = false; - - if (filter && Object.keys(filter).length > 0 && metadataCol) { - const conditions: string[] = []; - for (const [key, value] of Object.entries(filter)) { - if (!SAFE_IDENTIFIER_RE.test(key)) { - throw new StorageValidationError( - `Invalid metadata filter key: "${key}". Keys must match /^[a-zA-Z_][a-zA-Z0-9_]*$/.` - ); - } - conditions.push(`${metadataCol}->>'${key}' = $${paramIndex}`); - params.push(String(value)); - paramIndex++; - } - if (conditions.length > 0) { - sql += ` WHERE ${conditions.join(" AND ")}`; - hasWhere = true; - } - } - - // Always emit the score threshold predicate (matches the in-memory - // fallback's `>= scoreThreshold` semantics for every threshold value). - sql += hasWhere ? " AND" : " WHERE"; - sql += ` ${combinedScoreExpr} >= $${paramIndex}`; - params.push(scoreThreshold); - paramIndex++; - - sql += ` ORDER BY score DESC LIMIT $${paramIndex}`; - params.push(topK); - - const result = await this.db.query(sql, params); - - // Fetch vectors separately for each result - const results: Array = []; - for (const row of result.rows) { - const vectorResult = await this.db.query( - `SELECT ${vectorCol}::text FROM "${this.table}" WHERE ${this.getPrimaryKeyWhereClause()}`, - this.getPrimaryKeyValues(row) - ); - const vectorStr = vectorResult.rows[0]?.[vectorColRaw] || "[]"; - const vectorArray = JSON.parse(vectorStr); - - results.push({ - ...row, - [this.vectorPropertyName]: new this.vectorCtor(vectorArray), - score: parseFloat(row.score), - } as Entity & { score: number }); - } - - return results; - } catch (error) { - if (error instanceof StorageValidationError) { - throw error; // Don't swallow validation errors - } - // Fall back to in-memory hybrid search - console.error("pgvector hybrid query failed, falling back to in-memory search:", error); - return this.hybridSearchFallback(query, options); - } - } - /** * Throws if the configured distance metric isn't cosine. The in-memory * fallback paths below only know how to compute cosine similarity; if @@ -411,57 +317,6 @@ export class PostgresVectorStorage< return topResults; } - /** - * Fallback hybrid search. Only valid for `distance: "cosine"`; see - * {@link assertFallbackSupportsDistance}. - */ - private async hybridSearchFallback(query: TypedArray, options: HybridSearchOptions) { - this.assertFallbackSupportsDistance(); - const { topK = 10, filter, scoreThreshold = 0, textQuery, vectorWeight = 0.7 } = options; - - const allRows = (await this.getAll()) || []; - const results: Array = []; - const queryLower = textQuery.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 0); - - for (const row of allRows) { - const vector = row[this.vectorPropertyName] as TypedArray; - const metadata = this.metadataPropertyName - ? (row[this.metadataPropertyName] as Metadata) - : ({} as Metadata); - - if (filter && !this.matchesFilter(metadata, filter)) { - continue; - } - - const vectorScore = cosineSimilarity(query, vector); - const metadataText = Object.values(metadata ?? {}) - .join(" ") - .toLowerCase(); - let textScore = 0; - if (queryWords.length > 0) { - let matches = 0; - for (const word of queryWords) { - if (metadataText.includes(word)) { - matches++; - } - } - textScore = matches / queryWords.length; - } - - const combinedScore = vectorWeight * vectorScore + (1 - vectorWeight) * textScore; - - if (combinedScore >= scoreThreshold) { - results.push({ ...row, score: combinedScore } as Entity & { score: number }); - } - } - - results.sort((a, b) => b.score - a.score); - const topResults = results.slice(0, topK); - - return topResults; - } - private getPrimaryKeyWhereClause(): string { const conditions = this.primaryKeyNames.map((key, idx) => `${String(key)} = $${idx + 1}`); return conditions.join(" AND "); diff --git a/providers/sqlite/src/storage/SqliteAiVectorStorage.ts b/providers/sqlite/src/storage/SqliteAiVectorStorage.ts index dbd6b46bd..398243833 100644 --- a/providers/sqlite/src/storage/SqliteAiVectorStorage.ts +++ b/providers/sqlite/src/storage/SqliteAiVectorStorage.ts @@ -16,7 +16,6 @@ import { cosineSimilarity } from "@workglow/util/schema"; import { SqliteTabularStorage } from "./SqliteTabularStorage"; import { getMetadataProperty, getVectorProperty } from "@workglow/storage"; import type { - HybridSearchOptions, IVectorStorage, VectorDistanceMetric, VectorIndexOptions, @@ -513,98 +512,6 @@ export class SqliteAiVectorStorage< } } - /** - * Hybrid search combining vector similarity with text relevance. - * Uses sqlite-vector for the vector component and keyword matching for text. - * Falls back to in-memory search if the extension is unavailable. - */ - async hybridSearch(query: TypedArray, options: HybridSearchOptions) { - const { topK = 10, filter, scoreThreshold = 0, textQuery, vectorWeight = 0.7 } = options; - - if (!textQuery || textQuery.trim().length === 0) { - return this.similaritySearch(query, { topK, filter, scoreThreshold }); - } - - if (!this.extensionLoaded) { - return this.hybridSearchFallback(query, options); - } - - const db = this.database; - const tableName = this.table; - const vectorCol = String(this.vectorPropertyName); - const metadataCol = this.metadataPropertyName ? String(this.metadataPropertyName) : null; - - try { - const queryJson = this.encodeVectorJson(query); - const queryBlob = db - .prepare(`SELECT vector_as_${this.vectorTypeSuffix}(?) as v`) - .get(queryJson) as { v: Buffer }; - - // Use streaming mode for hybrid search to allow text scoring on all results - const sql = ` - SELECT t.*, v.distance - FROM ${escapeIdentifier(tableName)} AS t - JOIN vector_full_scan(?, ?, ?) AS v - ON t.rowid = v.rowid - ORDER BY v.distance ASC - `; - const stmt = db.prepare(sql); - const rows = stmt.all(tableName, vectorCol, queryBlob.v) as Array< - Record & { distance: number } - >; - - const queryLower = textQuery.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 0); - const results: Array = []; - - for (const row of rows) { - const vectorScore = 1 - row.distance; - - const entity = { ...row } as Record; - delete entity.distance; - for (const k in this.schema.properties) { - entity[k] = this.sqlToJsValue(k, entity[k] as any); - } - - const metadata = metadataCol ? (entity[metadataCol] as Metadata) : ({} as Metadata); - - // Apply metadata filter - if (filter && !matchesFilter(metadata, filter)) { - continue; - } - - // Calculate text relevance - const metadataText = Object.values(metadata ?? {}) - .join(" ") - .toLowerCase(); - let textScore = 0; - if (queryWords.length > 0) { - let matches = 0; - for (const word of queryWords) { - if (metadataText.includes(word)) { - matches++; - } - } - textScore = matches / queryWords.length; - } - - const combinedScore = vectorWeight * vectorScore + (1 - vectorWeight) * textScore; - - if (combinedScore < scoreThreshold) { - continue; - } - - results.push({ ...entity, score: combinedScore } as Entity & { score: number }); - } - - results.sort((a, b) => b.score - a.score); - return results.slice(0, topK); - } catch (error) { - console.warn("sqlite-vector hybrid query failed, falling back to in-memory search:", error); - return this.hybridSearchFallback(query, options); - } - } - /** * Fallback search using in-memory cosine similarity */ @@ -633,51 +540,4 @@ export class SqliteAiVectorStorage< results.sort((a, b) => b.score - a.score); return results.slice(0, topK); } - - /** - * Fallback hybrid search using in-memory computation - */ - private async hybridSearchFallback(query: TypedArray, options: HybridSearchOptions) { - const { topK = 10, filter, scoreThreshold = 0, textQuery, vectorWeight = 0.7 } = options; - - const allRows = (await this.getAll()) || []; - const results: Array = []; - const queryLower = textQuery.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 0); - - for (const row of allRows) { - const vector = row[this.vectorPropertyName] as TypedArray; - const metadata = this.metadataPropertyName - ? (row[this.metadataPropertyName] as Metadata) - : ({} as Metadata); - - if (filter && !matchesFilter(metadata, filter)) { - continue; - } - - const vectorScore = cosineSimilarity(query, vector); - const metadataText = Object.values(metadata ?? {}) - .join(" ") - .toLowerCase(); - let textScore = 0; - if (queryWords.length > 0) { - let matches = 0; - for (const word of queryWords) { - if (metadataText.includes(word)) { - matches++; - } - } - textScore = matches / queryWords.length; - } - - const combinedScore = vectorWeight * vectorScore + (1 - vectorWeight) * textScore; - - if (combinedScore >= scoreThreshold) { - results.push({ ...row, score: combinedScore } as Entity & { score: number }); - } - } - - results.sort((a, b) => b.score - a.score); - return results.slice(0, topK); - } } diff --git a/providers/sqlite/src/storage/SqliteVectorStorage.ts b/providers/sqlite/src/storage/SqliteVectorStorage.ts index 1e058ddbf..070b89ce8 100644 --- a/providers/sqlite/src/storage/SqliteVectorStorage.ts +++ b/providers/sqlite/src/storage/SqliteVectorStorage.ts @@ -15,7 +15,7 @@ import type { import { cosineSimilarity } from "@workglow/util/schema"; import { SqliteTabularStorage } from "./SqliteTabularStorage"; import { getMetadataProperty, getVectorProperty } from "@workglow/storage"; -import type { HybridSearchOptions, IVectorStorage, VectorSearchOptions } from "@workglow/storage"; +import type { IVectorStorage, VectorSearchOptions } from "@workglow/storage"; /** * Check if metadata matches filter @@ -139,69 +139,4 @@ export class SqliteVectorStorage< return topResults; } - - async hybridSearch(query: TypedArray, options: HybridSearchOptions) { - const { topK = 10, filter, scoreThreshold = 0, textQuery, vectorWeight = 0.7 } = options; - - if (!textQuery || textQuery.trim().length === 0) { - // Fall back to regular vector search if no text query - return this.similaritySearch(query, { topK, filter, scoreThreshold }); - } - - const results: Array = []; - const allEntities = (await this.getAll()) || []; - const queryLower = textQuery.toLowerCase(); - const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 0); - - for (const entity of allEntities) { - // SQLite stores vectors as JSON strings, need to deserialize - const vectorRaw = entity[this.vectorPropertyName] as unknown as string; - const vector = this.deserializeVector(vectorRaw); - const metadata = this.metadataPropertyName - ? (entity[this.metadataPropertyName] as Metadata) - : ({} as Metadata); - - // Apply filter if provided - if (filter && !matchesFilter(metadata, filter)) { - continue; - } - - // Calculate vector similarity - const vectorScore = cosineSimilarity(query, vector); - - // Calculate text relevance (simple keyword matching) - const metadataText = Object.values(metadata ?? {}) - .join(" ") - .toLowerCase(); - let textScore = 0; - if (queryWords.length > 0) { - let matches = 0; - for (const word of queryWords) { - if (metadataText.includes(word)) { - matches++; - } - } - textScore = matches / queryWords.length; - } - - // Combine scores - const combinedScore = vectorWeight * vectorScore + (1 - vectorWeight) * textScore; - - // Apply threshold - if (combinedScore < scoreThreshold) { - continue; - } - - results.push({ - ...entity, - score: combinedScore, - } as Entity & { score: number }); - } - - // Sort by combined score descending and take top K - results.sort((a, b) => b.score - a.score); - const topResults = results.slice(0, topK); - - return topResults; - } }