Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 32 additions & 6 deletions packages/ai/src/task/ChunkRetrievalTask.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ const inputSchema = {
enum: ["similarity", "hybrid"],
title: "Retrieval Method",
description:
"Retrieval strategy: 'similarity' (vector only) or 'hybrid' (vector + full-text).",
"Retrieval strategy: 'similarity' (vector only, scores are cosine similarity in [0,1]) " +
"or 'hybrid' (vector + full-text fused via Reciprocal Rank Fusion; scores are RRF " +
"fusion scores, NOT comparable to cosine similarity).",
default: "similarity",
},
topK: {
Expand All @@ -66,7 +68,10 @@ const inputSchema = {
scoreThreshold: {
type: "number",
title: "Score Threshold",
description: "Minimum similarity score threshold (0-1)",
description:
"Minimum cosine similarity score threshold (0-1). Applies only to method='similarity'; " +
"ignored for method='hybrid' because RRF fusion scores are not comparable to cosine " +
"similarity. Use topK to size hybrid results instead.",
minimum: 0,
maximum: 1,
default: 0,
Expand Down Expand Up @@ -129,7 +134,19 @@ const outputSchema = {
type: "array",
items: { type: "number" },
title: "Scores",
description: "Similarity scores for each result",
description:
"Per-result scores. For method='similarity', these are cosine similarity scores in " +
"[0,1]. For method='hybrid', these are Reciprocal Rank Fusion scores — small positive " +
"numbers (typically <0.05) that rank results but do not correspond to a similarity.",
},
scoreType: {
type: "string",
enum: ["cosine", "bm25", "rrf"],
title: "Score Type",
description:
"Discriminator naming the scorer used for `scores`: 'cosine' for similarity search " +
"and for hybrid fallback when the text query is empty/whitespace; 'rrf' for hybrid " +
"fusion. ('bm25' is reserved for direct text search and is not produced by this task.)",
},
vectors: {
type: "array",
Expand Down Expand Up @@ -157,7 +174,7 @@ const outputSchema = {
description: "The query used for retrieval (pass-through)",
},
},
required: ["chunks", "chunk_ids", "metadata", "scores", "count", "query"],
required: ["chunks", "chunk_ids", "metadata", "scores", "scoreType", "count", "query"],
additionalProperties: false,
} as const satisfies DataPortSchema;

Expand Down Expand Up @@ -216,7 +233,9 @@ export class ChunkRetrievalTask extends Task<
}
if (method === "hybrid" && !kb.supportsHybridSearch()) {
throw new Error(
"The provided knowledge base does not support hybrid search. Use method: 'similarity' or a backend with hybrid support (e.g., Postgres with pgvector)."
"Hybrid retrieval requires a text index installed on the knowledge base. " +
"Install one via `kb.installTextIndex(new BM25Index())` or pass " +
"`textIndex` to `createKnowledgeBase`. Otherwise use method: 'similarity'."
);
}

Expand Down Expand Up @@ -250,7 +269,6 @@ export class ChunkRetrievalTask extends Task<
textQuery: queryText!,
topK,
filter,
scoreThreshold,
vectorWeight,
})
: await kb.similaritySearch(searchVector, {
Expand All @@ -264,11 +282,19 @@ export class ChunkRetrievalTask extends Task<
return meta.text || JSON.stringify(meta);
});

// The KB tags every result with the same scoreType; the empty-textQuery
// fallback inside hybridSearch can flip this from "rrf" to "cosine", which
// is exactly the signal we want to surface to callers.
const scoreType =
results.length > 0 ? (results[0].scoreType ?? (method === "hybrid" ? "rrf" : "cosine"))
: method === "hybrid" ? "rrf" : "cosine";

const output: ChunkRetrievalTaskOutput = {
chunks,
chunk_ids: results.map((r) => r.chunk_id),
metadata: results.map((r) => r.metadata),
scores: results.map((r) => r.score),
scoreType,
count: results.length,
query,
};
Expand Down
71 changes: 0 additions & 71 deletions packages/indexeddb/src/storage/IndexedDbVectorStorage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import { getMetadataProperty, getVectorProperty } from "@workglow/storage";
import type {
ClientProvidedKeysOption,
AnyVectorStorage,
HybridSearchOptions,
IVectorStorage,
VectorSearchOptions,
} from "@workglow/storage";
Expand All @@ -40,25 +39,6 @@ function matchesFilter<Metadata>(metadata: Metadata, filter: Partial<Metadata>):
return true;
}

/**
* Simple full-text search scoring (keyword matching)
*/
function textRelevance(text: string, query: string): number {
const textLower = text.toLowerCase();
const queryLower = query.toLowerCase();
const queryWords = queryLower.split(/\s+/).filter((w) => w.length > 0);
if (queryWords.length === 0) {
return 0;
}
let matches = 0;
for (const word of queryWords) {
if (textLower.includes(word)) {
matches++;
}
}
return matches / queryWords.length;
}

/**
* IndexedDB vector storage implementation.
* Extends IndexedDbTabularStorage for storage.
Expand Down Expand Up @@ -168,55 +148,4 @@ export class IndexedDbVectorStorage<

return topResults;
}

async hybridSearch(query: TypedArray, options: HybridSearchOptions<Record<string, unknown>>) {
const { topK = 10, filter, scoreThreshold = 0, textQuery, vectorWeight = 0.7 } = options;

if (!textQuery || textQuery.trim().length === 0) {
// Fall back to regular vector search if no text query
return this.similaritySearch(query, { topK, filter, scoreThreshold });
}

const results: Array<Entity & { score: number }> = [];
const allEntities = (await this.getAll()) || [];

for (const entity of allEntities) {
// IndexedDB stores TypedArrays natively via structured clone (no deserialization needed)
const vector = entity[this.vectorPropertyName] as TypedArray;
const metadata = this.metadataPropertyName
? (entity[this.metadataPropertyName] as Metadata)
: ({} as Metadata);

// Apply filter if provided
if (filter && !matchesFilter(metadata, filter)) {
continue;
}

// Calculate vector similarity
const vectorScore = cosineSimilarity(query, vector);

// Calculate text relevance (simple keyword matching)
const metadataText = Object.values(metadata).join(" ").toLowerCase();
const textScore = textRelevance(metadataText, textQuery);

// Combine scores
const combinedScore = vectorWeight * vectorScore + (1 - vectorWeight) * textScore;

// Apply threshold
if (combinedScore < scoreThreshold) {
continue;
}

results.push({
...entity,
score: combinedScore,
} as Entity & { score: number });
}

// Sort by combined score descending and take top K
results.sort((a, b) => b.score - a.score);
const topResults = results.slice(0, topK);

return topResults;
}
}
21 changes: 20 additions & 1 deletion packages/knowledge-base/src/chunk/ChunkVectorStorageSchema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,26 @@ export type ChunkVectorStorage = IVectorStorage<
ChunkVectorPrimaryKey
>;

/**
* Discriminator for the scoring function used to produce a
* {@link ChunkSearchResult.score}. Callers (typically UI) use this to render
* the score appropriately, since the three scorers live on different scales:
*
* - `"cosine"`: cosine similarity in `[-1, 1]`, typically `[0, 1]` for text
* embeddings. Absolute — higher means more similar.
* - `"bm25"`: BM25(F) score in `[0, ∞)`. Absolute but corpus-dependent — not
* comparable across knowledge bases.
* - `"rrf"`: Reciprocal Rank Fusion score, bounded above by
* `2 / (rrfK + 1)` (~`0.033` with the default `rrfK=60`). Rank-based, not
* absolute — the magnitude is not a similarity, only an ordering signal.
* Not comparable across queries.
*/
export type ScoreType = "cosine" | "bm25" | "rrf";

/**
* Search result with score
*/
export type ChunkSearchResult = ChunkVectorEntity & { score: number };
export type ChunkSearchResult = ChunkVectorEntity & {
score: number;
scoreType?: ScoreType;
};
Loading
Loading