diff --git a/packages/test/src/test/ai-provider/WebBrowserProvider.test.ts b/packages/test/src/test/ai-provider/WebBrowserProvider.test.ts index e32b69891..1a371b76f 100644 --- a/packages/test/src/test/ai-provider/WebBrowserProvider.test.ts +++ b/packages/test/src/test/ai-provider/WebBrowserProvider.test.ts @@ -4,7 +4,13 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type { ChatMessage, ModelRecord } from "@workglow/ai"; +import type { + ChatMessage, + ModelRecord, + StructuredGenerationTaskInput, + ToolCallingTaskInput, + ToolDefinition, +} from "@workglow/ai"; import { _testOnly } from "@workglow/chrome-ai/ai"; import { afterEach, describe, expect, it, vi } from "vitest"; @@ -13,10 +19,21 @@ const { WEB_BROWSER_RUN_FN_SPECS, WEB_BROWSER_RUN_FNS, WebBrowser_TextGeneration_Unified, + WebBrowser_StructuredGeneration, + WebBrowser_ToolCalling, sessions, chatHistory, + probe, } = _testOnly; +/** + * Test-time helpers: the chrome-ai run-fns we test take strongly-typed task + * inputs requiring a `model` field that's irrelevant to provider-level + * tests (the dispatcher fills it in upstream). We coerce that away here. + */ +const asSGI = (v: unknown): StructuredGenerationTaskInput => v as StructuredGenerationTaskInput; +const asTCI = (v: unknown): ToolCallingTaskInput => v as ToolCallingTaskInput; + function model(model_id: string, capabilities: readonly string[] = []): ModelRecord { return { model_id, @@ -33,15 +50,47 @@ function model(model_id: string, capabilities: readonly string[] = []): ModelRec // Capability inference + parity // -------------------------------------------------------------------------- +/** + * Probe factory whose `create()` always resolves to a destroyable handle. + * Used to drive `WebBrowserProvider` past the conservative-pre-probe state + * so we can assert the post-probe inference shape. + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function makeAcceptingProbeFactory(): any { + const destroy = vi.fn(); + return { + create: vi.fn().mockResolvedValue({ destroy }), + params: vi.fn().mockResolvedValue({}), + }; +} + describe("WebBrowserProvider.inferCapabilities", () => { - const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS); + // Reset the module-level probe cache so each `new WebBrowserProvider` + // can drive a fresh probe with its injected factory. + afterEach(() => { + probe._resetProbeCache(); + }); - it("trusts declared capabilities", () => { + it("trusts declared capabilities (probe-independent)", () => { + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS); const caps = provider.inferCapabilities(model("anything", ["text.translation"])); expect(caps).toEqual(["text.translation"]); }); - it("infers text-gen + json-mode + tool-use for chrome-prompt / gemini-nano", () => { + it("conservative pre-probe: drops json-mode and tool-use for chrome-prompt", () => { + // Probe is async — until it resolves, the provider must NOT advertise + // json-mode or tool-use, since the underlying API might not support them. + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS); + const caps = provider.inferCapabilities(model("chrome-prompt")); + expect(caps).toContain("text.generation"); + expect(caps).not.toContain("json-mode"); + expect(caps).not.toContain("tool-use"); + }); + + it("post-probe: adds json-mode + tool-use when supported", async () => { + const factory = makeAcceptingProbeFactory(); + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS, undefined, factory); + await provider.ready(); const caps = provider.inferCapabilities(model("chrome-prompt")); expect(caps).toContain("text.generation"); expect(caps).toContain("json-mode"); @@ -50,27 +99,32 @@ describe("WebBrowserProvider.inferCapabilities", () => { }); it("infers text.summary for summarizer model", () => { + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS); const caps = provider.inferCapabilities(model("chrome-summarizer")); expect(caps).toContain("text.summary"); expect(caps).not.toContain("text.generation"); }); it("infers text.rewriter for rewriter model", () => { + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS); const caps = provider.inferCapabilities(model("chrome-rewriter")); expect(caps).toContain("text.rewriter"); }); it("infers text.translation for translator model", () => { + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS); const caps = provider.inferCapabilities(model("chrome-translator")); expect(caps).toContain("text.translation"); }); it("infers text.language-detection for language-detector model", () => { + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS); const caps = provider.inferCapabilities(model("chrome-language-detector")); expect(caps).toContain("text.language-detection"); }); it("returns baseline meta-ops for unknown ids", () => { + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS); const caps = provider.inferCapabilities(model("unknown-id")); expect(caps).toEqual(["model.search", "model.info"]); }); @@ -351,3 +405,690 @@ describe("WebBrowser_ChatHistory helpers", () => { expect(chatHistory.buildInitialPromptsFromHistory([])).toEqual([]); }); }); + +// -------------------------------------------------------------------------- +// Capability probe +// -------------------------------------------------------------------------- + +/** + * Fake factory whose two `create()` codepaths can be independently controlled + * — pass `jsonModeOk: false` to reject when `responseConstraint` is passed, + * `toolUseOk: false` to reject when `tools` is passed. Records the total + * number of `create()` invocations so we can assert coalescing behavior. + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function makeProbeFactory(opts: { jsonModeOk: boolean; toolUseOk: boolean }): any { + let destroys = 0; + const create = vi.fn(async (options?: Record) => { + if (options && "responseConstraint" in options && !opts.jsonModeOk) { + throw new Error("responseConstraint not supported"); + } + if (options && "tools" in options && !opts.toolUseOk) { + throw new Error("tools not supported"); + } + return { + destroy: (): void => { + destroys += 1; + }, + }; + }); + return { create, params: vi.fn(), destroyCount: () => destroys }; +} + +describe("probeWebBrowserCapabilities", () => { + // Each test injects its own factory; clear the cached coalesced promise + // so they don't share results. + afterEach(() => { + probe._resetProbeCache(); + }); + + it("both true when factory accepts both responseConstraint and tools", async () => { + const f = makeProbeFactory({ jsonModeOk: true, toolUseOk: true }); + const result = await probe.probeWebBrowserCapabilities(f); + expect(result).toEqual({ jsonMode: true, toolUse: true }); + }); + + it("jsonMode false when factory rejects responseConstraint", async () => { + const f = makeProbeFactory({ jsonModeOk: false, toolUseOk: true }); + const result = await probe.probeWebBrowserCapabilities(f); + expect(result).toEqual({ jsonMode: false, toolUse: true }); + }); + + it("toolUse false when factory rejects tools", async () => { + const f = makeProbeFactory({ jsonModeOk: true, toolUseOk: false }); + const result = await probe.probeWebBrowserCapabilities(f); + expect(result).toEqual({ jsonMode: true, toolUse: false }); + }); + + it("both false when factory rejects both", async () => { + const f = makeProbeFactory({ jsonModeOk: false, toolUseOk: false }); + const result = await probe.probeWebBrowserCapabilities(f); + expect(result).toEqual({ jsonMode: false, toolUse: false }); + }); + + it("coalesces concurrent calls into a single probe", async () => { + const f = makeProbeFactory({ jsonModeOk: true, toolUseOk: true }); + // Fire N concurrent probes through the public surface. They should all + // share the same in-flight promise and trigger at most the same set of + // create() calls a single probe would (one per feature, not N). + const results = await Promise.all( + Array.from({ length: 5 }, () => probe.probeWebBrowserCapabilities(f)) + ); + expect(results.every((r) => r.jsonMode && r.toolUse)).toBe(true); + // The probe issues exactly two create() calls — one for json-mode, one + // for tool-use. Concurrent callers must coalesce, not multiply. + expect(f.create).toHaveBeenCalledTimes(2); + }); + + it("provider.ready() reflects the probe result", async () => { + const f = makeProbeFactory({ jsonModeOk: true, toolUseOk: false }); + const provider = new WebBrowserProvider(WEB_BROWSER_RUN_FNS, undefined, f); + // Pre-ready: conservative subset for chrome-prompt. + const preCaps = provider.inferCapabilities(model("chrome-prompt")); + expect(preCaps).not.toContain("json-mode"); + expect(preCaps).not.toContain("tool-use"); + await provider.ready(); + // Post-ready: json-mode appears, tool-use stays gated. + const postCaps = provider.inferCapabilities(model("chrome-prompt")); + expect(postCaps).toContain("json-mode"); + expect(postCaps).not.toContain("tool-use"); + }); +}); + +// -------------------------------------------------------------------------- +// StructuredGeneration session cache (H1) +// -------------------------------------------------------------------------- + +/** + * Install a fake `LanguageModel` global so the run-fn's `getApi` / + * `ensureAvailable` checks pass. Returns a teardown. + */ +function installLanguageModelGlobal(impl: unknown): () => void { + const prior = (globalThis as Record).LanguageModel; + (globalThis as Record).LanguageModel = impl; + return () => { + if (prior === undefined) { + delete (globalThis as Record).LanguageModel; + } else { + (globalThis as Record).LanguageModel = prior; + } + }; +} + +/** + * Fake `LanguageModel` factory + session that streams a single chunk of + * pre-canned text. `text` is the full JSON payload returned by the + * model's "response" in one snapshot — sufficient for our parse pipeline + * because Chrome's stream surface emits progressive snapshots. + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function makeFakeLanguageModel(text: string | (() => string)): any { + let destroyed = 0; + const factory = { + availability: vi.fn().mockResolvedValue("available"), + create: vi.fn(async () => ({ + promptStreaming: (_p: string, _o?: unknown) => { + const value = typeof text === "function" ? text() : text; + return new ReadableStream({ + start(controller) { + controller.enqueue(value); + controller.close(); + }, + }); + }, + destroy: () => { + destroyed += 1; + }, + })), + }; + return { factory, destroyed: () => destroyed }; +} + +describe("WebBrowser_StructuredGeneration session cache", () => { + const schema = { + type: "object", + properties: { x: { type: "number" } }, + required: ["x"], + additionalProperties: false, + } as const; + const sid = "sg-test-1"; + + afterEach(() => { + sessions.deleteChromeSession(sid); + }); + + it("first call with sessionId seeds the cache", async () => { + const { factory } = makeFakeLanguageModel('{"x":1}'); + const restore = installLanguageModelGlobal(factory); + try { + const emit = vi.fn(); + await WebBrowser_StructuredGeneration( + asSGI({ prompt: "p", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema, + sid + ); + expect(sessions.getChromeSession(sid)).toBeDefined(); + expect(sessions.getChromeSession(sid)?.schemaFingerprint).toBeDefined(); + expect(factory.create).toHaveBeenCalledTimes(1); + } finally { + restore(); + } + }); + + it("second call with the same schema reuses the cached session", async () => { + const { factory } = makeFakeLanguageModel('{"x":1}'); + const restore = installLanguageModelGlobal(factory); + try { + const emit = vi.fn(); + await WebBrowser_StructuredGeneration( + asSGI({ prompt: "p1", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema, + sid + ); + await WebBrowser_StructuredGeneration( + asSGI({ prompt: "p2", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema, + sid + ); + // Only ONE create() — the second call reused the cached session. + expect(factory.create).toHaveBeenCalledTimes(1); + } finally { + restore(); + } + }); + + it("mismatched schema fingerprint forces rebuild", async () => { + const { factory } = makeFakeLanguageModel('{"x":1}'); + const restore = installLanguageModelGlobal(factory); + try { + const emit = vi.fn(); + await WebBrowser_StructuredGeneration( + asSGI({ prompt: "p1", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema, + sid + ); + const otherSchema = { + type: "object", + properties: { x: { type: "number" }, y: { type: "string" } }, + required: ["x"], + additionalProperties: false, + } as const; + await WebBrowser_StructuredGeneration( + asSGI({ prompt: "p2", outputSchema: otherSchema }), + undefined, + new AbortController().signal, + emit, + // streaming text is a valid `{x:1}` which satisfies otherSchema too + otherSchema, + sid + ); + // Two creates — schema fingerprint mismatch invalidated the cache. + expect(factory.create).toHaveBeenCalledTimes(2); + } finally { + restore(); + } + }); +}); + +describe("WebBrowser_StructuredGeneration cache poisoning", () => { + const schema = { + type: "object", + properties: { x: { type: "number" } }, + required: ["x"], + additionalProperties: false, + } as const; + const sid = "sg-poison-1"; + + afterEach(() => { + sessions.deleteChromeSession(sid); + }); + + it("drops the cache entry when a follow-up turn throws on parse failure", async () => { + // First call seeds a cache with parseable output; second call streams + // garbage so JSON.parse and parsePartialJson both fail. The run-fn + // must throw and clear the cache entry so the next attempt rebuilds. + let seq = 0; + const seqText = (): string => { + seq += 1; + return seq === 1 ? '{"x":1}' : "not json {"; + }; + const { factory } = makeFakeLanguageModel(seqText); + const restore = installLanguageModelGlobal(factory); + try { + const emit = vi.fn(); + await WebBrowser_StructuredGeneration( + asSGI({ prompt: "p1", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema, + sid + ); + expect(sessions.getChromeSession(sid)).toBeDefined(); + await expect( + WebBrowser_StructuredGeneration( + asSGI({ prompt: "p2", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema, + sid + ) + ).rejects.toThrow(/unparseable|validation/i); + // Entry is dropped after the failed turn. + expect(sessions.getChromeSession(sid)).toBeUndefined(); + } finally { + restore(); + } + }); +}); + +// -------------------------------------------------------------------------- +// StructuredGeneration final-JSON validation (H4) +// -------------------------------------------------------------------------- + +describe("WebBrowser_StructuredGeneration validation", () => { + const schema = { + type: "object", + properties: { x: { type: "number" } }, + required: ["x"], + additionalProperties: false, + } as const; + + it("emits finish on valid JSON that satisfies the schema", async () => { + const { factory } = makeFakeLanguageModel('{"x":1}'); + const restore = installLanguageModelGlobal(factory); + try { + const events: unknown[] = []; + const emit = (e: unknown): void => { + events.push(e); + }; + await WebBrowser_StructuredGeneration( + asSGI({ prompt: "p", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema + ); + const finish = events.find((e) => (e as { type?: string }).type === "finish") as + | { data: { object: { x: number } } } + | undefined; + expect(finish).toBeDefined(); + expect(finish?.data.object).toEqual({ x: 1 }); + } finally { + restore(); + } + }); + + it("throws PermanentJobError on unparseable JSON, no finish emitted", async () => { + const { factory } = makeFakeLanguageModel("definitely not json"); + const restore = installLanguageModelGlobal(factory); + try { + const events: unknown[] = []; + const emit = (e: unknown): void => { + events.push(e); + }; + await expect( + WebBrowser_StructuredGeneration( + asSGI({ prompt: "p", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema + ) + ).rejects.toThrow(/unparseable/i); + expect(events.some((e) => (e as { type?: string }).type === "finish")).toBe(false); + } finally { + restore(); + } + }); + + it("throws PermanentJobError when parsed object fails schema validation", async () => { + // Parses fine but `x` is a string, not a number — fails the schema. + const { factory } = makeFakeLanguageModel('{"x":"oops"}'); + const restore = installLanguageModelGlobal(factory); + try { + const events: unknown[] = []; + const emit = (e: unknown): void => { + events.push(e); + }; + await expect( + WebBrowser_StructuredGeneration( + asSGI({ prompt: "p", outputSchema: schema }), + undefined, + new AbortController().signal, + emit, + schema + ) + ).rejects.toThrow(/schema validation/i); + expect(events.some((e) => (e as { type?: string }).type === "finish")).toBe(false); + } finally { + restore(); + } + }); +}); + +// -------------------------------------------------------------------------- +// ToolCalling session cache (H2) +// -------------------------------------------------------------------------- + +/** + * Fake `LanguageModel` for tool-calling tests. The session's + * `promptStreaming` immediately invokes each declared tool's `execute` + * callback so the run-fn captures the tool calls, then closes the stream. + * + * `callsBy[toolName]` supplies args for each capture; if omitted defaults + * to `{}`. + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +function makeFakeToolCallingModel(callsBy: Record = {}): any { + const factory = { + availability: vi.fn().mockResolvedValue("available"), + create: vi.fn( + async (options?: { + tools?: Array<{ name: string; execute: (...args: unknown[]) => Promise }>; + }) => { + const tools = options?.tools ?? []; + return { + promptStreaming: () => + new ReadableStream({ + async start(controller) { + for (const t of tools) { + if (t.name === "_probe") continue; // probe tool ignored here + const args = callsBy[t.name] ?? {}; + await t.execute(args); + } + controller.close(); + }, + }), + destroy: vi.fn(), + }; + } + ), + }; + return { factory }; +} + +describe("WebBrowser_ToolCalling session cache", () => { + const sid = "tc-test-1"; + const toolA: ToolDefinition = { + name: "tool_a", + description: "tool a", + inputSchema: { type: "object", properties: {}, additionalProperties: true }, + }; + const toolB: ToolDefinition = { + name: "tool_b", + description: "tool b", + inputSchema: { type: "object", properties: {}, additionalProperties: true }, + }; + + afterEach(() => { + sessions.deleteChromeSession(sid); + }); + + it("reuses cache when sessionId + messages + tool set match", async () => { + const { factory } = makeFakeToolCallingModel(); + const restore = installLanguageModelGlobal(factory); + try { + const emit = vi.fn(); + const messages: ChatMessage[] = [ + { role: "user", content: [{ type: "text", text: "do it" }] }, + ]; + await WebBrowser_ToolCalling( + asTCI({ prompt: "", tools: [toolA, toolB], messages }), + undefined, + new AbortController().signal, + emit, + undefined, + sid + ); + const messages2: ChatMessage[] = [ + ...messages, + { role: "assistant", content: [{ type: "text", text: "ok" }] }, + { role: "user", content: [{ type: "text", text: "again" }] }, + ]; + await WebBrowser_ToolCalling( + asTCI({ prompt: "", tools: [toolA, toolB], messages: messages2 }), + undefined, + new AbortController().signal, + emit, + undefined, + sid + ); + // Same tool set, same conversation thread → cache reuse, one create(). + expect(factory.create).toHaveBeenCalledTimes(1); + } finally { + restore(); + } + }); + + it("rebuilds when the tool set changes", async () => { + const { factory } = makeFakeToolCallingModel(); + const restore = installLanguageModelGlobal(factory); + try { + const emit = vi.fn(); + const messages: ChatMessage[] = [ + { role: "user", content: [{ type: "text", text: "do it" }] }, + ]; + await WebBrowser_ToolCalling( + asTCI({ prompt: "", tools: [toolA], messages }), + undefined, + new AbortController().signal, + emit, + undefined, + sid + ); + const messages2: ChatMessage[] = [ + ...messages, + { role: "assistant", content: [{ type: "text", text: "ok" }] }, + { role: "user", content: [{ type: "text", text: "again" }] }, + ]; + await WebBrowser_ToolCalling( + asTCI({ prompt: "", tools: [toolA, toolB], messages: messages2 }), + undefined, + new AbortController().signal, + emit, + undefined, + sid + ); + // Different fingerprint → cache invalidated, two creates. + expect(factory.create).toHaveBeenCalledTimes(2); + } finally { + restore(); + } + }); + + it("drops + destroys the cache entry on prompt failure", async () => { + // Sequenced session: first promptStreaming() returns a clean close, + // second errors. Same session handle returned from both create() calls + // (cache reuse exercises the same `session` object). + let promptCount = 0; + const sessionImpl = { + promptStreaming: (): ReadableStream => + new ReadableStream({ + start(controller) { + promptCount += 1; + if (promptCount === 1) { + controller.close(); + } else { + controller.error(new Error("boom")); + } + }, + }), + destroy: vi.fn(), + }; + const factory = { + availability: vi.fn().mockResolvedValue("available"), + create: vi.fn(async () => sessionImpl), + }; + const restore = installLanguageModelGlobal(factory); + try { + const messages: ChatMessage[] = [ + { role: "user", content: [{ type: "text", text: "do it" }] }, + ]; + const emit = vi.fn(); + // First turn seeds the cache successfully. + await WebBrowser_ToolCalling( + asTCI({ prompt: "", tools: [toolA], messages }), + undefined, + new AbortController().signal, + emit, + undefined, + sid + ); + expect(sessions.getChromeSession(sid)).toBeDefined(); + // Second turn reuses the cached session whose stream now errors. + const messages2: ChatMessage[] = [ + ...messages, + { role: "assistant", content: [{ type: "text", text: "ok" }] }, + { role: "user", content: [{ type: "text", text: "again" }] }, + ]; + await expect( + WebBrowser_ToolCalling( + asTCI({ prompt: "", tools: [toolA], messages: messages2 }), + undefined, + new AbortController().signal, + emit, + undefined, + sid + ) + ).rejects.toThrow(/boom/); + // Cache cleaned up. + expect(sessions.getChromeSession(sid)).toBeUndefined(); + } finally { + restore(); + } + }); +}); + +// -------------------------------------------------------------------------- +// ToolCalling argument validation (H3) +// -------------------------------------------------------------------------- + +describe("WebBrowser_ToolCalling argument validation", () => { + const strictTool: ToolDefinition = { + name: "echo", + description: "echo", + inputSchema: { + type: "object", + properties: { text: { type: "string" } }, + required: ["text"], + additionalProperties: false, + }, + }; + + it("passes through calls whose args satisfy the inputSchema", async () => { + const { factory } = makeFakeToolCallingModel({ echo: { text: "hello" } }); + const restore = installLanguageModelGlobal(factory); + try { + const events: Array<{ type: string; port?: string; objectDelta?: unknown }> = []; + const emit = (e: unknown): void => { + events.push(e as { type: string; port?: string; objectDelta?: unknown }); + }; + await WebBrowser_ToolCalling( + asTCI({ prompt: "go", tools: [strictTool] }), + undefined, + new AbortController().signal, + emit + ); + const tcEvent = events.find((e) => e.type === "object-delta" && e.port === "toolCalls"); + expect(tcEvent).toBeDefined(); + const calls = (tcEvent?.objectDelta as Array<{ name: string; input: unknown }>) ?? []; + expect(calls).toHaveLength(1); + expect(calls[0]?.input).toEqual({ text: "hello" }); + } finally { + restore(); + } + }); + + it("drops calls missing a required field", async () => { + // `text` is required but omitted. + const { factory } = makeFakeToolCallingModel({ echo: {} }); + const restore = installLanguageModelGlobal(factory); + try { + const events: Array<{ type: string; port?: string }> = []; + const emit = (e: unknown): void => { + events.push(e as { type: string; port?: string }); + }; + await WebBrowser_ToolCalling( + asTCI({ prompt: "go", tools: [strictTool] }), + undefined, + new AbortController().signal, + emit + ); + // No toolCalls event since the only call was dropped. + expect(events.some((e) => e.type === "object-delta" && e.port === "toolCalls")).toBe(false); + } finally { + restore(); + } + }); + + it("drops calls with a wrong-typed field", async () => { + // `text` must be string; passing a number fails validation. + const { factory } = makeFakeToolCallingModel({ echo: { text: 42 } }); + const restore = installLanguageModelGlobal(factory); + try { + const events: Array<{ type: string; port?: string }> = []; + const emit = (e: unknown): void => { + events.push(e as { type: string; port?: string }); + }; + await WebBrowser_ToolCalling( + asTCI({ prompt: "go", tools: [strictTool] }), + undefined, + new AbortController().signal, + emit + ); + expect(events.some((e) => e.type === "object-delta" && e.port === "toolCalls")).toBe(false); + } finally { + restore(); + } + }); + + it("falls through to name-check when inputSchema fails to compile", async () => { + // A schema that compileSchema can't handle. The malformed-schema tool + // should still see its call pass through (no crash, no validation), and + // hallucinated names still get filtered. + const malformedTool = { + name: "loose", + description: "loose", + // Garbage schema — type is invalid. + inputSchema: { type: "not_a_real_type" } as unknown, + } as { name: string; description: string; inputSchema: unknown }; + const { factory } = makeFakeToolCallingModel({ loose: { anything: 1 } }); + const restore = installLanguageModelGlobal(factory); + try { + const events: Array<{ type: string; port?: string; objectDelta?: unknown }> = []; + const emit = (e: unknown): void => { + events.push(e as { type: string; port?: string; objectDelta?: unknown }); + }; + await WebBrowser_ToolCalling( + asTCI({ + prompt: "go", + tools: [malformedTool as unknown as typeof strictTool], + }), + undefined, + new AbortController().signal, + emit + ); + // Either the schema compiled and validation passed (loose schema), + // or it failed to compile and the call fell through unchanged. + // Either way, no crash, and we see the tool call event. + const tcEvent = events.find((e) => e.type === "object-delta" && e.port === "toolCalls"); + expect(tcEvent).toBeDefined(); + } finally { + restore(); + } + }); +}); diff --git a/providers/chrome-ai/src/ai/WebBrowserProvider.ts b/providers/chrome-ai/src/ai/WebBrowserProvider.ts index 38f9f17ef..ebb2ca015 100644 --- a/providers/chrome-ai/src/ai/WebBrowserProvider.ts +++ b/providers/chrome-ai/src/ai/WebBrowserProvider.ts @@ -12,9 +12,15 @@ import type { } from "@workglow/ai/worker"; import { AiProvider } from "@workglow/ai/worker"; import { + CONSERVATIVE_PROBED_CAPABILITIES, inferWebBrowserCapabilities, webBrowserWorkerRunFnSpecs, } from "./common/WebBrowser_Capabilities"; +import { + probeWebBrowserCapabilities, + type WebBrowserProbeFactory, + type WebBrowserProbedCapabilities, +} from "./common/WebBrowser_CapabilityProbe"; import { WEB_BROWSER } from "./common/WebBrowser_Constants"; import type { WebBrowserModelConfig } from "./common/WebBrowser_ModelSchema"; import { deleteChromeSession } from "./common/WebBrowser_Sessions"; @@ -32,6 +38,15 @@ export class WebBrowserProvider extends AiProvider { readonly isLocal = true; readonly supportsBrowser = true; + /** + * Result of {@link probeWebBrowserCapabilities}. Until the probe resolves + * we report the conservative subset (no `json-mode`, no `tool-use`) so we + * never advertise a capability a downstream task can't fulfil. Callers + * that need the final answer should await {@link ready}. + */ + private probedCaps: WebBrowserProbedCapabilities = CONSERVATIVE_PROBED_CAPABILITIES; + private readonly probeReady: Promise; + constructor( promiseRunFns?: readonly AiProviderRunFnRegistration< // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -44,13 +59,31 @@ export class WebBrowserProvider extends AiProvider { string, // eslint-disable-next-line @typescript-eslint/no-explicit-any AiProviderPreviewRunFn - > + >, + /** + * Test seam: injectable probe factory. Production callers leave this + * undefined so the probe resolves against the real `LanguageModel` + * global. + */ + probeFactory?: WebBrowserProbeFactory ) { super(promiseRunFns, previewTasks); + this.probeReady = probeWebBrowserCapabilities(probeFactory).then((result) => { + this.probedCaps = result; + }); + } + + /** + * Resolves once the capability probe has completed. After this point + * {@link inferCapabilities} reflects what the browser actually supports. + * Before this point it returns the conservative subset. + */ + ready(): Promise { + return this.probeReady; } override inferCapabilities(model: ModelRecord): readonly Capability[] { - return inferWebBrowserCapabilities(model); + return inferWebBrowserCapabilities(model, this.probedCaps); } protected override workerRunFnSpecs(): readonly { serves: readonly Capability[] }[] { diff --git a/providers/chrome-ai/src/ai/common/WebBrowser_Capabilities.ts b/providers/chrome-ai/src/ai/common/WebBrowser_Capabilities.ts index c68810297..6000339c5 100644 --- a/providers/chrome-ai/src/ai/common/WebBrowser_Capabilities.ts +++ b/providers/chrome-ai/src/ai/common/WebBrowser_Capabilities.ts @@ -5,6 +5,12 @@ */ import type { Capability, ModelRecord } from "@workglow/ai/worker"; +import { + CONSERVATIVE_PROBED_CAPABILITIES, + probeWebBrowserCapabilities, + type WebBrowserProbeFactory, + type WebBrowserProbedCapabilities, +} from "./WebBrowser_CapabilityProbe"; import { WEB_BROWSER_CAPABILITY_SETS } from "./WebBrowser_CapabilitySets"; export const WEB_BROWSER_RUN_FN_SPECS = WEB_BROWSER_CAPABILITY_SETS.map((serves) => ({ serves })); @@ -17,6 +23,20 @@ export function webBrowserWorkerRunFnSpecs(): readonly { type CapabilityHints = Pick; +/** + * Capabilities the `chrome-prompt`/`gemini-nano` family advertises + * *unconditionally*. `json-mode` and `tool-use` are gated separately because + * the `responseConstraint` and `tools` options on `LanguageModel.create` / + * `prompt` aren't universally supported across Chrome builds and channels. + */ +const PROMPT_BASE_CAPABILITIES = [ + "text.generation", + "text.rewriter", + "text.summary", + "model.info", + "model.search", +] as const satisfies readonly Capability[]; + /** * Heuristic capability inference for Chrome Built-in AI {@link ModelRecord}. * @@ -24,8 +44,20 @@ type CapabilityHints = Pick 0) return declared; @@ -37,15 +69,14 @@ export function inferWebBrowserCapabilities(model: CapabilityHints): readonly Ca const baseName = id.toLowerCase(); if (/prompt|gemini[-_]?nano/.test(baseName)) { - return [ - "text.generation", - "json-mode", - "tool-use", - "text.rewriter", - "text.summary", - "model.info", - "model.search", - ]; + const caps: Capability[] = [...PROMPT_BASE_CAPABILITIES]; + if (probed.jsonMode) caps.splice(1, 0, "json-mode"); + if (probed.toolUse) { + // Insert tool-use after json-mode (if present) for stable test ordering. + const insertAt = probed.jsonMode ? 2 : 1; + caps.splice(insertAt, 0, "tool-use"); + } + return caps; } if (/summariz/.test(baseName)) { return ["text.summary", "model.info", "model.search"]; @@ -62,3 +93,19 @@ export function inferWebBrowserCapabilities(model: CapabilityHints): readonly Ca return ["model.search", "model.info"]; } + +/** + * Probe-driven variant of {@link inferWebBrowserCapabilities}. Resolves the + * probed capability set (cached) before returning, so the result reflects + * the real browser surface rather than assuming both `json-mode` and + * `tool-use` are present. + */ +export async function inferWebBrowserCapabilitiesAsync( + model: CapabilityHints, + factory?: WebBrowserProbeFactory +): Promise { + const probed = await probeWebBrowserCapabilities(factory); + return inferWebBrowserCapabilities(model, probed); +} + +export { CONSERVATIVE_PROBED_CAPABILITIES }; diff --git a/providers/chrome-ai/src/ai/common/WebBrowser_CapabilityProbe.ts b/providers/chrome-ai/src/ai/common/WebBrowser_CapabilityProbe.ts new file mode 100644 index 000000000..08dae9be4 --- /dev/null +++ b/providers/chrome-ai/src/ai/common/WebBrowser_CapabilityProbe.ts @@ -0,0 +1,152 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { getApi } from "./WebBrowser_ChromeHelpers"; + +/** + * Result of probing Chrome Built-in AI's `LanguageModel` for the optional + * capabilities our run-fns rely on. Chrome's surface evolves and some flags + * (json-mode via `responseConstraint`, tool-use via `tools`) are not + * universally available — feature detection is the only reliable signal. + */ +export interface WebBrowserProbedCapabilities { + readonly jsonMode: boolean; + readonly toolUse: boolean; +} + +/** + * Minimal subset of the `LanguageModel` static surface the probe touches. + * Declared here so tests can pass a fake factory without depending on the + * `@types/dom-chromium-ai` ambient globals. + */ +export interface WebBrowserProbeFactory { + create(options?: unknown): Promise<{ destroy(): unknown }>; + params?(): Promise; +} + +/** + * Default conservative probe result. Used when the `LanguageModel` global + * is absent (e.g. running outside Chrome) so callers can still proceed — + * just without `json-mode` / `tool-use` capabilities exposed. + */ +export const CONSERVATIVE_PROBED_CAPABILITIES: WebBrowserProbedCapabilities = Object.freeze({ + jsonMode: false, + toolUse: false, +}); + +/** + * Module-level coalescing slot so concurrent `probeWebBrowserCapabilities()` + * callers share the same in-flight promise. Cleared via {@link _resetProbeCache} + * for tests only. + */ +let probePromise: Promise | undefined; + +/** + * Probe the running browser for `json-mode` and `tool-use` support on the + * `LanguageModel` API. Results are cached at module level; subsequent calls + * (and concurrent calls) return the same promise so we only pay for one set + * of create/destroy cycles per page load. + * + * The probe is intentionally conservative: any rejection from `factory.create` + * is interpreted as "not supported" rather than letting the exception + * propagate, because the alternative — surfacing transient failures into + * capability inference — would flip declared capabilities mid-session. + * + * Both probes immediately `destroy()` the smoke-tested session so we don't + * keep a model loaded just to satisfy feature detection. + * + * @param factory Optional injected factory for tests. Defaults to the real + * `LanguageModel` global when present. + */ +export function probeWebBrowserCapabilities( + factory?: WebBrowserProbeFactory +): Promise { + if (probePromise) return probePromise; + + probePromise = (async (): Promise => { + let resolvedFactory: WebBrowserProbeFactory | undefined = factory; + if (!resolvedFactory) { + // Lazy-resolve the real global through getApi which surfaces a + // consistent error if `LanguageModel` is missing. We catch and treat + // "missing" as "no capabilities". + try { + // The ambient `LanguageModel` global may be undefined outside Chrome. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const lm = ( + typeof (globalThis as any).LanguageModel !== "undefined" + ? // eslint-disable-next-line @typescript-eslint/no-explicit-any + (globalThis as any).LanguageModel + : undefined + ) as WebBrowserProbeFactory | undefined; + resolvedFactory = getApi("LanguageModel", lm); + } catch { + return CONSERVATIVE_PROBED_CAPABILITIES; + } + } + + // Prefer `LanguageModel.params()` if exposed: it's the cheapest signal. + // Today the spec params surface doesn't actually report json/tool flags + // (only topK/temperature) so this is a forward-compat hook. If the + // method exists and resolves we still fall through to smoke-tests for + // the actual feature gates. + if (typeof resolvedFactory.params === "function") { + try { + await resolvedFactory.params(); + } catch { + // Non-fatal — params() restricted to extensions and may reject on web. + } + } + + const jsonMode = await probeOption(resolvedFactory, { + responseConstraint: { type: "object" }, + }); + const toolUse = await probeOption(resolvedFactory, { + tools: [ + { + name: "_probe", + description: "", + inputSchema: { type: "object" }, + execute: async (): Promise => "", + }, + ], + }); + + return { jsonMode, toolUse }; + })(); + + return probePromise; +} + +/** + * Issue a smoke-test `factory.create(options)`. Any rejection means the + * option is unsupported in this Chrome build. On success we immediately + * `destroy()` the session — its only purpose was to confirm acceptance. + */ +async function probeOption( + factory: WebBrowserProbeFactory, + options: Record +): Promise { + try { + const session = await factory.create(options); + try { + session.destroy(); + } catch { + // best-effort: destroy failures don't affect the probe outcome + } + return true; + } catch { + return false; + } +} + +/** + * @internal Test-only escape hatch to clear the coalescing cache between + * test cases. Production code never resets the probe — the result is + * stable for the lifetime of the page. + */ +export function _resetProbeCache(): void { + probePromise = undefined; +} diff --git a/providers/chrome-ai/src/ai/common/WebBrowser_Sessions.ts b/providers/chrome-ai/src/ai/common/WebBrowser_Sessions.ts index 98d2cebbc..026acb29e 100644 --- a/providers/chrome-ai/src/ai/common/WebBrowser_Sessions.ts +++ b/providers/chrome-ai/src/ai/common/WebBrowser_Sessions.ts @@ -21,6 +21,20 @@ export interface ChromeChatSessionState { readonly session: LanguageModel; readonly messageCount: number; + /** + * Stable fingerprint of the `outputSchema` the session was created for + * (StructuredGeneration runs). Reuse requires an exact match — a schema + * change forces a session rebuild because Chrome bakes the constraint + * into the session's response handling state. + */ + readonly schemaFingerprint?: string; + /** + * Stable fingerprint of the *sorted* tool name list the session was + * created with (ToolCalling runs). Tool-set changes invalidate the + * cached session because Chrome's tools are bound at `create()` time + * and can't be hot-swapped per turn. + */ + readonly toolsFingerprint?: string; } const chromeSessions = new Map(); diff --git a/providers/chrome-ai/src/ai/common/WebBrowser_StructuredGeneration.ts b/providers/chrome-ai/src/ai/common/WebBrowser_StructuredGeneration.ts index a7ad55349..e529b824a 100644 --- a/providers/chrome-ai/src/ai/common/WebBrowser_StructuredGeneration.ts +++ b/providers/chrome-ai/src/ai/common/WebBrowser_StructuredGeneration.ts @@ -9,10 +9,48 @@ import type { StructuredGenerationTaskInput, StructuredGenerationTaskOutput, } from "@workglow/ai"; +import { PermanentJobError } from "@workglow/job-queue"; +import type { JsonSchema, SchemaNode } from "@workglow/util/schema"; +import { compileSchema } from "@workglow/util/schema"; import { parsePartialJson } from "@workglow/util/worker"; import { createDownloadMonitor, ensureAvailable, getApi } from "./WebBrowser_ChromeHelpers"; import type { WebBrowserModelConfig } from "./WebBrowser_ModelSchema"; +import { + deleteChromeSession, + dropChromeSessionEntry, + getChromeSession, + setChromeSession, +} from "./WebBrowser_Sessions"; + +/** + * Stable fingerprint of an `outputSchema` value, used to decide whether a + * cached Chrome session can be reused. The schema is canonicalised by + * sorting object keys before stringification so that semantically-equal + * schemas with differently-ordered properties produce the same fingerprint. + * + * Implementation note: we intentionally do not hash this. A medium-length + * JSON string is fine as a cache key — the cache lives in-memory, scoped + * to a session id, and turn-over is low. + */ +function schemaFingerprint(schema: object): string { + return canonicalStringify(schema); +} + +/** + * Recursively sorts object keys so `JSON.stringify` produces a stable + * representation independent of insertion order. Arrays preserve order + * (semantically meaningful in JSON Schema for e.g. `oneOf`/`enum`). + */ +function canonicalStringify(value: unknown): string { + if (value === null || typeof value !== "object") return JSON.stringify(value); + if (Array.isArray(value)) return `[${value.map(canonicalStringify).join(",")}]`; + const keys = Object.keys(value as Record).sort(); + const entries = keys.map( + (k) => `${JSON.stringify(k)}:${canonicalStringify((value as Record)[k])}` + ); + return `{${entries.join(",")}}`; +} /** * Streaming run-fn for `["text.generation", "json-mode"]`. @@ -33,12 +71,31 @@ import type { WebBrowserModelConfig } from "./WebBrowser_ModelSchema"; * `temperature` is `@deprecated` for non-extension contexts in the current * Chrome spec and silently ignored on the open web. Passed through anyway * so extension callers still get the knob. + * + * ## Session reuse + * + * When `sessionId` is provided we cache the underlying `LanguageModel` + * keyed by it, mirroring `WebBrowser_Chat`. Sessions are reused by + * `sessionId`; however, if the `outputSchema` changes (detected via + * `schemaFingerprint`), we rebuild the Chrome session because + * `responseConstraint` state is bound to the schema first used with that + * session, and mixing schemas on a reused session is undefined behavior. + * + * ## Validation + * + * Chrome's `responseConstraint` is best-effort, not a hard guarantee. + * After streaming we validate both that the final accumulated text parses + * as JSON *and* that the parsed object satisfies `outputSchema`. Failures + * raise {@link PermanentJobError} — `StructuredGenerationTask` runs us + * inside a retry loop that catches per-attempt errors, so throwing here + * is the correct way to mark this attempt failed without misleading + * downstream consumers with a `finish` carrying garbage. */ export const WebBrowser_StructuredGeneration: AiProviderRunFn< StructuredGenerationTaskInput, StructuredGenerationTaskOutput, WebBrowserModelConfig -> = async (input, _model, signal, emit, outputSchema) => { +> = async (input, _model, signal, emit, outputSchema, sessionId) => { const factory = getApi( "LanguageModel", typeof LanguageModel !== "undefined" ? LanguageModel : undefined @@ -47,14 +104,49 @@ export const WebBrowser_StructuredGeneration: AiProviderRunFn< const schema = (input.outputSchema ?? outputSchema) as object | undefined; if (!schema) { - throw new Error("WebBrowser_StructuredGeneration: outputSchema is required"); + throw new PermanentJobError("WebBrowser_StructuredGeneration: outputSchema is required"); + } + + // Compile validator up-front so a bad schema fails fast (cheap, ahead of + // any provider work). Re-thrown as PermanentJobError so the surrounding + // retry loop doesn't waste attempts on a malformed schema. + let validator: SchemaNode; + try { + validator = compileSchema(schema as JsonSchema); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + throw new PermanentJobError(`WebBrowser_StructuredGeneration: invalid outputSchema — ${msg}`); } - const session = await factory.create({ - signal, - temperature: input.temperature ?? undefined, - monitor: createDownloadMonitor(emit), - }); + const fingerprint = schemaFingerprint(schema); + + // StructuredGeneration has no message history of its own — successive + // calls with the same `sessionId` are independent prompts. We reuse the + // cached session purely to amortise the cost of `LanguageModel.create()`, + // gating only on schema fingerprint. The watermark we record is a + // monotonic call counter so existing fingerprint-aware cache consumers + // (and future evolution toward true multi-turn structured-gen) keep a + // consistent shape with `WebBrowser_Chat`. + let cached = sessionId ? getChromeSession(sessionId) : undefined; + if (sessionId !== undefined && cached && cached.schemaFingerprint !== fingerprint) { + deleteChromeSession(sessionId); + cached = undefined; + } + const priorMessageCount = cached?.messageCount ?? 0; + + const usedCachedSession = cached !== undefined; + let session: LanguageModel; + if (cached) { + session = cached.session; + } else { + session = await factory.create({ + signal, + temperature: input.temperature ?? undefined, + monitor: createDownloadMonitor(emit), + }); + } + + let cacheWritten = false; try { const stream = session.promptStreaming(input.prompt, { signal, @@ -87,17 +179,57 @@ export const WebBrowser_StructuredGeneration: AiProviderRunFn< reader.releaseLock(); } + // Validate the *final* output. `responseConstraint` is best-effort on + // Chrome — if the model produces an unparseable continuation or a + // shape mismatch, we surface a permanent (per-attempt) error rather + // than fabricate a `{}` result that downstream code can't distinguish + // from a legitimate empty object. let finalObject: Record; try { finalObject = JSON.parse(accumulatedJson) as Record; } catch { - finalObject = (parsePartialJson(accumulatedJson) ?? {}) as Record; + const partial = parsePartialJson(accumulatedJson); + if (partial === undefined) { + throw new PermanentJobError("Chrome AI returned unparseable JSON"); + } + finalObject = partial as Record; + } + + const validation = validator.validate(finalObject); + if (!validation.valid) { + const firstError = validation.errors[0]; + const detail = firstError?.message ?? "unknown validation error"; + throw new PermanentJobError(`Chrome AI output failed schema validation: ${detail}`); + } + + if (sessionId !== undefined) { + // Ownership of `session` transfers to the cache; the provider's + // `disposeSession` reclaims it at end of run. + setChromeSession(sessionId, { + session, + messageCount: priorMessageCount + 1, + schemaFingerprint: fingerprint, + }); + cacheWritten = true; } emit({ type: "finish", data: { object: finalObject } as StructuredGenerationTaskOutput, }); } finally { - session.destroy(); + // Mirror WebBrowser_Chat's cache-poison handling. If we threw before + // writing the cache entry and we reused a cached session, the cache + // entry is now poisoned (partial state); drop it (only if it still + // points at our handle, to avoid trampling a replacement) and destroy. + if (!cacheWritten) { + if (sessionId !== undefined && usedCachedSession) { + dropChromeSessionEntry(sessionId, session); + } + try { + session.destroy(); + } catch { + // best-effort + } + } } }; diff --git a/providers/chrome-ai/src/ai/common/WebBrowser_ToolCalling.ts b/providers/chrome-ai/src/ai/common/WebBrowser_ToolCalling.ts index 25975f7b4..83a6d87b8 100644 --- a/providers/chrome-ai/src/ai/common/WebBrowser_ToolCalling.ts +++ b/providers/chrome-ai/src/ai/common/WebBrowser_ToolCalling.ts @@ -10,9 +10,13 @@ import type { ToolCall, ToolCallingTaskInput, ToolCallingTaskOutput, + ToolDefinition, } from "@workglow/ai"; import { buildToolDescription, filterValidToolCalls } from "@workglow/ai"; import { uuid4 } from "@workglow/util"; +import type { JsonSchema, SchemaNode } from "@workglow/util/schema"; +import { compileSchema } from "@workglow/util/schema"; +import { getLogger } from "@workglow/util/worker"; import { buildInitialPromptsFromHistory, @@ -26,6 +30,12 @@ import { snapshotStreamToTextDeltas, } from "./WebBrowser_ChromeHelpers"; import type { WebBrowserModelConfig } from "./WebBrowser_ModelSchema"; +import { + deleteChromeSession, + dropChromeSessionEntry, + getChromeSession, + setChromeSession, +} from "./WebBrowser_Sessions"; function flattenPrompt(prompt: ToolCallingTaskInput["prompt"]): string { if (typeof prompt === "string") return prompt; @@ -54,24 +64,47 @@ function flattenPrompt(prompt: ToolCallingTaskInput["prompt"]): string { function buildToolCallPrompt(input: ToolCallingTaskInput): { initialPrompts: LanguageModelCreateOptions["initialPrompts"]; promptText: string; + priorMessageCount: number; } { const hasMessages = Array.isArray(input.messages) && input.messages.length > 0; if (hasMessages) { const messages = input.messages as readonly ChatMessage[]; const lastUserIdx = findLastUserIndex(messages); if (lastUserIdx < 0) { - return { initialPrompts: [], promptText: flattenPrompt(input.prompt) }; + return { + initialPrompts: [], + promptText: flattenPrompt(input.prompt), + priorMessageCount: messages.length, + }; } return { initialPrompts: buildInitialPromptsFromHistory(messages.slice(0, lastUserIdx)), promptText: messageText(messages[lastUserIdx]), + priorMessageCount: lastUserIdx, }; } const initialPrompts: LanguageModelCreateOptions["initialPrompts"] = input.systemPrompt ? [{ role: "system", content: input.systemPrompt }] : []; - return { initialPrompts, promptText: flattenPrompt(input.prompt) }; + return { initialPrompts, promptText: flattenPrompt(input.prompt), priorMessageCount: 0 }; +} + +/** + * Stable fingerprint of the tool set bound at `create()` time. Tool sets + * are compared by sorted name list — Chrome can't hot-swap tools per turn, + * so any change to the set invalidates a cached session. We intentionally + * don't include each tool's `inputSchema` here: if the *names* match, + * reuse; a schema-only edit on a same-named tool is unusual enough that + * the modest correctness risk is preferable to the cache thrash of hashing + * full schemas every turn. + */ +function toolsFingerprint(tools: readonly ToolDefinition[]): string { + return tools + .map((t) => t.name) + .filter((n): n is string => typeof n === "string" && n.length > 0) + .sort() + .join(","); } /** @@ -96,12 +129,35 @@ function buildToolCallPrompt(input: ToolCallingTaskInput): { * `temperature` is `@deprecated` for non-extension contexts in the current * Chrome spec and silently ignored on the open web. Passed through anyway * so extension callers still get the knob. + * + * ## Session reuse + * + * When `sessionId` is provided and `input.messages` is present we *may* + * cache the underlying `LanguageModel`. There's a real correctness risk + * here: Chrome's tool-calling loop appends tool-result turns to the + * session's internal state opaquely. Reusing the cached session across + * orchestrator turns would double-feed those results once the orchestrator + * also re-supplies them via `messages`. To stay safe: + * - We only cache when the orchestrator is driving via `input.messages`. + * - Cache reuse requires that the tool set hasn't changed. + * - On any error we drop and destroy the cache entry — Chrome's internal + * state may be in the middle of a tool-call cycle. + * + * ## Argument validation (H3) + * + * Chrome calls `execute` with `(args)` where `args[0]` is whatever the + * model produced. The model can hallucinate fields that don't match the + * tool's `inputSchema`. We compile each tool's schema once, validate the + * captured arguments before passing them to `filterValidToolCalls`, and + * drop+log calls that fail. Tools whose `inputSchema` fails to compile + * fall through to name-only validation (same as today's behavior) with + * a single warning so a malformed schema doesn't crash the run. */ export const WebBrowser_ToolCalling: AiProviderRunFn< ToolCallingTaskInput, ToolCallingTaskOutput, WebBrowserModelConfig -> = async (input, _model, signal, emit) => { +> = async (input, _model, signal, emit, _outputSchema, sessionId) => { const factory = getApi( "LanguageModel", typeof LanguageModel !== "undefined" ? LanguageModel : undefined @@ -110,6 +166,24 @@ export const WebBrowser_ToolCalling: AiProviderRunFn< const capturedCalls: ToolCall[] = []; + // Compile validators once per tool. A bad schema downgrades that tool + // to name-only validation rather than failing the whole run — the + // existing `filterValidToolCalls` name check is still applied below. + const validators = new Map(); + for (const td of input.tools) { + try { + validators.set(td.name, compileSchema(td.inputSchema as JsonSchema)); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + getLogger().warn( + `WebBrowser_ToolCalling: tool "${td.name}" has invalid inputSchema, ` + + `falling back to name-only validation — ${msg}`, + { toolName: td.name } + ); + validators.set(td.name, null); + } + } + // `toolChoice: "none"` → omit tools entirely so the model can't call any. // Specific tool-name choices aren't expressible in Chrome's surface; we // pass all tools and let the model decide. @@ -131,16 +205,40 @@ export const WebBrowser_ToolCalling: AiProviderRunFn< }, })); - const { initialPrompts, promptText } = buildToolCallPrompt(input); + const { initialPrompts, promptText, priorMessageCount } = buildToolCallPrompt(input); + const fingerprint = toolsFingerprint(input.tools); - const session = await factory.create({ - signal, - temperature: input.temperature ?? undefined, - tools: chromeTools.length > 0 ? chromeTools : undefined, - initialPrompts, - monitor: createDownloadMonitor(emit), - }); + // Safety guard: only allow cache reuse when the orchestrator drives via + // `input.messages`. In the bare-prompt path Chrome's session may carry + // opaque tool-call state from a prior turn that we can't reason about. + const cacheable = sessionId !== undefined && Array.isArray(input.messages); + let cached = cacheable && sessionId ? getChromeSession(sessionId) : undefined; + if ( + cacheable && + sessionId && + cached && + (cached.messageCount !== priorMessageCount || cached.toolsFingerprint !== fingerprint) + ) { + deleteChromeSession(sessionId); + cached = undefined; + } + + const usedCachedSession = cached !== undefined; + let session: LanguageModel; + if (cached) { + session = cached.session; + } else { + session = await factory.create({ + signal, + temperature: input.temperature ?? undefined, + tools: chromeTools.length > 0 ? chromeTools : undefined, + initialPrompts, + monitor: createDownloadMonitor(emit), + }); + } + + let cacheWritten = false; try { const stream = session.promptStreaming(promptText, { signal }); // Forward text-delta and snapshot events; swallow the inner `finish` @@ -154,14 +252,50 @@ export const WebBrowser_ToolCalling: AiProviderRunFn< emit(e); } + // Validate each captured call's `input` against its tool's compiled + // schema. Calls with no compiled validator (schema compile failed) + // skip this step and rely on the name-only check below. + const argValidated = capturedCalls.filter((tc) => { + const v = validators.get(tc.name); + if (!v) return true; + const result = v.validate(tc.input); + if (result.valid) return true; + const firstError = result.errors[0]; + const detail = firstError?.message ?? "unknown validation error"; + getLogger().warn( + `WebBrowser_ToolCalling: dropping call to "${tc.name}" — args fail inputSchema (${detail})`, + { callId: tc.id, toolName: tc.name } + ); + return false; + }); + // Defence in depth against hallucinated tool names — same shape as // OpenAI/Anthropic tool-calling run-fns. - const validated = filterValidToolCalls(capturedCalls, input.tools); + const validated = filterValidToolCalls(argValidated, input.tools); if (validated.length > 0) { emit({ type: "object-delta", port: "toolCalls", objectDelta: validated }); } + if (cacheable && sessionId !== undefined) { + // Watermark post-turn count: prior history + 1 trailing user turn + + // 1 assistant turn. Matches WebBrowser_Chat's convention. + setChromeSession(sessionId, { + session, + messageCount: priorMessageCount + 2, + toolsFingerprint: fingerprint, + }); + cacheWritten = true; + } emit({ type: "finish", data: {} as ToolCallingTaskOutput }); } finally { - session.destroy(); + if (!cacheWritten) { + if (cacheable && sessionId !== undefined && usedCachedSession) { + dropChromeSessionEntry(sessionId, session); + } + try { + session.destroy(); + } catch { + // best-effort + } + } } }; diff --git a/providers/chrome-ai/src/ai/index.ts b/providers/chrome-ai/src/ai/index.ts index 04ee3f084..378edc6e0 100644 --- a/providers/chrome-ai/src/ai/index.ts +++ b/providers/chrome-ai/src/ai/index.ts @@ -10,7 +10,13 @@ export * from "./common/WebBrowser_Constants"; export * from "./common/WebBrowser_ModelSchema"; export * from "./registerWebBrowser"; -import { WEB_BROWSER_RUN_FN_SPECS } from "./common/WebBrowser_Capabilities"; +import { + CONSERVATIVE_PROBED_CAPABILITIES, + inferWebBrowserCapabilities, + inferWebBrowserCapabilitiesAsync, + WEB_BROWSER_RUN_FN_SPECS, +} from "./common/WebBrowser_Capabilities"; +import { _resetProbeCache, probeWebBrowserCapabilities } from "./common/WebBrowser_CapabilityProbe"; import { buildInitialPromptsFromHistory, findLastUserIndex, @@ -26,6 +32,8 @@ import { getChromeSession, setChromeSession, } from "./common/WebBrowser_Sessions"; +import { WebBrowser_StructuredGeneration } from "./common/WebBrowser_StructuredGeneration"; +import { WebBrowser_ToolCalling } from "./common/WebBrowser_ToolCalling"; import { WebBrowserProvider } from "./WebBrowserProvider"; /** @@ -36,6 +44,8 @@ export const _testOnly = { WEB_BROWSER_RUN_FN_SPECS, WEB_BROWSER_RUN_FNS, WebBrowser_TextGeneration_Unified, + WebBrowser_StructuredGeneration, + WebBrowser_ToolCalling, sessions: { getChromeSession, setChromeSession, @@ -47,4 +57,11 @@ export const _testOnly = { findLastUserIndex, buildInitialPromptsFromHistory, }, + probe: { + probeWebBrowserCapabilities, + inferWebBrowserCapabilities, + inferWebBrowserCapabilitiesAsync, + CONSERVATIVE_PROBED_CAPABILITIES, + _resetProbeCache, + }, } as const;