web-infra-dev · yuyutaotao · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/packages/core/src/agent/agent.ts b/packages/core/src/agent/agent.ts
@@ -774,15 +774,15 @@ export class Agent<
     taskPrompt: string,
     opt?: {
       cacheable?: boolean;
-      thinkingLevel?: ThinkingLevel;
+      _thinkingLevel?: ThinkingLevel;
     },
   ) {
     const modelConfigForPlanning =
       this.modelConfigManager.getModelConfig('planning');
     const defaultIntentModelConfig =
       this.modelConfigManager.getModelConfig('default');
 
-    let thinkingLevelToUse = opt?.thinkingLevel;
+    let thinkingLevelToUse = opt?._thinkingLevel;
     if (!thinkingLevelToUse && this.opts.aiActionContext) {
       thinkingLevelToUse = 'high';
     } else if (!thinkingLevelToUse) {
@@ -791,10 +791,11 @@ export class Agent<
 
     // should include bbox in planning if
     // 1. the planning model is the same as the default intent model
-    // or 2. the thinking level is high
+    // and
+    // 2. the thinking level is not high
     const includeBboxInPlanning =
-      modelConfigForPlanning.modelName === defaultIntentModelConfig.modelName ||
-      thinkingLevelToUse === 'high';
+      modelConfigForPlanning.modelName === defaultIntentModelConfig.modelName &&
+      thinkingLevelToUse !== 'high';
     debug('setting includeBboxInPlanning to', includeBboxInPlanning);
 
     const cacheable = opt?.cacheable;

diff --git a/packages/core/src/ai-model/conversation-history.ts b/packages/core/src/ai-model/conversation-history.ts
@@ -1,19 +1,13 @@
 import type { ChatCompletionMessageParam } from 'openai/resources/index';
 
 export interface ConversationHistoryOptions {
-  maxUserImageMessages?: number;
   initialMessages?: ChatCompletionMessageParam[];
 }
 
-const defaultMaxUserImagesCount = 6;
-
 export class ConversationHistory {
-  private readonly maxUserImageMessages: number;
   private readonly messages: ChatCompletionMessageParam[] = [];
 
   constructor(options?: ConversationHistoryOptions) {
-    this.maxUserImageMessages =
-      options?.maxUserImageMessages ?? defaultMaxUserImagesCount;
     if (options?.initialMessages?.length) {
       this.seed(options.initialMessages);
     }
@@ -34,47 +28,8 @@ export class ConversationHistory {
     this.messages.length = 0;
   }
 
-  snapshot(options?: {
-    maxImageMessages?: number;
-  }): ChatCompletionMessageParam[] {
-    const maxImageMessages =
-      options?.maxImageMessages ?? this.maxUserImageMessages;
-
-    // Count image_url messages from back to front
-    let imageCount = 0;
-    const processedMessages = [...this.messages]
-      .reverse()
-      .map((message): ChatCompletionMessageParam => {
-        if (
-          typeof message.content !== 'string' &&
-          Array.isArray(message.content)
-        ) {
-          // Also process content items from back to front
-          const processedContent = [...message.content]
-            .reverse()
-            .map((item) => {
-              if (item.type === 'image_url') {
-                imageCount++;
-                if (imageCount > maxImageMessages) {
-                  // Replace with text type
-                  return {
-                    type: 'text' as const,
-                    text: '(omitted due to size limit)',
-                  };
-                }
-              }
-              return item;
-            })
-            .reverse();
-          return {
-            ...message,
-            content: processedContent,
-          } as ChatCompletionMessageParam;
-        }
-        return message;
-      });
-
-    return processedMessages.reverse();
+  snapshot(): ChatCompletionMessageParam[] {
+    return [...this.messages];
   }
 
   get length(): number {

diff --git a/packages/core/src/ai-model/llm-planning.ts b/packages/core/src/ai-model/llm-planning.ts
@@ -63,7 +63,6 @@ export async function plan(
   }
 
   const historyLog = opts.conversationHistory?.snapshot() || [];
-  // .filter((item) => item.role === 'assistant') || [];
 
   const knowledgeContext: ChatCompletionMessageParam[] = opts.actionContext
     ? [
@@ -96,7 +95,7 @@ export async function plan(
     content: [
       {
         type: 'text',
-        text: 'I have finished the action previously planned, and the last screenshot is as follows:',
+        text: 'I have finished the action previously planned, and the last screenshot is attached. Please going on according to the instruction.',
       },
       {
         type: 'image_url',
@@ -117,13 +116,15 @@ export async function plan(
     latestImageMessage,
   ];
 
-  const { content: planFromAI, usage } =
-    await callAIWithObjectResponse<RawResponsePlanningAIResponse>(
-      msgs,
-      AIActionType.PLAN,
-      modelConfig,
-    );
-  const rawResponse = JSON.stringify(planFromAI, undefined, 2);
+  const {
+    content: planFromAI,
+    contentString: rawResponse,
+    usage,
+  } = await callAIWithObjectResponse<RawResponsePlanningAIResponse>(
+    msgs,
+    AIActionType.PLAN,
+    modelConfig,
+  );
 
   const actions = planFromAI.action ? [planFromAI.action] : [];
   const returnValue: PlanningAIResponse = {

diff --git a/packages/core/src/ai-model/service-caller/index.ts b/packages/core/src/ai-model/service-caller/index.ts
@@ -358,12 +358,16 @@ export async function callAIWithObjectResponse<T>(
   messages: ChatCompletionMessageParam[],
   AIActionTypeValue: AIActionType,
   modelConfig: IModelConfig,
-): Promise<{ content: T; usage?: AIUsageInfo }> {
+): Promise<{ content: T; contentString: string; usage?: AIUsageInfo }> {
   const response = await callAI(messages, AIActionTypeValue, modelConfig);
   assert(response, 'empty response');
   const vlMode = modelConfig.vlMode;
   const jsonContent = safeParseJson(response.content, vlMode);
-  return { content: jsonContent, usage: response.usage };
+  return {
+    content: jsonContent,
+    contentString: response.content,
+    usage: response.usage,
+  };
 }
 
 export async function callAIWithStringResponse(

diff --git a/packages/core/tests/unit-test/conversation-history.test.ts b/packages/core/tests/unit-test/conversation-history.test.ts
@@ -45,55 +45,8 @@ describe('ConversationHistory', () => {
     expect(history.snapshot()).toEqual([assistantMessage('hello')]);
   });
 
-  it('limits image messages in snapshot from back to front', () => {
-    const history = new ConversationHistory({ maxUserImageMessages: 2 });
-
-    history.append(userMessageWithImage('first', 'data:image1'));
-    history.append(assistantMessage('ack1'));
-    history.append(userMessageWithImage('second', 'data:image2'));
-    history.append(assistantMessage('ack2'));
-    history.append(userMessageWithImage('third', 'data:image3'));
-
-    const snapshot = history.snapshot();
-
-    // First image should be omitted (counting from back, it's the 3rd one)
-    expect(snapshot[0]).toEqual({
-      role: 'user',
-      content: [
-        { type: 'text', text: 'first' },
-        { type: 'text', text: '(omitted due to size limit)' },
-      ],
-    });
-
-    // Second and third images should be preserved
-    expect(snapshot[2]).toEqual(userMessageWithImage('second', 'data:image2'));
-    expect(snapshot[4]).toEqual(userMessageWithImage('third', 'data:image3'));
-  });
-
-  it('respects maxImageMessages parameter in snapshot options', () => {
-    const history = new ConversationHistory({ maxUserImageMessages: 5 });
-
-    history.append(userMessageWithImage('first', 'data:image1'));
-    history.append(userMessageWithImage('second', 'data:image2'));
-    history.append(userMessageWithImage('third', 'data:image3'));
-
-    // Override with maxImageMessages: 1
-    const snapshot = history.snapshot({ maxImageMessages: 1 });
-
-    // Only the last image should be preserved
-    expect(snapshot[0].content).toEqual([
-      { type: 'text', text: 'first' },
-      { type: 'text', text: '(omitted due to size limit)' },
-    ]);
-    expect(snapshot[1].content).toEqual([
-      { type: 'text', text: 'second' },
-      { type: 'text', text: '(omitted due to size limit)' },
-    ]);
-    expect(snapshot[2]).toEqual(userMessageWithImage('third', 'data:image3'));
-  });
-
-  it('handles messages with multiple images in content', () => {
-    const history = new ConversationHistory({ maxUserImageMessages: 2 });
+  it('returns image messages without modification', () => {
+    const history = new ConversationHistory();
 
     const messageWithTwoImages: ChatCompletionMessageParam = {
       role: 'user',
@@ -104,17 +57,14 @@ describe('ConversationHistory', () => {
       ],
     };
 
+    history.append(userMessageWithImage('first', 'data:image1'));
+    history.append(assistantMessage('ack1'));
     history.append(messageWithTwoImages);
-    history.append(userMessageWithImage('another', 'data:image3'));
 
     const snapshot = history.snapshot();
 
-    // From back to front: image3 (1st), image2 (2nd), image1 (3rd - should be omitted)
-    expect(snapshot[0].content).toEqual([
-      { type: 'text', text: 'Look at these' },
-      { type: 'text', text: '(omitted due to size limit)' },
-      { type: 'image_url', image_url: { url: 'data:image2' } },
-    ]);
-    expect(snapshot[1]).toEqual(userMessageWithImage('another', 'data:image3'));
+    expect(snapshot[0]).toEqual(userMessageWithImage('first', 'data:image1'));
+    expect(snapshot[1]).toEqual(assistantMessage('ack1'));
+    expect(snapshot[2]).toEqual(messageWithTwoImages);
   });
 });
diff --git a/packages/shared/src/env/parse-model-config.ts b/packages/shared/src/env/parse-model-config.ts
@@ -73,7 +73,7 @@ export const modelFamilyToVLConfig = (
 
   // Check if the modelFamily is valid
   if (!MODEL_FAMILY_VALUES.includes(modelFamily as any)) {
-    throw new Error(`Invalid modelFamily: ${modelFamily}`);
+    throw new Error(`Invalid MIDSCENE_MODEL_FAMILY value: ${modelFamily}`);
   }
 
   // For other model families, they directly map to vlMode

diff --git a/packages/shared/tests/unit-test/env/parse.test.ts b/packages/shared/tests/unit-test/env/parse.test.ts
@@ -50,7 +50,7 @@ describe('modelFamilyToVLConfig', () => {
 
   it('should throw on invalid value', () => {
     expect(() => modelFamilyToVLConfig('invalid' as any)).toThrow(
-      'Invalid modelFamily: invalid',
+      'Invalid MIDSCENE_MODEL_FAMILY value: invalid',
     );
   });
 });