FIX: MultiModalMessage in gemini with openai sdk error occured (#6440)

SongChiYoung · ekzhu · web-flow · commit 6fc4f5321289 · 2025-05-01T09:27:31.000-07:00
## Why are these changes needed? Multimodal message fill context with other routine. However current `_set_empty_to_whitespace` is fill with context. So, error occured. And, I checked `multimodal_user_transformer_funcs` and I found it, in this routine, context must not be empty. Now remove the `_set_empty_to_whitespace` when multimodal message,  ## Related issue number Closes #6439 ## Checks - [ ] I've included any doc changes needed for <https://microsoft.github.io/autogen/>. See <https://github.com/microsoft/autogen/blob/main/CONTRIBUTING.md> to build and test documentation locally. - [x] I've added tests (if relevant) corresponding to the changes introduced in this PR. - [x] I've made sure all auto checks have passed. Co-authored-by: Eric Zhu <ekzhu@users.noreply.github.com>
diff --git a/python/packages/autogen-ext/src/autogen_ext/models/openai/_message_transform.py b/python/packages/autogen-ext/src/autogen_ext/models/openai/_message_transform.py
@@ -379,7 +379,7 @@ def assistant_condition(message: LLMMessage, context: Dict[str, Any]) -> str:
 
 user_transformer_funcs_gemini: Dict[str, List[Callable[[LLMMessage, Dict[str, Any]], Dict[str, Any]]]] = {
     "text": single_user_transformer_funcs + [_set_empty_to_whitespace],
-    "multimodal": multimodal_user_transformer_funcs + [_set_empty_to_whitespace],
+    "multimodal": multimodal_user_transformer_funcs,
 }
 
 
diff --git a/python/packages/autogen-ext/tests/models/test_openai_model_client.py b/python/packages/autogen-ext/tests/models/test_openai_model_client.py
@@ -7,6 +7,8 @@
 
 import httpx
 import pytest
+from autogen_agentchat.agents import AssistantAgent
+from autogen_agentchat.messages import MultiModalMessage
 from autogen_core import CancellationToken, FunctionCall, Image
 from autogen_core.models import (
     AssistantMessage,
@@ -2459,4 +2461,28 @@ def test_find_model_family() -> None:
     assert _find_model_family("openai", "error") == ModelFamily.UNKNOWN
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model",
+    [
+        "gpt-4.1-nano",
+        "gemini-1.5-flash",
+        "claude-3-5-haiku-20241022",
+    ],
+)
+async def test_multimodal_message_test(
+    model: str, openai_client: OpenAIChatCompletionClient, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    # Test that the multimodal message is converted to the correct format
+    img = Image.from_base64(
+        "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAIAAACQd1PeAAAADElEQVR4nGP4z8AAAAMBAQDJ/pLvAAAAAElFTkSuQmCC"
+    )
+    multi_modal_message = MultiModalMessage(content=["Can you describe the content of this image?", img], source="user")
+
+    ocr_agent = AssistantAgent(
+        name="ocr_agent", model_client=openai_client, system_message="""You are a helpful agent."""
+    )
+    _ = await ocr_agent.run(task=multi_modal_message)
+
+
 # TODO: add integration tests for Azure OpenAI using AAD token.

Original file line number	Diff line number	Diff line change
`@@ -379,7 +379,7 @@ def assistant_condition(message: LLMMessage, context: Dict[str, Any]) -> str:`
`379`	`379`
`380`	`380`	`user_transformer_funcs_gemini: Dict[str, List[Callable[[LLMMessage, Dict[str, Any]], Dict[str, Any]]]] = {`
`381`	`381`	`"text": single_user_transformer_funcs + [_set_empty_to_whitespace],`
`382`		`- "multimodal": multimodal_user_transformer_funcs + [_set_empty_to_whitespace],`
	`382`	`+ "multimodal": multimodal_user_transformer_funcs,`
`383`	`383`	`}`
`384`	`384`
`385`	`385`