# Image to Text with LCEL

Note : API evolves.  See also : 
- https://python.langchain.com/docs/how_to/multimodal_inputs/
- https://python.langchain.com/docs/how_to/multimodal_prompts/

In [None]:
import base64

from dotenv import load_dotenv
from genai_blueprint.utils.config_mngr import global_config
from genai_tk.core.llm_factory import get_llm
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.messages.base import BaseMessage
from langchain_core.output_parsers import StrOutputParser
from upath import UPath

load_dotenv(verbose=True)

### Multi-Modal Chain Construction

This chain combines three components:
1. **Prompt Generation**: Creates structured messages with system instructions and user content
2. **LLM Selection**: Configures GPT-4o for image understanding capabilities
3. **Output Parsing**: Converts LLM response to clean text output

The chain handles both text prompts and image inputs in a single query.

In [None]:
def gen_prompt(param_dict: dict) -> list[BaseMessage]:
    # Function to generate a prompt based on given parameters
    system_message = (
        "You are a helpful assistant that kindly explains images and answers questions provided by the user."
    )
    human_messages = [
        {
            "type": "text",
            "text": f"{param_dict['question']}",
        },
        {
            "type": "image_url",
            "image_url": {
                "url": f"{param_dict['image_url']}",
            },
        },
    ]
    return [SystemMessage(content=system_message), HumanMessage(content=human_messages)]


llm = get_llm(llm_tag="for_vision")

chain = gen_prompt | llm | StrOutputParser()

In [None]:
from genai_blueprint.utils.config_mngr import global_config

global_config().get_str("default_config")

### Image Handling & Base64 Encoding

Key technical details:
- Images are converted to base64 strings for API compatibility
- Path resolution uses centralized configuration
- Supports both local files and external URLs
- Automatic encoding/decoding preserves image fidelity

The `encode_image` function handles binary-to-text conversion required for JSON payloads.

In [None]:
BASE = global_config().get_dir_path("paths.project")
IMAGE_PATH = BASE / "use_case_data/other/reference-architecture-magento.png"
QUESTION = "List the AWS services used in that architecture.  To What Amazon CloudFront is connected ?"


def encode_image(image_path: UPath) -> str:
    # Open the image file and encode it as a base64 string
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


base64_image = encode_image(IMAGE_PATH)

In [None]:
response = chain.invoke(
    {
        "question": QUESTION,
        "image_url": f"data:image/jpeg;base64,{base64_image}",
    }
)
print(response)

###  With EdenAI API (can be skipped)

In [None]:
import os

import requests

headers = {"Authorization": f"Bearer {os.environ['EDENAI_API_KEY']}"}
url = "https://api.edenai.run/v2/multimodal/chat"


# Function to read the image file and convert it to base64
with open(IMAGE_PATH, "rb") as image_file:
    base64_image = base64.b64encode(image_file.read()).decode("utf-8")
payload = {
    "providers": "openai, google",
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "content": {"text": QUESTION},
                },
                {
                    "type": "media_base64",
                    "content": {
                        "media_base64": base64_image,
                        "media_type": "image/png",
                    },
                },
            ],
        }
    ],
    "chatbot_global_action": "",
}

response = requests.post(url, json=payload, headers=headers)
result = response.json()
print(result["openai"]["generated_text"])

In [None]:
print(result["google"]["generated_text"])