In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model="llava:34b")

# warmup
llm.invoke("hello")

"Hi there, I'm a chatbot and I'm here to help! What is on your mind?"

In [31]:
import base64
from io import BytesIO

from IPython.display import HTML, display
from PIL import Image


def convert_to_base64(pil_image):
    """
    Convert PIL images to Base64 encoded strings

    :param pil_image: PIL image
    :return: Re-sized Base64 string
    """

    buffered = BytesIO()
    pil_image.save(buffered, format="PNG")  # You can change the format if needed
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


def plt_img_base64(img_base64):
    """
    Display base64 encoded string as image

    :param img_base64:  Base64 string
    """
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/png;base64,{img_base64}" />'
    # Display the image by rendering the HTML
    display(HTML(image_html))
    
image_root = "base_grounder/rendered_views/scene0704_00/1618/high/"

images = []
for i in range(4):
    pil_image = Image.open(image_root + f"view_{i}.png")
    image_base64 = convert_to_base64(pil_image)
    images.append(image_base64)
    # plt_img_base64(image_base64)

In [32]:
llm_with_image_context = llm.bind(images=images)

In [33]:
import os
import base64
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class Result(BaseModel):
    """
    The result of the image locator.
    """
    index: int = Field(
        default=None, 
        enum=[0, 1],
        description="the index of the upper-left corner of the bbox")

    reason: str = Field(
        default=None,
        description="the reason why the index is chosen"
    )
    
parser = PydanticOutputParser(pydantic_object=Result)
template = """You are a helpful image locator who can understand the text description I give you about a specific object. You need to answer which bounding box (bbox) in the image contains the target object from the text description. You only need to provide the index of the upper-left corner of the bbox and explain why you chose it. Wrap the output in `json` tags\n{format_instructions} \ndescription: {description}
"""

description = "looking in from the door the trash can on the right"

prompt = PromptTemplate.from_template(template).partial(format_instructions=parser.get_format_instructions())
# print(prompt.format_prompt(description=description).to_string())
prompt.invoke({"description": description})

StringPromptValue(text='You are a helpful image locator who can understand the text description I give you about a specific object. You need to answer which bounding box (bbox) in the image contains the target object from the text description. You only need to provide the index of the upper-left corner of the bbox and explain why you chose it. Wrap the output in `json` tags\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"description": "The result of the image locator.", "properties": {"index": {"title": "Index", "description": "the index of the upper-left corner of the bbox", "enu

In [34]:
chain = prompt | llm_with_image_context | parser

for i in range(10):
    print(f"Attempt {i}:", end=" ")
    try:
        print(chain.invoke({"description": description}))
    except:
        print("Error ocurred")
        pass

Attempt 0: index=0 reason='The trash can is located on the right side of the image. The index 0 corresponds to the bounding box that includes the door and the area where you would be looking from when viewing the trash can. This matches the text description provided.'
Attempt 1: index=1 reason="The reason is that the image contains two bounding boxes (bbox) and the trash can is located to the right. The left bbox corresponds to index 0 while the right one corresponds to index 1. Therefore, I choose the right bbox as it matches the description of the object's location."
Attempt 2: index=1 reason="The trash can is located to the right of the door frame. It appears as a large object with a cylindrical shape typical of trash cans, and it's clearly distinguishable from the other objects in the image. Therefore, I chose the bbox with index 1 to represent the trash can."
Attempt 3: index=0 reason='The trash can is located to the right of a door, and it appears to be closer than other objects 