In [1]:
from langchain_ollama.llms import OllamaLLM

llm = OllamaLLM(model="llava:34b")

# warmup
llm.invoke("hello")

"Hello! How can I assist you today? 😄 If there's anything specific you would like to know or talk about, feel free to let me know. Just type your message below and I will respond as soon as possible.\n\nRemember, I am a learning model so please be patient with my responses and don't hesitate to ask for clarification if needed. I'm always happy to help! 🤖\n\nAlso, it would be great if you could provide me feedback on how I'm doing by clicking one of the smiley faces below:\n\n[😊] [😕] [😒] [😠] [👍] [👎] [🚫] [🤔] [😅] [😴]\n\nPlease note that any feedback you provide will help me improve and serve better. Thank you! 🙏"

In [3]:
import base64
from io import BytesIO

from IPython.display import HTML, display
from PIL import Image


def convert_to_base64(pil_image):
    """
    Convert PIL images to Base64 encoded strings

    :param pil_image: PIL image
    :return: Re-sized Base64 string
    """

    buffered = BytesIO()
    pil_image.save(buffered, format="PNG")  # You can change the format if needed
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


def plt_img_base64(img_base64):
    """
    Display base64 encoded string as image

    :param img_base64:  Base64 string
    """
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/png;base64,{img_base64}" />'
    # Display the image by rendering the HTML
    display(HTML(image_html))
    
image_root = "../rendered_views/scene0704_00/1618/high/"

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

images = []
for i in range(4):
    # pil_image = Image.open(image_root + f"view_{i}.png")
    # image_base64 = convert_to_base64(pil_image)
    images.append(encode_image_to_base64(image_root + f"view_{i}.png"))
    # plt_img_base64(image_base64)

In [4]:
llm_with_image_context = llm.bind(images=images)

In [5]:
import os
import base64
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class Result(BaseModel):
    """
    The result of the image locator.
    """
    index: int = Field(
        default=None, 
        enum=[0, 1],
        description="the index of the upper-left corner of the bbox")

    reason: str = Field(
        default=None,
        description="the reason why the index is chosen"
    )
    
parser = PydanticOutputParser(pydantic_object=Result)
template = """You are a helpful image locator who can understand the text description I give you about a specific object. You need to answer which bounding box (bbox) in the image contains the target object from the text description. You only need to provide the index of the upper-left corner of the bbox and explain why you chose it. Wrap the output in `json` tags\n{format_instructions} \ndescription: {description}
"""

description = "looking in from the door the trash can on the right"

prompt = PromptTemplate.from_template(template).partial(format_instructions=parser.get_format_instructions())
# print(prompt.format_prompt(description=description).to_string())
prompt.invoke({"description": description})

StringPromptValue(text='You are a helpful image locator who can understand the text description I give you about a specific object. You need to answer which bounding box (bbox) in the image contains the target object from the text description. You only need to provide the index of the upper-left corner of the bbox and explain why you chose it. Wrap the output in `json` tags\nThe output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"description": "The result of the image locator.", "properties": {"index": {"title": "Index", "description": "the index of the upper-left corner of the bbox", "enu

In [6]:
chain = prompt | llm_with_image_context | parser

for i in range(10):
    print(f"Attempt {i}:", end=" ")
    try:
        print(chain.invoke({"description": description}))
    except:
        print("Error ocurred")
        pass

Attempt 0: index=0 reason="The bounding box at index 0 contains the trash can to the right of the entrance. This matches the description provided: 'looking in from the door the trash can on the right'."
Attempt 1: index=1 reason="The image contains a trash can located on the right side. Based on your text description 'looking in from the door', we are assuming that you are looking at the image from the direction of the entrance or exit of a building, such as a house or an office. This would place the 'door' on the left side of the image, and consequently, the trash can would be located on the right side. Therefore, bbox 1, which is positioned on the right side of the image, contains the trash can."
Attempt 2: index=1 reason='The trash can is located to the right of the doorway and slightly ahead of it. The bounding box at index 1 includes the trash can.'
Attempt 3: index=1 reason='The bbox with index 1 contains a trash can which is described as being on the right and located in from of