## Make your first Image-to-text with Gradio and LLaVA model

In [None]:
!pip install transformers

In [None]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image
import torch
import numpy as np
import requests

In [None]:
# Follow the documentation at https://huggingface.co/docs/transformers/en/model_doc/llava
model_name = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_name)

model = LlavaForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# This code would take a while to run

While running this code, you can learn about LLaVa model from here
[LLaVa](https://llava-vl.github.io/)


In [None]:
url = "https://www.ilankelman.org/stopsigns/australia.jpg" ## click on the link to see the image
# or this image 
# url = "https://llava-vl.github.io/static/images/monalisa.jpg"

image_stop = Image.open(requests.get(url, stream=True).raw)

## Display the image
image_stop.show()

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"}, # This is the image input
            {"type": "text", "text": "What is shown in this image?"}, # This is the text input
        ],
    },
]

# Create prompt from conversation (image and text)
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) 

# Process the image and prompt
inputs = processor(
    images=[image_stop], 
    text=[prompt], 
    return_tensors="pt"
).to(device="cuda", dtype=torch.float16) # Send the inputs to the GPU


generate_ids = model.generate(
    **inputs,
    do_sample=True,
    max_new_tokens=100
)

output_text = processor.batch_decode(generate_ids, skip_special_tokens=True) # Decode the generated text

In [None]:
print(output_text)

## The output text is contain "USER" input and the generated text from the model

In [None]:
## filter the output text to get the answer after "ASSISTANT:"
answer = output_text[0].split("ASSISTANT:")[1].strip()

print(answer)

Now, let's put everything into one function and then test our function

In [None]:
def generate_description(image, prompt = "What is shown in this image?", max_new_tokens=200):
    
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(
        images=[image],
        text=[prompt],
        return_tensors="pt"
    ).to(device="cuda", dtype=torch.float16)
    generate_ids = model.generate(
        **inputs,
        do_sample=True,
        max_new_tokens=max_new_tokens
    )
    generated_description = processor.batch_decode(generate_ids, skip_special_tokens=True)

    return generated_description[0]

Then serve using Gradio. `input` will be images and textbox (prompt) and output will be text (description of the text)

In [None]:
# Test the function that we just build
image = Image.open("/content/466029110_1113670126421850_13431688209473903_n.jpg")

generate_description(
    image,
    "What is shown in this image?"
)

In [None]:
## The output text is contain "USER" input and the generated text from the model

import gradio as gr

demo = gr.Interface(
    fn=lambda img, prompt: generate_description(img, prompt),
    inputs=[gr.Image(type="pil"),
            gr.Textbox(label="prompt", value="What is shown in this image?", lines=3)],  # Changed to numpy
    outputs=[gr.Textbox(label="Description", lines=3)],
    title="Image Description using LLaVA",
    description="Upload an image to get a detailed description using LLaVA-1.5-7b",
)
demo.launch()

In [None]:
# We can leave a lot of port open. So don't forget to close all the port using `gr.close_all()`
gr.close_all()