# Build your ML application with Gradio

Gradio is an open-source Python package that allows you to quickly build a demo or web application for your machine learning model, API, or any arbitrary Python function.

See the documentation here: https://www.gradio.app/guides/quickstart

In [None]:
%%capture
!pip install gradio
!pip install transformers

In [None]:
# Import libraries using in this notebook
import numpy as np
import gradio as gr

Specifying the input types and the output types.

In [None]:
def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gr.Interface(
    fn=greet,
    inputs=["text", "slider"],
    outputs=["text"],
)

demo.launch()

If you use actual classes for `gr.Textbox` and `gr.Slider` instead of the string shortcuts, you have access to much more customizability through component attributes.

In [None]:
def greet(name, intensity):
    return "Hello, " + name + "!" * intensity

demo = gr.Interface(
    fn=greet,
    inputs=["text", gr.Slider(value=2, minimum=1, maximum=10, step=1)],
    outputs=[gr.Textbox(label="greeting", lines=3)], # add number of textbox lines
)

demo.launch()

In [None]:
def greet(name, intensity):
    return "Hello, " + name + "!" * intensity

demo = gr.Interface(
    fn=greet,
    inputs=["text", gr.Slider(value=2, minimum=1, maximum=10, step=1)],
    outputs=[gr.Textbox(label="greeting", lines=3)],
)

demo.launch()

In [None]:
def filter_sepia(input_img):
    sepia_filter = np.array([
        [0.393, 0.769, 0.189],
        [0.349, 0.686, 0.168],
        [0.272, 0.534, 0.131]
    ])
    sepia_img = input_img.dot(sepia_filter.T)
    sepia_img /= sepia_img.max()
    return sepia_img

demo = gr.Interface(filter_sepia, gr.Image(), "image")
demo.launch()

See more examples at https://www.gradio.app/guides/the-interface-class

## Make your first Image-to-text with Gradio

In [None]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
from PIL import Image
import torch
import numpy as np
import requests

In [None]:
# Follow the documentation at https://huggingface.co/docs/transformers/en/model_doc/llava
model_name = "llava-hf/llava-1.5-7b-hf"
processor = AutoProcessor.from_pretrained(model_name)
model = LlavaForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image_stop = Image.open(requests.get(url, stream=True).raw)
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is shown in this image?"},
        ],
    },
]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)


# Process the image and prompt
inputs = processor(
    images=[image_stop],
    text=[prompt],
    return_tensors="pt"
).to(device="cuda", dtype=torch.float16)


generate_ids = model.generate(
    **inputs,
    do_sample=True,
    max_new_tokens=100
)
processor.batch_decode(generate_ids, skip_special_tokens=True)

Now, let's put everything into one function and then test our function

In [None]:
def generate_description(image, prompt = "What is shown in this image?", max_new_tokens=200):
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt},
            ],
        },
    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(
        images=[image],
        text=[prompt],
        return_tensors="pt"
    ).to(device="cuda", dtype=torch.float16)
    generate_ids = model.generate(
        **inputs,
        do_sample=True,
        max_new_tokens=max_new_tokens
    )
    generated_description = processor.batch_decode(generate_ids, skip_special_tokens=True)
    return generated_description[0]

In [None]:
# Test the function that we just build
image = Image.open("/content/466029110_1113670126421850_13431688209473903_n.jpg")
generate_description(
    image,
    "What is shown in this image?"
)

Then serve using Gradio. `input` will be images and textbox (prompt) and output will be text (description of the text)

In [None]:
import gradio as gr

demo = gr.Interface(
    fn=lambda img, prompt: generate_description(img, prompt),
    inputs=[gr.Image(type="pil"),
            gr.Textbox(label="prompt", value="What is shown in this image?", lines=3)],  # Changed to numpy
    outputs=[gr.Textbox(label="Description", lines=3)],
    title="Image Description using LLaVA",
    description="Upload an image to get a detailed description using LLaVA-1.5-7b",
)
demo.launch()

In [None]:
# We can leave a lot of port open. So don't forget to close all the port using `gr.close_all()`
gr.close_all()