In [1]:
import os
# where you want all HF files (models, tokenizers, caches, etc.) to live:
os.environ['HF_HOME'] = ""
os.environ["HF_TOKEN"] = "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"

In [None]:
# pip install accelerate

from transformers import AutoProcessor, Gemma3ForConditionalGeneration
from PIL import Image
import requests
import torch

model_id = "google/gemma-3-12b-it"

model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id, device_map="auto"
).eval()

processor = AutoProcessor.from_pretrained(model_id)

messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are a helpful assistant."}]
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
            {"type": "text", "text": "Describe this image in detail."}
        ]
    }
]

inputs = processor.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=True,
    return_dict=True, return_tensors="pt"
).to(model.device, dtype=torch.bfloat16)

input_len = inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]

decoded = processor.decode(generation, skip_special_tokens=True)
print(decoded)

# **Overall Impression:** The image is a close-up shot of a vibrant garden scene,
# focusing on a cluster of pink cosmos flowers and a busy bumblebee.
# It has a slightly soft, natural feel, likely captured in daylight.


In [None]:
# app.py
import torch
import gradio as gr
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
from PIL import Image

# 1. Load model & processor once at startup
model_id = "google/gemma-3-12b-it"
device   = "cuda" if torch.cuda.is_available() else "cpu"

model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id, device_map="auto"
).eval()

processor = AutoProcessor.from_pretrained(model_id)

# 2. Define your inference function
def describe(image, user_prompt):
    # ensure PIL.Image
    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)

    messages = [
        {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
        {"role": "user", "content": [
            {"type": "image", "image": image},
            {"type": "text",  "text":  user_prompt}
        ]},
    ]

    # prepare inputs
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(device, dtype=torch.bfloat16)

    # run generation
    input_len = inputs["input_ids"].shape[-1]
    with torch.inference_mode():
        tokens = model.generate(**inputs, max_new_tokens=100, do_sample=False)
    gen_tokens = tokens[0, input_len:]
    return processor.decode(gen_tokens, skip_special_tokens=True)

# 3. Build the Gradio interface with BOTH image + text inputs
iface = gr.Interface(
    fn=describe,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Textbox(lines=2,
                   placeholder="e.g. Describe this image in detail",
                   label="Your Prompt")
    ],
    outputs=gr.Textbox(label="Model’s Response"),
    title="Gemma-3 Image & Text Describer",
    description="Upload an image and enter any text prompt to get a multimodal response."
)

if __name__ == "__main__":
    iface.launch()

In [None]:
import torch
import gradio as gr
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
from PIL import Image

# 1. Load model & processor once at startup
model_id = "google/gemma-3-12b-it"
device   = "cuda" if torch.cuda.is_available() else "cpu"

model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id, device_map="auto"
).eval()

processor = AutoProcessor.from_pretrained(model_id)

# 2. Define your inference function
def describe(image=None, user_prompt=""):
    # Reject if nothing is provided
    if image is None and not user_prompt.strip():
        return "⚠️ Please upload an image or enter a text prompt (or both)."

    # System message
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."}]
        }
    ]

    # Build user content dynamically
    user_contents = []
    if image is not None:
        # ensure PIL.Image
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)
        user_contents.append({"type": "image", "image": image})
    if user_prompt.strip():
        user_contents.append({"type": "text", "text": user_prompt.strip()})

    messages.append({"role": "user", "content": user_contents})

    # Prepare inputs for the model
    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_dict=True,
        return_tensors="pt"
    ).to(device, dtype=torch.bfloat16)

    # Run generation
    input_len = inputs["input_ids"].shape[-1]
    with torch.inference_mode():
        tokens = model.generate(**inputs, max_new_tokens=100, do_sample=False)
    gen_tokens = tokens[0, input_len:]
    return processor.decode(gen_tokens, skip_special_tokens=True)

# 3. Build a Gradio interface with optional inputs
iface = gr.Interface(
    fn=describe,
    inputs=[
        gr.Image(type="pil", label="Upload an Image (optional)", optional=True),
        gr.Textbox(
            lines=2,
            placeholder="Enter a text prompt (optional)",
            label="Your Prompt",
            optional=True
        )
    ],
    outputs=gr.Textbox(label="Model’s Response"),
    title="Gemma-3 Multimodal Describer",
    description=(
        "Upload an image, enter text, or both. "
        "The model will respond based on whichever inputs you provide."
    )
)

if __name__ == "__main__":
    iface.launch()