# L2: Image captioning app 🖼️📝

Load your HF API key and relevant Python libraries

In [3]:
import os
import io
import IPython.display
from PIL import Image
import base64 
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
hf_api_key = os.environ['HF_API_KEY']

## See .env for "Serverless Inference API (no endpoint to manage)"

In [2]:
"""
# Helper functions
import requests, json

#Image-to-text endpoint
def get_completion(inputs, parameters=None, ENDPOINT_URL=os.environ['HF_API_ITT_BASE']):
    headers = {
      "Authorization": f"Bearer {hf_api_key}",
      "Content-Type": "application/json"
    }
    data = { "inputs": inputs }
    if parameters is not None:
        data.update({"parameters": parameters})
    response = requests.request("POST",
                                ENDPOINT_URL,
                                headers=headers,
                                data=json.dumps(data))
    return json.loads(response.content.decode("utf-8"))
"""

In [9]:
import os, json, base64, requests
from urllib.parse import urlparse

def _is_url(x: str) -> bool:
    try:
        s = urlparse(x)
        return s.scheme in ("http", "https") and bool(s.netloc)
    except Exception:
        return False

def _to_image_bytes(inp) -> bytes:
    if isinstance(inp, (bytes, bytearray)):
        return bytes(inp)
    if isinstance(inp, str) and _is_url(inp):
        r = requests.get(inp, timeout=30)
        r.raise_for_status()
        return r.content
    if isinstance(inp, str) and os.path.exists(inp):
        with open(inp, "rb") as f:
            return f.read()
    raise ValueError("inputs must be an image URL, local path, or bytes")

def _auth_header(token: str | None) -> dict:
    return {"Authorization": f"Bearer {token}"} if token else {}

def _explain_and_raise(resp: requests.Response):
    ct = resp.headers.get("Content-Type", "")
    body = None
    try:
        body = resp.json() if "application/json" in ct else resp.text
    except Exception:
        body = resp.text
    msg = f"HTTP {resp.status_code} from {resp.url}\nContent-Type: {ct}\nBody (truncated): {str(body)[:800]}"
    # Common hints
    if resp.status_code in (401, 403):
        msg += "\nHint: missing/invalid token or endpoint is protected."
    elif resp.status_code == 404:
        msg += "\nHint: check ENDPOINT_URL is correct (model path or endpoint base)."
    elif resp.status_code == 415:
        msg += "\nHint: wrong Content-Type for this model/endpoint."
    elif resp.status_code == 422:
        msg += "\nHint: payload schema is wrong for this model."
    raise RuntimeError(msg)

def get_completion(inputs, parameters=None, ENDPOINT_URL=None, token=None, timeout=60):
    """
    Image-to-text request that is robust across:
      - HF serverless (api-inference.huggingface.co/models/<owner>/<model>)
      - HF dedicated endpoints (*.endpoints.huggingface.cloud)
    Strategy:
      1) Send image BYTES with Content-Type: application/octet-stream (most vision models).
      2) If 400/415, and 'inputs' was a URL, RETRY as JSON: {"inputs": "<image_url>", "parameters": {...}}
    """
    url = ENDPOINT_URL or os.getenv("HF_API_ITT_BASE")
    if not url:
        raise RuntimeError("HF_API_ITT_BASE not set; export it or pass ENDPOINT_URL.")

    tok = token or os.getenv("HF_API_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")

    # --- Attempt 1: binary bytes (universal for vision) ---
    try:
        img_bytes = _to_image_bytes(inputs)
    except Exception as e:
        # If we can't turn it into bytes but it's a URL, we'll try JSON below
        img_bytes = None

    if img_bytes is not None:
        headers = {"Accept": "application/json", "Content-Type": "application/octet-stream", **_auth_header(tok)}
        resp = requests.post(url, headers=headers, data=img_bytes, timeout=timeout)
        if 200 <= resp.status_code < 300:
            # Parse JSON if possible; otherwise return raw
            if "application/json" in resp.headers.get("Content-Type", ""):
                try:
                    return resp.json()
                except Exception:
                    return resp.text
            return resp.content
        # If the failure looks like a format issue, try JSON fallback when possible
        if resp.status_code in (400, 415) and isinstance(inputs, str) and _is_url(inputs):
            pass  # fall through to JSON retry below
        else:
            _explain_and_raise(resp)

    # --- Attempt 2: JSON with URL (some models/endpoints prefer this) ---
    if isinstance(inputs, str) and _is_url(inputs):
        payload = {"inputs": inputs, "parameters": parameters or {}}
        headers = {"Accept": "application/json", "Content-Type": "application/json", **_auth_header(tok)}
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        if 200 <= resp.status_code < 300:
            try:
                return resp.json()
            except Exception:
                return resp.text
        _explain_and_raise(resp)

    # --- Optional Attempt 3: JSON with base64 (rarely needed, but here if you want it) ---
    if img_bytes is not None:
        b64 = base64.b64encode(img_bytes).decode("utf-8")
        payload = {"inputs": b64, "parameters": parameters or {}}
        headers = {"Accept": "application/json", "Content-Type": "application/json", **_auth_header(tok)}
        resp = requests.post(url, headers=headers, json=payload, timeout=timeout)
        if 200 <= resp.status_code < 300:
            try:
                return resp.json()
            except Exception:
                return resp.text
        _explain_and_raise(resp)

    raise RuntimeError("Could not prepare a valid payload for the image input.")


## Building an image captioning app 

Here we'll be using an [Inference Endpoint](https://huggingface.co/inference-endpoints) for `Salesforce/blip-image-captioning-base` a 14M parameter captioning model.

The free images are available on: https://free-images.com/

In [10]:
image_url = "https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg"
display(IPython.display.Image(url=image_url))

#get_completion(image_url)


## Need to set up cloud endpoint to use the following code

In [13]:
"""
image_url = "https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg"
result = get_completion(
    image_url,
    ENDPOINT_URL="https://api-inference.huggingface.co/models/nlpconnect/vit-gpt2-image-captioning",
    token=os.getenv("HF_API_TOKEN"),
)
print(result)
# Typical shape: [{'generated_text': '...'}]
if isinstance(result, list) and result and isinstance(result[0], dict):
    print("Caption:", result[0].get("generated_text"))
"""

'\nimage_url = "https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg"\nresult = get_completion(\n    image_url,\n    ENDPOINT_URL="https://api-inference.huggingface.co/models/nlpconnect/vit-gpt2-image-captioning",\n    token=os.getenv("HF_API_TOKEN"),\n)\nprint(result)\n# Typical shape: [{\'generated_text\': \'...\'}]\nif isinstance(result, list) and result and isinstance(result[0], dict):\n    print("Caption:", result[0].get("generated_text"))\n'

In [14]:
"""
BLIP_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"
res = get_completion(image_url, ENDPOINT_URL=BLIP_URL, token=os.getenv("HF_API_TOKEN"))
print(res)
"""

'\nBLIP_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-base"\nres = get_completion(image_url, ENDPOINT_URL=BLIP_URL, token=os.getenv("HF_API_TOKEN"))\nprint(res)\n'

## Captioning with `gr.Interface()`

#### gr.Image()
- The `type` parameter is the format that the `fn` function expects to receive as its input.  If `type` is `numpy` or `pil`, `gr.Image()` will convert the uploaded file to this format before sending it to the `fn` function.
- If `type` is `filepath`, `gr.Image()` will temporarily store the image and provide a string path to that image location as input to the `fn` function.

In [4]:
import gradio as gr 

def image_to_base64_str(pil_image):
    byte_arr = io.BytesIO()
    pil_image.save(byte_arr, format='PNG')
    byte_arr = byte_arr.getvalue()
    return str(base64.b64encode(byte_arr).decode('utf-8'))

def captioner(image):
    base64_image = image_to_base64_str(image)
    result = get_completion(base64_image)
    return result[0]['generated_text']

gr.close_all()
demo = gr.Interface(fn=captioner,
                    inputs=[gr.Image(label="Upload image", type="pil")],
                    outputs=[gr.Textbox(label="Caption")],
                    title="Image Captioning with BLIP",
                    description="Caption any image using the BLIP model",
                    allow_flagging="never",
                    examples=["christmas_dog.jpeg", "bird_flight.jpeg", "cow.jpeg"])

demo.launch(share=True, server_port=int(os.environ['PORT1']))



Running on local URL:  https://0.0.0.0:7860
IMPORTANT: You are using gradio version 3.37.0, however version 4.44.1 is available, please upgrade.
--------

Could not create share link. Missing file: /usr/local/lib/python3.9/site-packages/gradio/frpc_linux_amd64_v0.2. 

Please check your internet connection. This can happen if your antivirus software blocks the download of this file. You can install manually by following these steps: 

1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.2/frpc_linux_amd64
2. Rename the downloaded file to: frpc_linux_amd64_v0.2
3. Move the file to this location: /usr/local/lib/python3.9/site-packages/gradio




In [5]:
gr.close_all()

Closing server running on port: 7860


In [3]:
import os, requests
img = requests.get("https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg", timeout=30).content
headers = {"Accept":"application/json","Content-Type":"application/octet-stream",
           "Authorization": f"Bearer {os.environ['HF_API_TOKEN']}"}
r = requests.post(os.environ["HF_API_ITT_BASE"], headers=headers, data=img, timeout=60)
r.raise_for_status()
print(r.json())  # typically: [{"generated_text": "..."}]


HTTPError: 404 Client Error: Not Found for url: https://api-inference.huggingface.co/models/nlpconnect/vit-gpt2-image-captioning

## Quick local captioning (ViT-GPT2: small & reliable)

In [1]:
from transformers import pipeline

pipe = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device_map="auto")

img_url = "https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg"
out = pipe(img_url, max_new_tokens=32)
print(out[0]["generated_text"])


config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use mps
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


a dog wearing a red hat and a red bow tie 


In [4]:
display(IPython.display.Image(url=img_url))


## Fastest working captioner (ViT-GPT2) — minimal RAM, great on M-series

In [5]:
from transformers import pipeline
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"

# Simple + reliable
pipe = pipeline(
    "image-to-text",
    model="nlpconnect/vit-gpt2-image-captioning",
    device=device,               # uses MPS on Apple Silicon
    # model_kwargs={"torch_dtype": "auto"}  # optional
)

img_url = "https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg"
display(IPython.display.Image(url=img_url))

out = pipe(img_url, max_new_tokens=32)
print(out[0]["generated_text"])


Device set to use mps


a dog wearing a red hat and a red bow tie 


## Higher-quality captions (BLIP base) — use float16 on MPS

In [8]:
import io, requests, torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

use_mps = torch.backends.mps.is_available()
device = torch.device("mps" if use_mps else "cpu")

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# MPS tip: half precision saves memory & speeds up on M-series
if use_mps:
    model = model.to(torch.float16)
model = model.to(device).eval()

img_url = "https://free-images.com/sm/9596/dog_animal_greyhound_983023.jpg"
display(IPython.display.Image(url=img_url))

img = Image.open(io.BytesIO(requests.get(img_url, timeout=30).content)).convert("RGB")

inputs = processor(images=img, return_tensors="pt").to(device)
if use_mps:
    # keep input dtype consistent with model for MPS
    inputs = {k: (v.half() if v.dtype == torch.float32 else v) for k, v in inputs.items()}

with torch.inference_mode():
    out_ids = model.generate(**inputs, max_new_tokens=32)
caption = processor.decode(out_ids[0], skip_special_tokens=True)
print(caption)


a dog wearing a santa hat and a red scarf


## Tiny local Gradio app (runs on your Mac)

In [9]:
import gradio as gr
from transformers import pipeline
import torch

device = "mps" if torch.backends.mps.is_available() else "cpu"
pipe = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)

def caption(img):
    return pipe(img, max_new_tokens=32)[0]["generated_text"]

demo = gr.Interface(fn=caption, inputs=gr.Image(type="pil"), outputs="text",
                    title="Local Image Captioning (MPS)")
demo.launch()


Device set to use mps


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




## local first, no HF endpoint needed

In [10]:
# local_itt.py
import torch
from transformers import pipeline

_PIPE = None

def get_completion_local(image, model_name="nlpconnect/vit-gpt2-image-captioning", max_new_tokens=32):
    global _PIPE
    if _PIPE is None:
        device = "mps" if torch.backends.mps.is_available() else "cpu"
        _PIPE = pipeline("image-to-text", model=model_name, device=device)
    out = _PIPE(image, max_new_tokens=max_new_tokens)
    # standardize shape similar to HF serverless
    return [{"generated_text": out[0]["generated_text"]}]


In [11]:
gr.close_all()

Closing server running on port: 7860
