# BLIPâ€‘2 Demo â€” Captioning and VQA (Transformers)

This notebook demonstrates image captioning and visual question answering with **BLIPâ€‘2 (Flanâ€‘T5 XL)** using ðŸ¤— Transformers.

In [None]:
# If running on Colab or a fresh environment, uncomment the next lines:
# !pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# !pip install -U transformers accelerate pillow safetensors


In [None]:
from PIL import Image
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', device)


In [None]:
MODEL_ID = 'Salesforce/blip2-flan-t5-xl'

processor = Blip2Processor.from_pretrained(MODEL_ID)
model = Blip2ForConditionalGeneration.from_pretrained(
    MODEL_ID, torch_dtype=torch.float16 if device == 'cuda' else torch.float32, device_map=None
).to(device)

print('Loaded model:', MODEL_ID)


## Image Captioning

In [None]:
# Set your image path or URL here
IMAGE_PATH = 'sample.jpg'  # e.g., '/path/to/your_image.jpg'

# Load image
image = Image.open(IMAGE_PATH).convert('RGB')

# Generate caption
inputs = processor(images=image, return_tensors='pt').to(device, dtype=torch.float16 if device=='cuda' else torch.float32)
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=30)
caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print('Caption:', caption)


## Visual Question Answering (Zeroâ€‘shot)

In [None]:
# Ask a question about the same image
question = 'What is the main object in the image?'

prompt = f'Question: {question} Short answer:'  # per BLIP-2 Flanâ€‘T5 prompt style
inputs = processor(images=image, text=prompt, return_tensors='pt').to(device, dtype=torch.float16 if device=='cuda' else torch.float32)
with torch.no_grad():
    generated_ids = model.generate(**inputs, max_new_tokens=20)
answer = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print('Q:', question)
print('A:', answer)
