# BLIP (Bootstrapped Language-Image Pretraining)

In [4]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", token=HF_TOKEN)
model = BlipForConditionalGeneration.from_pretrained('Salesforce/blip-image-captioning-base', token=HF_TOKEN)

image = Image.open('test.jpg').convert('RGB')
inputs = processor(images=image, return_tensors='pt')

output = model.generate(**inputs)

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [5]:
print("BLIP caption:", processor.decode(output[0], skip_special_tokens=True))

BLIP caption: a puppy sitting in the grass with its mouth open


# CLIP (Contrastive Language-Image Pretraining)

In [6]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-xwddk2ia
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-xwddk2ia
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->clip==1.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting 

In [2]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

model, processor = clip.load("ViT-B/32", device=device)

image = processor(Image.open('test.jpg')).unsqueeze(0).to(device)

caption_options = [
    "a dog on the grass",
    "a pug sitting",
    "a cute cat",
    "a cat on the table"
]

captions = clip.tokenize(caption_options).to(device)

with torch.no_grad():
  image_features = model.encode_image(image)
  text_features = model.encode_text(captions)
  logits_per_image, _ = model(image, captions)
  probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("CLIP's best caption:", caption_options[probs.argmax()])

CLIP's best caption: a dog on the grass


# [실습] 이미지 캡션
1. BLIP을 이용해 이미지에 적절한 캡션 생성
2. OpenAI API를 이용해 보기가 될 캡션 생성
3. 사용자의 선택
4. CLIP을 이용해 가장 유사도가 높은 캡션 매칭
5. 결과 출력: 정답/오답 여부, BLIP 생성 캡션, CLIP 매칭 캡션, CLIP 유사도 점수