In [1]:
import torch
import torchvision

print("torch:", torch.__version__)
print("torchvision:", torchvision.__version__)
print("CUDA available?:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))

torch: 2.4.1+cu124
torchvision: 0.19.1+cu124
CUDA available?: True
GPU: NVIDIA A40


In [2]:
# 0. (필요 시) 패키지 설치
!pip install transformers>=4.40.0 pillow

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
# 1. 라이브러리 임포트
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

In [4]:
# 2. 디바이스 설정
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [5]:
# 3. 모델과 프로세서 로드
from transformers import BlipProcessor, BlipForConditionalGeneration

model_name = "Salesforce/blip-image-captioning-base"

# ① 빠른 프로세서 사용
processor = BlipProcessor.from_pretrained(model_name, use_fast=True)

# ② Safetensors 포맷 강제 로드
model = BlipForConditionalGeneration.from_pretrained(
    model_name,
    use_safetensors=True   # ✨ 이 옵션을 추가하세요
).to(device)


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [6]:
# 4. 이미지 불러오기 및 전처리
image_path = "Golden-Retriever.jpg"  # 본인 환경에 맞게 경로 수정
image = Image.open(image_path).convert("RGB")
inputs = processor(images=image, return_tensors="pt").to(device)

In [7]:
# 5. 캡션 생성
with torch.no_grad():
    out_ids = model.generate(
        pixel_values=inputs.pixel_values,
        max_length=64,       # 생성할 최대 토큰 길이
        num_beams=5,         # 빔 서치 폭
        early_stopping=True
    )

In [8]:
# 6. 결과 디코딩 및 출력
caption = processor.decode(out_ids[0], skip_special_tokens=True)
print("🔹 Generated Caption:", caption)

🔹 Generated Caption: a dog sitting on a log in a field
