In [22]:
# 라이브러리 로딩
import torch
import torch.nn.functional as F
from transformers import CLIPModel, CLIPProcessor
import os, json, random
from PIL import Image
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

# 시드 및 디바이스 설정
def set_seed(seed=42):
    torch.manual_seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [23]:
# 모델 및 프로세서 로드 (Hugging Face CLIP - OpenAI 원본)
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to(device).eval()
processor = CLIPProcessor.from_pretrained(model_name)

# 데이터 로딩: COCO annotations
image_dir = "train2017"
caption_json_path = "annotations/captions_train2017.json"

with open(caption_json_path, 'r') as f:
    coco_data = json.load(f)

imgid_to_captions = defaultdict(list)
for ann in coco_data['annotations']:
    imgid_to_captions[ann['image_id']].append(ann['caption'])

In [24]:
# 무작위 이미지 3장 선택
image_list = sorted([f for f in os.listdir(image_dir) if f.endswith(".jpg")])
sample_images = random.sample(image_list, 3)

# Best-of-5 캡션 선택 및 유사도 계산
image_embeds = []
text_embeds = []
best_captions = []

for fname in sample_images:
    image_path = os.path.join(image_dir, fname)
    image_id = int(os.path.splitext(fname)[0])
    cap_list = imgid_to_captions[image_id][:5]

    # 이미지 인코딩
    image = Image.open(image_path).convert("RGB")
    image_input = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        image_embed = model.get_image_features(**image_input)
        image_embed = F.normalize(image_embed, dim=-1)

    # 텍스트 5개 인코딩
    text_input = processor(text=cap_list, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        text_feats = model.get_text_features(**text_input)
        text_feats = F.normalize(text_feats, dim=-1)

    # 가장 유사한 텍스트 선택
    sims = torch.matmul(image_embed, text_feats.T).squeeze()
    best_idx = sims.argmax().item()

    image_embeds.append(image_embed)
    text_embeds.append(text_feats[best_idx].unsqueeze(0))
    best_captions.append(cap_list[best_idx])

In [25]:
# Cosine similarity 계산
image_embeds = torch.cat(image_embeds, dim=0)
text_embeds = torch.cat(text_embeds, dim=0)
sims = cosine_similarity(image_embeds.cpu().numpy(), text_embeds.cpu().numpy())


In [26]:
# 출력
print("\nCLIP Best-of-5 Caption Cosine Similarity:")
for i, fname in enumerate(sample_images):
    print(f"\n {fname}")
    print(f" Best caption: \"{best_captions[i]}\"")
    print(f" Similarity: {sims[i, i]:.4f}")


CLIP Best-of-5 Caption Cosine Similarity:

 000000443093.jpg
 Best caption: "a close up of an open box of doughnuts "
 Similarity: 0.3365

 000000077159.jpg
 Best caption: "A skier on a steep snowy hill with evergreen trees."
 Similarity: 0.3271

 000000017560.jpg
 Best caption: "A tan cat is lying on a bed looking at something off camera."
 Similarity: 0.3188
