In [1]:

import warnings
warnings.filterwarnings("ignore")

import os
os.environ['PYTHONWARNINGS'] = 'ignore'

import math
import random
from pathlib import Path
from typing import List, Optional, Dict
import pandas as pd
from PIL import Image
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from torchvision import transforms

In [2]:
# 시드 설정 함수
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [3]:
# CLIP Contrastive Loss
def clip_contrastive_loss(logits_per_image: torch.Tensor, logits_per_text: torch.Tensor) -> torch.Tensor:
    b = logits_per_image.size(0)
    target = torch.arange(b, device=logits_per_image.device)
    return 0.5 * (F.cross_entropy(logits_per_image, target) +
                  F.cross_entropy(logits_per_text,  target))

In [6]:
# 간단한 Vision Transformer (이미지 인코더)
class SimpleViT(nn.Module):
    def __init__(self, image_size=224, patch_size=32, embed_dim=512, num_layers=6, num_heads=8):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2
        
        # 패치 임베딩
        self.patch_embed = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        
        # 위치 임베딩 (CLS 토큰 + 패치들)
        self.pos_embed = nn.Parameter(torch.randn(1, self.num_patches + 1, embed_dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        
        # Transformer 블록들
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, 
            nhead=num_heads, 
            dim_feedforward=embed_dim*4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # 최종 프로젝션
        self.ln_final = nn.LayerNorm(embed_dim)
        self.projection = nn.Linear(embed_dim, embed_dim)
        
    def forward(self, x):
        B = x.shape[0]
        
        # 패치로 나누기 (B, 3, H, W) -> (B, embed_dim, H/P, W/P) -> (B, N, embed_dim)
        x = self.patch_embed(x)  # (B, embed_dim, 7, 7)
        x = x.flatten(2).transpose(1, 2)  # (B, 49, embed_dim)
        
        # CLS 토큰 추가
        cls_tokens = self.cls_token.expand(B, -1, -1)  # (B, 1, embed_dim)
        x = torch.cat([cls_tokens, x], dim=1)  # (B, 50, embed_dim)
        
        # 위치 임베딩 추가
        x = x + self.pos_embed
        
        # Transformer
        x = self.transformer(x)
        
        # CLS 토큰만 사용
        x = self.ln_final(x[:, 0])  # (B, embed_dim)
        x = self.projection(x)
        
        # L2 정규화
        return F.normalize(x, p=2, dim=-1)


In [7]:
class SimpleTextEncoder(nn.Module):
    def __init__(self, vocab_size=10000, embed_dim=512, max_len=77, num_layers=4, num_heads=8):
        super().__init__()
        self.max_len = max_len
        self.vocab_size = vocab_size
        
        # 토큰 임베딩
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Parameter(torch.randn(1, max_len, embed_dim))
        
        # Transformer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim*4,
            dropout=0.1,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # 최종 프로젝션
        self.ln_final = nn.LayerNorm(embed_dim)
        self.projection = nn.Linear(embed_dim, embed_dim)
        
    def tokenize(self, texts: List[str]) -> torch.LongTensor:
        """간단한 단어 기반 토크나이저"""
        # 간단한 전처리
        tokens = []
        for text in texts:
            # 소문자 변환, 특수문자 제거, 공백으로 분할
            words = text.lower().replace('.', '').replace(',', '').split()
            # 단어를 숫자로 변환 (해시 기반 - 실제로는 vocab 사전 필요)
            token_ids = [1]  # SOS 토큰
            for word in words[:self.max_len-2]:  # SOS, EOS 공간 확보
                # 간단한 해시 기반 토큰화 (실제로는 BPE나 WordPiece 사용)
                token_id = (hash(word) % (self.vocab_size - 100)) + 100  # 특수토큰 공간 확보
                token_ids.append(token_id)
            token_ids.append(2)  # EOS 토큰
            
            # 패딩
            while len(token_ids) < self.max_len:
                token_ids.append(0)  # PAD 토큰
                
            tokens.append(token_ids[:self.max_len])
        
        return torch.LongTensor(tokens)
    
    def forward(self, token_ids):
        # 토큰 임베딩
        x = self.token_embed(token_ids)  # (B, seq_len, embed_dim)
        
        # 위치 임베딩 추가
        x = x + self.pos_embed[:, :x.size(1)]
        
        # 패딩 마스크 생성
        mask = (token_ids == 0)  # PAD 토큰 위치
        
        # Transformer
        x = self.transformer(x, src_key_padding_mask=mask)
        
        # EOS 토큰 위치 찾기 (간단하게 마지막 non-pad 토큰 사용)
        seq_lens = (token_ids != 0).sum(dim=1) - 1  # EOS 토큰 위치
        batch_idx = torch.arange(x.size(0))
        x = x[batch_idx, seq_lens]  # (B, embed_dim)
        
        # 최종 처리
        x = self.ln_final(x)
        x = self.projection(x)
        
        # L2 정규화
        return F.normalize(x, p=2, dim=-1)

In [8]:
class SimpleCLIP(nn.Module):
    def __init__(self, image_size=224, patch_size=32, embed_dim=512):
        super().__init__()
        
        # 이미지와 텍스트 인코더
        self.image_encoder = SimpleViT(image_size, patch_size, embed_dim)
        self.text_encoder = SimpleTextEncoder(embed_dim=embed_dim)
        
        # Temperature 파라미터 (학습 가능)
        self.logit_scale = nn.Parameter(torch.tensor(math.log(1/0.07)))
        
        # 이미지 전처리
        self.preprocess = transforms.Compose([
            transforms.Resize((image_size, image_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
    
    @property
    def temperature(self):
        return self.logit_scale.exp()
    
    def encode_image(self, images):
        return self.image_encoder(images)
    
    def encode_text(self, text_tokens):
        return self.text_encoder(text_tokens)
    
    def forward(self, images, text_tokens):
        # 임베딩 추출
        image_embeds = self.encode_image(images)  # (B, embed_dim)
        text_embeds = self.encode_text(text_tokens)  # (B, embed_dim)
        
        # 유사도 계산 (temperature 적용)
        logits = self.temperature * image_embeds @ text_embeds.t()
        
        return {
            "image_embeds": image_embeds,
            "text_embeds": text_embeds, 
            "logits_per_image": logits,
            "logits_per_text": logits.t()
        }

In [9]:
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}

class ImageTextCsv(Dataset):
    def __init__(self, csv_path: str, img_root: str, preprocess):
        self.df = pd.read_csv(csv_path)
        self.root = Path(img_root)
        self.preprocess = preprocess

    def __len__(self): 
        return len(self.df)

    def __getitem__(self, i) -> Dict[str, object]:
        row = self.df.iloc[i]
        img = Image.open(self.root / row["image_path"]).convert("RGB")
        return {"image": self.preprocess(img), "text": str(row["caption"])}

def build_collate_clip(tokenize_fn):
    def _fn(batch: List[dict]):
        images = torch.stack([b["image"] for b in batch])
        texts  = [b["text"] for b in batch]
        tokens = tokenize_fn(texts)  # (B, 77)
        return {"images": images, "text_tokens": tokens, "raw_texts": texts}
    return _fn

In [10]:
class SimpleCLIPLightning(pl.LightningModule):
    def __init__(self, embed_dim=512, lr: float = 1e-4, weight_decay: float = 0.01):
        super().__init__()
        self.save_hyperparameters()
        self.model = SimpleCLIP(embed_dim=embed_dim)

    @property
    def temperature(self):
        return self.model.temperature

    def training_step(self, batch, _):
        out = self.model(images=batch["images"], text_tokens=batch["text_tokens"])
        loss = clip_contrastive_loss(out["logits_per_image"], out["logits_per_text"])
        self.log("train/loss", loss, prog_bar=True, on_step=True, on_epoch=True)
        self.log("train/logit_scale", self.temperature, on_step=True, prog_bar=False)
        return loss

    def validation_step(self, batch, _):
        out = self.model(images=batch["images"], text_tokens=batch["text_tokens"])
        loss = clip_contrastive_loss(out["logits_per_image"], out["logits_per_text"])
        self.log("val/loss", loss, prog_bar=True, on_epoch=True)

    def on_before_optimizer_step(self, optimizer):
        # logit_scale 상한 제한 (안정화)
        with torch.no_grad():
            self.model.logit_scale.clamp_(max=math.log(100.0))

    def configure_optimizers(self):
        opt = torch.optim.AdamW(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay, betas=(0.9, 0.98))
        sch = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=self.trainer.max_steps or 1000)
        return {"optimizer": opt, "lr_scheduler": {"scheduler": sch, "interval": "step"}}

In [11]:
class SimpleCLIPDataModule(pl.LightningDataModule):
    def __init__(self, train_csv: str, img_root: str, val_csv: str = None, batch_size: int = 128,
                 num_workers: int = 4, embed_dim: int = 512):
        super().__init__()
        
        # 간단한 CLIP 모델에서 전처리와 토크나이저 가져오기
        clip_model = SimpleCLIP(embed_dim=embed_dim)
        self.preprocess = clip_model.preprocess
        self.tokenize = clip_model.text_encoder.tokenize

        self.train_csv, self.val_csv = train_csv, val_csv
        self.img_root = img_root
        self.batch_size, self.num_workers = batch_size, num_workers

    def setup(self, stage=None):
        self.ds_train = ImageTextCsv(self.train_csv, self.img_root, self.preprocess)
        self.ds_val   = ImageTextCsv(self.val_csv,   self.img_root, self.preprocess) if self.val_csv else None

    def train_dataloader(self):
        return DataLoader(self.ds_train, batch_size=self.batch_size, shuffle=True,
                          num_workers=self.num_workers, pin_memory=True,
                          persistent_workers=self.num_workers>0, prefetch_factor=2 if self.num_workers>0 else None,
                          drop_last=True, collate_fn=build_collate_clip(self.tokenize))

    def val_dataloader(self):
        if self.ds_val is None: return None
        return DataLoader(self.ds_val, batch_size=self.batch_size, shuffle=False,
                          num_workers=self.num_workers, pin_memory=True,
                          persistent_workers=self.num_workers>0, prefetch_factor=2 if self.num_workers>0 else None,
                          drop_last=False, collate_fn=build_collate_clip(self.tokenize))

In [12]:
# 학습 하이퍼파라미터 설정
embed_dim = 512  # 임베딩 차원
train_csv = "data/train.csv"
val_csv = None  # 검증 데이터가 있으면 경로 지정
img_root = "data/images"
batch_size = 32  # 작은 모델이므로 배치 크기 줄임
num_workers = 0  # 윈도우에서는 0 권장
epochs = 3
lr = 1e-3  # 처음부터 학습하므로 학습률 높임
weight_decay = 0.01
seed = 42

In [14]:
project_root = Path("C:/python/clip-demo")  # 실제 프로젝트 경로
if project_root.exists():
    os.chdir(project_root)
    print(f"작업 디렉토리를 {project_root}로 변경했습니다")
else:
    print("프로젝트 디렉토리를 찾을 수 없습니다. 경로를 확인해주세요.")

print(f"현재 작업 디렉토리: {os.getcwd()}")

# 시드 설정
set_seed(seed)

# 데이터 모듈 생성
dm = SimpleCLIPDataModule(train_csv, img_root, val_csv,
                         batch_size=batch_size, num_workers=num_workers,
                         embed_dim=embed_dim)

# 모델 생성
model = SimpleCLIPLightning(embed_dim=embed_dim, lr=lr, weight_decay=weight_decay)

print(f"모델 파라미터 수: {sum(p.numel() for p in model.parameters()):,}")

# 트레이너 설정
trainer = pl.Trainer(
    max_epochs=epochs,
    precision="32",
    gradient_clip_val=1.0,
    log_every_n_steps=10,
    enable_checkpointing=False,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    num_sanity_val_steps=0,
    limit_val_batches=0,
)

# 학습 시작
trainer.fit(model, dm)

작업 디렉토리를 C:\python\clip-demo로 변경했습니다
현재 작업 디렉토리: C:\python\clip-demo
모델 파라미터 수: 38,810,113


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type       | Params | Mode 
---------------------------------------------
0 | model | SimpleCLIP | 38.8 M | train
---------------------------------------------
38.8 M    Trainable params
0         Non-trainable params
38.8 M    Total params
155.240   Total estimated model params size (MB)
113       Modules in train mode
0         Modules in eval mode
`Trainer.fit` stopped: No training batches.


In [15]:
# 학습된 모델로 간단한 테스트
model.eval()

# 디바이스 확인 및 설정
device = next(model.parameters()).device
print(f"모델이 위치한 디바이스: {device}")

with torch.no_grad():
    # 텍스트 임베딩 생성
    texts = ["a photo of a cat", "a photo of a dog"]
    text_tokens = model.model.text_encoder.tokenize(texts)
    
    # 토큰을 모델과 같은 디바이스로 이동
    text_tokens = text_tokens.to(device)
    print(f"텍스트 토큰 shape: {text_tokens.shape}")
    print(f"텍스트 토큰 디바이스: {text_tokens.device}")
    
    # 정규화된 텍스트 임베딩 얻기
    text_embeds = model.model.encode_text(text_tokens)
    
    print(f"텍스트 임베딩 크기: {text_embeds.shape}")
    print(f"텍스트 임베딩 디바이스: {text_embeds.device}")
    print(f"임베딩 norm: {text_embeds.norm(dim=-1)}")  # L2 정규화 확인 (1에 가까워야 함)
    
    # 코사인 유사도 계산 (정규화된 벡터 간의 내적)
    cosine_similarity = (text_embeds[0] @ text_embeds[1]).item()
    print(f"텍스트 간 코사인 유사도: {cosine_similarity:.4f}")
    
    # temperature가 적용된 유사도
    temperature = model.model.temperature.item()
    scaled_similarity = temperature * cosine_similarity
    print(f"Temperature: {temperature:.4f}")
    print(f"Temperature 적용된 유사도: {scaled_similarity:.4f}")
    
    # 추가 테스트: 더 많은 텍스트들
    more_texts = ["a photo of a cat", "a photo of a dog", "a photo of a car", "a photo of a tree"]
    more_tokens = model.model.text_encoder.tokenize(more_texts)
    more_tokens = more_tokens.to(device)
    
    more_embeds = model.model.encode_text(more_tokens)
    similarity_matrix = more_embeds @ more_embeds.t()
    
    print(f"\n텍스트 간 유사도 매트릭스 (코사인 유사도):")
    for i, text1 in enumerate(more_texts):
        for j, text2 in enumerate(more_texts):
            sim = similarity_matrix[i, j].item()
            print(f"{text1[:15]:15} vs {text2[:15]:15}: {sim:.4f}")
    
    # 이미지 테스트 (간단한 더미 이미지)
    dummy_images = torch.randn(2, 3, 224, 224).to(device)
    image_embeds = model.model.encode_image(dummy_images)
    print(f"\n이미지 임베딩 크기: {image_embeds.shape}")
    print(f"이미지 임베딩 norm: {image_embeds.norm(dim=-1)}")
    
    # 이미지-텍스트 간 유사도
    img_text_sim = image_embeds @ text_embeds.t()
    print(f"\n이미지-텍스트 유사도 매트릭스:")
    print(img_text_sim)

모델이 위치한 디바이스: cpu
텍스트 토큰 shape: torch.Size([2, 77])
텍스트 토큰 디바이스: cpu
텍스트 임베딩 크기: torch.Size([2, 512])
텍스트 임베딩 디바이스: cpu
임베딩 norm: tensor([1.0000, 1.0000])
텍스트 간 코사인 유사도: 0.9800
Temperature: 14.2857
Temperature 적용된 유사도: 13.9997

텍스트 간 유사도 매트릭스 (코사인 유사도):
a photo of a ca vs a photo of a ca: 1.0000
a photo of a ca vs a photo of a do: 0.9800
a photo of a ca vs a photo of a ca: 0.9801
a photo of a ca vs a photo of a tr: 0.9798
a photo of a do vs a photo of a ca: 0.9800
a photo of a do vs a photo of a do: 1.0000
a photo of a do vs a photo of a ca: 0.9852
a photo of a do vs a photo of a tr: 0.9849
a photo of a ca vs a photo of a ca: 0.9801
a photo of a ca vs a photo of a do: 0.9852
a photo of a ca vs a photo of a ca: 1.0000
a photo of a ca vs a photo of a tr: 0.9870
a photo of a tr vs a photo of a ca: 0.9798
a photo of a tr vs a photo of a do: 0.9849
a photo of a tr vs a photo of a ca: 0.9870
a photo of a tr vs a photo of a tr: 1.0000

이미지 임베딩 크기: torch.Size([2, 512])
이미지 임베딩 norm: tensor([1.