##CLIP 클래스 구현

In [None]:
#필요 패키지 임포트
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from PIL import Image
import torchvision.transforms as transforms
import numpy as np

In [None]:
# CLIP 모델 구현
class CLIP(nn.Module):
    def __init__(self, embed_dim=512, image_resolution=224, vision_layers=12,
                 vision_width=768, vision_patch_size=32, context_length=77,
                 vocab_size=49408, transformer_width=512, transformer_heads=8,
                 transformer_layers=12):
        super().__init__()

        # 이미지 인코더 (Vision Transformer 간소화 버전)
        self.visual = VisionTransformer(
            input_resolution=image_resolution,
            patch_size=vision_patch_size,
            width=vision_width,
            layers=vision_layers,
            output_dim=embed_dim
        )

        # 텍스트 인코더 (Transformer 간소화 버전)
        self.transformer = Transformer(
            width=transformer_width,
            layers=transformer_layers,
            heads=transformer_heads,
            context_length=context_length
        )

        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
        self.positional_embedding = nn.Parameter(torch.empty(context_length, transformer_width))
        self.ln_final = nn.LayerNorm(transformer_width)

        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

        self.initialize_parameters()

    def initialize_parameters(self):
        nn.init.normal_(self.token_embedding.weight, std=0.02)
        nn.init.normal_(self.positional_embedding, std=0.01)
        nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)

    def encode_image(self, image):
        return self.visual(image)

    def encode_text(self, text):
        x = self.token_embedding(text)
        x = x + self.positional_embedding
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x)

        # x.shape = [batch_size, n_ctx, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection

        return x

    def forward(self, image, text):
        image_features = self.encode_image(image)
        text_features = self.encode_text(text)

        # 정규화
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

        # 코사인 유사도 계산을 위한 로짓
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()

        return logits_per_image, logits_per_text


In [None]:
#ViT 클래스
#논문에서는 ResNet50 역시 사용했으나 결과적으로 성능이 좋았던 것은 ViT

class VisionTransformer(nn.Module):
    def __init__(self, input_resolution=224, patch_size=32, width=768, layers=12, output_dim=512):
        super().__init__()
        self.input_resolution = input_resolution
        self.output_dim = output_dim

        # 간소화를 위해 ResNet을 backbone으로 사용
        self.backbone = models.resnet50(pretrained=True)
        self.backbone.fc = nn.Identity()  # FC 레이어 제거

        self.proj = nn.Linear(2048, output_dim)  # ResNet50의 마지막 레이어 출력 차원은 2048

    def forward(self, x):
        x = self.backbone(x)
        x = self.proj(x)
        return x

In [None]:
#텍스트 데이터를 위한 클래스

class Transformer(nn.Module):
    def __init__(self, width=512, layers=12, heads=8, context_length=77):
        super().__init__()
        self.width = width
        self.layers = layers

        # 간소화를 위해 transformer 레이어 직접 구현 대신 nn.TransformerEncoder 사용
        encoder_layer = nn.TransformerEncoderLayer(d_model=width, nhead=heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=layers)

    def forward(self, x):
        return self.transformer(x)

In [None]:
# CLIP 손실 함수
def contrastive_loss(logits_per_image, logits_per_text, labels):
    loss_image = F.cross_entropy(logits_per_image, labels)
    loss_text = F.cross_entropy(logits_per_text, labels)
    return (loss_image + loss_text) / 2.0

In [None]:
# CLIP을 위한 데이터셋 클래스
# 데이터셋 구축 : 대규모의 이미지-텍스트 쌍을 생성해야함

class CLIPDataset(Dataset):
    def __init__(self, image_paths, captions, processor, transform=None):
        self.image_paths = image_paths
        self.captions = captions
        self.processor = processor
        self.transform = transform if transform else transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
                                (0.26862954, 0.26130258, 0.27577711))
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        caption = self.captions[idx]

        image = Image.open(image_path).convert("RGB")
        image = self.transform(image)

        # 텍스트 토큰화 (실제 구현에서는 tokenizer 사용)
        text = torch.zeros(77, dtype=torch.long)  # 더미 토큰

        return image, text

In [None]:
# 모델 학습 함수 예시
def train_clip(model, dataloader, optimizer, device, epochs=1):
    model.to(device)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for images, texts in dataloader:
            images = images.to(device)
            texts = texts.to(device)

            optimizer.zero_grad()

            logits_per_image, logits_per_text = model(images, texts)

            # 대각 행렬의 인덱스가 올바른 쌍이라고 가정
            labels = torch.arange(images.shape[0], device=device)

            loss = contrastive_loss(logits_per_image, logits_per_text, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")


In [None]:
'''
# 모델 사용 예시
def main():
    # 모델 생성
    model = CLIP(embed_dim=512)

    # 데이터셋 및 데이터 로더 설정 (더미 데이터)
    image_paths = ["image1.jpg", "image2.jpg"]  # 예시 경로
    captions = ["a dog", "a cat"]  # 예시 캡션

    dataset = CLIPDataset(image_paths, captions, None)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

    # 옵티마이저 설정
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

    # 학습
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_clip(model, dataloader, optimizer, device, epochs=5)

    # 모델 저장
    torch.save(model.state_dict(), "clip_model.pt")

if __name__ == "__main__":
    main()
'''

##데이터셋 로드

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

# MS COCO 캡션 데이터셋 로드 (2017 버전, Train split)
coco_dataset = load_dataset("imagecaptioning/mscoco_caption", split="train")

# 첫 번째 예시 확인
print(coco_dataset[0])

FileNotFoundError: Couldn't find a dataset script at /content/imagecaptioning/mscoco_caption/mscoco_caption.py or any data file in the same directory. Couldn't find 'imagecaptioning/mscoco_caption' on the Hugging Face Hub either: FileNotFoundError: Dataset 'imagecaptioning/mscoco_caption' doesn't exist on the Hub. If the repo is private or gated, make sure to log in with `huggingface-cli login`.