# CLIP 기반 이미지/텍스트 임베딩 파이프라인 (정리본)

이 노트북은 다음을 수행합니다.

1. CSV 로드 및 전처리 (`description`, `dataset_source`, `product_id`)
2. `dataset_source`별 이미지 경로 매칭 (HNM / FASHION)
3. CLIP(`openai/clip-vit-base-patch32`)로 텍스트/이미지 임베딩 추출 및 L2 정규화
4. 임베딩을 CSV로 저장 (`img_emb`, `wrd_emb`)

> 제출용을 가정하여 **경로/하이퍼파라미터를 상단 Config로 통일**하고, 함수 단위로 정리했습니다.


In [None]:
# (선택) 최초 1회 설치 - 이미 환경에 설치돼 있으면 생략 가능
# !pip install -q transformers torch torchvision pillow tqdm


In [None]:
from __future__ import annotations

import os
from pathlib import Path
from typing import Dict, Optional, List

import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
from transformers import CLIPModel, CLIPProcessor

In [None]:
# =========================
# Config (여기만 수정)
# =========================
CSV_PATH    = Path(r"C:\Users\min\Downloads\csvs\merged_dataset_onehot.csv")

HNM_DIR     = Path(r"C:\Users\min\Downloads\filtered_images")                      # HNM images
FASHION_DIR = Path(r"C:\Users\min\Downloads\archive (13)\fashion-dataset\images")  # FASHION images

OUT_CSV     = Path(r"C:\Users\min\Downloads\before_parquet.csv")

MODEL_NAME  = "openai/clip-vit-base-patch32"
BATCH_SIZE  = 64
NUM_WORKERS = 0   # Windows/Jupyter 환경에서는 0 권장

# HNM 파일명 규칙: '0108775015.jpg' -> code=fname[1:7] == '108775'
HNM_CODE_SLICE = slice(1, 7)

In [None]:
# =========================
# Model / Device
# =========================
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained(MODEL_NAME).to(device).eval()
processor = CLIPProcessor.from_pretrained(MODEL_NAME)

def l2_normalize(x: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
    return x / (x.norm(dim=-1, keepdim=True) + eps)

print(f"Device={device} | Model={MODEL_NAME}")

In [None]:
# =========================
# I/O & Validation Utils
# =========================
REQUIRED_COLS = {"description", "dataset_source", "product_id"}
VALID_SOURCES = {"HNM", "FASHION"}

def load_and_filter_csv(csv_path: Path) -> pd.DataFrame:
    df = pd.read_csv(csv_path)

    missing = REQUIRED_COLS - set(df.columns)
    if missing:
        raise ValueError(f"CSV에 필요한 컬럼이 없습니다: {sorted(missing)}")

    df = df.copy()
    df["description"] = df["description"].fillna("").astype(str)
    df = df[df["description"].str.len() > 0]

    df["dataset_source"] = df["dataset_source"].astype(str).str.upper().str.strip()
    df = df[df["dataset_source"].isin(VALID_SOURCES)]

    # product_id 정규화: 공백 제거, 숫자로 읽힌 '... .0' 제거
    def norm_pid(x) -> str:
        s = "" if pd.isna(x) else str(x).strip()
        return s[:-2] if s.endswith(".0") else s

    df["product_id"] = df["product_id"].map(norm_pid)

    return df.reset_index(drop=True)

df = load_and_filter_csv(CSV_PATH)
print("전처리 후 행 수:", len(df))
df.head(3)

In [None]:
# =========================
# Image Index (HNM / FASHION)
# =========================
def build_hnm_index(root_dir: Path) -> Dict[str, str]:
    """HNM: 파일명에서 code=fname[1:7] 추출 -> code -> 이미지 경로 (첫 번째로 발견된 파일)"""
    code2path: Dict[str, str] = {}
    for p in root_dir.rglob("*.jpg"):
        fname = p.name
        if len(fname) >= 7:
            code = fname[HNM_CODE_SLICE]
            code2path.setdefault(code, str(p))
    return code2path

def build_fashion_index(root_dir: Path) -> Dict[str, str]:
    """FASHION: 파일명(확장자 제외) == product_id"""
    name2path: Dict[str, str] = {}
    for p in root_dir.rglob("*.jpg"):
        name2path[p.stem] = str(p)
    return name2path

print("HNM 인덱스 생성 중...")
hnm_idx = build_hnm_index(HNM_DIR)
print("HNM 인덱스 크기:", len(hnm_idx))

print("FASHION 인덱스 생성 중...")
fashion_idx = build_fashion_index(FASHION_DIR)
print("FASHION 인덱스 크기:", len(fashion_idx))

In [None]:
# =========================
# Row-wise image path resolver
# =========================
def resolve_image_path(dataset_source: str, product_id: str) -> Optional[str]:
    if dataset_source == "HNM":
        # 1) code direct match (product_id == 6-digit code)
        if product_id in hnm_idx:
            return hnm_idx[product_id]

        # 2) fallback: product_id가 파일명에 포함되는지 느슨하게 검색 (매우 느릴 수 있음)
        USE_FALLBACK_SCAN = False
        if USE_FALLBACK_SCAN:
            for p in HNM_DIR.rglob("*.jpg"):
                if product_id in p.name:
                    return str(p)
        return None

    if dataset_source == "FASHION":
        return fashion_idx.get(product_id)

    return None

df = df.copy()
df["image_path"] = [
    resolve_image_path(src, pid)
    for src, pid in tqdm(zip(df["dataset_source"].tolist(), df["product_id"].tolist()), total=len(df))
]

before = len(df)
df = df[df["image_path"].notna()].reset_index(drop=True)
after = len(df)
print(f"이미지 매칭 성공: {after}/{before} ({after/before:.1%})")

df.head(3)[["dataset_source", "product_id", "image_path", "description"]]

In [None]:
# =========================
# CLIP Encoding
# =========================
def encode_texts_clip(texts: List[str], batch_size: int = BATCH_SIZE) -> torch.Tensor:
    outs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Text batches"):
        batch = texts[i:i+batch_size]
        inputs = processor(text=batch, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            feats = model.get_text_features(**inputs)
            feats = l2_normalize(feats)
        outs.append(feats.cpu())
        del inputs, feats
        if device == "cuda":
            torch.cuda.empty_cache()
    return torch.cat(outs, dim=0)

def encode_images_clip(paths: List[str], batch_size: int = BATCH_SIZE) -> torch.Tensor:
    outs = []
    for i in tqdm(range(0, len(paths), batch_size), desc="Image batches"):
        batch_paths = paths[i:i+batch_size]
        images = [Image.open(p).convert("RGB") for p in batch_paths]

        inputs = processor(images=images, return_tensors="pt").to(device)
        with torch.no_grad():
            feats = model.get_image_features(**inputs)
            feats = l2_normalize(feats)
        outs.append(feats.cpu())

        # cleanup
        for img in images:
            try:
                img.close()
            except Exception:
                pass
        del images, inputs, feats
        if device == "cuda":
            torch.cuda.empty_cache()
    return torch.cat(outs, dim=0)

texts = df["description"].tolist()
img_paths = df["image_path"].tolist()

wrd_emb_t = encode_texts_clip(texts)
img_emb_t = encode_images_clip(img_paths)

wrd_emb = wrd_emb_t.numpy()
img_emb = img_emb_t.numpy()

print("wrd_emb:", wrd_emb.shape, "img_emb:", img_emb.shape)

In [None]:
# (선택) 간단 검증: 같은 행의 텍스트-이미지 코사인 유사도
pair_cos = np.sum(wrd_emb * img_emb, axis=1)
print("pair cosine mean:", float(pair_cos.mean()))
print("pair cosine median:", float(np.median(pair_cos)))

In [None]:
# =========================
# Save to CSV
# =========================
def vec_to_str(v: np.ndarray) -> str:
    return ",".join(f"{x:.6f}" for x in v.tolist())

df_out = df.copy()
df_out["img_emb"] = [vec_to_str(v) for v in img_emb]
df_out["wrd_emb"] = [vec_to_str(v) for v in wrd_emb]

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df_out.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")

print(f"CSV 저장 완료: {OUT_CSV}")
print("열:", list(df_out.columns))
df_out.head(2)[["dataset_source","product_id","image_path"]]