# Embed & Cluster (CLIP + KMeans)

제출/재현성을 위해 **경로·하이퍼파라미터를 상단 Config로 통일**하고, 임베딩/클러스터링/저장을 함수로 분리한 정리본입니다.

- 입력: 이미지 폴더(기본: `filtered_images`)
- 출력: `cluster_results.csv` (+ 선택: `embeddings.npy`, `umap_2d.npy`)


In [None]:
# ===== 0) Config =====
from pathlib import Path

# 입력 이미지 폴더 (Windows 예시: r"C:\Users\min\Downloads\filtered_images")
IMAGE_DIR = Path(r"C:\Users\min\Downloads\filtered_images")

# 허용 확장자 / 재귀 탐색 여부
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp"}
RECURSIVE = False

# 출력 폴더/파일
OUT_DIR = Path("./outputs")
OUT_CSV = OUT_DIR / "cluster_results.csv"
OUT_EMB = OUT_DIR / "embeddings.npy"     # optional
OUT_UMAP = OUT_DIR / "umap_2d.npy"       # optional

# 모델/연산
MODEL_NAME = "ViT-B/32"
BATCH_SIZE = 128

# 클러스터링
N_CLUSTERS = 21
RANDOM_STATE = 42

# (선택) 엘보우 메소드 탐색 범위
RUN_ELBOW = True
K_RANGE = range(10, 31)


In [None]:
# ===== 1) Imports =====
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

import torch
import clip

from sklearn.cluster import KMeans
import umap
import matplotlib.pyplot as plt


In [None]:
# ===== 2) Utilities =====
def get_device() -> str:
    return "cuda" if torch.cuda.is_available() else "cpu"

def list_images(image_dir, exts, recursive=False):
    image_dir = Path(image_dir)
    if not image_dir.exists():
        raise FileNotFoundError(f"IMAGE_DIR not found: {image_dir}")

    if recursive:
        paths = [p for p in image_dir.rglob("*") if p.suffix.lower() in exts]
    else:
        paths = [p for p in image_dir.glob("*") if p.suffix.lower() in exts]

    if not paths:
        raise ValueError(f"No images found in {image_dir} with extensions: {sorted(exts)}")
    return sorted(paths)

def load_clip(model_name, device):
    model, preprocess = clip.load(model_name, device=device, jit=False)
    model.eval()
    return model, preprocess

def encode_images(paths, model, preprocess, device, batch_size=128):
    all_emb = []
    all_ids = []

    with torch.no_grad():
        for i in tqdm(range(0, len(paths), batch_size), desc="Encoding images"):
            batch_paths = paths[i:i+batch_size]
            batch_imgs = []
            batch_ids = []

            for p in batch_paths:
                try:
                    img = Image.open(p)
                    if img.mode != "RGB":
                        img = img.convert("RGB")
                    batch_imgs.append(preprocess(img))
                    batch_ids.append(p.name)
                except Exception as e:
                    print(f"[WARN] Failed to load {p}: {e}")

            if not batch_imgs:
                continue

            batch_tensor = torch.stack(batch_imgs).to(device)
            feats = model.encode_image(batch_tensor)
            feats = feats / feats.norm(dim=-1, keepdim=True)

            all_emb.append(feats.cpu().numpy())
            all_ids.extend(batch_ids)

    if not all_emb:
        raise RuntimeError("No embeddings were generated. Check image loading errors above.")
    emb = np.vstack(all_emb)
    return emb, all_ids

def run_elbow(embeddings, k_range, random_state=42):
    inertia = []
    for k in tqdm(list(k_range), desc="Elbow (inertia)"):
        km = KMeans(n_clusters=k, n_init=10, random_state=random_state)
        km.fit(embeddings)
        inertia.append(km.inertia_)
    return inertia

def cluster_kmeans(embeddings, n_clusters, random_state=42):
    km = KMeans(n_clusters=n_clusters, n_init=10, random_state=random_state)
    labels = km.fit_predict(embeddings)
    return labels

def reduce_umap(embeddings, random_state=42, n_neighbors=15, min_dist=0.1):
    reducer = umap.UMAP(
        n_components=2,
        random_state=random_state,
        n_neighbors=n_neighbors,
        min_dist=min_dist
    )
    emb2d = reducer.fit_transform(embeddings)
    return emb2d

def plot_umap(emb2d, labels, title):
    plt.figure(figsize=(12, 10))
    sc = plt.scatter(emb2d[:, 0], emb2d[:, 1], c=labels, s=10, alpha=0.7)
    plt.title(title, fontsize=14)
    plt.xlabel("UMAP 1")
    plt.ylabel("UMAP 2")
    cbar = plt.colorbar(sc)
    cbar.set_label("Cluster ID")
    plt.grid(True, linestyle="--", alpha=0.4)
    plt.show()


In [None]:
# ===== 3) Run pipeline =====
device = get_device()
print("Device:", device)
print("IMAGE_DIR:", IMAGE_DIR)

OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) images
paths = list_images(IMAGE_DIR, IMAGE_EXTS, recursive=RECURSIVE)
print("Num images:", len(paths))

# 2) model
model, preprocess = load_clip(MODEL_NAME, device=device)
print("Loaded CLIP:", MODEL_NAME)

# 3) embeddings
embeddings, image_ids = encode_images(paths, model, preprocess, device=device, batch_size=BATCH_SIZE)
print("Embeddings:", embeddings.shape)

# (optional) save raw embeddings for reproducibility
np.save(OUT_EMB, embeddings)

# 4) elbow
if RUN_ELBOW:
    inertia_values = run_elbow(embeddings, K_RANGE, random_state=RANDOM_STATE)
    plt.figure(figsize=(10, 6))
    plt.plot(list(K_RANGE), inertia_values, marker="o", linestyle="--")
    plt.title("Elbow Method (Inertia)")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Inertia")
    plt.xticks(list(K_RANGE))
    plt.grid(True, linestyle="--", alpha=0.5)
    plt.show()

# 5) kmeans
labels = cluster_kmeans(embeddings, N_CLUSTERS, random_state=RANDOM_STATE)
print("KMeans done. k =", N_CLUSTERS)

# 6) umap 2D
emb2d = reduce_umap(embeddings, random_state=RANDOM_STATE)
np.save(OUT_UMAP, emb2d)

plot_umap(emb2d, labels, title=f"UMAP (CLIP {MODEL_NAME}) + KMeans (k={N_CLUSTERS})")

# 7) save results
df = pd.DataFrame({"image_id": image_ids, "cluster": labels})
df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print("Saved:", OUT_CSV.resolve())
df.head()
