# WinCLIP 결과 후처리 (winclip_llm_dataset.json -> jsonl)

- 입력: winclip_llm_dataset.json (list of records)
- 출력: outputs/jo/winclip_postprocessed.jsonl

이 노트북은 hotspot_coords(예: "(7,7)")를 파싱해서
- bbox(rmin,cmin,rmax,cmax)
- 위치 라벨(상/중/하 × 좌/중/우)
- LLM 입력에 쓰기 좋은 정규화 레코드
를 생성합니다.

(Code) — 경로/출력 설정 + 프로젝트 루트 고정

In [15]:
from pathlib import Path

# 공유드라이브 바로가기(Shortcut)가 실제로 풀리는 경로
SHORTCUT_ROOT = Path(r"G:\.shortcut-targets-by-id")

if not SHORTCUT_ROOT.exists():
    raise FileNotFoundError(f"{SHORTCUT_ROOT} 가 없습니다. Google Drive가 G:로 마운트되어 있는지 확인하세요.")

cands = []
for p in SHORTCUT_ROOT.iterdir():
    mmad = p / "MMAD"
    if mmad.exists():
        cands.append(mmad)

print("found MMAD roots:", len(cands))
for x in cands[:10]:
    print(" -", x)

# 보통 1개만 잡힙니다. 1개면 자동 선택
if len(cands) == 1:
    MMAD_ROOT = cands[0]
else:
    # 여러 개면 첫 번째를 우선 선택(필요하면 바꿀 수 있음)
    MMAD_ROOT = cands[0] if cands else None

if MMAD_ROOT is None:
    raise FileNotFoundError("MMAD 폴더를 shortcut-targets-by-id 아래에서 찾지 못했습니다.")

print("MMAD_ROOT =", MMAD_ROOT)

found MMAD roots: 1
 - G:\.shortcut-targets-by-id\1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy\MMAD
MMAD_ROOT = G:\.shortcut-targets-by-id\1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy\MMAD


In [16]:
DS = MMAD_ROOT / "DS-MVTec"
print("DS =", DS)
print("DS exists =", DS.exists())

if not DS.exists():
    # MMAD 폴더 아래에 뭐가 있는지 출력해서 실제 폴더명을 확인
    print("MMAD_ROOT contents sample:")
    for p in sorted(MMAD_ROOT.iterdir())[:30]:
        print(" -", p.name)

DS = G:\.shortcut-targets-by-id\1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy\MMAD\DS-MVTec
DS exists = True


파서/후처리 함수들

In [12]:
GRID = 8  # hotspot 좌표가 0~7로 보이면 8x8 그리드
COORD_RE = re.compile(r"\((\d+)\s*,\s*(\d+)\)")

def parse_coord(s: str) -> tuple[int, int]:
    """
    "(r,c)" 문자열을 (r,c) 튜플로 변환
    """
    m = COORD_RE.fullmatch(s.strip())
    if not m:
        raise ValueError(f"Bad coord format: {s!r}")
    r, c = int(m.group(1)), int(m.group(2))
    if not (0 <= r < GRID and 0 <= c < GRID):
        raise ValueError(f"Out of range coord: {s!r}")
    return r, c

def coords_to_bbox(coords: list[tuple[int,int]]):
    """
    좌표 리스트 -> bbox (rmin,cmin,rmax,cmax)
    """
    if not coords:
        return None
    rs = [r for r, _ in coords]
    cs = [c for _, c in coords]
    return {"rmin": min(rs), "cmin": min(cs), "rmax": max(rs), "cmax": max(cs)}

def bbox_to_region_kr(bbox):
    """
    bbox 중심을 3x3 구역으로 나눠 "상단/중단/하단 x 좌측/중앙/우측" 라벨 생성
    """
    if bbox is None:
        return None

    r_center = (bbox["rmin"] + bbox["rmax"]) / 2
    c_center = (bbox["cmin"] + bbox["cmax"]) / 2

    def bin3(x):
        # 0~7 -> [0~2]=0, [3~4]=1, [5~7]=2
        if x <= 2: return 0
        if x <= 4: return 1
        return 2

    rbin, cbin = bin3(r_center), bin3(c_center)
    rows = ["상단", "중단", "하단"]
    cols = ["좌측", "중앙", "우측"]
    return f"{rows[rbin]} {cols[cbin]}"

def parse_status_from_output(output: str):
    """
    WinCLIP json의 output 문장에 normal/anomaly가 들어있는 경우 추출
    (없으면 None)
    """
    if not output:
        return None
    if " is normal" in output:
        return "normal"
    if " is anomaly" in output:
        return "anomaly"
    return None

def normalize_record(rec: dict) -> dict:
    """
    원본 레코드(rec) -> 표준화 레코드(dict)
    """
    inp = rec.get("input", {})
    coords_raw = inp.get("hotspot_coords", []) or []
    coords = [parse_coord(x) for x in coords_raw]

    bbox = coords_to_bbox(coords)
    region_kr = bbox_to_region_kr(bbox)
    status = parse_status_from_output(rec.get("output", ""))

    # peak_score는 문자열일 수도 있어 float로 강제
    peak = inp.get("peak_score", None)
    peak_score = float(peak) if peak is not None else None

    return {
        "source_model": "winclip",
        "category": inp.get("category"),
        "image_name": inp.get("image_name"),
        "peak_score": peak_score,

        # 후처리 결과
        "hotspots_rc": coords,
        "hotspot_bbox_rc": bbox,
        "hotspot_region_kr": region_kr,

        # 원문에서 추출된 상태(있으면)
        "status_from_output": status,

        # 원본도 보존(디버깅용)
        "raw_instruction": rec.get("instruction"),
        "raw_output": rec.get("output"),
    }

실행: JSON 로드 → 정규화 → JSONL 저장

In [13]:
data = json.loads(WINCLIP_JSON.read_text(encoding="utf-8"))

count = 0
with OUT_JSONL.open("w", encoding="utf-8") as f:
    for rec in data:
        norm = normalize_record(rec)
        f.write(json.dumps(norm, ensure_ascii=False) + "\n")
        count += 1

print(f"saved: {OUT_JSONL}")
print(f"n = {count}")

# 샘플 1개 출력
print("\n--- example ---")
print(json.dumps(normalize_record(data[0]), ensure_ascii=False, indent=2))

saved: c:\Users\yeony\multimodal-anomaly-report-generation\outputs\jo\winclip_postprocessed.jsonl
n = 1725

--- example ---
{
  "source_model": "winclip",
  "category": "bottle",
  "image_name": "000.png",
  "peak_score": 0.3988,
  "hotspots_rc": [],
  "hotspot_bbox_rc": null,
  "hotspot_region_kr": null,
  "status_from_output": "anomaly",
  "raw_instruction": "Identify the status and defect locations of the bottle image.",
  "raw_output": "The bottle is anomaly. No defects detected."
}


DS-MVTec 이미지 경로 구조 자동 탐지

In [20]:
sample = data[0]["input"]
cat = sample["category"]
name = sample["image_name"]

print("sample:", sample)
cat_dir = DS / cat
print("cat_dir exists:", cat_dir.exists(), cat_dir)

matches = list(cat_dir.rglob(name)) if cat_dir.exists() else []
print("matches in DS/cat:", len(matches))
print("first 5:", matches[:5])

sample: {'category': 'bottle', 'image_name': '000.png', 'peak_score': 0.3988, 'hotspot_coords': []}
cat_dir exists: True G:\.shortcut-targets-by-id\1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy\MMAD\DS-MVTec\bottle
matches in DS/cat: 4
first 5: [WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/DS-MVTec/bottle/image/broken_large/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/DS-MVTec/bottle/image/good/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/DS-MVTec/bottle/image/broken_small/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/DS-MVTec/bottle/image/contamination/000.png')]


In [21]:
MMAD = MMAD_ROOT  # 이미 잡아둔 MMAD_ROOT 사용

sample = data[0]["input"]
cat = sample["category"]
name = sample["image_name"]

all_matches = [p for p in MMAD.rglob(name) if cat in p.parts]
print("matches under MMAD containing category in path:", len(all_matches))
print("first 10:", all_matches[:10])

matches under MMAD containing category in path: 10
first 10: [WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/DS-MVTec/bottle/image/broken_large/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/DS-MVTec/bottle/image/good/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/DS-MVTec/bottle/image/broken_small/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/DS-MVTec/bottle/image/contamination/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/MVTec-AD/bottle/test/broken_small/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/MVTec-AD/bottle/test/broken_large/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/MVTec-AD/bottle/test/good/000.png'), WindowsPath('G:/.shortcut-targets-by-id/1MOlF1Xwaw_0p4R5EY_HXDRgLg0it43Jy/MMAD/MV

In [17]:
from collections import Counter
from pathlib import Path

# (중요) MMAD_ROOT는 1번 셀에서 이미 잡혔다고 가정
DS = MMAD_ROOT / "DS-MVTec"

if not DS.exists():
    print("DS-MVTec 폴더가 없습니다:", DS)
else:
    candidates = [
        ("DS/cat/name", lambda cat, name: DS / cat / name),
        ("DS/cat/test/name", lambda cat, name: DS / cat / "test" / name),
        ("DS/cat/images/name", lambda cat, name: DS / cat / "images" / name),
        ("DS/cat/test/good/name", lambda cat, name: DS / cat / "test" / "good" / name),
    ]

    N = min(100, len(data))
    hit = Counter()

    for rec in data[:N]:
        cat = rec["input"]["category"]
        name = rec["input"]["image_name"]
        for key, fn in candidates:
            if fn(cat, name).exists():
                hit[key] += 1

    print("checked:", N)
    for key, _ in candidates:
        print(f"{key:20s} : {hit[key]}/{N}")

checked: 100
DS/cat/name          : 0/100
DS/cat/test/name     : 0/100
DS/cat/images/name   : 0/100
DS/cat/test/good/name : 0/100
