# Dataset 재구성

Taxonomy 및 train/val/test split 재생성

**원본 스크립트**: `aws_icon_dataset_rebuild.sh`


## 개요
AWS 아이콘 taxonomy + train/val/test split 재생성

### 사용법
```bash
./aws_icon_dataset_rebuild.sh ./dataset/icons
```

### 인자
- `$1`: CSV 파일이 들어있는 디렉터리 (기본값: ./dataset/icons)

### 동작
기존 labels_coarse.csv / labels_fine.csv / train/val/test_fine.csv 기준으로 재계산을 수행합니다.


In [None]:
set -euo pipefail
DATA_DIR="${1:-./dataset/icons}"
export DATA_DIR
if [ ! -d "$DATA_DIR" ]; then
  echo "ERROR: DATA_DIR '$DATA_DIR' 디렉터리가 없습니다." >&2
  exit 1
fi
echo "[INFO] DATA_DIR = $DATA_DIR"
echo "[INFO] 기존 labels_coarse.csv / labels_fine.csv / train/val/test_fine.csv 기준으로 재계산을 수행합니다."


In [None]:
import os
import json
from collections import Counter
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

# ----- 설정 -----
DATA_DIR = os.environ["DATA_DIR"]
data_path = Path(DATA_DIR)

# ----- labels csv 자동 탐색 (안전장치) -----
def find_csv(root: Path, name: str):
    for p in root.rglob(name):
        return p
    return None

labels_fine_path = find_csv(data_path, "labels_fine.csv")
labels_coarse_path = find_csv(data_path, "labels_coarse.csv")

if labels_fine_path is None:
    raise FileNotFoundError("labels_fine.csv not found anywhere under DATA_DIR")
if labels_coarse_path is None:
    raise FileNotFoundError("labels_coarse.csv not found anywhere under DATA_DIR")

print(f"[INFO] Using labels_fine.csv: {labels_fine_path}")
print(f"[INFO] Using labels_coarse.csv: {labels_coarse_path}")

print(f"[INFO] Load: {labels_fine_path}")
df_fine = pd.read_csv(labels_fine_path)

print(f"[INFO] Load: {labels_coarse_path}")
df_coarse = pd.read_csv(labels_coarse_path)

# ============================================
# 1. Taxonomy 재구성 (coarse / fine)
# ============================================

# coarse taxonomy
coarse_counts = df_fine["coarse_class"].value_counts().sort_index()
df_tax_coarse = pd.DataFrame({
    "coarse_class": coarse_counts.index,
    "num_icons_fine": coarse_counts.values,
})

coarse_out = data_path / "taxonomy_coarse.csv"
df_tax_coarse.to_csv(coarse_out, index=False)
print(f"[OK] taxonomy_coarse.csv 저장: {coarse_out} (rows={len(df_tax_coarse)})")

# fine taxonomy (canonical_service_name + coarse + count)
fine_group = df_fine.groupby(
    ["coarse_class", "canonical_service_name"], as_index=False
).agg(num_icons=("file_path", "count"))

fine_group = fine_group.sort_values(
    ["coarse_class", "canonical_service_name"]
).reset_index(drop=True)

fine_out = data_path / "taxonomy_fine.csv"
fine_group.to_csv(fine_out, index=False)
print(f"[OK] taxonomy_fine.csv 저장: {fine_out} (rows={len(fine_group)})")

# stage1/2 class 리스트 텍스트 (라벨링/YOLO names용)
stage1_path = data_path / "stage1_classes.txt"
stage2_path = data_path / "stage2_classes.txt"

coarse_list = list(coarse_counts.index)
fine_list = list(fine_group["canonical_service_name"].unique())

stage1_path.write_text("\n".join(coarse_list), encoding="utf-8")
stage2_path.write_text("\n".join(fine_list), encoding="utf-8")

print(f"[OK] stage1_classes.txt 저장: {stage1_path} (coarse {len(coarse_list)}개)")
print(f"[OK] stage2_classes.txt 저장: {stage2_path} (fine {len(fine_list)}개)")

# ============================================
# 2. train/val/test split 재생성 (stratified by coarse_class)
#    - 기본 비율: 0.7 / 0.15 / 0.15
# ============================================

N = len(df_fine)
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# 1st: train vs (val+test)
df_train, df_tmp = train_test_split(
    df_fine,
    test_size=(1.0 - train_ratio),
    stratify=df_fine["coarse_class"],
    random_state=42,
)

# 2nd: (val+test) → val / test
tmp_ratio = 1.0 - train_ratio
val_fraction = val_ratio / tmp_ratio  # (val+test) 내부에서 다시 분할

df_val, df_test = train_test_split(
    df_tmp,
    test_size=(1.0 - val_fraction),
    stratify=df_tmp["coarse_class"],
    random_state=42,
)

# 정렬(옵션) - file_path 기준
df_train = df_train.sort_values("file_path").reset_index(drop=True)
df_val = df_val.sort_values("file_path").reset_index(drop=True)
df_test = df_test.sort_values("file_path").reset_index(drop=True)

train_out = data_path / "train_fine.csv"
val_out = data_path / "val_fine.csv"
test_out = data_path / "test_fine.csv"

df_train.to_csv(train_out, index=False)
df_val.to_csv(val_out, index=False)
df_test.to_csv(test_out, index=False)

print(f"[OK] train_fine.csv 저장: {train_out} (rows={len(df_train)})")
print(f"[OK] val_fine.csv 저장:   {val_out} (rows={len(df_val)})")
print(f"[OK] test_fine.csv 저장:  {test_out} (rows={len(df_test)})")

# ============================================
# 3. split 통계 JSON 생성 (train_val_test_split.json)
# ============================================

def stats_for(df):
    coarse = df["coarse_class"].value_counts().to_dict()
    fine = df["canonical_service_name"].value_counts().to_dict()
    return {
        "count": int(len(df)),
        "coarse_distribution": {k: int(v) for k, v in coarse.items()},
        "fine_distribution": {k: int(v) for k, v in fine.items()},
    }

split_stats = {
    "train": stats_for(df_train),
    "val": stats_for(df_val),
    "test": stats_for(df_test),
}

json_out = data_path / "train_val_test_split.json"
json_out.write_text(json.dumps(split_stats, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"[OK] train_val_test_split.json 저장: {json_out}")

print("[DONE] taxonomy + split + stats 재생성 완료.")
PY

