In [1]:
# !pip install -q git+https://github.com/tcstrength/item-identification.git@main timm

In [1]:
import os
import pandas as pd
from pathlib import Path
from hcmus.core import appconfig
from hcmus.lbs import LabelStudioConnector
from hcmus.utils import viz_utils

[32m2025-07-15 21:23:45.562[0m | [1mINFO    [0m | [36mhcmus.core.appconfig[0m:[36m<module>[0m:[36m7[0m - [1mLoad DotEnv: True[0m


In [2]:
accepted_labels = """
"""
accepted_labels = accepted_labels.splitlines()

In [3]:
splits = {}
for split_name in ["train", "test", "val"]:
    lsb_connector = LabelStudioConnector(
        url=appconfig.LABEL_STUDIO_URL,
        api_key=appconfig.LABEL_STUDIO_API_KEY,
        project_id=appconfig.LABEL_STUDIO_PROJECT_MAPPING[split_name],
        temp_dir=appconfig.LABEL_STUDIO_TEMP_DIR
    )

    tasks = lsb_connector.get_tasks()
    labels = lsb_connector.extract_labels(tasks)
    dataset = lsb_connector.download_dataset(tasks, labels)
    dataset = [x for x in dataset if x.get("target").get("labels")]
    idx2label = {v: k for k, v in labels.items()}
    for item in dataset:
        new_labels = []
        for idx in item.get("target").get("labels"):
            label_str = idx2label[idx]
            if label_str in accepted_labels:
                new_labels.append(idx2label[idx])
            else:
                # Compatible with SKU110k
                new_labels.append("object")
        item.get("target")["labels"] = new_labels
    splits[split_name] = dataset

[32m2025-07-15 21:23:47.251[0m | [1mINFO    [0m | [36mhcmus.lbs._label_studio_connector[0m:[36mget_tasks[0m:[36m152[0m - [1mNew `page_to` applied: 35[0m
Loading tasks: 100%|██████████| 35/35 [00:11<00:00,  3.14it/s]
Downloading images: 100%|██████████| 3443/3443 [00:06<00:00, 557.94it/s] 
[32m2025-07-15 21:24:04.672[0m | [1mINFO    [0m | [36mhcmus.lbs._label_studio_connector[0m:[36mget_tasks[0m:[36m152[0m - [1mNew `page_to` applied: 5[0m
Loading tasks: 100%|██████████| 5/5 [00:04<00:00,  1.10it/s]
Downloading images: 100%|██████████| 420/420 [00:02<00:00, 193.55it/s]
[32m2025-07-15 21:24:11.462[0m | [1mINFO    [0m | [36mhcmus.lbs._label_studio_connector[0m:[36mget_tasks[0m:[36m152[0m - [1mNew `page_to` applied: 4[0m
Loading tasks: 100%|██████████| 4/4 [00:01<00:00,  2.54it/s]
Downloading images: 100%|██████████| 309/309 [00:01<00:00, 175.12it/s]


In [4]:
import os
import json
import shutil
from PIL import Image

def generate_categories(splits):
    label_set = set()
    for split_data in splits.values():
        for item in split_data:
            label_set.update(item["target"]["labels"])
    label_list = sorted(label_set)
    return [{"id": i + 1, "name": label} for i, label in enumerate(label_list)], {
        label: i + 1 for i, label in enumerate(label_list)
    }

def convert_split_to_coco(data, categories_dict, split_name, split_output_dir):
    images_dir = os.path.join(split_output_dir, "images")
    os.makedirs(images_dir, exist_ok=True)

    coco_dict = {
        "images": [],
        "annotations": [],
        "categories": [{"id": cid, "name": name} for name, cid in categories_dict.items()]
    }

    ann_id = 1
    for img_id, item in enumerate(data):
        img_path = item["image"]
        target = item["target"]
        boxes = target["boxes"]
        labels = target["labels"]

        file_name = os.path.basename(img_path)
        dst_path = os.path.join(images_dir, file_name)
        shutil.copyfile(img_path, dst_path)

        with Image.open(img_path) as img:
            width, height = img.size

        coco_dict["images"].append({
            "id": img_id,
            "file_name": file_name,
            "width": width,
            "height": height
        })

        for box, label in zip(boxes, labels):
            x1, y1, x2, y2 = box
            bbox = [x1, y1, x2 - x1, y2 - y1]
            area = bbox[2] * bbox[3]

            coco_dict["annotations"].append({
                "id": ann_id,
                "image_id": img_id,
                "category_id": categories_dict[label],
                "bbox": bbox,
                "area": area,
                "iscrowd": 0
            })
            ann_id += 1

    # Save JSON in split folder
    json_path = os.path.join(split_output_dir, f"annotations_{split_name}.json")
    with open(json_path, "w") as f:
        json.dump(coco_dict, f, indent=2)

    print(f"✔ {split_name}: saved {len(data)} images and annotations to {split_output_dir}")

def convert_splits_to_coco(splits, base_output_dir):
    os.makedirs(base_output_dir, exist_ok=True)

    _, label_to_id = generate_categories(splits)

    for split_name, data in splits.items():
        split_output_dir = os.path.join(base_output_dir, split_name)
        os.makedirs(split_output_dir, exist_ok=True)
        convert_split_to_coco(data, label_to_id, split_name, split_output_dir)


In [5]:
convert_splits_to_coco(splits, "/Volumes/Cucumber/Projects/datasets/curated/hcmus-iid-object")

✔ train: saved 3443 images and annotations to /Volumes/Cucumber/Projects/datasets/curated/hcmus-iid-object/train
✔ test: saved 420 images and annotations to /Volumes/Cucumber/Projects/datasets/curated/hcmus-iid-object/test
✔ val: saved 309 images and annotations to /Volumes/Cucumber/Projects/datasets/curated/hcmus-iid-object/val
