In [None]:
import os

In [None]:
from pathlib import Path
import random
from PIL import Image


In [3]:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection


In [18]:
def retrieve_image_batches(images_folder, batch_size, sample_size=None, random_state=None):
    """
    Yield batches of images directly from a given folder.

    Args:
        images_folder (str | Path): Path to folder containing images (searched recursively).
        batch_size (int): Number of images per batch.
        sample_size (int, optional): Limit total number of images to sample.
        random_state (int, optional): Seed for reproducible shuffling if needed.
    """
    exts = (".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp")
    paths = [p for p in Path(images_folder).glob("*") if p.suffix.lower() in exts]
    print(f"Found {len(paths)} image files")

    if random_state is not None:
        random.seed(random_state)
        random.shuffle(paths)
    if sample_size:
        paths = paths[:sample_size]

    for i in range(0, len(paths), batch_size):
        batch = paths[i:i + batch_size]
        yield [Image.open(p).convert("RGB").copy() for p in batch], [p.name for p in batch]

In [None]:
test_root = "../Data/tomatoes/images/val"   # adjust to your dataset path
for imgs, names in retrieve_image_batches(test_root, batch_size=2, sample_size=4):
    print("Batch filenames:", names)
    print("Batch size:", len(imgs))
    break  # only first batch for quick test

Found 364 image files
Batch filenames: ['2025_02_26__14_24_30_468000000___camera_4__camera_5_bunch_5.png', '2025_02_06__13_28_13_350000000___camera_4__camera_5_bunch_12.png']
Batch size: 2


In [20]:
def select_model(model_name):
    if model_name == "gd_t":
        model_id = "IDEA-Research/grounding-dino-tiny"
        device = infer_device()

    if model_name == "gd_b":
        model_id = "IDEA-Research/grounding-dino-base"
        device = infer_device()

    if model_name == "mmgd_t":
        model_id = "openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
        device = infer_device()

    if model_name == "mmgd_b_all": # too big for T4 GPU
        model_id = "rziga/mm_grounding_dino_base_all"
        device = infer_device()

    if model_name == "mmgd_l_all": # too big for T4 GPU
        model_id = "rziga/mm_grounding_dino_large_all"
        device = infer_device()

    processor = AutoProcessor.from_pretrained(model_id)#, token=os.environ["HF_TOKEN"])
    model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)#, token=os.environ["HF_TOKEN"]).to(device)

    return processor, model

In [21]:
def make_predictions(
    dataset_folder,
    categories_dict,
    model_name,
    batch_size=4,
    sample_size=20,
    random_state=44,
    threshold=0.4,
    text_threshold=0.3,
):
    """
    Run zero-shot object detection and export results in COCO format.
    Saves '<model_name>_<dataset_name>_predictions.json' inside the dataset folder.
    """
    processor, model = select_model(model_name)

    base = os.path.splitext(os.path.basename(dataset_folder))[0]
    out_file = os.path.join(dataset_folder, f"{model_name}_{base}_predictions.json")

    info = {
        "year": 2025,
        "description": f"Predictions on {dataset_folder} with {model_name}",
        "date_created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    }

    categories = [{"id": cid, "name": name} for name, cid in categories_dict.items()]
    images, annotations = [], []

    def image_id_from_name(name: str) -> int:
        return int(hashlib.md5(name.encode()).hexdigest()[:8], 16)

    model.eval()
    use_cuda = (model.device.type == "cuda")
    label_list = list(categories_dict.keys())

    # cache text tokens once (optional addition to improve inference)
    cached_text = processor(text=[label_list], return_tensors="pt", padding=True)

    total_time = 0.0
    num_images = 0
    num_boxes = 0

    # warmup (start timing after warmup to establish fair results)
    for _ in range(2):
        dummy = processor(
            images=[Image.new("RGB", (224, 224))] * batch_size,
            text=[label_list] * batch_size,
            return_tensors="pt",
            padding=True,
        ).to(model.device)
        with torch.inference_mode():
            with torch.autocast(device_type="cuda", enabled=use_cuda):
                _ = model(**dummy)
        if use_cuda:
            torch.cuda.synchronize()


    cached_text = None  # set before loop

    with torch.inference_mode():
        for imgs, names in retrieve_image_batches(
            dataset_folder=dataset_folder,
            batch_size=batch_size,
            sample_size=sample_size,
            random_state=random_state,
        ):
            start = time.perf_counter()

            # one-time text tokenization, included in timing
            if cached_text is None:
                cached_text = processor(text=[label_list], return_tensors="pt", padding=True)

            # image preprocessing only
            inputs = processor(images=imgs, return_tensors="pt", padding=True)
            inputs["input_ids"] = cached_text.input_ids.repeat(len(imgs), 1)

            # device transfer
            inputs = {k: v.to(model.device, non_blocking=True) for k, v in inputs.items()}

            # forward + sync
            outputs = model(**inputs)
            if use_cuda:
                torch.cuda.synchronize()

            # post-process
            target_sizes = [(im.height, im.width) for im in imgs]
            results = processor.post_process_grounded_object_detection(
                outputs,
                inputs["input_ids"],
                threshold=threshold,
                text_threshold=text_threshold,
                target_sizes=target_sizes,
            )

            end = time.perf_counter()
            total_time += (end - start)
            num_images += len(imgs)


            for name, res in zip(names, results):
                img_id = image_id_from_name(name)
                images.append({"id": img_id, "file_name": f"images/{name}"})
                for box, score, label in zip(res["boxes"], res["scores"], res.get("text_labels", [])):
                    x1, y1, x2, y2 = [float(v) for v in box.tolist()]
                    cid = categories_dict.get(label)
                    if cid:
                        annotations.append({
                            "image_id": img_id,
                            "category_id": cid,
                            "bbox": [x1, y1, x2 - x1, y2 - y1],
                            "score": float(score.item()),
                        })
                        num_boxes += 1

    avg_time_image = total_time / num_images if num_images else 0.0
    avg_time_bbox = total_time / num_boxes if num_boxes else 0.0

    info["num_images"] = num_images
    info["num_predicted_bbox"] = num_boxes
    info["avg_inference_time_s_image"] = avg_time_image
    info["avg_inference_time_s_bbox"] = avg_time_bbox
    info["total_inference_time_s"] = total_time

    coco_output = {
        "info": info,
        "images": images,
        "annotations": annotations,
        "categories": categories,
    }

    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(coco_output, f, indent=2)

    print(f"Wrote COCO-format JSON to {out_file}")
    print(f"Total inference time: {total_time:.4f}s")
    print(f"Avg inference time per image: {avg_time_image:.4f}s")
    print(f"Avg inference time per bbox: {avg_time_bbox:.6f}s")
    print(f"Processed {num_images} images and {num_boxes} boxes")

    return coco_output
