In [34]:
import os
from pathlib import Path
import random
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, infer_device
import torch
from datetime import datetime
import time
import json
import hashlib
import numpy as np


In [7]:
def retrieve_image_batches(images_folder, batch_size, sample_size=None, random_state=None):
    """
    Yield batches of images directly from a given folder.

    Args:
        images_folder (str | Path): Path to folder containing images (searched recursively).
        batch_size (int): Number of images per batch.
        sample_size (int, optional): Limit total number of images to sample.
        random_state (int, optional): Seed for reproducible shuffling if needed.
    """
    exts = (".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp")
    paths = [p for p in Path(images_folder).glob("*") if p.suffix.lower() in exts]
    print(f"Found {len(paths)} image files")

    if random_state is not None:
        random.seed(random_state)
        random.shuffle(paths)
    if sample_size:
        paths = paths[:sample_size]

    for i in range(0, len(paths), batch_size):
        batch = paths[i:i + batch_size]
        yield [Image.open(p).convert("RGB").copy() for p in batch], [p.name for p in batch]

In [8]:
test_root = "../Data/tomatoes/images/val"   # adjust to your dataset path
for imgs, names in retrieve_image_batches(test_root, batch_size=2, sample_size=4):
    print("Batch filenames:", names)
    print("Batch size:", len(imgs))
    break  # only first batch for quick test

Found 364 image files
Batch filenames: ['2023_08_02__13_32_31_894227000___02Z8R__02Z8W_bunch_1.png', '2023_08_02__14_21_25_793843000___02Z8R__02Z8W_bunch_6.png']
Batch size: 2


In [26]:
def select_model(model_name):
    if model_name == "gd_t":
        model_id = "IDEA-Research/grounding-dino-tiny"
        device = infer_device()

    if model_name == "gd_b":
        model_id = "IDEA-Research/grounding-dino-base"
        device = infer_device()

    if model_name == "mmgd_t":
        model_id = "openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
        device = infer_device()

    if model_name == "mmgd_b_all": # too big for T4 GPU
        model_id = "rziga/mm_grounding_dino_base_all"
        device = infer_device()

    if model_name == "mmgd_l_all": # too big for T4 GPU
        model_id = "rziga/mm_grounding_dino_large_all"
        device = infer_device()

    processor = AutoProcessor.from_pretrained(model_id)#, token=os.environ["HF_TOKEN"])
    model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)#, token=os.environ["HF_TOKEN"]).to(device)

    return processor, model

In [None]:
def make_predictions(
    images_folder,
    categories_list,
    model_name,
    batch_size=4,
    sample_size=20,
    random_state=44,
    threshold=0.4,
    text_threshold=0.3,
):
    """
    Run zero-shot object detection and export results in COCO format.
    Saves '<model_name>_<dataset_name>_predictions.json' inside the Results folder under Experiment_1.

    Parameters
    ----------
    images_folder : str
        Folder containing images.
    categories_list : list[str]
        Class names (e.g. ["cat", "dog", "person"]).
    model_name : str
        Model identifier for select_model().
    """

    processor, model = select_model(model_name)

    # I/O setup
    base = os.path.splitext(os.path.basename(images_folder))[0]
    out_dir = os.path.join("..", "Results", "Experiment_1")
    os.makedirs(out_dir, exist_ok=True)
    out_file = os.path.join(out_dir, f"{model_name}_{base}_predictions.json")

    # build id/name mapping automatically
    categories = [{"id": i + 1, "name": name} for i, name in enumerate(categories_list)]
    categories_dict = {name: i + 1 for i, name in enumerate(categories_list)}

    info = {
        "description": f"Predictions on {images_folder} with {model_name}",
        "date_created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    }

    images, annotations = [], []

    def image_id_from_name(name: str) -> int:
        return int(hashlib.md5(name.encode()).hexdigest()[:8], 16)

    model.eval()
    use_cuda = (model.device.type == "cuda")

    label_list = list(categories_dict.keys())
    cached_text = processor(text=[label_list], return_tensors="pt", padding=True)

    total_time = 0.0
    num_images = 0
    num_boxes = 0

    # warmup
    for _ in range(2):
        dummy = processor(
            images=[Image.new("RGB", (224, 224))] * batch_size,
            text=[label_list] * batch_size,
            return_tensors="pt",
            padding=True,
        ).to(model.device)
        with torch.inference_mode():
            _ = model(**dummy)
        if use_cuda:
            torch.cuda.synchronize()

    cached_text = None

    with torch.inference_mode():
        for imgs, names in retrieve_image_batches(
            images_folder=images_folder,
            batch_size=batch_size,
            sample_size=sample_size,
            random_state=random_state,
        ):
            start = time.perf_counter()

            if cached_text is None:
                cached_text = processor(text=[label_list], return_tensors="pt", padding=True)

            inputs = processor(images=imgs, return_tensors="pt", padding=True)
            inputs["input_ids"] = cached_text.input_ids.repeat(len(imgs), 1)
            inputs = {k: v.to(model.device, non_blocking=True) for k, v in inputs.items()}

            outputs = model(**inputs)
            if use_cuda:
                torch.cuda.synchronize()

            target_sizes = [(im.height, im.width) for im in imgs]
            results = processor.post_process_grounded_object_detection(
                outputs,
                inputs["input_ids"],
                threshold=threshold,
                text_threshold=text_threshold,
                target_sizes=target_sizes,
            )

            end = time.perf_counter()
            total_time += (end - start)
            num_images += len(imgs)

            for name, res, im in zip(names, results, imgs):
                img_id = image_id_from_name(name)
                H, W = im.height, im.width
                images.append({"id": img_id, "file_name": f"images/val/{name}"})

                boxes = res["boxes"].tolist()
                scores = res["scores"].tolist()
                labels = res.get("text_labels", [])

                for box, score, label in zip(boxes, scores, labels):
                    x1, y1, x2, y2 = map(float, box)

                    # clamp to image bounds
                    x1 = max(0.0, min(x1, W))
                    y1 = max(0.0, min(y1, H))
                    x2 = max(0.0, min(x2, W))
                    y2 = max(0.0, min(y2, H))

                    # enforce proper ordering
                    if x2 < x1:
                        x1, x2 = x2, x1
                    if y2 < y1:
                        y1, y2 = y2, y1

                    w = max(0.0, x2 - x1)
                    h = max(0.0, y2 - y1)
                    if w == 0.0 or h == 0.0:
                        continue

                    cid = categories_dict.get(label)
                    if cid is None:
                        continue

                    ann_id = len(annotations) + 1
                    annotations.append({
                        "id": ann_id,
                        "image_id": img_id,
                        "category_id": cid,
                        "bbox": [x1, y1, w, h],
                        "score": float(score),
                    })
                    num_boxes += 1

    avg_time_image = total_time / num_images if num_images else 0.0
    avg_time_bbox = total_time / num_boxes if num_boxes else 0.0

    info.update({
        "num_images": num_images,
        "num_predicted_bbox": num_boxes,
        "avg_inference_time_s_image": avg_time_image,
        "avg_inference_time_s_bbox": avg_time_bbox,
        "total_inference_time_s": total_time,
    })

    coco_output = {
        "info": info,
        "images": images,
        "annotations": annotations,
        "categories": categories,
    }

    with open(out_file, "w", encoding="utf-8") as f:
        json.dump(coco_output, f, indent=2)

    print(f"Wrote COCO-format JSON to {out_file}")
    print(f"Total inference time: {total_time:.4f}s")
    print(f"Avg inference time per image: {avg_time_image:.4f}s")
    print(f"Avg inference time per bbox: {avg_time_bbox:.6f}s")
    print(f"Processed {num_images} images and {num_boxes} boxes")

    return coco_output


In [36]:
make_predictions(
    images_folder="../Data/tomatoes/images/val",
    categories_list=["tomato"],
    model_name="gd_t",
    batch_size=2,
    sample_size=20,
    threshold=0.4,
    text_threshold=0.3,
)

Found 364 image files
Wrote COCO-format JSON to ..\Results\Experiment_1\gd_t_val_predictions.json
Total inference time: 519.9467s
Avg inference time per image: 25.9973s
Avg inference time per bbox: 17.929196s
Processed 20 images and 29 boxes


{'info': {'description': 'Predictions on ../Data/tomatoes/images/val with gd_t',
  'date_created': '2025-11-11 18:31:03',
  'num_images': 20,
  'num_predicted_bbox': 29,
  'avg_inference_time_s_image': 25.997333549996256,
  'avg_inference_time_s_bbox': 17.929195551721556,
  'total_inference_time_s': 519.9466709999251},
 'images': [{'id': 1467614578,
   'file_name': 'images/val/2023_08_11__16_20_43_969564000___02Z8R__02Z8W_bunch_6.png'},
  {'id': 3405516372,
   'file_name': 'images/val/2023_08_18__08_39_27_579884000___02Z8R__02Z8W_bunch_4.png'},
  {'id': 796568874,
   'file_name': 'images/val/2023_08_18__13_17_47_649299000___02Z8R__02Z8W_bunch_5.png'},
  {'id': 4067744067,
   'file_name': 'images/val/2023_08_04__08_31_03_412536000___02Z8R__02Z8W_bunch_3.png'},
  {'id': 309385764,
   'file_name': 'images/val/2023_11_22__15_06_41_214000000___04ZXJ__04ZXK_bunch_4.png'},
  {'id': 755751560,
   'file_name': 'images/val/2024_09_17__12_45_33_925381000___cam3__cam4_bunch_0.png'},
  {'id': 40204

In [33]:
def compute_exact_pr(cocoEval, iou_thr=0.50):
    # indices
    iou_idx = np.where(np.isclose(cocoEval.params.iouThrs, iou_thr))[0][0]
    area_all = cocoEval.params.areaRng[0]  # 'all' area range [0^2, inf^2]
    maxDets = cocoEval.params.maxDets[-1]

    tp = fp = fn = 0

    for ei in cocoEval.evalImgs:
        if ei is None:
            continue
        # keep ONLY area='all'
        if not np.allclose(ei["aRng"], area_all):
            continue

        dtMatches = np.asarray(ei["dtMatches"])  # [T, D]
        gtMatches = np.asarray(ei["gtMatches"])  # [T, G]
        dtIgnore  = np.asarray(ei["dtIgnore"],  dtype=bool)  # [T, D] or [D]
        gtIgnore  = np.asarray(ei["gtIgnore"],  dtype=bool)  # [G]

        # Some pycocotools versions store dtIgnore as [D]; normalize
        if dtIgnore.ndim == 1:
            dtIgnore = np.tile(dtIgnore, (len(cocoEval.params.iouThrs), 1))

        dtm = dtMatches[iou_idx, :maxDets]
        dti = dtIgnore[iou_idx, :maxDets]
        gtm = gtMatches[iou_idx]
        gti = gtIgnore

        tp += int(np.sum((dtm > 0) & (~dti)))
        fp += int(np.sum((dtm == 0) & (~dti)))
        fn += int(np.sum((gtm == 0) & (~gti)))

    print(f"true positives:{tp}")
    print(f"false positives:{fp}")
    print(f"false negatives:{fn}")
    precision_exact = tp / (tp + fp) if (tp + fp) else 0.0
    recall_exact = tp / (tp + fn) if (tp + fn) else 0.0
    return precision_exact, recall_exact
