In [1]:
import os
from pathlib import Path
import random
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, infer_device
import torch
from datetime import datetime
import time
import json
import hashlib
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# the print takes a long time so inspect to use a native package or another solution to do this faster
def retrieve_image_batches(images_folder, batch_size, sample_size=None, random_state=None):
    """
    Yield batches of images directly from a given folder.

    Args:
        images_folder (str | Path): Path to folder containing images (searched recursively).
        batch_size (int): Number of images per batch.
        sample_size (int, optional): Limit total number of images to sample.
        random_state (int, optional): Seed for reproducible shuffling if needed.
    """
    exts = (".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp")
    paths = [p for p in Path(images_folder).glob("*") if p.suffix.lower() in exts]
    print(f"Found {len(paths)} image files")

    if random_state is not None:
        random.seed(random_state)
        random.shuffle(paths)
    if sample_size:
        paths = paths[:sample_size]

    for i in range(0, len(paths), batch_size):
        batch = paths[i:i + batch_size]
        yield [Image.open(p).convert("RGB").copy() for p in batch], [p.name for p in batch]

In [3]:
test_root = "../Data/tomatoes/images/val"   # adjust to your dataset path
for imgs, names in retrieve_image_batches(test_root, batch_size=2, sample_size=4):
    print("Batch filenames:", names)
    print("Batch size:", len(imgs))
    break  # only first batch for quick test

Found 1325 image files
Batch filenames: ['2025_02_06__13_24_55_176000000___camera_4__camera_5_bunch_3.png', '2023_11_22__14_52_11_503000000___04Z49__04T2Y_bunch_9.png']
Batch size: 2


In [4]:
def select_model(model_name):
    if model_name == "gd_t":
        model_id = "IDEA-Research/grounding-dino-tiny"
        device = infer_device()

    if model_name == "gd_b":
        model_id = "IDEA-Research/grounding-dino-base"
        device = infer_device()

    if model_name == "mmgd_t":
        model_id = "openmmlab-community/mm_grounding_dino_tiny_o365v1_goldg_v3det"
        device = infer_device()

    if model_name == "mmgd_b_all": # too big for T4 GPU
        model_id = "rziga/mm_grounding_dino_base_all"
        device = infer_device()

    if model_name == "mmgd_l_all": # too big for T4 GPU
        model_id = "rziga/mm_grounding_dino_large_all"
        device = infer_device()

    processor = AutoProcessor.from_pretrained(model_id)#, token=os.environ["HF_TOKEN"])
    model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)#, token=os.environ["HF_TOKEN"]).to(device)

    return processor, model

In [5]:
def build_output_path(experiment, images_folder, model_name):
    parent = os.path.basename(os.path.dirname(os.path.dirname(images_folder)))
    base = os.path.splitext(os.path.basename(images_folder))[0]

    out_dir = os.path.join("..", "Results", experiment)
    os.makedirs(out_dir, exist_ok=True)

    out_file = os.path.join(out_dir, f"{parent}_{base}_{model_name}_predictions.json")
    return out_file


In [6]:
def write_coco_output(
 images_folder,
 model_name,
 categories_list,
 images,
 annotations,
 num_images,
 num_boxes,
 total_time,
 gpu_hourly_price=2.5,
 gpu_tdp_watts=250.0,
 gpu_utilization_factor=0.9,
 power_utilization_factor=0.7,
 electricity_price_per_kwh=0.30,
):
 # ---- timing statistics ----
 avg_time_image = total_time / num_images if num_images else 0.0
 avg_time_bbox = total_time / num_boxes if num_boxes else 0.0

 # ---- cost / energy estimates ----
 gpu_hours = (total_time / 3600.0) * gpu_utilization_factor
 infra_cost_eur = gpu_hours * gpu_hourly_price
 energy_kwh = (gpu_tdp_watts / 1000.0) * (total_time / 3600.0) * power_utilization_factor
 energy_cost_eur = energy_kwh * electricity_price_per_kwh
 total_cost_eur = infra_cost_eur + energy_cost_eur

#  cost_per_image_eur = total_cost_eur / num_images if num_images else 0.0
 cost_per_bbox_eur = total_cost_eur / num_boxes if num_boxes else 0.0

 info = {
     "description": f"Predictions on {images_folder} with {model_name}",
     "date_created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
     "num_images": num_images,
     "num_predicted_bbox": num_boxes,
     "avg_inference_time_s_image": avg_time_image,
     "avg_inference_time_s_bbox": avg_time_bbox,
     "total_inference_time_s": total_time,
     "gpu_hours_estimate": gpu_hours,
     "total_cost_eur": total_cost_eur,
     "cost_per_bbox_eur": cost_per_bbox_eur,
 }

 categories = [{"id": i + 1, "name": name} for i, name in enumerate(categories_list)]

 coco_output = {
     "info": info,
     "images": images,
     "annotations": annotations,
     "categories": categories,
 }

 out_file = build_output_path("Experiment_1", images_folder, model_name)
 with open(out_file, "w", encoding="utf-8") as f:
     json.dump(coco_output, f, indent=2)

 return out_file

In [7]:
def make_zero_shot_predictions(
    images_folder,
    categories_list,
    model_name,
    batch_size,
    sample_size,
    random_state=None,
    threshold=0.4,
    text_threshold=0.3,
):
    """
    Run zero-shot object detection and export results in COCO format.

    This function:
      1. Loads a model and processor via `select_model(model_name)`.
      2. Iterates over images in `images_folder` in batches.
      3. Runs zero-shot detection.
      4. Collects predictions into COCO-style structures.
      5. Calls `write_coco_output` to write a JSON file for Experiment_1.

    Parameters
    ----------
    images_folder : str
        Folder containing images.
    categories_list : list[str]
        Class names (e.g. ["cat", "dog", "person"]).
    model_name : str
        Model identifier for select_model().
    batch_size : int
        Number of images per batch.
    sample_size : int
        Total number of images to sample from the folder.
    random_state : int or None, optional
        Random seed used in `retrieve_image_batches`.
    threshold : float, optional
        Detection score threshold.
    text_threshold : float, optional
        Text matching threshold for the grounded detection head.
    """

    # Load model and processor
    processor, model = select_model(model_name)

    # Map category name -> category id (1-based)
    categories_dict = {name: i + 1 for i, name in enumerate(categories_list)}

    images = []
    annotations = []

    def image_id_from_name(name: str) -> int:
        """Stable integer id derived from the image filename."""
        return int(hashlib.md5(name.encode()).hexdigest()[:8], 16)

    model.eval()
    use_cuda = (model.device.type == "cuda")

    # Labels used for the text encoder
    label_list = categories_list
    cached_text = None  # filled the first time we see a batch

    total_time = 0.0
    num_images = 0
    num_boxes = 0

    warmup_steps = 2
    first_batch = True

    with torch.inference_mode():
        for imgs, names in retrieve_image_batches(
            images_folder=images_folder,
            batch_size=batch_size,
            sample_size=sample_size,
            random_state=random_state,
        ):
            # Build and cache text encoding once
            if cached_text is None:
                cached_text = processor(
                    text=[label_list],
                    return_tensors="pt",
                    padding=True,
                )

            # Prepare model inputs for this batch
            inputs = processor(images=imgs, return_tensors="pt", padding=True)
            inputs["input_ids"] = cached_text.input_ids.repeat(len(imgs), 1)
            inputs = {
                k: v.to(model.device, non_blocking=True)
                for k, v in inputs.items()
            }

            # Warmup on the first batch (not timed)
            if first_batch:
                for _ in range(warmup_steps):
                    _ = model(**inputs)
                    if use_cuda:
                        torch.cuda.synchronize()
                first_batch = False

            # Timed forward pass
            start = time.perf_counter()
            outputs = model(**inputs)
            if use_cuda:
                torch.cuda.synchronize()
            end = time.perf_counter()

            total_time += (end - start)
            num_images += len(imgs)

            # Post-process detections to image coordinates
            target_sizes = [(im.height, im.width) for im in imgs]
            results = processor.post_process_grounded_object_detection(
                outputs,
                inputs["input_ids"],
                threshold=threshold,
                text_threshold=text_threshold,
                target_sizes=target_sizes,
            )

            # Collect COCO-style image and annotation entries
            for name, res, im in zip(names, results, imgs):
                img_id = image_id_from_name(name)
                H, W = im.height, im.width

                images.append({
                    "id": img_id,
                    "file_name": f"images/val/{name}",
                })

                boxes = res["boxes"].tolist()
                scores = res["scores"].tolist()
                labels = res.get("text_labels", [])

                for box, score, label in zip(boxes, scores, labels):
                    x1, y1, x2, y2 = map(float, box)

                    # Clamp to image bounds
                    x1 = max(0.0, min(x1, W))
                    y1 = max(0.0, min(y1, H))
                    x2 = max(0.0, min(x2, W))
                    y2 = max(0.0, min(y2, H))

                    # Enforce correct ordering
                    if x2 < x1:
                        x1, x2 = x2, x1
                    if y2 < y1:
                        y1, y2 = y2, y1

                    w = max(0.0, x2 - x1)
                    h = max(0.0, y2 - y1)
                    if w == 0.0 or h == 0.0:
                        continue

                    # Map label string to category id
                    cid = categories_dict.get(label)
                    if cid is None:
                        continue

                    ann_id = len(annotations) + 1
                    annotations.append({
                        "id": ann_id,
                        "image_id": img_id,
                        "category_id": cid,
                        "bbox": [x1, y1, w, h],
                        "score": float(score),
                    })
                    num_boxes += 1

    # Write COCO JSON and log path
    out_file = write_coco_output(
        images_folder=images_folder,
        model_name=model_name,
        categories_list=categories_list,
        images=images,
        annotations=annotations,
        num_images=num_images,
        num_boxes=num_boxes,
        total_time=total_time,
    )

    print(f"Wrote COCO-format JSON to {out_file}")


In [9]:
make_zero_shot_predictions(
    images_folder="../Data/tomatoes/images/val",
    categories_list=["tomato"],
    model_name="gd_t",
    batch_size=2,
    sample_size=4,
    threshold=0.4,
    text_threshold=0.3,
)

Found 1325 image files
Wrote COCO-format JSON to ../Results/Experiment_1/tomatoes_val_gd_t_predictions.json
