In [None]:
# Clone GroundingDINO
!git clone https://github.com/IDEA-Research/GroundingDINO.git
!pip install -e GroundingDINO

# Other dependencies
!pip install supervision diffusers transformers accelerate scipy safetensors
!pip install huggingface_hub tqdm pandas


In [12]:
# ===================================================
# GroundingDINO Inference for Local Images (CPU-friendly version)
# ===================================================

import os
import time
import torch
import sys
import pandas as pd
from tqdm import tqdm

# --- Add GroundingDINO repo to path ---
sys.path.append("GroundingDINO")  # adjust if your repo is elsewhere

# --- GroundingDINO core imports ---
from GroundingDINO.groundingdino.util.slconfig import SLConfig
from GroundingDINO.groundingdino.models import build_model
from GroundingDINO.groundingdino.util.utils import clean_state_dict
from GroundingDINO.groundingdino.util.inference import load_image, predict
from huggingface_hub import hf_hub_download

# ----------------------
# Load model manually
# ----------------------
def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
    cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
    args = SLConfig.fromfile(cache_config_file)
    model = build_model(args)
    args.device = device

    cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
    checkpoint = torch.load(cache_file, map_location='cpu')
    log = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    print(f"Model loaded from {cache_file} => {log}")
    _ = model.eval()
    return model

# -----------------------------------------------------------
# Configuration
# -----------------------------------------------------------
DEVICE = torch.device("cpu")  # Force CPU
ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filename = "groundingdino_swinb_cogcoor.pth"
ckpt_config_filename = "GroundingDINO_SwinB.cfg.py"

IMAGE_FOLDER = "Img_folder"  # folder containing 1.jpeg to 5.jpeg
TEXT_PROMPT = ""  # empty = detect all
BOX_THRESHOLD = 0.3
TEXT_THRESHOLD = 0.25

# -----------------------------------------------------------
# Load model
# -----------------------------------------------------------
model = load_model_hf(
    repo_id=ckpt_repo_id,
    filename=ckpt_filename,
    ckpt_config_filename=ckpt_config_filename,
    device=DEVICE
)

# -----------------------------------------------------------
# Inference function
# -----------------------------------------------------------
def get_boxes(image_path):
    image_source, image = load_image(image_path)
    boxes, _, _ = predict(
        model=model,
        image=image,
        caption=TEXT_PROMPT,
        box_threshold=BOX_THRESHOLD,
        text_threshold=TEXT_THRESHOLD,
        device=DEVICE
    )

    # Scale boxes to original image size
    boxes = boxes * torch.tensor([
        image_source.shape[1], image_source.shape[0],
        image_source.shape[1], image_source.shape[0]
    ])

    # Convert [x1, y1, x2, y2] → [x_center, y_center, width, height]
    boxes_xywh = []
    for box in boxes:
        x1, y1, x2, y2 = box.tolist()
        w = x2 - x1
        h = y2 - y1
        xc = x1 + w / 2
        yc = y1 + h / 2
        boxes_xywh.append([xc, yc, w, h])

    return boxes_xywh

# -----------------------------------------------------------
# Run on images in folder
# -----------------------------------------------------------
image_names = [f"{i}.jpeg" for i in range(1, 6)]
start_time = time.time()

for img_name in image_names:
    img_path = os.path.join(IMAGE_FOLDER, img_name)
    try:
        boxes = get_boxes(img_path)
        print(f"\n{img_name} bounding boxes:")
        for box in boxes:
            x, y, w, h = box
            print(f"  [{x:.1f}, {y:.1f}, {w:.1f}, {h:.1f}]")
    except Exception as e:
        print(f"Failed to process {img_name}: {e}")

end_time = time.time()
total_time = end_time - start_time
throughput = len(image_names) / total_time

print(f"\nTotal time for {len(image_names)} images: {total_time:.2f} seconds")
print(f"Throughput: {throughput:.2f} images per second")


final text_encoder_type: bert-base-uncased
Model loaded from /root/.cache/huggingface/hub/models--ShilongLiu--GroundingDINO/snapshots/a94c9b567a2a374598f05c584e96798a170c56fb/groundingdino_swinb_cogcoor.pth => _IncompatibleKeys(missing_keys=[], unexpected_keys=['label_enc.weight', 'bert.embeddings.position_ids'])

1.jpeg bounding boxes:

2.jpeg bounding boxes:

3.jpeg bounding boxes:

4.jpeg bounding boxes:

5.jpeg bounding boxes:

Total time for 5 images: 165.25 seconds
Throughput: 0.03 images per second
