In [1]:
!pip install -q git+https://github.com/IDEA-Research/GroundingDINO.git
!pip install -q git+https://github.com/huggingface/transformers.git

In [1]:
import cv2
from PIL import Image
import supervision as sv


def read_video_frames(video_path, frame_indices: list[int]):
    frames = {}
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise Exception("Error Opening Stream @ {video_path}")

    frame_id = 0
    max_index = sorted(frame_indices)[-1]
    while cap.isOpened():
        _, frame = cap.read()
        if frame is None or frame_id > max_index:
            # No more frames. Recognition done.
            break

        if frame_id in frame_indices:
            frames[frame_id] = frame

        frame_id += 1

    cap.release()
    cv2.destroyAllWindows()
     
    return frames

def save_annotation(image, detections, labels, f_path):
    box_annotator = sv.BoxAnnotator()
    annotated_image = box_annotator.annotate(scene=image, detections=detections, labels=labels)
    cv2.imwrite(f_path, annotated_image)

### Run as script

#### From Repo

Every Face is detected

Files are saved in bboxes/script-*.jpg

In [2]:
from cProfile import label
import json
from groundingdino.util.inference import Model
from utils import read_video_frames, save_annotation

TEXT_PROMPT = "Faces."

model = Model("./grounding_dino/config/GroundingDINO_SwinT_OGC.py", "./grounding_dino/weights/groundingdino_swint_ogc.pth", device="cpu")


frames = read_video_frames("video.mp4", [4,10,32])

predictions = {
    k: model.predict_with_caption(v, TEXT_PROMPT)
    for k,v in frames.items()
}

bboxes = {
    k: v[0].xyxy.tolist()
    for k,v in predictions.items()
}


for frame_id, result in predictions.items():
    detections, labels = result
    f_path = f"bboxes/script-{frame_id}.jpg"
    save_annotation(frames[frame_id], detections, labels, f_path)

print(json.dumps(bboxes, indent=2))



final text_encoder_type: bert-base-uncased




{
  "4": [],
  "10": [
    [
      595.7476196289062,
      210.34384155273438,
      664.7664184570312,
      310.4613952636719
    ]
  ],
  "32": [
    [
      503.5018310546875,
      118.34725952148438,
      739.0670166015625,
      452.6176452636719
    ]
  ]
}




#### From huggingface
Nothing is detected

The code block is copied from: https://huggingface.co/IDEA-Research/grounding-dino-tiny

In [3]:

import torch
from transformers import AutoProcessor, GroundingDinoForObjectDetection


model_id = "IDEA-Research/grounding-dino-tiny"
device="cpu"
processor = AutoProcessor.from_pretrained(model_id)
model = GroundingDinoForObjectDetection.from_pretrained(model_id).to(device)

frames = read_video_frames("video.mp4", [4,10,32])
frames = {
    k:Image.fromarray(v)
    for k,v in frames.items()
}
# Check for cats and remote controls
text = "Faces."

dets = {}
for idx, image in frames.items():
    inputs = processor(images=image, text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    results = processor.post_process_grounded_object_detection(
        outputs,
        inputs.input_ids,
        box_threshold=0.35,
        text_threshold=0.25,
        target_sizes=[image.size[::-1]]
    )
    dets[idx] = results

dets

{4: [{'scores': tensor([0.3629]),
   'labels': ['faces'],
   'boxes': tensor([[591.4139, 207.4501, 654.7849, 299.5935]])}],
 10: [{'scores': tensor([0.3848]),
   'labels': ['faces'],
   'boxes': tensor([[595.7828, 210.3787, 664.9124, 310.6078]])}],
 32: [{'scores': tensor([0.3518]),
   'labels': ['faces'],
   'boxes': tensor([[505.5514, 119.7520, 740.2328, 452.0472]])}]}