## Install Grounding DINO 🦕

In [1]:
import os
HOME = os.getcwd()

In [1]:
import os
HOME = os.getcwd()

%cd {HOME}
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd {HOME}/GroundingDINO
!pip install -q -e .
!pip install -q roboflow

!mkdir {HOME}/weights
%cd {HOME}/weights

!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
!pip install -q loguru
!pip install torchvision

/Users/tarik.setia/lab/cv-benchmark
Cloning into 'GroundingDINO'...


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


remote: Enumerating objects: 443, done.[K
remote: Counting objects: 100% (211/211), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 443 (delta 156), reused 137 (delta 128), pack-reused 232[K
Receiving objects: 100% (443/443), 12.86 MiB | 1.85 MiB/s, done.
Resolving deltas: 100% (228/228), done.
/Users/tarik.setia/lab/cv-benchmark/GroundingDINO
Reason for being yanked: deprecated, use 4.8.0.76[0m[33m
[0m/Users/tarik.setia/lab/cv-benchmark/weights


In [2]:
!python -m pip install -q pillow
!python -m pip install -q loguru
!python -m pip install -q pydantic
!python -m pip install -q pandas

## Restart Notebook at this point

In [6]:
import os
HOME = os.getcwd()

CONFIG_PATH = os.path.join(HOME, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
print(CONFIG_PATH, "; exist:", os.path.isfile(CONFIG_PATH))
WEIGHTS_NAME = "groundingdino_swint_ogc.pth"
WEIGHTS_PATH = os.path.join(HOME, "weights", WEIGHTS_NAME)
print(WEIGHTS_PATH, "; exist:", os.path.isfile(WEIGHTS_PATH))

/Users/tarik.setia/lab/cv-benchmark/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py ; exist: True
/Users/tarik.setia/lab/cv-benchmark/weights/groundingdino_swint_ogc.pth ; exist: True


In [7]:
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cpu'

In [8]:
from groundingdino.util.inference import Model, predict, preprocess_caption, load_model
from groundingdino.util.utils import get_phrases_from_posmap

model = load_model(CONFIG_PATH, WEIGHTS_PATH, DEVICE)
model = model.to(DEVICE)

final text_encoder_type: bert-base-uncased


In [9]:
import numpy as np
import torch
from PIL import Image

import groundingdino.datasets.transforms as T

def preprocess_frame(frame: np.array) -> torch.Tensor:
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    image_source = Image.fromarray(frame.astype("uint8"))
    image_transformed, _ = transform(image_source, None)
    return image_transformed

In [10]:
from typing import List, Tuple
import bisect

def predict(
        model,
        image: torch.Tensor,
        caption: str,
        box_threshold: float,
        text_threshold: float,
        device: str = "cuda",
        remove_combined: bool = False
) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
    caption = preprocess_caption(caption=caption)


    with torch.no_grad():
        outputs = model(image[None], captions=[caption])

    prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0]  # prediction_logits.shape = (nq, 256)
    prediction_boxes = outputs["pred_boxes"].cpu()[0]  # prediction_boxes.shape = (nq, 4)

    mask = prediction_logits.max(dim=1)[0] > box_threshold
    logits = prediction_logits[mask]  # logits.shape = (n, 256)
    boxes = prediction_boxes[mask]  # boxes.shape = (n, 4)

    tokenizer = model.tokenizer
    tokenized = tokenizer(caption)
    
    if remove_combined:
        sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
        
        phrases = []
        for logit in logits:
            max_idx = logit.argmax()
            insert_idx = bisect.bisect_left(sep_idx, max_idx)
            right_idx = sep_idx[insert_idx]
            left_idx = sep_idx[insert_idx - 1]
            phrases.append(get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer, left_idx, right_idx).replace('.', ''))
    else:
        phrases = [
            get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
            for logit
            in logits
        ]

    return boxes, logits.max(dim=1)[0], phrases


In [11]:
import time
from utils.protocols import GDino
from utils.utils import convert_model_detection, get_file_name, get_gpu_name
from utils.video import read_video
from loguru import logger
from datetime import datetime

TEXT = "Face."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25
MODEL="grounding-dino-tiny-github"
BASE_DIR="experiments/gdino"

def run_model(frames):
    results = {}
    for frame_id, frame in frames.items():
        results[frame_id] = predict(
            model=model,
            image=frame,
            caption=TEXT,
            box_threshold=BOX_TRESHOLD,
            text_threshold=TEXT_TRESHOLD,
            device=DEVICE
        )
    return results

def process_video(video, frames=[]):
    start_time = datetime.now()
    source_h, source_w = None, None
    
    frames_ = {}
    start = time.time()
    logger.info("Started Pre-processing")
    for frame_id, frame in read_video(video):
        #frames_[frame_id] = Model.preprocess_image(frame).to(DEVICE)
        frames_[frame_id] = preprocess_frame(frame).to(DEVICE)
        if not source_h:
            source_h, source_w, _ = frame.shape

    logger.info("Finished Pre-processing")
    pre_processing_time = time.time() - start
    
    frames_ = {k: frames_[k] for k in list(frames_.keys())[:3]}
    n_frames = len(frames_)
    
    start = time.time()
    outputs = run_model(frames_)
    inference_time = time.time() - start
    
    results = {}
    start = time.time()
    
    for frame_id, output in outputs.items():    
        boxes, logits, phrases = output
        results[frame_id] = convert_model_detection({"boxes": boxes, "labels":phrases, "scores": logits})
    post_processing_time = time.time() - start
    
    end_time = datetime.now()
    
    
    exp =  GDino(
        model=MODEL,
        gpu=get_gpu_name(),
        video_file=video,
        frames=None if not frames else frames,
        n_frames=n_frames,
        
        pre_processing_time=pre_processing_time,
        inference_time=inference_time,
        post_processing_time=post_processing_time,
        video_processing_time=(end_time-start_time).seconds,
       
        start_time=start_time.isoformat(),
        end_time=end_time.isoformat(),
        record_file=get_file_name(BASE_DIR, start, MODEL, video),
        
        data=results,
        prompt=TEXT,
    )
    exp.save()
    exp.log()
    torch.cuda.empty_cache()
    return exp

In [12]:
results = []
for i in range(5):
    vid_hd = process_video("data/720.mp4")
    results.append(vid_hd)
    vid_fhd = process_video("data/1080.mp4")
    results.append(vid_fhd)

AttributeError: module 'groundingdino.datasets.transforms' has no attribute 'to'

In [24]:
columns = results[0].columns
rows = [result.row for result in results]

In [25]:
import pandas as pd

df = pd.DataFrame(rows, columns=columns)
df.head()

Unnamed: 0,model,gpu,video_file,batch_size,n_frames,pre_processing_fps,inference_fps,post_processing_fps,video_fps,pre_processing_time,inference_time,post_processing_time,video_processing_time,start_time,end_time,record_file,prompt,data
0,grounding-dino-tiny-github,cpu,data/720.mp4,1,3,0.737605,0.307241,78643.2,0.230769,4.067215,9.764309,3.8e-05,13,2024-06-12T02:42:29.341121,2024-06-12T02:42:43.316071,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.4578925371170044, 0.27158764004707..."
1,grounding-dino-tiny-github,cpu,data/1080.mp4,1,3,0.519929,0.302522,51569.311475,0.2,5.770022,9.916628,5.8e-05,15,2024-06-12T02:42:43.319755,2024-06-12T02:42:59.156470,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.4580150544643402, 0.27503931522369..."
2,grounding-dino-tiny-github,cpu,data/720.mp4,1,3,0.781595,0.314452,73156.465116,0.230769,3.838304,9.540408,4.1e-05,13,2024-06-12T02:42:59.159440,2024-06-12T02:43:12.668020,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.4578925371170044, 0.27158764004707..."
3,grounding-dino-tiny-github,cpu,data/1080.mp4,1,3,0.5224,0.321466,75346.778443,0.2,5.742723,9.33226,4e-05,15,2024-06-12T02:43:12.674278,2024-06-12T02:43:27.876457,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.4580150544643402, 0.27503931522369..."
4,grounding-dino-tiny-github,cpu,data/720.mp4,1,3,0.772727,0.311089,83330.543046,0.230769,3.882355,9.643543,3.6e-05,13,2024-06-12T02:43:27.880153,2024-06-12T02:43:41.481468,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.4578925371170044, 0.27158764004707..."


In [26]:
from datetime import datetime
now = datetime.now()
csv_file = f"zz-{MODEL}-{get_gpu_name()}-{now.day}-{now.hour}-{now.minute}.csv"
df.to_csv(csv_file)