## Install Grounding DINO 🦕

In [1]:
import os
HOME = os.getcwd()

In [2]:
import os
HOME = os.getcwd()

%cd {HOME}
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd {HOME}/GroundingDINO
!pip install -q -e .
!pip install -q roboflow

!mkdir {HOME}/weights
%cd {HOME}/weights

!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
!pip install -q loguru
!pip install torchvision

/home/ec2-user/SageMaker/cv-benchmark
fatal: destination path 'GroundingDINO' already exists and is not an empty directory.
/home/ec2-user/SageMaker/cv-benchmark/GroundingDINO
mkdir: cannot create directory ‘/home/ec2-user/SageMaker/cv-benchmark/weights’: File exists
/home/ec2-user/SageMaker/cv-benchmark/weights


In [3]:
!python -m pip install -q pillow
!python -m pip install -q loguru
!python -m pip install -q pydantic
!python -m pip install -q pandas

## Restart Notebook at this point

In [3]:
import os
HOME = os.getcwd()

CONFIG_PATH = os.path.join(HOME, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
print(CONFIG_PATH, "; exist:", os.path.isfile(CONFIG_PATH))
WEIGHTS_NAME = "groundingdino_swint_ogc.pth"
WEIGHTS_PATH = os.path.join(HOME, "weights", WEIGHTS_NAME)
print(WEIGHTS_PATH, "; exist:", os.path.isfile(WEIGHTS_PATH))

/home/ec2-user/SageMaker/cv-benchmark/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py ; exist: True
/home/ec2-user/SageMaker/cv-benchmark/weights/groundingdino_swint_ogc.pth ; exist: True


In [4]:
import torch

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [5]:
from groundingdino.util.inference import Model, predict, preprocess_caption, load_model
from groundingdino.util.utils import get_phrases_from_posmap

model = load_model(CONFIG_PATH, WEIGHTS_PATH, DEVICE)
model = model.to(DEVICE)



final text_encoder_type: bert-base-uncased


In [6]:
import numpy as np
import torch
from PIL import Image

import groundingdino.datasets.transforms as T

def preprocess_frame(frame: np.array) -> torch.Tensor:
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    image_source = Image.fromarray(frame.astype("uint8"))
    image_transformed, _ = transform(image_source, None)
    return image_transformed

In [7]:
from typing import List, Tuple
import bisect

def predict(
        model,
        image: torch.Tensor,
        caption: str,
        box_threshold: float,
        text_threshold: float,
        device: str = "cuda",
        remove_combined: bool = False
) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
    caption = preprocess_caption(caption=caption)


    #with torch.no_grad():
    with torch.inference_mode():
        outputs = model(image[None], captions=[caption])

    prediction_logits = outputs["pred_logits"].cpu().sigmoid()[0]  # prediction_logits.shape = (nq, 256)
    prediction_boxes = outputs["pred_boxes"].cpu()[0]  # prediction_boxes.shape = (nq, 4)

    mask = prediction_logits.max(dim=1)[0] > box_threshold
    logits = prediction_logits[mask]  # logits.shape = (n, 256)
    boxes = prediction_boxes[mask]  # boxes.shape = (n, 4)

    tokenizer = model.tokenizer
    tokenized = tokenizer(caption)
    
    if remove_combined:
        sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
        
        phrases = []
        for logit in logits:
            max_idx = logit.argmax()
            insert_idx = bisect.bisect_left(sep_idx, max_idx)
            right_idx = sep_idx[insert_idx]
            left_idx = sep_idx[insert_idx - 1]
            phrases.append(get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer, left_idx, right_idx).replace('.', ''))
    else:
        phrases = [
            get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
            for logit
            in logits
        ]

    return boxes, logits.max(dim=1)[0], phrases


In [8]:
import time
from utils.protocols import GDino
from utils.utils import convert_model_detection, get_file_name, get_gpu_name
from utils.video import read_video
from loguru import logger
from datetime import datetime

TEXT = "Face."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25
MODEL="grounding-dino-tiny-github"
BASE_DIR="experiments/gdino"

def run_model(frames):
    results = {}
    for frame_id, frame in frames.items():
        results[frame_id] = predict(
            model=model,
            image=frame,
            caption=TEXT,
            box_threshold=BOX_TRESHOLD,
            text_threshold=TEXT_TRESHOLD,
            device=DEVICE
        )
    return results

def process_video(video, frames=[]):
    start_time = datetime.now()
    source_h, source_w = None, None
    
    frames_ = {}
    start = time.time()
    logger.info("Start Pre")
    for frame_id, frame in read_video(video):
        #frames_[frame_id] = Model.preprocess_image(frame).to(DEVICE)
        frames_[frame_id] = preprocess_frame(frame).to(DEVICE)
        if not source_h:
            source_h, source_w, _ = frame.shape
    logger.info("Finished Pre")
    pre_processing_time = time.time() - start
    
    n_frames = len(frames_)
    
    start = time.time()
    outputs = run_model(frames_)
    inference_time = time.time() - start
    
    results = {}
    start = time.time()
    
    for frame_id, output in outputs.items():    
        boxes, logits, phrases = output
        results[frame_id] = convert_model_detection({"boxes": boxes, "labels":phrases, "scores": logits})
    post_processing_time = time.time() - start
    
    end_time = datetime.now()
    
    
    exp =  GDino(
        model=MODEL,
        gpu=get_gpu_name(),
        video_file=video,
        frames=None if not frames else frames,
        n_frames=n_frames,
        
        pre_processing_time=pre_processing_time,
        inference_time=inference_time,
        post_processing_time=post_processing_time,
        video_processing_time=(end_time-start_time).seconds,
       
        start_time=start_time.isoformat(),
        end_time=end_time.isoformat(),
        record_file=get_file_name(BASE_DIR, start, MODEL, video),
        
        data=results,
        prompt=TEXT,
    )
    exp.save()
    exp.log()
    torch.cuda.empty_cache()
    return exp

In [9]:
results = []
for i in range(5):
    vid_hd = process_video("data/720.mp4")
    results.append(vid_hd)
    vid_fhd = process_video("data/1080.mp4")
    results.append(vid_fhd)

[32m2024-06-12 09:01:58.072[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m33[0m - [1mStart Pre[0m
[32m2024-06-12 09:02:06.471[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m39[0m - [1mFinished Pre[0m
[32m2024-06-12 09:02:49.565[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m113[0m - [1mdata/720.mp4 | frames=283 | model_fps=6.569246681944922 | inference_time=43.07952094078064 | preprocess_time=8.400093793869019[0m
[32m2024-06-12 09:02:49.598[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m33[0m - [1mStart Pre[0m
[32m2024-06-12 09:03:00.806[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m39[0m - [1mFinished Pre[0m
[32m2024-06-12 09:03:42.740[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m113[0m - [1mdata/1080.mp4 | frames=283 | model_fps=6.75090768916517 | inference_time=41.92028880119324 | preprocess_time=11.20978331565857[0m
[

In [10]:
columns = results[0].columns
rows = [result.row for result in results]

In [11]:
import pandas as pd

df = pd.DataFrame(rows, columns=columns)
df.head()

Unnamed: 0,model,gpu,video_file,batch_size,n_frames,pre_processing_fps,inference_fps,post_processing_fps,video_fps,pre_processing_time,inference_time,post_processing_time,video_processing_time,start_time,end_time,record_file,prompt,data
0,grounding-dino-tiny-github,Tesla_V100-SXM2-16GB,data/720.mp4,1,283,33.6901,6.569247,66297.365505,5.54902,8.400094,43.079521,0.004269,51,2024-06-12T09:01:58.072672,2024-06-12T09:02:49.556571,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.45709970593452454, 0.2791376411914..."
1,grounding-dino-tiny-github,Tesla_V100-SXM2-16GB,data/1080.mp4,1,283,25.245805,6.750908,70528.106477,5.339623,11.209783,41.920289,0.004013,53,2024-06-12T09:02:49.598260,2024-06-12T09:03:42.732364,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.3344019651412964, 0.18687993288040..."
2,grounding-dino-tiny-github,Tesla_V100-SXM2-16GB,data/720.mp4,1,283,36.864189,6.744473,67488.516716,5.77551,7.676827,41.960282,0.004193,49,2024-06-12T09:03:42.773802,2024-06-12T09:04:32.415122,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.45709970593452454, 0.2791376411914..."
3,grounding-dino-tiny-github,Tesla_V100-SXM2-16GB,data/1080.mp4,1,283,25.63833,6.771375,69373.935243,5.442308,11.03816,41.793577,0.004079,52,2024-06-12T09:04:32.456593,2024-06-12T09:05:25.292421,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.3344019651412964, 0.18687993288040..."
4,grounding-dino-tiny-github,Tesla_V100-SXM2-16GB,data/720.mp4,1,283,36.864156,6.773358,69536.498653,5.77551,7.676834,41.781344,0.00407,49,2024-06-12T09:05:25.333876,2024-06-12T09:06:14.796146,experiments/gdino/exp-grounding-dino-tiny-gith...,Face.,"{0: [box=[0.45709970593452454, 0.2791376411914..."


In [12]:
from datetime import datetime
now = datetime.now()
csv_file = f"zz-{MODEL}-{get_gpu_name()}-{now.day}-{now.hour}-{now.minute}.csv"
df.to_csv(csv_file)