## Install Grounding DINO 🦕

In [1]:
import os
HOME = os.getcwd()

In [2]:
import os
HOME = os.getcwd()

%cd {HOME}
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd {HOME}/GroundingDINO
!pip install -q -e .
!pip install -q roboflow

!mkdir {HOME}/weights
%cd {HOME}/weights

!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
!pip install -q loguru
!pip install torchvision

/home/ec2-user/SageMaker
fatal: destination path 'GroundingDINO' already exists and is not an empty directory.
/home/ec2-user/SageMaker/GroundingDINO
mkdir: cannot create directory ‘/home/ec2-user/SageMaker/weights’: File exists
/home/ec2-user/SageMaker/weights


In [3]:
!pip install opencv-python



## Restart Notebook at this point

In [1]:
import os
HOME = os.getcwd()

CONFIG_PATH = os.path.join(HOME, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
print(CONFIG_PATH, "; exist:", os.path.isfile(CONFIG_PATH))
WEIGHTS_NAME = "groundingdino_swint_ogc.pth"
WEIGHTS_PATH = os.path.join(HOME, "weights", WEIGHTS_NAME)
print(WEIGHTS_PATH, "; exist:", os.path.isfile(WEIGHTS_PATH))

/home/ec2-user/SageMaker/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py ; exist: True
/home/ec2-user/SageMaker/weights/groundingdino_swint_ogc.pth ; exist: True


In [4]:
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [5]:
from groundingdino.util.inference import load_model, load_image, predict, annotate

model = load_model(CONFIG_PATH, WEIGHTS_PATH, DEVICE)



final text_encoder_type: bert-base-uncased


In [6]:
model.eval()

GroundingDINO(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x DeformableTransformerEncoderLayer(
          (self_attn): MultiScaleDeformableAttention(
            (sampling_offsets): Linear(in_features=256, out_features=256, bias=True)
            (attention_weights): Linear(in_features=256, out_features=128, bias=True)
            (value_proj): Linear(in_features=256, out_features=256, bias=True)
            (output_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (dropout1): Dropout(p=0.0, inplace=False)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout2): Dropout(p=0.0, inplace=False)
          (linear2): Linear(in_features=2048, out_features=256, bias=True)
          (dropout3): Dropout(p=0.0, inplace=False)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_aff

In [7]:
import numpy as np
import torch
from PIL import Image

import groundingdino.datasets.transforms as T

def preprocess_frame(frame: np.array) -> torch.Tensor:
    transform = T.Compose(
        [
            T.RandomResize([800], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]
    )
    image_source = Image.fromarray(frame.astype("uint8"))
    image_transformed, _ = transform(image_source, None)
    return image_transformed

In [15]:
import time
from utils.protocols import GDino
from utils.utils import convert_model_detection, get_file_name, get_gpu_name
from utils.video import read_video
from loguru import logger
from datetime import datetime

TEXT = "Face."
BOX_TRESHOLD = 0.35
TEXT_TRESHOLD = 0.25
MODEL="grounding-dino-tiny-github"
BASE_DIR="experiments/gdino"

def process_video(video, frames=[]):
    start_time = datetime.now().isoformat()
    start = time.time()
    
    results = {}
    for frame_id, frame in read_video(video):
        boxes, logits, phrases = predict(
            model=model,
            image=preprocess_frame(frame),
            caption=TEXT,
            box_threshold=BOX_TRESHOLD,
            text_threshold=TEXT_TRESHOLD,
            device=DEVICE
        )
        results[frame_id] = convert_model_detection({"boxes": boxes, "labels":phrases, "scores": logits})
        
    end = time.time()
    end_time = datetime.now().isoformat()
    
    n_frames = frame_id+1 if not frames else len(frames)
    
    exp = GDino(
        model=MODEL,
        gpu=get_gpu_name(),
        filename=get_file_name(BASE_DIR, start, MODEL, video),
        file=video,
        prompt=TEXT,
        frames=None if not frames else frames,
        n_frames=n_frames,
        processing_time=end-start,
        fps=n_frames/(end-start),
        data=results,
        start_time=start_time,
        end_time=end_time
    )
    exp.save()
    exp.log()
    torch.cuda.empty_cache()
    return exp

In [16]:
results = []
for i in range(5):
    vid_hd = process_video("data/720.mp4")
    results.append(vid_hd)
    vid_fhd = process_video("data/1080.mp4")
    results.append(vid_fhd)

[32m2024-06-11 10:54:17.760[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m88[0m - [1mdata/720.mp4 | frames=283 | delta=54.0480523109436 | fps=5.236081373883262[0m
[32m2024-06-11 10:55:16.183[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m88[0m - [1mdata/1080.mp4 | frames=283 | delta=58.413251876831055 | fps=4.844791051810089[0m
[32m2024-06-11 10:56:11.900[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m88[0m - [1mdata/720.mp4 | frames=283 | delta=55.70707559585571 | fps=5.080144613102856[0m
[32m2024-06-11 10:57:09.442[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m88[0m - [1mdata/1080.mp4 | frames=283 | delta=57.531291246414185 | fps=4.9190621984803595[0m
[32m2024-06-11 10:58:04.724[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m88[0m - [1mdata/720.mp4 | frames=283 | delta=55.27209115028381 | fps=5.120124715935356[0m
[32m2024-06-11 10:59:04.258[0m | [1mINFO   

In [17]:
columns = results[0].columns
rows = [result.row for result in results]

In [18]:
import pandas as pd

df = pd.DataFrame(rows, columns=columns)
df.head()

Unnamed: 0,model,gpu,file,batch_size,n_frames,processing_time,fps,start_time,end_time,result_file,prompt,data
0,grounding-dino-tiny-github,Tesla V100-SXM2-16GB,data/720.mp4,1,283,54.048052,5.236081,2024-06-11T10:53:23.702729,2024-06-11T10:54:17.750796,experiments/gdino/17181032037027397-grounding-...,Face.,"{0: [box=[0.45709970593452454, 0.2791376411914..."
1,grounding-dino-tiny-github,Tesla V100-SXM2-16GB,data/1080.mp4,1,283,58.413252,4.844791,2024-06-11T10:54:17.761880,2024-06-11T10:55:16.175152,experiments/gdino/17181032577618957-grounding-...,Face.,"{0: [box=[0.3344019651412964, 0.18687993288040..."
2,grounding-dino-tiny-github,Tesla V100-SXM2-16GB,data/720.mp4,1,283,55.707076,5.080145,2024-06-11T10:55:16.185075,2024-06-11T10:56:11.892169,experiments/gdino/17181033161850908-grounding-...,Face.,"{0: [box=[0.45709970593452454, 0.2791376411914..."
3,grounding-dino-tiny-github,Tesla V100-SXM2-16GB,data/1080.mp4,1,283,57.531291,4.919062,2024-06-11T10:56:11.902434,2024-06-11T10:57:09.433739,experiments/gdino/17181033719024425-grounding-...,Face.,"{0: [box=[0.3344019651412964, 0.18687993288040..."
4,grounding-dino-tiny-github,Tesla V100-SXM2-16GB,data/720.mp4,1,283,55.272091,5.120125,2024-06-11T10:57:09.443876,2024-06-11T10:58:04.715988,experiments/gdino/17181034294438922-grounding-...,Face.,"{0: [box=[0.45709970593452454, 0.2791376411914..."


In [19]:
from datetime import datetime
now = datetime.now()
csv_file = f"__{MODEL}-{get_gpu_name().replace("","_")}-{now.day}-{now.hour}-{now.min}.csv"
df.to_csv(csv_file)