In [1]:
!pip install -q huggingface
!pip install -q transformers
!pip install -q pillow
!pip install -q loguru
!pip install -q pydantic

In [1]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection 

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "IDEA-Research/grounding-dino-tiny"

device

'cuda'

In [3]:
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

In [4]:
model.eval()

GroundingDinoForObjectDetection(
  (model): GroundingDinoModel(
    (backbone): GroundingDinoConvModel(
      (conv_encoder): GroundingDinoConvEncoder(
        (model): SwinBackbone(
          (embeddings): SwinEmbeddings(
            (patch_embeddings): SwinPatchEmbeddings(
              (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
            )
            (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (encoder): SwinEncoder(
            (layers): ModuleList(
              (0): SwinStage(
                (blocks): ModuleList(
                  (0-1): 2 x SwinLayer(
                    (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
                    (attention): SwinAttention(
                      (self): SwinSelfAttention(
                        (query): Linear(in_features=96, out_features=96, bias=True)
                        (key): Linear(in_feat

In [5]:
model.device

device(type='cuda', index=0)

In [8]:

import time
import torch
from utils.protocols import GDino
from utils.utils import convert_model_detection
from utils.video import read_video
from utils.utils import get_file_name, get_gpu_name
from loguru import logger
from datetime import datetime
MODEL= "groungding-dino-tiny-hf"

text = "Face."
BASE_DIR = "experiments/gdino"

def process_video(video, frames=[]):
    start_time = datetime.now().isoformat()
    start = time.time()
    
    results = {}
    for frame_id, frame in read_video(video):
        image = Image.fromarray(frame.astype("uint8"))
        inputs = processor(images=image, text=text, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        detections = processor.post_process_grounded_object_detection(
            outputs,
            inputs.input_ids,
            box_threshold=0.4,
            text_threshold=0.3,
            target_sizes=[image.size[::-1]]
        )
        results[frame_id] = convert_model_detection(detections[0])

    end = time.time()
    end_time = datetime.now().isoformat()
    n_frames = frame_id+1 if not frames else len(frames)
    
    exp =  GDino(
        model=MODEL,
        gpu=get_gpu_name(),
        file=video,
        prompt=text,
        frames=None if not frames else frames,
        n_frames=n_frames,
        processing_time=end-start,
        fps=n_frames/(end-start),
        data=results,
        start_time=start_time,
        end_time=end_time,
        filename=get_file_name(BASE_DIR, start, MODEL, video)
    )
    exp.save()
    exp.log()
    torch.cuda.empty_cache()
    return exp

In [9]:
results = []
for i in range(5):
    vid_hd = process_video("data/720.mp4")
    results.append(vid_hd)
    vid_fhd = process_video("data/1080.mp4")
    results.append(vid_fhd)

[32m2024-06-11 10:17:10.208[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m88[0m - [1mdata/720.mp4 | frames=283 | delta=85.01531672477722 | fps=3.3288119235756644[0m
[32m2024-06-11 10:18:40.274[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m88[0m - [1mdata/1080.mp4 | frames=283 | delta=90.03738784790039 | fps=3.143138720084485[0m


In [10]:
columns = results[0].columns
rows = [result.row for result in results]

In [11]:
import pandas as pd

df = pd.DataFrame(rows, columns=columns)

In [12]:
df.head()

Unnamed: 0,model,gpu,file,batch_size,n_frames,processing_time,fps,start_time,end_time,result_file,prompt,data
0,groungding-dino-tiny-hf,Tesla V100-SXM2-16GB,data/720.mp4,1,283,85.015317,3.328812,2024-06-11T10:15:45.192772,2024-06-11T10:17:10.208104,experiments/gdino/17181009451927836-groungding...,Face.,"{0: [box=[566.5269165039062, 154.7469024658203..."
1,groungding-dino-tiny-hf,Tesla V100-SXM2-16GB,data/1080.mp4,1,283,90.037388,3.143139,2024-06-11T10:17:10.236489,2024-06-11T10:18:40.273898,experiments/gdino/17181010302365065-groungding...,Face.,"{0: [box=[850.1380004882812, 224.1177520751953..."


In [None]:
from datetime import datetime
now = datetime.now()
csv_file = f"__{MODEL}-{get_gpu_name().replace(' ','_')}-{now.day}-{now.hour}-{now.min}.csv"
df.to_csv(csv_file)