In [1]:
!pip install -q transformers
!pip install -q pillow
!pip install -q loguru
!pip install -q pydantic
!pip install -q opencv-python

In [2]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
model_id = "IDEA-Research/grounding-dino-tiny"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

In [5]:
model.eval()

GroundingDinoForObjectDetection(
  (model): GroundingDinoModel(
    (backbone): GroundingDinoConvModel(
      (conv_encoder): GroundingDinoConvEncoder(
        (model): SwinBackbone(
          (embeddings): SwinEmbeddings(
            (patch_embeddings): SwinPatchEmbeddings(
              (projection): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
            )
            (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (encoder): SwinEncoder(
            (layers): ModuleList(
              (0): SwinStage(
                (blocks): ModuleList(
                  (0-1): 2 x SwinLayer(
                    (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
                    (attention): SwinAttention(
                      (self): SwinSelfAttention(
                        (query): Linear(in_features=96, out_features=96, bias=True)
                        (key): Linear(in_feat

In [7]:
model.device

device(type='cpu')

In [12]:
import time
import torch
from utils.protocols import GDino
from utils.utils import convert_model_detection
from utils.video import read_video
from utils.utils import get_file_name, get_gpu_name
from loguru import logger
from datetime import datetime
MODEL= "groungding-dino-tiny-hf"

text = "Face."
BASE_DIR = "experiments/gdino"

def run_model(frames):
    results = {}
    for frame_id, frame in frames.items():
        with torch.no_grad():
            outputs = model(**frame)
            results[frame_id] = outputs
    return results

def process_video(video, frames=[]):
    start_time = datetime.now()
    target_sizes = None
    
    frames_ = {}
    start = time.time()
    for frame_id, frame in read_video(video):
        image = Image.fromarray(frame.astype("uint8"))
        inputs = processor(images=image, text=text, return_tensors="pt").to(device)
        frames_[frame_id] = inputs
        if  target_sizes is None:
            target_sizes = torch.Tensor([image.size[::-1]])
    pre_processing_time = time.time() - start
    
    frames_ = {k: frames_[k] for k in list(frames_.keys())[:3]}
    n_frames = len(frames_)
    
    start = time.time()
    outputs = run_model(frames_)
    inference_time = time.time() - start
    
    results = {}
    start = time.time()
    
    for frame_id, output in outputs.items():    
        detections = processor.post_process_grounded_object_detection(
            output,
            frames_[frame_id].input_ids,
            box_threshold=0.4,
            text_threshold=0.3,
            target_sizes=[image.size[::-1]]
        )
        results[frame_id] = convert_model_detection(detections[0])
    post_processing_time = time.time() - start
    
    end_time = datetime.now()
    
    exp =  GDino(
        model=MODEL,
        gpu=get_gpu_name(),
        video_file=video,
        frames=None if not frames else frames,
        n_frames=n_frames,
        
        pre_processing_time=pre_processing_time,
        inference_time=inference_time,
        post_processing_time=post_processing_time,
        video_processing_time=(end_time-start_time).seconds,
       
        start_time=start_time.isoformat(),
        end_time=end_time.isoformat(),
        record_file=get_file_name(BASE_DIR, start, MODEL, video),
        
        data=results,
        prompt=text,
    )
    exp.save()
    exp.log()
    torch.cuda.empty_cache()
    return exp

In [14]:
results = []
for i in range(5):
    vid_hd = process_video("data/720.mp4")
    results.append(vid_hd)
    vid_fhd = process_video("data/1080.mp4")
    results.append(vid_fhd)

[32m2024-06-12 01:14:33.704[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m113[0m - [1mdata/720.mp4 | frames=3 | model_fps=0.27745390044633944 | inference_time=10.812607049942017 | preprocess_time=10.130944013595581[0m
[32m2024-06-12 01:14:56.246[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m113[0m - [1mdata/1080.mp4 | frames=3 | model_fps=0.2995214787776134 | inference_time=10.015976190567017 | preprocess_time=12.25296425819397[0m


In [15]:
columns = results[0].columns
rows = [result.row for result in results]

In [16]:
import pandas as pd

df = pd.DataFrame(rows, columns=columns)

In [17]:
df.head()

Unnamed: 0,model,gpu,video_file,batch_size,n_frames,pre_processing_fps,inference_fps,post_processing_fps,video_fps,pre_processing_time,inference_time,post_processing_time,video_processing_time,start_time,end_time,record_file,prompt,data
0,groungding-dino-tiny-hf,cpu,data/720.mp4,1,3,0.296122,0.277454,1991.912617,0.142857,10.130944,10.812607,0.001506,21,2024-06-12T01:14:12.476800,2024-06-12T01:14:33.703193,experiments/gdino/exp-groungding-dino-tiny-hf-...,Face.,"{0: [box=[566.631103515625, 157.65919494628906..."
1,groungding-dino-tiny-hf,cpu,data/1080.mp4,1,3,0.244839,0.299521,2178.859221,0.136364,12.252964,10.015976,0.001377,22,2024-06-12T01:14:33.714054,2024-06-12T01:14:56.245460,experiments/gdino/exp-groungding-dino-tiny-hf-...,Face.,"{0: [box=[849.8721313476562, 226.7397766113281..."


In [18]:
from datetime import datetime
now = datetime.now()
csv_file = f"zz-{MODEL}-{get_gpu_name()}-{now.day}-{now.hour}-{now.minute}.csv"
df.to_csv(csv_file)