In [1]:
!pip install -q huggingface
!pip install -q transformers
!pip install -q pillow
!pip install -q loguru
!pip install -q pydantic

In [2]:
import requests
from PIL import Image
import torch

from transformers import OwlViTProcessor, OwlViTForObjectDetection

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14").to(device)

preprocessor_config.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [5]:
model.eval()

OwlViTForObjectDetection(
  (owlvit): OwlViTModel(
    (text_model): OwlViTTextTransformer(
      (embeddings): OwlViTTextEmbeddings(
        (token_embedding): Embedding(49408, 768)
        (position_embedding): Embedding(16, 768)
      )
      (encoder): OwlViTEncoder(
        (layers): ModuleList(
          (0-11): 12 x OwlViTEncoderLayer(
            (self_attn): OwlViTAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): OwlViTMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, out_

In [6]:
model.device

device(type='cuda', index=0)

In [13]:

import time
import torch
from utils.protocols import OwlVit
from utils.utils import convert_model_detection
from utils.video import read_video
from utils.utils import get_file_name, get_gpu_name
from loguru import logger
from datetime import datetime

MODEL= "owlvit-large-p14-hf"

text = "face"
BASE_DIR = "experiments/owlvit"

def process_video(video, frames=[]):
    start_time = datetime.now().isoformat()
    start = time.time()
    
    results = {}
    for frame_id, frame in read_video(video):
        image = Image.fromarray(frame.astype("uint8"))
        inputs = processor(text=[[text]], images=image, return_tensors="pt")

        inputs = processor(images=image, text=text, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        target_sizes = torch.Tensor([image.size[::-1]])
        detections = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
        results[frame_id] = convert_model_detection(detections[0])

    end = time.time()
    end_time = datetime.now().isoformat()
    n_frames = frame_id+1 if not frames else len(frames)
    
    exp =  OwlVit(
        model=MODEL,
        gpu=get_gpu_name(),
        file=video,
        prompt=text,
        frames=None if not frames else frames,
        n_frames=n_frames,
        processing_time=end-start,
        fps=n_frames/(end-start),
        data=results,
        start_time=start_time,
        end_time=end_time,
        filename=get_file_name(BASE_DIR, start, MODEL, video)
    )
    exp.save()
    exp.log()
    torch.cuda.empty_cache()
    return exp

In [14]:
results = []
for i in range(5):
    vid_hd = process_video("data/720.mp4")
    results.append(vid_hd)
    vid_fhd = process_video("data/1080.mp4")
    results.append(vid_fhd)

[32m2024-06-11 11:11:32.998[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m88[0m - [1mdata/720.mp4 | frames=283 | delta=141.298907995224 | fps=2.0028463348744747[0m


In [None]:
columns = results[0].columns
rows = [result.row for result in results]

{0: [Detection(box=[367.13006591796875, 66.28545379638672, 483.8175964355469, 200.25653076171875], score=0.11801959574222565, label='0'),
  Detection(box=[426.5200500488281, 92.12797546386719, 483.99212646484375, 190.67138671875], score=0.2221613973379135, label='0'),
  Detection(box=[566.7552490234375, 147.77830505371094, 608.813720703125, 242.77261352539062], score=0.1355343759059906, label='0')],
 1: [Detection(box=[426.1279296875, 97.1685562133789, 480.4498291015625, 194.4124755859375], score=0.19247953593730927, label='0'),
  Detection(box=[555.4312744140625, 158.38441467285156, 598.7103271484375, 252.9552459716797], score=0.17357537150382996, label='0'),
  Detection(box=[231.39268493652344, 231.80731201171875, 261.355712890625, 254.5439910888672], score=0.11136927455663681, label='0'),
  Detection(box=[888.0611572265625, 236.5264434814453, 918.010498046875, 300.550048828125], score=0.18694692850112915, label='0')],
 2: [Detection(box=[841.8854370117188, 110.76239013671875, 955.58

In [None]:
import pandas as pd

df = pd.DataFrame(rows, columns=columns)
df.head()

In [None]:
from datetime import datetime
now = datetime.now()
csv_file = f"__{MODEL}-{get_gpu_name().replace("","_")}-{now.day}-{now.hour}-{now.min}.csv"
df.to_csv(csv_file)