In [7]:
!python -m pip install -q transformers
!python -m pip install -q pillow
!python -m pip install -q loguru
!python -m pip install -q pydantic
!python -m pip install -q pandas
!python -m pip install -q opencv-python

In [2]:
from PIL import Image
import torch

from transformers import OwlViTProcessor, OwlViTForObjectDetection

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [4]:
processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14").to(device)

In [5]:
model.eval()

OwlViTForObjectDetection(
  (owlvit): OwlViTModel(
    (text_model): OwlViTTextTransformer(
      (embeddings): OwlViTTextEmbeddings(
        (token_embedding): Embedding(49408, 768)
        (position_embedding): Embedding(16, 768)
      )
      (encoder): OwlViTEncoder(
        (layers): ModuleList(
          (0-11): 12 x OwlViTEncoderLayer(
            (self_attn): OwlViTAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): OwlViTMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, out_

In [6]:
model.device

device(type='cpu')

In [7]:

import time
import torch
from utils.protocols import OwlVit
from utils.utils import convert_model_detection
from utils.video import read_video
from utils.utils import get_file_name, get_gpu_name
from loguru import logger
from datetime import datetime

MODEL= "owlvit-large-p14-hf"

text = "face"
BASE_DIR = "experiments/owlvit"

def run_model(frames):
    results = {}
    for frame_id, frame in frames.items():
        with torch.no_grad():
            outputs = model(**frame)
            results[frame_id] = outputs
    return results

def process_video(video, frames=[]):
    start_time = datetime.now()
    target_sizes = None
    
    frames_ = {}
    start = time.time()
    for frame_id, frame in read_video(video):
        image = Image.fromarray(frame.astype("uint8"))
        inputs = processor(images=image, text=text, return_tensors="pt").to(device)
        frames_[frame_id] = inputs
        if not target_sizes:
            target_sizes = torch.Tensor([image.size[::-1]])
    
    pre_processing_time = time.time() - start
    
    n_frames = len(frames_)
    
    start = time.time()
    outputs = run_model(frames_)
    inference_time = time.time() - start
    
    results = {}
    start = time.time()
    for frame_id, output in outputs.items():    
        detections = processor.post_process_object_detection(outputs=output, threshold=0.1, target_sizes=target_sizes)
        results[frame_id] = convert_model_detection(detections[0])
    post_processing_time = time.time() - start

    end_time = datetime.now()
    
    exp =  OwlVit(
        model=MODEL,
        gpu=get_gpu_name(),
        video_file=video,
        frames=None if not frames else frames,
        n_frames=n_frames,
        
        pre_processing_time=pre_processing_time,
        inference_time=inference_time,
        post_processing_time=post_processing_time,
        video_processing_time=(end_time-start_time).seconds,
       
        start_time=start_time.isoformat(),
        end_time=end_time.isoformat(),
        record_file=get_file_name(BASE_DIR, start, MODEL, video),
        
        data=results,
        prompt=text,
        
    )
    exp.save()
    exp.log()
    torch.cuda.empty_cache()
    return exp

In [8]:
results = []
for i in range(1):
    vid_hd = process_video("data/720.mp4")
    results.append(vid_hd)
    vid_fhd = process_video("data/1080.mp4")
    results.append(vid_fhd)

[32m2024-06-12 00:39:12.774[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m26[0m - [1mReading Frame[0m
[32m2024-06-12 00:39:21.206[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m39[0m - [1mnframes = 3[0m
[32m2024-06-12 00:39:21.207[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m40[0m - [1mRunning MOdel[0m
[32m2024-06-12 00:39:48.419[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m45[0m - [1mRunning Postprocessing[0m
[32m2024-06-12 00:39:48.421[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m53[0m - [1mDone![0m
[32m2024-06-12 00:39:48.423[0m | [1mINFO    [0m | [36mutils.protocols[0m:[36mlog[0m:[36m113[0m - [1mdata/720.mp4 | frames=3 | model_fps=0.11024676022476197 | inference_time=27.211683988571167 | preprocess_time=8.374552011489868[0m
[32m2024-06-12 00:39:48.429[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m26

In [9]:
columns = results[0].columns
rows = [result.row for result in results]

In [10]:
import pandas as pd

df = pd.DataFrame(rows, columns=columns)
df.head(10)

Unnamed: 0,model,gpu,video_file,batch_size,n_frames,pre_processing_fps,inference_fps,post_processing_fps,video_fps,pre_processing_time,inference_time,post_processing_time,video_processing_time,start_time,end_time,record_file,prompt,data
0,owlvit-large-p14-hf,cpu,data/720.mp4,1,3,0.358228,0.110247,1414.446043,0.085714,8.374552,27.211684,0.002121,35,2024-06-12T00:39:12.774421,2024-06-12T00:39:48.422247,experiments/owlvit/exp-owlvit-large-p14-hf-cpu...,face,"{0: [box=[366.6923522949219, 66.16351318359375..."
1,owlvit-large-p14-hf,cpu,data/1080.mp4,1,3,0.257969,0.114098,2181.881741,0.081081,11.629296,26.293289,0.001375,37,2024-06-12T00:39:48.429717,2024-06-12T00:40:26.408882,experiments/owlvit/exp-owlvit-large-p14-hf-cpu...,face,"{0: [box=[547.648193359375, 97.87193298339844,..."


In [11]:
from datetime import datetime
now = datetime.now()
csv_file = f"zz-{MODEL}-{get_gpu_name()}-{now.day}-{now.hour}-{now.minute}.csv"
df.to_csv(csv_file)