In [7]:
!python -m pip install -q transformers
!python -m pip install -q pillow
!python -m pip install -q loguru
!python -m pip install -q pydantic
!python -m pip install -q pandas
!python -m pip install -q opencv-python

In [5]:

from PIL import Image
import torch
from transformers import OwlViTProcessor, OwlViTForObjectDetection

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [7]:
processor = OwlViTProcessor.from_pretrained("google/owlvit-large-patch14")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-large-patch14").to(device)

In [8]:
model.eval()

OwlViTForObjectDetection(
  (owlvit): OwlViTModel(
    (text_model): OwlViTTextTransformer(
      (embeddings): OwlViTTextEmbeddings(
        (token_embedding): Embedding(49408, 768)
        (position_embedding): Embedding(16, 768)
      )
      (encoder): OwlViTEncoder(
        (layers): ModuleList(
          (0-11): 12 x OwlViTEncoderLayer(
            (self_attn): OwlViTAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (mlp): OwlViTMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, out_

In [9]:
model.device

device(type='cpu')

In [17]:

from statistics import mean
import time
import torch
from utils.protocols import OwlVit
from utils.utils import convert_model_detection
from utils.video import read_vid_batch, read_video
from utils.utils import get_file_name, get_gpu_name
from loguru import logger
from datetime import datetime

MODEL= "owlvit-large-p14-hf"

text = "face"
BASE_DIR = "experiments/owlvit"
BATCH_SIZE = 4


def process_video(video):
    start_time = datetime.now()
    target_sizes = None
    results = {}
    pre_processing_times = []
    inference_times = []
    post_processing_times = []
    
    for batch in read_vid_batch(video, batch_size=BATCH_SIZE):
        logger.info(f"Preprocessing batch: {len(batch)}")
        start = time.time()
        frame_ids = batch.keys()
        frames =  [Image.fromarray(frame.astype("uint8")) for frame in batch.values()]
        prompts = [text for _ in frames]
        target_sizes = [frame.size for frame in frames]
        inputs = processor(images=frames, text=prompts, return_tensors="pt").to(device)
        
        pre_processing_times.append(time.time()-start)
        
        logger.info(f"Running Inference")
        start = time.time()
        with torch.no_grad():
            outputs = model(**inputs)
        inference_times.append(time.time()-start)
        
        logger.info(f"PostPorcessing")
        start = time.time()
        detections = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
        batch_result =  {frame_id: convert_model_detection(detection) for frame_id, detection in zip(frame_ids, detections)}
        results = {**results, **batch_result}
        post_processing_times.append(time.time()-start)
        torch.cuda.empty_cache()

    end_time = datetime.now()
    n_frames = len(results)
    
    exp =  OwlVit(
        model=MODEL,
        gpu=get_gpu_name(),
        video_file=video,
        frames=None,
        batch_size=BATCH_SIZE,
        n_frames=n_frames,
        
        pre_processing_time=sum(pre_processing_times),
        inference_time=sum(inference_times),
        post_processing_time=sum(post_processing_times),
        video_processing_time=(end_time-start_time).seconds,
       
        start_time=start_time.isoformat(),
        end_time=end_time.isoformat(),
        record_file=get_file_name(BASE_DIR, start, MODEL, video),
        
        data=results,
        prompt=text,
        
    )
    exp.save()
    exp.log()
    torch.cuda.empty_cache()
    return exp

In [18]:
d = process_video("data/720.mp4")

[32m2024-06-12 16:09:47.293[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m27[0m - [1mPreprocessing batch: 4[0m
[32m2024-06-12 16:09:47.411[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m37[0m - [1mRunning Inference[0m
[32m2024-06-12 16:10:33.988[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m43[0m - [1mPostPorcessing[0m
[32m2024-06-12 16:10:34.033[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m27[0m - [1mPreprocessing batch: 4[0m
[32m2024-06-12 16:10:34.188[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m37[0m - [1mRunning Inference[0m
[32m2024-06-12 16:11:20.331[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m43[0m - [1mPostPorcessing[0m
[32m2024-06-12 16:11:20.362[0m | [1mINFO    [0m | [36m__main__[0m:[36mprocess_video[0m:[36m27[0m - [1mPreprocessing batch: 4[0m
[32m2024-06-12 16:11:20.498[0m | [1mINFO   