In [1]:
from transformers import AutoProcessor, AutoModel
from torch import cat, save
from numpy import array
from math import ceil

from decord import VideoReader, cpu
from IPython.display import HTML

from utils import compute_clip_sim, get_samples

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
model_name = "microsoft/xclip-base-patch16-zero-shot"
processor, model = AutoProcessor.from_pretrained(model_name), AutoModel.from_pretrained(model_name)

In [4]:
filename = input()
vr = VideoReader(filename, ctx=cpu(0))
HTML(f'<video controls> <source src="{filename}" type="video/mp4"> </video>')

 videos/aerial.mp4


In [5]:
def get_sample_frame_indices(videoreader, sample_length=16, num_frames_per_sample=32):
    '''
    sample_length is in seconds
    return list of n lists of frame indices, where n is num_samples, AKA num_vectors
    '''
    range_per_sample = int(sample_length * videoreader.get_avg_fps())  # range in # of frames
    interval_per_frame_of_sample = ceil(range_per_sample / num_frames_per_sample)
    num_samples = int(len(videoreader) // range_per_sample)
    print('draws one frame every', interval_per_frame_of_sample, 'frames over', range_per_sample, 'frames')
    
    indices = []
    for i in range(0, num_samples):
        _indices = range(i*range_per_sample, (i+1)*range_per_sample, interval_per_frame_of_sample)
        indices.append([*_indices])
    return indices

In [6]:
idx = array(get_sample_frame_indices(vr))

draws one frame every 12 frames over 383 frames
17


In [7]:
video_features = []
for i in range(0, len(idx)):
    video_features.append(
        model.get_video_features(**processor(
            videos=list(vr.get_batch(idx[i]).asnumpy()), return_tensors="pt"
        ))
    )

In [8]:
video_features = cat(video_features); video_features.shape

torch.Size([17, 512])

In [12]:
save(video_features, 'feature_vectors/aerial_17samples.pt')

In [11]:
text_features = model.get_text_features(**processor(text=["moving cars"], return_tensors="pt", padding=True))
logits = compute_clip_sim(video_features, text_features)
*logits, print('Content found at frame #:', logits.argmax())

Content found at frame #: tensor(3)


(tensor([0.1914, 0.1939, 0.1907, 0.2066, 0.2008, 0.1922, 0.1787, 0.1922, 0.1860,
         0.1821, 0.1869, 0.1558, 0.1713, 0.1732, 0.1821, 0.1817, 0.1773]),
 None)

In [9]:
text_features = model.get_text_features(**processor(text=["moving cars"], return_tensors="pt", padding=True))
logits = compute_clip_sim(video_features, text_features)
*logits, print('Content found at frame #:', logits.argmax())

Content found at frame #: tensor(3)


(tensor([0.1914, 0.1939, 0.1907, 0.2066, 0.2008, 0.1922, 0.1787, 0.1922, 0.1860,
         0.1821, 0.1869, 0.1558, 0.1713, 0.1732, 0.1821, 0.1817, 0.1773]),
 None)

In [10]:
text_features = model.get_text_features(**processor(text=["docked ship"], return_tensors="pt", padding=True))
logits = compute_clip_sim(video_features, text_features)
*logits, print('Content found at frame #:', logits.argmax())

Content found at frame #: tensor(15)


(tensor([0.1890, 0.1825, 0.1760, 0.1883, 0.1835, 0.1720, 0.1705, 0.1795, 0.1687,
         0.1727, 0.1925, 0.1745, 0.1795, 0.1749, 0.1955, 0.2008, 0.1892]),
 None)

In [11]:
text_features = model.get_text_features(**processor(text=["tower"], return_tensors="pt", padding=True))
logits = compute_clip_sim(video_features, text_features)
*logits, print('Content found at frame #:', logits.argmax())

Content found at frame #: tensor(1)


(tensor([0.2215, 0.2297, 0.2120, 0.2111, 0.2208, 0.2209, 0.2128, 0.2156, 0.2126,
         0.2190, 0.2190, 0.1724, 0.2097, 0.2243, 0.2260, 0.2063, 0.2031]),
 None)