[CLIP4Clip](https://arxiv.org/abs/2104.08860) is a video-text retrieval model based on [CLIP (ViT-B)](https://github.com/openai/CLIP). The original implement code is in [github CLIP4Clip](https://github.com/ArrowLuo/CLIP4Clip).

[MSR-VTT (Microsoft Research Video to Text)](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/) is a large-scale dataset for the open domain video captioning, which consists of 10,000 video clips.

For the convenience, you can also download the splits and captions by,
```
wget https://github.com/ArrowLuo/CLIP4Clip/releases/download/v0.0/msrvtt_data.zip
```

And download raw videos by, the size of which is about 6.3G,
```
wget https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip
```

After you have finished downloading, you can find `MSRVTT_data.json`, `MSRVTT_JSFUSION_test.csv`, `MSRVTT_train.7k.csv`, `MSRVTT_train.9k.csv` in the msrvtt_data.zip .

In fact, we use 9k captions in `MSRVTT_train.9k.csv` for training and 1k captions in `MSRVTT_JSFUSION_test.csv` for testing.



## 1.View some sample data.
Sample some test caption sentence in the test set.

In [2]:
import pandas as pd

test_csv_path = './MSRVTT_JSFUSION_test.csv'
sample_num = 2
test_df = pd.read_csv(test_csv_path)
test_df = test_df.sample(sample_num)
test_df[['video_id', 'sentence']]

Unnamed: 0,video_id,sentence
98,video9346,a woman giving a photoshop tutorial
774,video9027,a group of young people are playing


Define a helper function to show the video in notebook.

Show some sample video.

In [3]:
raw_video_path = './path_to_your_test_video'  # Your raw video path
from IPython import display
import os

for _, row in test_df.iterrows():
    video_id = row[2]
    sentence = row[3]
    video_path = os.path.join(raw_video_path, video_id + '.mp4')
    display.display(display.Video(video_path, embed=True))
    print(sentence + '\n')

a woman giving a photoshop tutorial



a group of young people are playing



## 2.Get caption text embeddings.
Create model and load pretrained weights.

In [4]:
from towhee.models import clip4clip

batch_size = 2

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = clip4clip.create_model(model_name='clip_vit_b32',
                               context_length=77,
                               pretrained=True,
                               weights_path='./pytorch_model.bin.1',
                               device=device)
model.eval()

Define a helper function to get the caption text embedding.

In [18]:
from towhee.models.clip4clip import convert_tokens_to_id


def get_batch_text_embeddings(batch_text):
    tokenize = clip4clip.SimpleTokenizer()
    text_ids_list = []
    for text in batch_text:
        text_ids = convert_tokens_to_id(tokenize, text)
        text_ids = torch.tensor(text_ids).unsqueeze(0).to(device)
        text_ids_list.append(text_ids)
    text_ids_list_tensor = torch.cat(text_ids_list, dim=0)
    text_features = model.get_sequence_output(text_ids_list_tensor)
    text_features = text_features.detach()
    return text_features


batch_text = ['This is a test sentence.', 'This is another test sentence.']
batch_text_embeddings = get_batch_text_embeddings(batch_text)
batch_text_embeddings.shape  # , batch_text_embeddings.max(), batch_text_embeddings.min()

torch.Size([2, 1, 512])

## 3.Get video embeddings.
Create a helper class to extract image from video.

In [6]:
import torch as th
import numpy as np
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import cv2


class RawVideoExtractorCV2():
    def __init__(self, centercrop=False, size=224, framerate=-1, ):
        self.centercrop = centercrop
        self.size = size
        self.framerate = framerate
        self.transform = self._transform(self.size)

    def _transform(self, n_px):
        return Compose([
            Resize(n_px, interpolation=Image.BICUBIC),
            CenterCrop(n_px),
            lambda image: image.convert("RGB"),
            ToTensor(),
            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ])

    def video_to_tensor(self, video_file, preprocess, sample_fp=0, start_time=None, end_time=None):
        if start_time is not None or end_time is not None:
            assert isinstance(start_time, int) and isinstance(end_time, int) and -1 < start_time < end_time
        assert sample_fp > -1

        # Samples a frame sample_fp X frames.
        cap = cv2.VideoCapture(video_file)
        frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))

        total_duration = (frameCount + fps - 1) // fps
        start_sec, end_sec = 0, total_duration

        if start_time is not None:
            start_sec, end_sec = start_time, end_time if end_time <= total_duration else total_duration
            cap.set(cv2.CAP_PROP_POS_FRAMES, int(start_time * fps))

        interval = 1
        if sample_fp > 0:
            interval = fps // sample_fp
        else:
            sample_fp = fps
        if interval == 0: interval = 1

        inds = [ind for ind in np.arange(0, fps, interval)]
        assert len(inds) >= sample_fp
        inds = inds[:sample_fp]

        ret = True
        images, included = [], []

        for sec in np.arange(start_sec, end_sec + 1):
            if not ret: break
            sec_base = int(sec * fps)
            for ind in inds:
                cap.set(cv2.CAP_PROP_POS_FRAMES, sec_base + ind)
                ret, frame = cap.read()
                if not ret: break
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                images.append(preprocess(Image.fromarray(frame_rgb).convert("RGB")))

        cap.release()

        if len(images) > 0:
            video_data = th.tensor(np.stack(images))
        else:
            video_data = th.zeros(1)
        return {'video': video_data}

    def get_video_data(self, video_path, start_time=None, end_time=None):
        image_input = self.video_to_tensor(video_path, self.transform, sample_fp=self.framerate, start_time=start_time,
                                           end_time=end_time)
        return image_input

    def process_raw_data(self, raw_video_data):
        tensor_size = raw_video_data.size()
        tensor = raw_video_data.view(-1, 1, tensor_size[-3], tensor_size[-2], tensor_size[-1])
        return tensor

    def process_frame_order(self, raw_video_data, frame_order=0):
        # 0: ordinary order; 1: reverse order; 2: random order.
        if frame_order == 0:
            pass
        elif frame_order == 1:
            reverse_order = np.arange(raw_video_data.size(0) - 1, -1, -1)
            raw_video_data = raw_video_data[reverse_order, ...]
        elif frame_order == 2:
            random_order = np.arange(raw_video_data.size(0))
            np.random.shuffle(random_order)
            raw_video_data = raw_video_data[random_order, ...]

        return raw_video_data

In [20]:
batch_video_path = ['./path_to_your_test_video/video9451.mp4',
                    './path_to_your_test_video/video9793.mp4',
                    ]


def get_batch_video_embeddings(batch_video_path):
    max_frames = 12
    rawVideoExtractor = RawVideoExtractorCV2(framerate=1, size=224)
    video_list = []
    video_mask_list = []
    for video_path in batch_video_path:
        raw_video_data = rawVideoExtractor.get_video_data(video_path=video_path)
        raw_video_data = raw_video_data['video']
        raw_video_slice = rawVideoExtractor.process_raw_data(raw_video_data)
        if max_frames < raw_video_slice.shape[0]:
            sample_indx = np.linspace(0, raw_video_slice.shape[0] - 1, num=max_frames, dtype=int)
            video_slice = raw_video_slice[sample_indx, ...]
        else:
            video_slice = raw_video_slice
        slice_len = video_slice.shape[0]
        max_video_length = 0 if 0 > slice_len else slice_len
        video = np.zeros((1, max_frames, 1, 3, rawVideoExtractor.size, rawVideoExtractor.size), dtype=np.float)
        if slice_len >= 1:
            video[0, :slice_len, ...] = video_slice
        video_mask = np.zeros((1, max_frames), dtype=np.long)
        video_mask[0, :max_video_length] = [1] * max_video_length

        video = torch.as_tensor(video).float()
        pair, bs, ts, channel, h, w = video.shape
        video = video.view(pair * bs * ts, channel, h, w)
        video_mask = torch.as_tensor(video_mask).float()
        video_list.append(video)
        video_mask_list.append(video_mask)
    video_list_tensor = torch.cat(video_list, dim=0)
    video_mask_list_tensor = torch.cat(video_mask_list, dim=0)

    visual_output = model.get_visual_output(video_list_tensor, video_mask_list_tensor, shaped=True)
    return visual_output


video_embeddings = get_batch_video_embeddings(batch_video_path)
video_embeddings.shape  #, video_embeddings.sum()

  Resize(n_px, interpolation=Image.BICUBIC),
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  video = np.zeros((1, max_frames, 1, 3, rawVideoExtractor.size, rawVideoExtractor.size), dtype=np.float)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  video_mask = np.zeros((1, max_frames), dtype=np.long)


torch.Size([2, 12, 512])