In [1]:
!pip install sentence-transformers tf-keras

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import glob
import numpy as np
from PIL import Image
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import torch

  from tqdm.autonotebook import tqdm, trange
2024-10-13 00:29:03.102802: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-13 00:29:03.274226: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-13 00:29:03.336484: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-13 00:29:03.355086: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-13 00:29:03.481696

In [3]:
class CLIPFeatureExtractor:
    def __init__(self, keyframes_dir, save_dir, model_name='clip-ViT-L-14', batch_size=4):
        """
        Initializes the CLIPFeatureExtractor.

        Args:
            keyframes_dir (str): Path to the directory containing video keyframes.
            save_dir (str): Path to the directory where extracted features will be saved.
            model_name (str): Name of the CLIP model to be used. Default is 'clip-ViT-L-14'.
            batch_size (int): Number of images to process in each batch. Default is 4.

        Input Format:
            - keyframes_dir: Directory structure should be organized as:
                /keyframes_dir/part_x/video_y/*.jpg

            - Each video should have its keyframes stored as .jpg files.

        Output Format:
            - Numpy files (.npy) containing CLIP features for each video.
            - Saved in the specified save_dir, organized by parts and video IDs:
                /save_dir/part_x/video_id.npy
        """
        self.keyframes_dir = keyframes_dir
        self.save_dir = save_dir
        self.model = SentenceTransformer(model_name)
        self.batch_size = batch_size
        self.all_keyframe_paths = self._collect_keyframe_paths()

        # Ensure save directory exists
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

    def _collect_keyframe_paths(self):
        """
        Collects all keyframe paths from the specified directory.

        Output:
            - A dictionary where each key corresponds to a part (e.g., 'part_x') and 
              each value is another dictionary mapping video IDs to a list of keyframe paths.

        Example Output Format:
            {
                'part_1': {
                    'video_001': ['/path/to/keyframe1.jpg', '/path/to/keyframe2.jpg', ...],
                    'video_002': [...],
                    ...
                },
                'part_2': {...},
                ...
            }
        """
        keyframe_paths = dict()
        for part in sorted(os.listdir(self.keyframes_dir)):
            if part.endswith(".zip"):
                continue
            if part not in ["L25_extra", "L26_extra", "L27_extra", "L28_extra", "L29_extra", "L30_extra"]:
                continue
            keyframe_paths[part] = dict()

        for part in sorted(keyframe_paths.keys()):
            part_path = os.path.join(self.keyframes_dir, part)
            video_dirs = sorted(os.listdir(part_path))
            video_ids = [video_dir.split('_')[-1] for video_dir in video_dirs]
            for video_id, video_dir in zip(video_ids, video_dirs):
                paths = sorted(glob.glob(f'{part_path}/{video_dir}/*.jpg'))
                keyframe_paths[part][video_id] = paths

        return keyframe_paths

    def _process_images(self, image_paths):
        """
        Processes and encodes a batch of images using the CLIP model.

        Args:
            image_paths (list of str): List of paths to the images to be processed.

        Output:
            - A tensor containing the CLIP features for the batch of images.
            - Each feature vector is normalized.

        Example Output Format:
            torch.Tensor of shape (batch_size, feature_dim)
        """
        images = [Image.open(image_path) for image_path in image_paths]
        with torch.no_grad():
            image_feats = self.model.encode(images, convert_to_tensor=True, show_progress_bar=False)
        image_feats /= image_feats.norm(dim=-1, keepdim=True)  # Normalize features
        return image_feats

    def extract_and_save_features(self):
        """
        Extracts features for all keyframes and saves them as numpy files.

        Output:
            - Numpy files saved in the specified save directory.
            - Each file contains the CLIP features for all keyframes of a single video, organized by parts and video IDs.
        """
        for key, video_keyframe_paths in tqdm(self.all_keyframe_paths.items(), desc="Processing parts"):
            part_save_dir = os.path.join(self.save_dir, key)
            if not os.path.exists(part_save_dir):
                os.makedirs(part_save_dir)

            for video_id, keyframe_paths in tqdm(video_keyframe_paths.items(), desc="Processing videos"):
                video_feats = []

                for i in range(0, len(keyframe_paths), self.batch_size):
                    batch_paths = keyframe_paths[i:i + self.batch_size]
                    batch_feats = self._process_images(batch_paths)
                    
                    for feat in batch_feats:
                        video_feats.append(feat.cpu().numpy().astype(np.float32))

                # Save the features as a numpy file
                np.save(os.path.join(part_save_dir, f'{video_id}.npy'), np.array(video_feats))


In [4]:
# Input directory containing keyframes and output directory for features
keyframes_dir = '/media/daoan/T7 Shield2/AI_Challenge_2024_DATA/Keyframes'
save_dir = '/media/daoan/T7 Shield2/AI_Challenge_2024_DATA/CLIP_features'

# Instantiate and run the feature extractor
extractor = CLIPFeatureExtractor(keyframes_dir, save_dir)
extractor.extract_and_save_features()

Processing parts:   0%|          | 0/6 [00:00<?, ?it/s]
Processing videos:   0%|          | 0/88 [00:00<?, ?it/s][A
Processing videos:   1%|          | 1/88 [00:08<12:01,  8.29s/it][A
Processing videos:   2%|▏         | 2/88 [00:17<12:14,  8.54s/it][A
Processing videos:   3%|▎         | 3/88 [00:24<11:43,  8.28s/it][A
Processing videos:   5%|▍         | 4/88 [00:34<12:21,  8.83s/it][A
Processing videos:   6%|▌         | 5/88 [00:43<12:11,  8.81s/it][A
Processing videos:   7%|▋         | 6/88 [00:51<11:37,  8.51s/it][A
Processing videos:   8%|▊         | 7/88 [00:59<11:13,  8.31s/it][A
Processing videos:   9%|▉         | 8/88 [01:07<11:02,  8.28s/it][A
Processing videos:  10%|█         | 9/88 [01:14<10:32,  8.01s/it][A
Processing videos:  11%|█▏        | 10/88 [01:22<10:16,  7.91s/it][A
Processing videos:  12%|█▎        | 11/88 [01:30<10:13,  7.96s/it][A
Processing videos:  14%|█▎        | 12/88 [01:38<10:00,  7.89s/it][A
Processing videos:  15%|█▍        | 13/88 [01:47<10: