In [None]:
pip install pygments pillow requests tqdm pandas opencv-python open_clip_torch

In [None]:
import os

CSV_FOLDER = r"/kaggle/working/csv"
FRAMES_FOLDER = r"/kaggle/input/frames-l2-b2"
CSV_FPS = r"/kaggle/input/frames-l2-b2/frames-l2-b2.csv"

BATCH_CLIP_SIZE=6144

THRESHOLD_SIMILARITY=0.85

EMBEDDING_NAME='ViT-B-32-256'
EMBEDDING_PRETRAINED='datacomp_s34b_b86k'
EMBEDDING_TOKEN='ViT-B-32-256'

EM_MODEL_2='ViT-H-14-quickgelu'
EM_MODEL_2_PRETRAINED='dfn5b'
EM_TOKEN_2='ViT-H-14-quickgelu'

EMBEDDING_FOLDER=r"/kaggle/working/embeddings"

EMBEDDING_BATCH_SIZE=512

ROOT_DIR = r"/kaggle/temp"

In [None]:
import torch
import open_clip

class CLIP_Embedding:
    def __init__(self, model_name="ViT-L-14", pretrained="commonpool_xl_laion_s13b_b90k", device="cuda", tokenizers='ViT-L-14'):
        self.device = device
        self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained, device=self.device)
        self.model.eval()
        self.tokenizer = open_clip.get_tokenizer(tokenizers)

    def get_image_embedding(self, image):
        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
        with torch.no_grad(), torch.amp.autocast('cuda'):
            image_features = self.model.encode_image(image_input)
        del image_input
        return image_features[0]/image_features[0].norm()
    
    def get_images_embedding(self, images):
        image_input = torch.stack([self.preprocess(image) for image in images]).to(self.device)
        with torch.no_grad(), torch.amp.autocast('cuda'):
            image_features = self.model.encode_image(image_input)
        del image_input
        return image_features/image_features.norm(dim=-1, keepdim=True)

    def get_text_embedding(self, text):
        text_input = self.tokenizer(text).to(self.device)
        with torch.no_grad(), torch.amp.autocast('cuda'):
            text_features = self.model.encode_text(text_input)
        return text_features[0]/text_features[0].norm()
    
class CLIPSingleton:
    _instance = None
    def __new__(cls, model_name="ViT-L-14", pretrained="commonpool_xl_laion_s13b_b90k", device="cuda", tokenizers='ViT-L-14'):
        if cls._instance is None:
            cls._instance = CLIP_Embedding(model_name, pretrained, device, tokenizers)
        return cls._instance
    

In [None]:
from PIL import Image
from multiprocessing import Process
import os
import numpy as np
import pandas as pd
from timeit import default_timer as timer
import time

def embedding_batch(list_frames, embedding_model):
    images = [Image.open(frame) for _, frame in list_frames]
    embeddings = embedding_model.get_images_embedding(images).detach().cpu().numpy()
    list_embedding = [(list_frames[i][0], embeddings[i]) for i in range(len(list_frames))]
    return list_embedding

def save_embedding(list_embedding, output_folder):
    for frame_number, embedding in list_embedding:
        embedding_path = os.path.join(output_folder, f'{frame_number}.npy')
        np.save(embedding_path, embedding)

def similarity(embedding1, embedding2):
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

def get_fps(video_name):
    df = pd.read_csv(CSV_FPS)
    fps = df.loc[df['video_name'] == video_name, 'fps']
    
    # Kiểm tra nếu tìm thấy thì trả về giá trị, ngược lại trả về None
    if not fps.empty:
        return fps.values[0]
    else:
        return 25

def get_keyframes(list_embedding, embedding_folder, csv_folder, video_name, threshold=THRESHOLD_SIMILARITY):
    list_keyframes = {
        'frame_number': [],
        'second': []
    }
    
    sorted
    
    fps = get_fps(video_name)

    v_pred = None
    keyframes_embedding = []
    for frame_number, embedding in list_embedding:
        if v_pred is None:
            v_pred = embedding
            list_keyframes['frame_number'].append(frame_number)
            list_keyframes['second'].append(int(frame_number)/fps)
            keyframes_embedding.append((frame_number, embedding))
        else:
            sim = similarity(v_pred, embedding)
            if sim < threshold:
                v_pred = embedding
                list_keyframes['frame_number'].append(frame_number)
                list_keyframes['second'].append(int(frame_number)/fps)
                keyframes_embedding.append((frame_number, embedding))

    df = pd.DataFrame(list_keyframes)
    df.to_csv(os.path.join(csv_folder, f'{video_name}.csv'), index=False)
#     Process(target=save_embedding, args=(keyframes_embedding, embedding_folder)).start()
    print(f'Keyframes {video_name} done')
    

def embedding_frame(list_frames, embedding_folder, csv_folder, embedding_model, video_name, batch_size=32):
    output_folder = os.path.join(embedding_folder, video_name)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    list_embedding = []

    for i in range(0, len(list_frames), batch_size):
        batch = list_frames[i:i+batch_size]
        embeddings = embedding_batch(batch, embedding_model)
        print(f'Embedding {i} done')
        list_embedding.extend(embeddings)

    Process(target=get_keyframes, args=(list_embedding, output_folder, csv_folder, video_name)).start()

    print(f'Emedding {video_name} done')

def keyframes_to_embedding(list_keyframes, embedding_folder, embedding_model, video_name, batch_size=32):
    output_folder = os.path.join(embedding_folder, video_name)
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    list_embedding = []

    for i in range(0, len(list_keyframes), batch_size):
        batch = list_keyframes[i:i+batch_size]
        embeddings = embedding_batch(batch, embedding_model)
        print(f'Embedding {i} done')
        list_embedding.extend(embeddings)

    Process(target=save_embedding, args=(list_embedding, output_folder)).start()

    print(f'Emedding {video_name} done')

def get_list_keyframes(csv_path, keyframes_folder, video_name):
    df = pd.read_csv(csv_path)
    key_frames = df['frame_number'].values.tolist()

    keyframes_f = os.path.join(keyframes_folder, video_name)

    list_keyframes = []
    for key_frame in key_frames:
        key_frame_path = os.path.join(keyframes_f, f'{key_frame}.jpg')
        list_keyframes.append((key_frame, key_frame_path))

    return list_keyframes

def get_list_frames(frames_folder, video_name):
    frames_folder = os.path.join(frames_folder, video_name)
    list_frames = [(f.split('.')[0], os.path.join(frames_folder, f)) for f in os.listdir(frames_folder) if f.endswith('.jpg')]
    list_frames.sort(key=lambda x: int(x[0]))
    return list_frames

def create_embedding_and_csv(frames_folder, embedding_folder, csv_folder, embedding_model, video_name, batch_size):
    list_frames = get_list_frames(frames_folder, video_name)
    embedding_frame(list_frames, embedding_folder, csv_folder, embedding_model, video_name, batch_size)

def extract_embedding(csv_path, video_name, keyframes_folder, embedding_folder, embedding_model, batch_size):
    list_keyframes = get_list_keyframes(csv_path, keyframes_folder, video_name)
    keyframes_to_embedding(list_keyframes, embedding_folder, embedding_model, video_name, batch_size)

def wfile(folder, ext):
    return [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(ext)]

def embedding_keyframes(csv_folder, keyframes_folder, embedding_folder, embedding_model, embedding_batch_size=32):
    embedding_model = CLIP_Embedding(*embedding_model)
    videos = wfile(csv_folder, '.csv')

    for video in videos:
        start = timer()
        video_name = os.path.basename(video).split('.')[0]
        csv_path = os.path.join(csv_folder, f'{video_name}.csv')
        extract_embedding(csv_path, video_name, keyframes_folder, embedding_folder, embedding_model, embedding_batch_size)

        print(f'{video_name} done in {timer() - start}')
    del embedding_model
    torch.cuda.empty_cache()

def frames_to_keyframes(frames_folder, csv_folder, embedding_folder, embedding_model, batch_size):
    embedding_model = CLIP_Embedding(*embedding_model)
    videos = [os.path.join(frames_folder, f) for f in os.listdir(frames_folder) if os.path.isdir(os.path.join(frames_folder, f))]

    for video in videos:
        start = timer()
        video_name = os.path.basename(video)
        create_embedding_and_csv(frames_folder, embedding_folder, csv_folder, embedding_model, video_name, batch_size)

        print(f'{video_name} done in {timer() - start}')
    del embedding_model
    torch.cuda.empty_cache()

if __name__ == '__main__':
    embedding_model = EMBEDDING_NAME
    embedding_pretrained = EMBEDDING_PRETRAINED
    embedding_token = EMBEDDING_TOKEN
    device = 'cuda'
    embedding_batch_size_1 = BATCH_CLIP_SIZE
    embedding_batch_size_2 = EMBEDDING_BATCH_SIZE

    csv_folder = CSV_FOLDER
    frames_folder = FRAMES_FOLDER
    embedding_folder = EMBEDDING_FOLDER

    if not os.path.exists(embedding_folder):
        os.makedirs(embedding_folder)
        
    if not os.path.exists(csv_folder):
        os.makedirs(csv_folder)

    frames_to_keyframes(frames_folder, csv_folder, embedding_folder, (embedding_model, embedding_pretrained, device, embedding_token), embedding_batch_size_1)
    time.sleep(3)
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    time.sleep(3)
    embedding_keyframes(csv_folder, frames_folder, embedding_folder, (EM_MODEL_2, EM_MODEL_2_PRETRAINED, device, EM_TOKEN_2), embedding_batch_size_2)