In [21]:
import fiftyone as fo
import os
import pandas as pd
import numpy as np
from glob import glob
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import csv

In [22]:
# run in about 15 seconds
if fo.dataset_exists("AIC_2024"):
    fo.delete_dataset("AIC_2024")
    
dataset = fo.Dataset.from_images_dir(
    name="AIC_2024", 
    images_dir=os.path.join("..", "data", "batch1", "keyframes"), 
    recursive=True
)

 100% |███████████| 106589/106589 [15.5s elapsed, 0s remaining, 6.7K samples/s]      


In [23]:
# run in about 36 seconds
unique_videos = set()
for sample in dataset:
    tmp, sample['video'], sample['keyframe_id'] = sample['filepath'][:-4].rsplit(os.sep, 2)
    sample['batch'] = tmp.rsplit(os.sep, 4)[-3]
    unique_videos.add(sample['video'])
    sample.save()

In [24]:
# run in nearly 40 seconds
video_frameid_dict = {}
for b in [1, 2, 3]:
    for video in unique_videos:
        filepath = os.path.join('..', 'data', f'batch{b}', 'map-keyframes', f'{video}.csv')
        if os.path.exists(filepath):
            a = pd.read_csv(filepath)
            video_frameid_dict[video] = a['frame_idx']

for sample in dataset:
    print(sample['video'] + '-' + sample['keyframe_id'])
    sample['frame_id'] = video_frameid_dict[sample['video']].iloc[int(sample['keyframe_id']) - 1]
    sample.save()

L01_V001-001
L01_V001-002
L01_V001-003
L01_V001-004
L01_V001-005
L01_V001-006
L01_V001-007
L01_V001-008
L01_V001-009
L01_V001-010
L01_V001-011
L01_V001-012
L01_V001-013
L01_V001-014
L01_V001-015
L01_V001-016
L01_V001-017
L01_V001-018
L01_V001-019
L01_V001-020
L01_V001-021
L01_V001-022
L01_V001-023
L01_V001-024
L01_V001-025
L01_V001-026
L01_V001-027
L01_V001-028
L01_V001-029
L01_V001-030
L01_V001-031
L01_V001-032
L01_V001-033
L01_V001-034
L01_V001-035
L01_V001-036
L01_V001-037
L01_V001-038
L01_V001-039
L01_V001-040
L01_V001-041
L01_V001-042
L01_V001-043
L01_V001-044
L01_V001-045
L01_V001-046
L01_V001-047
L01_V001-048
L01_V001-049
L01_V001-050
L01_V001-051
L01_V001-052
L01_V001-053
L01_V001-054
L01_V001-055
L01_V001-056
L01_V001-057
L01_V001-058
L01_V001-059
L01_V001-060
L01_V001-061
L01_V001-062
L01_V001-063
L01_V001-064
L01_V001-065
L01_V001-066
L01_V001-067
L01_V001-068
L01_V001-069
L01_V001-070
L01_V001-071
L01_V001-072
L01_V001-073
L01_V001-074
L01_V001-075
L01_V001-076
L01_V001-077

In [26]:
# run in about 1 minutes
video_keyframe_dict = {}
all_keyframe_paths = glob(os.path.join(os.getcwd(), '..', 'data', 'batch*', 'keyframes',
                            '*', '*', '*.jpg'))

for kf in all_keyframe_paths:
    _, vid, kf = kf[:-4].rsplit(os.sep, 2)
    if vid not in video_keyframe_dict.keys():
        video_keyframe_dict[vid] = [kf]
    else:
        video_keyframe_dict[vid].append(kf)

for k, v in video_keyframe_dict.items():
    video_keyframe_dict[k] = sorted(v)

embedding_dict = {}
for j in [1, 2, 3]:
    for video in unique_videos:
        clip_14_path = os.path.join('..', 'data', f'batch{j}', 
                            'clip-features-14', f'{video}.npy')
        if os.path.exists(clip_14_path):
            a = np.load(clip_14_path)
            embedding_dict[video] = {}
            for i, k in enumerate(video_keyframe_dict[video]):
                embedding_dict[video][k] = a[i]

for sample in dataset:
    sample['clip-14'] = embedding_dict[sample['video']][sample['keyframe_id']]
    sample.save()

KeyError: 'L01_V001'

In [6]:
# run in 10 minutes
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336")



In [7]:
# run in 11 seconds
image_samples = []
image_embeddings = []
for sample in dataset:
    image_samples.append(sample)
    image_embeddings.append(sample['clip-14']) 
image_embeddings = np.array(image_embeddings)

In [25]:
def submission(text_query, k, csv_file):
    inputs = processor(text=[text_query], return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        text_features = model.get_text_features(**inputs).cpu().numpy().flatten()
    similarities = cosine_similarity([text_features], image_embeddings)[0]
    top_k_indices = similarities.argsort()[-k:][::-1]

    if fo.dataset_exists("submission"):
        fo.delete_dataset("submission")

    dataset_submission = fo.Dataset(
        name="submission"
    )

    for index in top_k_indices:
        dataset_submission.add_sample(image_samples[index])

    with open(csv_file, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['video', 'frame_id'])
        # writer.writeheader()
        for sample in dataset_submission:
            writer.writerow({'video': sample['video'], 'frame_id': sample['frame_id']})

    return dataset_submission

In [1]:
text_query = "A scene from a radiation emergency response exercise. The first shot shows a person in yellow and blue clothes lying on the ground wearing a mask, followed by a fire brigade using a fire extinguisher to spray smoke. It ends with two people in blue protective suits carrying a victim on a stretcher. How many people use the fire extinguisher in the scene?"
output_file = "output.csv"

output_file = os.path.join('..', 'submission', output_file)
dataset_submission = submission(text_query, 100, output_file)
session = fo.launch_app(dataset_submission, auto=False)
# session.open_tab()

NameError: name 'os' is not defined