In [2]:
import os
import faiss
import numpy as np
from tqdm import tqdm
import json
import sys 
sys.path.append("../")
from modules.settings import (
    PROJECT_DIR,
    DEFAULT_CLIP_FEATURE_DIR,
    DEFAULT_CLIP_FEATURE_V2_DIR,
    DEFAULT_DB_V1_DIR,
    DEFAULT_DB_V2_DIR,
)

In [3]:
def faiss_indexing(model: str, feature_length: int, db_dir: str):
    """
    - Indexing clip features in the index object
    - Create an dictionary for mapping from the index of the index object to the keyframe
    """

    features_dir = os.path.join(db_dir, "clip_features", model)
    keyframes_dir = os.path.join(db_dir, "keyframes")

    print(features_dir)
    print(db_dir)

    idx2keyframe = {}
    video2idx = {}
    pack2idx = {}

    i = 0

    index = faiss.IndexFlatIP(feature_length)

    for feature_file in tqdm(
        sorted(os.listdir(features_dir))
    ):  # feature: 'L01_V001.npy'

        # for feature_path in tqdm(sorted(glob.glob(os.path.join(features_dir, data_part) +'/*.npy'))):
        video_name, _ = os.path.splitext(feature_file)  # remove .npy, L01_V001

        keyframe_names = os.listdir(f"{keyframes_dir}/{video_name}")
        keyframe_names = sorted(keyframe_names)

        feature_path = os.path.join(features_dir, feature_file)
        feats = np.load(feature_path)

        start_idx = i
        for idx, feat in enumerate(feats):
            feat = feat.astype(np.float32).reshape(1, -1)
            faiss.normalize_L2(feat)  # Normalize for cosine similarity
            index.add(feat)  # Add feature to the index
            # create a starting index 0
            # update
            idx2keyframe[i] = f"{video_name}/{keyframe_names[idx]}"
            i += 1
        video2idx[video_name] = (start_idx, i - 1)

    for video_name in video2idx:
        pack_name, _ = video_name.split("_")
        start, end = video2idx[video_name]
        if pack_name not in pack2idx:
            pack2idx[pack_name] = [start, end]
        else:
            pack2idx[pack_name][1] = end
        
    # Write the index to a binary file
    index_path = os.path.join(db_dir, "faiss-index", "faiss_clip.bin")
    os.makedirs(os.path.dirname(index_path), exist_ok=True)
    faiss.write_index(index, index_path)

    # Save idx2keyframe mapping as JSON
    idx2keyframe_path = os.path.join(db_dir, "faiss-index", "idx2keyframe.json")
    with open(idx2keyframe_path, "w") as f:
        json.dump(idx2keyframe, f, indent=4)

    video2idx_path = os.path.join(db_dir, "faiss-index", "video2idx.json")
    with open(video2idx_path, "w") as f:
        json.dump(video2idx, f, indent=4)

    pack2idx_path = os.path.join(db_dir, "faiss-index", "pack2idx.json")
    with open(pack2idx_path, "w") as f:
        json.dump(pack2idx, f, indent=4)

    return index

In [4]:
MODEL = "ViT-L-14"
feature_length = 768

index = faiss_indexing(MODEL, feature_length, DEFAULT_DB_V1_DIR)


z:\github\Amatos_hcm_ai\notebooks\../db\clip_features\ViT-L-14
z:\github\Amatos_hcm_ai\notebooks\../db


100%|██████████| 726/726 [00:01<00:00, 501.47it/s]


In [25]:
rquery = np.random.rand(1, 768)
n = len(rquery)
distances = None
labels = None
index.search(rquery, 10)

(array([[1.3920091, 1.2818295, 1.2131524, 1.1747348, 1.157879 , 1.1481402,
         1.1459833, 1.141677 , 1.1226684, 1.0984651]], dtype=float32),
 array([[ 19774, 106168,  19773,  70501,  21189,  57678,  15704,  52076,
           2750,  97878]]))

In [26]:
faiss.IDSelector()

Help on method replacement_search in module faiss.class_wrappers:

replacement_search(x, k, *, params=None, D=None, I=None) method of faiss.swigfaiss.IndexFlatIP instance
    Find the k nearest neighbors of the set of vectors x in the index.
    
    Parameters
    ----------
    x : array_like
        Query vectors, shape (n, d) where d is appropriate for the index.
        `dtype` must be float32.
    k : int
        Number of nearest neighbors.
    params : SearchParameters
        Search parameters of the current search (overrides the class-level params)
    D : array_like, optional
        Distance array to store the result.
    I : array_like, optional
        Labels array to store the results.
    
    Returns
    -------
    D : array_like
        Distances of the nearest neighbors, shape (n, k). When not enough results are found
        the label is set to +Inf or -Inf.
    I : array_like
        Labels of the nearest neighbors, shape (n, k).
        When not enough results ar

In [29]:
?faiss.read_index

[0;31mSignature:[0m [0mfaiss[0m[0;34m.[0m[0mread_index[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mFile:[0m      ~/opt/anaconda3/envs/qachatbot/lib/python3.11/site-packages/faiss/swigfaiss.py
[0;31mType:[0m      function

In [6]:
a = [(1,10), (20, 30), (25, 35)]
b = []
for s,e in a:
    b.extend(list(range(s,e+1)))
b


[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35]