In [2]:
# Install vector database
# ! pip install vectordb
!pip install faiss-cpu
!pip install faiss-gpu

## Import modules


In [1]:
import os
import numpy as np
from tqdm.notebook import trange, tqdm
from PIL import Image, ImageFont, ImageDraw
import torch
import clip
import json as js
from docarray import DocList, BaseDoc
from docarray.typing import NdArray
import numpy as np
from vectordb import InMemoryExactNNVectorDB, HNSWVectorDB
from IPython.display import clear_output, display, HTML
from natsort import natsorted
import pandas as pd
from typing import List
import webbrowser
import shutil
import random
from utils import create_html_script, get_all_scripts

## Constants


In [3]:
MODEL = "ViT-B/32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

METADATA_PATH = "../data/metadata/"
KEYFRAME_PATH = "../data/keyframes/"
FEATURE_PATH = "../data/features/"
MAP_KEYFRAMES = "../data/map-keyframes/"
VIDEOS_PATH = "../data/videos/"
SCRIPT_PATH = "../data/scripts/"

WORKSPACE = "./vectordb"

cuda


In [4]:
# Clean DB if necessery
DBs = [os.path.abspath(os.path.join(WORKSPACE, path)) for path in os.listdir(WORKSPACE)]
for db in DBs:
    shutil.rmtree(db)

## Re-formating Dataset


#### Get video names


In [5]:
LEN_OF_KEYFRAME_NAME = 4

In [6]:
video_names = [name for name in os.listdir(KEYFRAME_PATH) if name != ".gitkeep"]
# print(video_names)

In [7]:
for name in video_names:
    keyframes = [path for path in os.listdir(os.path.join(KEYFRAME_PATH, name))]
    for kf in keyframes:
        img_name = kf.split(".")[0]
        if len(img_name) != LEN_OF_KEYFRAME_NAME:
            changed_path = os.path.join(KEYFRAME_PATH, name, img_name.zfill(4) + ".jpg")
            old_path = os.path.join(KEYFRAME_PATH, name, kf)
            print(f"Change {old_path} to {changed_path}")
            os.rename(old_path, changed_path)

## Text embedding


In [6]:
class TextEmbedding:
    def __init__(self):
        self.device = DEVICE
        self.model, _ = clip.load(MODEL, device=self.device)

    def __call__(self, text: str) -> np.ndarray:
        text_inputs = clip.tokenize([text]).to(self.device)
        with torch.no_grad():
            text_feature = self.model.encode_text(text_inputs)[0]
        return text_feature.detach().cpu().numpy()

    def __call__(self, texts) -> np.ndarray:
        text_inputs = clip.tokenize(texts).to(self.device)
        with torch.no_grad():
            text_feature = self.model.encode_text(text_inputs)[0]
        return text_feature.detach().cpu().numpy()

## Vector Database


### Frame Document Class


In [7]:
class FrameDoc(BaseDoc):
    embedding: NdArray[512]
    video_name = ""
    image_path = ""
    keyframe_id = 0
    actual_idx = 0
    actual_time = 0.0
    fps=0
    metadata = {}

    def __str__(self):
        return f"""
            Video name: {self.video_name}
            Image path: {self.image_path}
            Keyframe Id: {self.keyframe_id}
            Actual keyframe idx: {self.actual_idx}
            Time: {self.actual_time}
            FPS: {self.fps}
            Metadata: {self.metadata}
          """

## Database Handler


In [8]:
class VectorDB:
    text_embedding = TextEmbedding()
    workspace = os.getcwd()
    method = "ANN"

    def __init__(self, method="ANN"):
        # Check if parent workspace exists
        if not os.path.isdir(WORKSPACE):
            os.mkdir(WORKSPACE, 0o666)
        # Create new workspae
        exits = [int(name.rsplit("_")[1]) for name in os.listdir(WORKSPACE)]
        while True:
            id = random.getrandbits(128)
            if id not in exits:
                self.workspace = os.path.join(
                    self.workspace, WORKSPACE, "DB_" + str(id)
                )
                break

        self.method = method
        #   Approximate Nearest Neighbour based on HNSW algorithm
        if method == "ANN":
            self.DB = HNSWVectorDB[FrameDoc](workspace=self.workspace)

        # Exhaustive search on the embeddings
        else:
            self.DB = InMemoryExactNNVectorDB[FrameDoc](workspace=self.workspace)

    def index(self, doc_list: List[FrameDoc]):
        # Index database
        self.DB.index(inputs=DocList[FrameDoc](doc_list))

    def search(self, query_text: str, topk=100):
        query_doc = FrameDoc(embedding=self.text_embedding(query_text))
        return self.DB.search(inputs=DocList[FrameDoc]([query_doc]), limit=topk)[
            0
        ].matches

    def delete(self, del_doc_list: List[FrameDoc]):
        self.DB.delete(docs=DocList[FrameDoc](del_doc_list))

### Needed functions


### Get all features files


In [9]:
def get_all_feats():
    return [
        os.path.join(FEATURE_PATH, file)
        for file in os.listdir(FEATURE_PATH)
        if file.endswith(".npy")
    ]

In [10]:
all_feat_files = get_all_feats()
# print(all_feat_files)
print(len(all_feat_files))

737


### Create all the Docs


In [12]:
def get_all_docs(npy_files):
    doc_list = []
    for feat_npy in npy_files:
        video_name = feat_npy[feat_npy.find("L") :].split(".")[0]
        feats_arr = np.load(os.path.join(feat_npy))
        # Load metadata
        metadata = {}
        with open(os.path.join(METADATA_PATH, video_name + ".json")) as meta_f:
            metadata = js.load(meta_f)
            map_kf = pd.read_csv(
                os.path.join(MAP_KEYFRAMES, video_name + ".csv"),
                usecols=["pts_time", "fps", "frame_idx"],
            )
            metadata = {key: metadata[key] for key in ["publish_date", "watch_url"]}
            for frame_idx, feat in enumerate(feats_arr):
                image_path = os.path.join(
                    KEYFRAME_PATH, video_name, f"{frame_idx + 1:04d}.jpg"
                )
                actual_idx = map_kf["frame_idx"][frame_idx]
                doc_list.append(
                    FrameDoc(
                        embedding=feat,
                        video_name=video_name,
                        image_path=image_path,
                        keyframe_id=frame_idx + 1,
                        actual_idx=actual_idx,
                        actual_time=map_kf["pts_time"][frame_idx],
                        fps=map_kf["fps"][frame_idx],
                        metadata=metadata,
                    )
                )

    return doc_list

In [17]:
doc_list = get_all_docs(all_feat_files)

In [18]:
print(len(doc_list))
print(doc_list[100])

202148

            Video name: L01_V001
            Image path: ../data/keyframes/L01_V001/0101.jpg
            Keyframe Id: 101
            Actual keyframe idx: 11047
            Time: 441.88
            Metadata: {'publish_date': '01/12/2022', 'watch_url': 'https://youtube.com/watch?v=HNsRpkryGXA'}
          


## Utils


### !!! Vip pro UI


In [19]:
def FrameDocToImage(docs):
    return [
        {
            "link": doc.metadata["watch_url"].split("v=")[-1],
            "path": doc.image_path,
            "video": doc.video_name,
            "frame": doc.actual_idx,
            "s": str(int(doc.actual_time) // 60)
            + "'"
            + str(round(doc.actual_time - 60 * (int(doc.actual_time) // 60), 1)),
        }
        for doc in docs
    ]

In [20]:
def visualize(docs):
    display(HTML(create_html_script(FrameDocToImage(docs))))

### Open video at specific times


In [21]:
# def open_video(doc: FrameDoc):
#     webbrowser.open(doc.metadata["watch_url"])

## Visualization functions


In [22]:
# def get_images(results):
#     images = []
#     for i, res in enumerate(results):
#         img = Image.open(res.image_path)
#         draw = ImageDraw.Draw(img)
#         font = ImageFont.truetype("arial.ttf", 50)
#         draw.text(xy=(5, 5), text=f"{i}, {res.video_name}, {res.actual_idx}", align="left", fill=(255,0,0,255), font=font)
#         images.append(img)
#     return images

In [23]:
# def visualize(imgs: List[Image.Image]) -> None:
#     rows = len(imgs) // 2 # see more clearly
#     if not rows:
#         rows += 1
#     cols = len(imgs) // rows
#     if rows * cols < len(imgs):
#         rows += 1
#     w, h = imgs[0].size
#     grid = Image.new('RGB', size=(cols * w, rows * h))
#     grid_w, grid_h = grid.size

#     for i, img in enumerate(imgs):

#         grid.paste(img, box=(i % cols * w, i // cols * h))

#     display(grid)

## Filter by audio

In [46]:
SCRIPT_PATH = "../data/scripts/"
def check_script(file_content, keywords):
    for keyword in keywords:
        if (keyword not in file_content):
            return False
    return True
def filter_by_audio(result, keywords):
    for i in range(len(result) - 1, -1, -1):
        transcript_path = SCRIPT_PATH + result[i].video_name + ".txt"
        try:
            with open(transcript_path, 'r') as file:
                content = file.read()
                if (not check_script(content, keywords)):
                    result.pop(i)                 
        except FileNotFoundError:
            pass
    return result

## DEMO


### Create DB


In [25]:
DB = VectorDB()
DB.index(doc_list)

### Query


#### Query text


Đoạn video hai người đàn ông đang chỉ vào các bức tranh bên trên tường. Trong các bức tranh, có một bức vẽ hình Bác Hồ trên nền màu đỏ ở giữa các bức tranh còn lại. Trên một vách tường khác có có treo các khung nhìn như bằng khen.

### Filter 1


In [26]:
results1 = DB.search(
    "Video of a man wearing a green shirt holding a piece of paper. There was a woman wearing a coatstanding and listening.",
    1000,
)  # Nên lấy nhiều

In [27]:
visualize(results1[:100])

### Filter 2


In [None]:
DB2 = VectorDB()
DB2.index(results1)

In [None]:
results2 = DB2.search("A bag is on the road", 500)

In [None]:
visualize(results1[:50])

## Test script with bert

In [92]:
import re
import faiss
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from utils import create_html_script, get_all_scripts

In [93]:
list_script, documents = get_all_scripts()
len(documents)

740

In [94]:
# Khởi tạo tokenizer và mô hình BERT
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [95]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [96]:
def separate_paragraphs(script, max_word=128):
    # Tách đoạn văn thành danh sách các từ
    words = re.findall(r'\b\w+\b', script)
    
    # Tính số lượng từ trong mỗi đoạn văn con
    n_child_script = len(words) // max_word
    
    # Tạo danh sách các đoạn văn con
    child_scripts = []
    for i in range(n_child_script):
        start = i * max_word
        end = (i + 1) * max_word
        if i == n_child_script - 1:
            # Trường hợp cuối cùng, lấy tất cả từ còn lại
            child_script = ' '.join(words[start:])
        else:
            child_script = ' '.join(words[start:end])
        child_scripts.append(child_script)
    
    return child_scripts

In [97]:
# WITH BERT
document_embeddings = []
name_parents = []
for i, doc in enumerate(tqdm(documents)):
    child_scripts = separate_paragraphs(doc)
    for i_child, part in enumerate(child_scripts):
        inputs = tokenizer(part, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}  # Chuyển dữ liệu lên GPU
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Sử dụng trung bình của các embeddings từ BERT
        document_embeddings.append(embeddings)
        name_parents.append(list_script[i])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 740/740 [09:03<00:00,  1.36it/s]


In [98]:
np_document_embeddings = np.vstack([x.cpu().numpy() for x in document_embeddings])
print(np_document_embeddings.shape)

(20147, 768)


## Setup FAISS

In [99]:
d = np_document_embeddings.shape[1]
d

768

In [100]:
index = faiss.IndexFlatL2(d)
index.is_trained

True

In [101]:
index.add(np_document_embeddings)

#### Input keyword

In [102]:
search_string = "tông tai nạn gây thương tích"
search_inputs = tokenizer(search_string, return_tensors="pt", padding=True, truncation=True)
search_inputs = {key: value.to(device) for key, value in search_inputs.items()}  # Chuyển dữ liệu lên GPU
with torch.no_grad():
    search_outputs = model(**search_inputs)
search_embedding = search_outputs.last_hidden_state.mean(dim=1)

In [103]:
search = search_embedding.cpu().numpy()
search.shape

(1, 768)

In [105]:
k = 500
D, I = index.search(search, k)  # search
I.shape

(1, 500)

In [120]:
top_videos = []
for i in I[0]:
    top_videos.append(name_parents[i][:-4])
    
top_videos = list(set(top_videos))
len(top_videos)

296

#### Create csv file

In [160]:
'''
file_name: tên của query
list_frame: từ results1
list_video: là list tên file video, có thể dùng BERT or cách thủ công của quí
'''
def create_csv(file_name, list_frame, list_video):
    list_frame = FrameDocToImage(list_frame)
    with open(file_name, 'w') as f:
        for row in list_frame:
            video = row["video"]
            if len(list_video) != 0 and video in list_video:
                row_text = f"{video}, {row['frame']}\n"
                f.write(row_text)
            elif len(list_video) == 0:
                row_text = f"{video}, {row['frame']}\n"
                f.write(row_text)

In [161]:
create_csv("test.csv", results1[:100], top_videos)