In [1]:
import sys
import os
from core.operators import vid_vec_rep_clip
from core.models.media_factory import VideoFactory
from core.store.es_vec import ES
from core.config import *
from core.models.media import MediaType
from datetime import datetime
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [2]:
sys.path.append(os.path.abspath('../../app'))

In [4]:
video_links = [
    "https://github.com/aatmanvaidya/audio-files/raw/main/feluda-media/en-speech.mp4",
    "https://github.com/aatmanvaidya/audio-files/raw/main/feluda-media/hi-speech.mp4",
    "https://github.com/aatmanvaidya/audio-files/raw/main/feluda-media/ta-speech.mp4"
]

file_names = [x.split('/')[-1] for x in video_links]


In [5]:
def get_vector_representation_of_video(video_links):
    
    result = []
    vid_vec_rep_clip.initialize({})
    for link in video_links:
        video = VideoFactory.make_from_url(link)
        vector = vid_vec_rep_clip.run(video)
        result.append(vector)

    return result

final_embeddings = get_vector_representation_of_video(video_links)
        
    

Installing packages for vid_vec_rep_clip


  from .autonotebook import tqdm as notebook_tqdm


Downloading video from URL
100% [..............................................................................] 48488 / 48488
Video downloaded
Downloading video from URL
100% [..............................................................................] 47732 / 47732
Video downloaded
Downloading video from URL
100% [..............................................................................] 51309 / 51309
Video downloaded


In [6]:
# Code to initialize Elastic search instance

param_dict = {
    "host_name": "es",
    "text_index_name": "text",
    "image_index_name": "image",
    "video_index_name": "video",
    "audio_index_name": "audio",
}

param = QueueConfig(
    label="test",
    type="es",
    parameters=StoreESParameters(
        host_name=param_dict["host_name"],
        image_index_name=param_dict["image_index_name"],
        text_index_name=param_dict["text_index_name"],
        video_index_name=param_dict["video_index_name"],
        audio_index_name=param_dict["audio_index_name"],
    ),
)
es = ES(param)
es.connect()

### Storing embeddings into Elastic Search

In [7]:
def generate_document(post_id: str, representation: any):
    base_doc = {
        "e_kosh_id": "",
        "dataset": post_id,
        "metadata": None,
        "date_added": datetime.now().isoformat(),
    }

    def generator_doc():
        for vector in representation:
            base_doc["_index"] = "video"
            base_doc["vid_vec"] = vector["vid_vec"]
            base_doc["is_avg"] = vector["is_avg"]
            yield base_doc

    return generator_doc


for file_name,embedding in zip(file_names,final_embeddings):
    print(file_name,embedding)
    doc = generate_document(file_name,embedding)
    media_type = MediaType.VIDEO
    result = es.store(media_type, doc)
    print("result:", result)


en-speech.mp4 <generator object gendata at 0x738989a535a0>
----> 6 (2, [])
result: {'message': 'multiple media stored'}
hi-speech.mp4 <generator object gendata at 0x738989a52ce0>
----> 6 (2, [])
result: {'message': 'multiple media stored'}
ta-speech.mp4 <generator object gendata at 0x738989a534c0>
----> 6 (2, [])
result: {'message': 'multiple media stored'}


### Checking if embeddings are stored correctly

In [9]:
!curl -X GET "http://es:9200/_cat/indices?v"

health status index uuid                   pri rep docs.count docs.deleted store.size pri.store.size dataset.size
yellow open   video B1F4p1UGTMy-lFPMjw8g3g   1   1         44            0    834.1kb        834.1kb      834.1kb


### Searching for a specific media

In [19]:
def search_video_vector(video_url):

    file_name = video_url.split('/')[-1]
    video = VideoFactory.make_from_url(video_url)
    embedding = vid_vec_rep_clip.run(video)
    average_vector = next(embedding)
    search_result = es.find("video", average_vector.get("vid_vec"))
    print("."*50)
    print("SEARCH RESULTS \n : ")
    pp.pprint(search_result)
    file_found = False
    for result in search_result:
        if result.get("dataset") == file_name:
            file_found = True
            break
    print("."*50)
    if(file_found):
        print(f"File {file_name} found in search result")
    else:
        print(f"File {file_name} not found in search result")

    

    
    

In [20]:
search_video_vector("https://github.com/aatmanvaidya/audio-files/raw/main/feluda-media/en-speech.mp4")

Downloading video from URL
100% [..............................................................................] 48488 / 48488
Video downloaded
calculation: 1 / (1 + l2norm(params.query_vector, 'vid_vec'))
..................................................
SEARCH RESULTS 
 : 
[   {   'dataset': 'en-speech.mp4',
        'dist': 0.9999998,
        'doc_id': 'XdQMqpIBbNdOPzkpIBHY',
        'e_kosh_id': '',
        'metadata': None,
        'text': None},
    {   'dataset': 'en-speech.mp4',
        'dist': 0.9999998,
        'doc_id': 'XtQMqpIBbNdOPzkpIBHa',
        'e_kosh_id': '',
        'metadata': None,
        'text': None},
    {   'dataset': 'en-speech.mp4',
        'dist': 0.9999998,
        'doc_id': 'QF_VqZIBOE8dDT7tnO5U',
        'e_kosh_id': '',
        'metadata': None,
        'text': None},
    {   'dataset': 'en-speech.mp4',
        'dist': 0.9999998,
        'doc_id': 'QV_VqZIBOE8dDT7tnO5U',
        'e_kosh_id': '',
        'metadata': None,
        'text': None},
    {  