In [1]:
# Necessary Imports
import os
import json
from opensearchpy import OpenSearch
import pprint as pp
from sentence_transformers import SentenceTransformer
import numpy as np

1. Load Dataset and some Analysis

In [2]:
# Load dataset
with open("train.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Count number of moments per video
video_counts = {vid: len(info["timestamps"]) for vid, info in data.items()}

# Sort by most moments and get the top 20
top_videos = sorted(video_counts.items(), key=lambda x: x[1], reverse=True)[:20]

# Print full breakdown for each video
for rank, (video_id, count) in enumerate(top_videos, 1):
    entry = data[video_id]
    timestamps = entry["timestamps"]
    captions = entry["sentences"]
    url = f"https://www.youtube.com/watch?v={video_id[2:]}"

    print(f"\n Rank: {rank}. Video ID: {video_id}")
    print(f" URL: {url}")
    print(f" Number of Moments: {count}")

    for i, ((start,end), caption) in enumerate(zip(timestamps, captions)):
        if i >= 3:
            break
        print(f" {i+1:02d}) [{start:.2f} - {end:.2f}] {caption.strip()}")
   


 Rank: 1. Video ID: v_3l7quTy4c2s
 URL: https://www.youtube.com/watch?v=3l7quTy4c2s
 Number of Moments: 27
 01) [0.00 - 7.84] A cheerleader girl stands in the grass.
 02) [4.36 - 7.84] A young man wearing baseball hat greats her with a pie tin and jokes with her.
 03) [7.84 - 15.69] The man pushes a pie tin in the girls face as a joke getting her very messy.

 Rank: 2. Video ID: v_tuhHQ-lHIs4
 URL: https://www.youtube.com/watch?v=tuhHQ-lHIs4
 Number of Moments: 26
 01) [3.35 - 6.70] A group of kids poses with the arms in the air.
 02) [8.13 - 10.52] Another group of kids poses holding up peace signs.
 03) [11.00 - 13.39] A third group of kids poses with their arms around each other.

 Rank: 3. Video ID: v_-rKS00dzFxQ
 URL: https://www.youtube.com/watch?v=-rKS00dzFxQ
 Number of Moments: 22
 01) [8.10 - 23.14] A man and a woman stand by a table speaking to the camera.
 02) [24.30 - 39.34] A recipe of mashed potatoes sits on the table.
 03) [40.50 - 53.23] The man peels and cuts potatoes

2. Create OpenSearch Index + Mappings

In [3]:
# Setup OpeanSearch connection
host = 'api.novasearch.org'
port = 443
user = 'user04'
password = 'no.LIMITS2100'
index_name = user

# Create OpenSearch client
client = OpenSearch(
    hosts=[{'host': host, 'port': port}],
    http_compress = True,
    http_auth = (user, password),
    use_ssl = True,
    url_prefix = 'opensearch_v2',
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

In [16]:
# Define index setting and mappings
index_body = {
    "settings": {
        "index": {
            "number_of_shards": 4,
            "number_of_replicas": 0,
            "refresh_interval": "1s",
            "knn": "true"
        }
    },
    "mappings": {
        "dynamic": "strict",
        "properties": {
            "video_id": {"type": "keyword"},
            "video_url": {"type": "keyword"},
            "start": {"type": "float"},
            "end": {"type": "float"},
            "duration": {"type": "float"},
            "caption_bow": {"type": "text", "analyzer": "standard", "similarity": "BM25"},
            "caption_vec": {"type": "knn_vector", "dimension": 384}
        }
    }
}

# Create index only if not exists
if client.indices.exists(index=index_name):
    print("Index already exists.")
else:
    response = client.indices.create(index=index_name, body=index_body)
    print("Index created.")
    pp.pprint(response)

# For Debug: Display index settings and mappings
print("\nIndex Settings:")
pp.pprint(client.indices.get_settings(index=index_name))

print("\nIndex Mappings:")
pp.pprint(client.indices.get_mapping(index=index_name))

Index created.
{'acknowledged': True, 'index': 'user04', 'shards_acknowledged': True}

Index Settings:
{'user04': {'settings': {'index': {'creation_date': '1743948594071',
                                   'knn': 'true',
                                   'number_of_replicas': '0',
                                   'number_of_shards': '4',
                                   'provided_name': 'user04',
                                   'refresh_interval': '1s',
                                   'replication': {'type': 'DOCUMENT'},
                                   'uuid': 'M7NxhmYtSle6blVH4DVXjQ',
                                   'version': {'created': '136387927'}}}}}

Index Mappings:
{'user04': {'mappings': {'dynamic': 'strict',
                         'properties': {'caption_bow': {'analyzer': 'standard',
                                                        'similarity': 'BM25',
                                                        'type': 'text'},
                       

In [15]:
# Index Deletion
if client.indices.exists(index=index_name):
    # Delete the index
    response = client.indices.delete(
        index = index_name
    )
    print('\nDeleting index:')
    print(response)


Deleting index:
{'acknowledged': True}


4. Index Documents (Text + Embeddings)

In [17]:
# Load the BERT model (384-dim)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Your 10 selected video IDs
selected_ids = [
    "v_-rKS00dzFxQ",
    "v_-fjUWhSM6Hc",
    "v_v7o9uSu9AVI",
    "v_RJpWgi0EaUE",
    "v_G7kqlq8WhRo",
    "v_jTMdMnbW9OI",
    "v_9wtMJoqGTg0",
    "v_Ffi7vDa3C2I",
    "v_JRr3BruqS2Y",
    "v_Mkljhl3D9-Q",
]

# Load train.json
with open("train.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Index each moment
doc_id = 0
for video_id in selected_ids:
    entry = data.get(video_id)
    if not entry:
        continue

    url = f"https://www.youtube.com/watch?v={video_id[2:]}"
    duration = float(entry["duration"])
    timestamps = entry["timestamps"]
    captions = entry["sentences"]

    for (start, end), caption in zip(timestamps, captions):
        doc_id += 1

        # Compute the embedding
        embedding = model.encode(caption.strip(), normalize_embeddings=True).tolist()

        # Create the document
        doc = {
            "video_id": video_id,
            "video_url": url,
            "start": float(start),
            "end": float(end),
            "duration": duration,
            "caption_bow": caption.strip(),
            "caption_vec": embedding
        }

        # Index in OpenSearch
        response = client.index(index=index_name, id=doc_id, body=doc)
        print(f"Indexed document ID {doc_id} for video {video_id}: {response['result']}")

Indexed document ID 1 for video v_-rKS00dzFxQ: created
Indexed document ID 2 for video v_-rKS00dzFxQ: created
Indexed document ID 3 for video v_-rKS00dzFxQ: created
Indexed document ID 4 for video v_-rKS00dzFxQ: created
Indexed document ID 5 for video v_-rKS00dzFxQ: created
Indexed document ID 6 for video v_-rKS00dzFxQ: created
Indexed document ID 7 for video v_-rKS00dzFxQ: created
Indexed document ID 8 for video v_-rKS00dzFxQ: created
Indexed document ID 9 for video v_-rKS00dzFxQ: created
Indexed document ID 10 for video v_-rKS00dzFxQ: created
Indexed document ID 11 for video v_-rKS00dzFxQ: created
Indexed document ID 12 for video v_-rKS00dzFxQ: created
Indexed document ID 13 for video v_-rKS00dzFxQ: created
Indexed document ID 14 for video v_-rKS00dzFxQ: created
Indexed document ID 15 for video v_-rKS00dzFxQ: created
Indexed document ID 16 for video v_-rKS00dzFxQ: created
Indexed document ID 17 for video v_-rKS00dzFxQ: created
Indexed document ID 18 for video v_-rKS00dzFxQ: created
I

Text-based search

In [None]:
# BM25 Keyword Search
# This query uses classic keyword-based search over the caption_bow field.
# It ranks results based on word overlap between the query and the caption.
# BM25 does not understand synonyms or semantic meaning.

# Based on the Lab02a_OpeanSearch text-based search
qtxt = "woman talking"

query_bm25 = {
    "size": 5,
    "_source": ["video_id", "video_url", "start", "end", "caption_bow"],
    "query": {
        "match": {
            "caption_bow": qtxt
        }
    }
}

response = client.search(index=index_name, body=query_bm25)

print("\nBM25 Keyword Search Results:")
for hit in response["hits"]["hits"]:
    doc = hit["_source"]
    print(f"- {doc['video_id']} [{doc['start']:.2f}-{doc['end']:.2f}]: {doc['caption_bow']}")


res = client.count(index=index_name)
print(f"Indexed docs: {res['count']}")


BM25 Keyword Search Results:
- v_JRr3BruqS2Y [106.84-119.08]: A woman in a brown hat is talking.
- v_JRr3BruqS2Y [152.47-163.60]: A woman in a pink shirt is talking.
- v_JRr3BruqS2Y [214.80-222.59]: An older woman is talking to the camera.
- v_JRr3BruqS2Y [72.34-77.91]: It goes back to the man in the black shirt talking.
- v_JRr3BruqS2Y [63.44-70.11]: It goes back to the man in a black hat talking.
Indexed docs: 189


In [33]:
# Semantic Search (KNN Vector Query)
# Encodes the query text as a 384-dimensional embedding using Sentence-BERT.
# Uses k-Nearest Neighbors search to find semantically similar captions.
# Returns top 5 moments that are most conceptually similar, even if they don't share keywords.

# Semantic Search (KNN Vector Query)
query_text = "man dancing"
embedding = model.encode(query_text, normalize_embeddings=True).tolist()

query_knn = {
    "size": 5,
    "_source": ["video_id", "video_url", "start", "end", "caption_bow"],
    "query": {
        "knn": {
            "caption_vec": {
                "vector": embedding,
                "k": 5
            }
        }
    }
}

response = client.search(index=index_name, body=query_knn)

print(f"\n Semantic Search Results for '{query_text}':")
for hit in response["hits"]["hits"]:
    doc = hit["_source"]
    print(f"- {doc['video_id']} [{doc['start']:.2f}-{doc['end']:.2f}]: {doc['caption_bow']}")


 Semantic Search Results for 'man dancing':
- v_RJpWgi0EaUE [172.22-177.47]: The man in a black suit dances and then sits.
- v_RJpWgi0EaUE [16.80-19.95]: An image of a man is incorporated into the screen with the dancers.
- v_RJpWgi0EaUE [40.95-45.16]: The man and woman gesture the dancers hand movements.
- v_RJpWgi0EaUE [8.40-27.30]: Two boys dance in the same fashion.
- v_RJpWgi0EaUE [128.12-131.27]: The man claps his hands together.
