In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install opensearch-py

Collecting opensearch-py
  Downloading opensearch_py-2.8.0-py3-none-any.whl.metadata (6.9 kB)
Collecting Events (from opensearch-py)
  Downloading Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Downloading opensearch_py-2.8.0-py3-none-any.whl (353 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m353.5/353.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Events-0.5-py3-none-any.whl (6.8 kB)
Installing collected packages: Events, opensearch-py
Successfully installed Events-0.5 opensearch-py-2.8.0


In [4]:
!pip install transformers sentencepiece



In [5]:
#Imports

import cv2
import os
import torch
from PIL import Image
import json
from tqdm import tqdm
import pickle
import numpy as np
import seaborn as sns
from opensearchpy import OpenSearch, helpers
import pprint as pp
import requests
from transformers import CLIPProcessor, CLIPModel, CLIPVisionModel
import base64
import matplotlib.pyplot as plt

In [6]:
#Auxiliar Functions

def download_video(youtube_id, output_path):
    url = f"https://www.youtube.com/watch?v={youtube_id}"
    os.system(f'yt-dlp -f mp4 "{url}" -o "{output_path}"')


def extract_keyframes(video_path, output_dir, interval=2):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps * interval)
    frame_id = 0
    saved = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_id % frame_interval == 0:
            timestamp = int(cap.get(cv2.CAP_PROP_POS_MSEC) // 1000)
            out_path = os.path.join(output_dir, f"{os.path.basename(video_path)[:-4]}_t{timestamp}s.jpg")
            cv2.imwrite(out_path, frame)
            saved += 1
        frame_id += 1
    cap.release()
    print(f"{saved} frames extraídos para: {output_dir}")


def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
    return image_features[0].detach().cpu().numpy().tolist()



def find_caption_for_timestamp(ts, timestamps, sentences):
    for i, (start, end) in enumerate(timestamps):
        if start <= ts <= end:
            return sentences[i]
    return ""

In [7]:
# Shows the top 10 videos with the most moments

train_path = "/content/drive/MyDrive/captions/train.json"

with open(train_path, "r") as f:
    data = json.load(f)

video_moments = []
for full_id, info in data.items():
    num_moments = len(info.get("timestamps", []))
    youtube_id = full_id.replace("v_", "")
    video_moments.append((full_id, youtube_id, num_moments))

video_moments.sort(key=lambda x: x[2], reverse=True)

print("Top 10 vídeos com mais momentos:")
for i, (full_id, youtube_id, count) in enumerate(video_moments[:10], start=1):
    print(f"{i}. {full_id} — {count} momentos — https://www.youtube.com/watch?v={youtube_id}")


Top 10 vídeos com mais momentos:
1. v_3l7quTy4c2s — 27 momentos — https://www.youtube.com/watch?v=3l7quTy4c2s
2. v_tuhHQ-lHIs4 — 26 momentos — https://www.youtube.com/watch?v=tuhHQ-lHIs4
3. v_-rKS00dzFxQ — 22 momentos — https://www.youtube.com/watch?v=-rKS00dzFxQ
4. v_-fjUWhSM6Hc — 22 momentos — https://www.youtube.com/watch?v=-fjUWhSM6Hc
5. v_v7o9uSu9AVI — 20 momentos — https://www.youtube.com/watch?v=v7o9uSu9AVI
6. v_NiaE7amNW7s — 20 momentos — https://www.youtube.com/watch?v=NiaE7amNW7s
7. v_RJpWgi0EaUE — 20 momentos — https://www.youtube.com/watch?v=RJpWgi0EaUE
8. v_G7kqlq8WhRo — 19 momentos — https://www.youtube.com/watch?v=G7kqlq8WhRo
9. v_Ke5MPiv-wrY — 19 momentos — https://www.youtube.com/watch?v=Ke5MPiv-wrY
10. v_jTMdMnbW9OI — 19 momentos — https://www.youtube.com/watch?v=jTMdMnbW9OI


In [8]:
base_dir = "/content/drive/MyDrive/captions"
video_dir = os.path.join(base_dir, "videos")
frames_dir = os.path.join(base_dir, "keyframes")

os.makedirs(video_dir, exist_ok=True)
os.makedirs(frames_dir, exist_ok=True)

# We chose the 2nd video with the most moments to take keyframes (this is because the first one gives a 404 when we click on the link)
video_id = "v_tuhHQ-lHIs4"
youtube_id = video_id.replace("v_", "")

In [9]:
video_path = os.path.join(video_dir, f"{video_id}.mp4")
download_video(youtube_id, video_path)

In [10]:
extract_keyframes(video_path, frames_dir)

48 frames extraídos para: /content/drive/MyDrive/captions/keyframes


In [11]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [12]:
embeddings = []

for fname in tqdm(os.listdir(frames_dir)):
    if fname.startswith(video_id) and fname.endswith(".jpg"):
        frame_path = os.path.join(frames_dir, fname)
        emb = get_image_embedding(frame_path)
        embeddings.append({
            "video_id": video_id,
            "frame_file": fname,
            "embedding": emb
        })

output_path = os.path.join(base_dir, f"{video_id}_embeddings.pkl")
with open(output_path, "wb") as f:
    pickle.dump(embeddings, f)

print(f"Guardado: {video_id}_embeddings.pkl com {len(embeddings)} embeddings.")


  0%|          | 0/48 [00:00<?, ?it/s][A
  2%|▏         | 1/48 [00:02<01:35,  2.03s/it][A
  4%|▍         | 2/48 [00:03<01:29,  1.94s/it][A
  6%|▋         | 3/48 [00:05<01:25,  1.91s/it][A
  8%|▊         | 4/48 [00:07<01:16,  1.74s/it][A
 10%|█         | 5/48 [00:09<01:17,  1.79s/it][A
 12%|█▎        | 6/48 [00:11<01:30,  2.15s/it][A
 15%|█▍        | 7/48 [00:13<01:17,  1.88s/it][A
 17%|█▋        | 8/48 [00:14<01:02,  1.56s/it][A
 19%|█▉        | 9/48 [00:15<00:55,  1.43s/it][A
 21%|██        | 10/48 [00:16<00:49,  1.30s/it][A
 23%|██▎       | 11/48 [00:17<00:43,  1.18s/it][A
 25%|██▌       | 12/48 [00:17<00:37,  1.04s/it][A
 27%|██▋       | 13/48 [00:18<00:34,  1.01it/s][A
 29%|██▉       | 14/48 [00:19<00:32,  1.06it/s][A
 31%|███▏      | 15/48 [00:20<00:29,  1.13it/s][A
 33%|███▎      | 16/48 [00:21<00:30,  1.06it/s][A
 35%|███▌      | 17/48 [00:22<00:28,  1.08it/s][A
 38%|███▊      | 18/48 [00:24<00:36,  1.20s/it][A
 40%|███▉      | 19/48 [00:26<00:41,  1.43s/it]

Guardado: v_tuhHQ-lHIs4_embeddings.pkl com 48 embeddings.


In [13]:
image_folder = "/content/drive/MyDrive/captions/keyframes"

In [14]:
image_paths = sorted([
    os.path.join(image_folder, f) for f in os.listdir(image_folder)
    if f.startswith("v_fzp5ooc727c") and f.endswith(".jpg")
])

In [15]:
with open("/content/drive/MyDrive/captions/v_tuhHQ-lHIs4_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

image_embeddings = [item["embedding"] for item in embeddings]
image_paths = [os.path.join(image_folder, item["frame_file"]) for item in embeddings]

In [16]:
host = 'api.novasearch.org'
port = 443

user = 'user06'
password = 'quarta+mpdw'
index_name = user

In [17]:
# Create the client with SSL/TLS enabled, but hostname verification disabled.
client_openSearch = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True,
    http_auth = (user, password),
    use_ssl = True,
    url_prefix = 'opensearch_v2',
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

if client_openSearch.indices.exists(index_name):

    resp = client_openSearch.indices.open(index = index_name)
    print(resp)

    print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
    settings = client_openSearch.indices.get_settings(index = index_name)
    pp.pprint(settings)

    print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
    mappings = client_openSearch.indices.get_mapping(index = index_name)
    pp.pprint(mappings)

    print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
    print(client_openSearch.count(index = index_name))
else:
    print("Index does not exist.")

{'acknowledged': True, 'shards_acknowledged': True}

----------------------------------------------------------------------------------- INDEX SETTINGS
{'user06': {'settings': {'index': {'creation_date': '1749930439789',
                                   'knn': 'true',
                                   'number_of_replicas': '1',
                                   'number_of_shards': '1',
                                   'provided_name': 'user06',
                                   'refresh_interval': '1s',
                                   'replication': {'type': 'DOCUMENT'},
                                   'uuid': 'bHzigVNWSU6fuLXvcMEAwA',
                                   'version': {'created': '136387927'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'user06': {'mappings': {'properties': {'caption': {'type': 'text'},
                                        'caption_embedding': {'dimension': 512,
                   

In [18]:
if client_openSearch.indices.exists(index=index_name):
    # Delete the index.
    response = client_openSearch.indices.delete(
        index = index_name
    )
    print('\nDeleting index:')
    print(response)


Deleting index:
{'acknowledged': True}


In [19]:
if not client_openSearch.indices.exists(index=index_name):
    index_body = {
        "settings": {
            "index": {
                "knn": True
            }
        },
        "mappings": {
            "properties": {
                "video_id": {"type": "keyword"},
                "timestamp": {"type": "float"},
                "image_path": {"type": "keyword"},
                "caption": {"type": "text"},
                "image_embedding": {
                    "type": "knn_vector",
                    "dimension": 512,
                    "method": {
                        "name": "hnsw",
                        "space_type": "cosinesimil",
                        "engine": "nmslib"
                    }
                },
                "caption_embedding": {
                    "type": "knn_vector",
                    "dimension": 512,
                    "method": {
                        "name": "hnsw",
                        "space_type": "cosinesimil",
                        "engine": "nmslib"
                    }
                }
            }
        }
    }
    client_openSearch.indices.create(index=index_name, body=index_body)
    print(f" Índice '{index_name}' criado.")
else:
    print(f"Índice '{index_name}' já existe.")

 Índice 'user06' criado.


In [20]:
mappings = client_openSearch.indices.get_mapping(index=index_name)
pp.pprint(mappings)

{'user06': {'mappings': {'properties': {'caption': {'type': 'text'},
                                        'caption_embedding': {'dimension': 512,
                                                              'method': {'engine': 'nmslib',
                                                                         'name': 'hnsw',
                                                                         'parameters': {},
                                                                         'space_type': 'cosinesimil'},
                                                              'type': 'knn_vector'},
                                        'image_embedding': {'dimension': 512,
                                                            'method': {'engine': 'nmslib',
                                                                       'name': 'hnsw',
                                                                       'parameters': {},
                                             

In [21]:
index_settings = {
    "settings": {
        "index": {
            "refresh_interval": "1s"
        }
    }
}
client_openSearch.indices.put_settings(index=index_name, body=index_settings)

{'acknowledged': True}

In [22]:
settings = client_openSearch.indices.get_settings(index=index_name)
pp.pprint(settings)

{'user06': {'settings': {'index': {'creation_date': '1749933641296',
                                   'knn': 'true',
                                   'number_of_replicas': '1',
                                   'number_of_shards': '1',
                                   'provided_name': 'user06',
                                   'refresh_interval': '1s',
                                   'replication': {'type': 'DOCUMENT'},
                                   'uuid': 'ww0rUfh0Qxq_xGoRYLRaIw',
                                   'version': {'created': '136387927'}}}}}


In [23]:
video_id = "v_tuhHQ-lHIs4"
pkl_path = f"/content/drive/MyDrive/captions/{video_id}_embeddings.pkl"
captions_json_path = "/content/drive/MyDrive/captions/train.json"

with open(pkl_path, "rb") as f:
    data = pickle.load(f)

with open(captions_json_path, "r") as f:
    captions_data = json.load(f)

video_data = captions_data.get(video_id)
timestamps = video_data["timestamps"]
sentences = video_data["sentences"]

# Finds and returns the caption whose timestamp range includes the given timestamp; returns empty string if none found.
def find_caption_for_timestamp(ts, timestamps, sentences):
    for i, (start, end) in enumerate(timestamps):
        if start <= ts <= end:
            return sentences[i]
    return ""

for doc in data:
    try:
        ts = int(doc["frame_file"].split("_t")[-1].replace("s.jpg", ""))

        caption = find_caption_for_timestamp(ts, timestamps, sentences)

        inputs = processor(text=[caption], return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            caption_emb_tensor = model.get_text_features(**inputs)
        caption_emb = caption_emb_tensor[0].detach().cpu().numpy().tolist()

        image_emb = doc["embedding"]
        if not isinstance(image_emb, list):
            image_emb = image_emb.detach().cpu().numpy().tolist()


        body1 = {
          "video_id": video_id,
          "timestamp": ts,
          "image_path": doc["frame_file"],
          "caption": caption,
          "image_embedding": image_emb,
          "caption_embedding": caption_emb
        }

        client_openSearch.index(index=index_name, body=body1)

        print(f"Frame indexado {doc['frame_file']}: {e}")

    except Exception as e:
      print(f"Frame indexado {doc['frame_file']}")


Frame indexado v_tuhHQ-lHIs4_t0s.jpg
Frame indexado v_tuhHQ-lHIs4_t4s.jpg
Frame indexado v_tuhHQ-lHIs4_t6s.jpg
Frame indexado v_tuhHQ-lHIs4_t2s.jpg
Frame indexado v_tuhHQ-lHIs4_t8s.jpg
Frame indexado v_tuhHQ-lHIs4_t14s.jpg
Frame indexado v_tuhHQ-lHIs4_t12s.jpg
Frame indexado v_tuhHQ-lHIs4_t10s.jpg
Frame indexado v_tuhHQ-lHIs4_t16s.jpg
Frame indexado v_tuhHQ-lHIs4_t20s.jpg
Frame indexado v_tuhHQ-lHIs4_t18s.jpg
Frame indexado v_tuhHQ-lHIs4_t22s.jpg
Frame indexado v_tuhHQ-lHIs4_t26s.jpg
Frame indexado v_tuhHQ-lHIs4_t24s.jpg
Frame indexado v_tuhHQ-lHIs4_t28s.jpg
Frame indexado v_tuhHQ-lHIs4_t30s.jpg
Frame indexado v_tuhHQ-lHIs4_t32s.jpg
Frame indexado v_tuhHQ-lHIs4_t34s.jpg
Frame indexado v_tuhHQ-lHIs4_t36s.jpg
Frame indexado v_tuhHQ-lHIs4_t38s.jpg
Frame indexado v_tuhHQ-lHIs4_t40s.jpg
Frame indexado v_tuhHQ-lHIs4_t42s.jpg
Frame indexado v_tuhHQ-lHIs4_t46s.jpg
Frame indexado v_tuhHQ-lHIs4_t44s.jpg
Frame indexado v_tuhHQ-lHIs4_t52s.jpg
Frame indexado v_tuhHQ-lHIs4_t48s.jpg
Frame indexado v_

In [24]:
# Retrieves the top-k most relevant video moments for a query by embedding the text with CLIP, performing a k-NN search in OpenSearch on caption embeddings, and returning moments sorted by timestamp.
def retrieve_top_k_moments(query_text, k=5):

    inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        text_embedding = model.get_text_features(**inputs)
    text_embedding = text_embedding[0].detach().cpu().numpy().tolist()


    response = client_openSearch.search(index=index_name, body={
        "size": k,
        "query": {
            "knn": {
                "caption_embedding": {
                    "vector": text_embedding,
                    "k": k
                }
            }
        }
    })

    # Extract and sort by timestamp
    moments = []
    for hit in response["hits"]["hits"]:
        src = hit["_source"]
        moments.append({
            "caption": src["caption"],
            "timestamp": src["timestamp"],
            "image_path": src["image_path"],
            "video_id": src["video_id"]
        })

    return sorted(moments, key=lambda x: x["timestamp"])

In [25]:
query = "What does the child do after picking up the ball?"
moments = retrieve_top_k_moments(query, k=5)

for i, m in enumerate(moments, 1):
    print(f"{i}. [{m['timestamp']}] {m['caption']}")


1. [60]  The adults play do0dge ball.
2. [62]  The adults play do0dge ball.
3. [64]  The adults play do0dge ball.
4. [66]  The adults play do0dge ball.
5. [68]  The adults play do0dge ball.


In [26]:
# Builds a prompt for Flan-T5
def build_t5_prompt(query_text, moments):
    context = ""
    for i, m in enumerate(moments, 1):
        context += f"{i}. {m['caption']} (timestamp {m['timestamp']})\n"

    prompt = (
        f"question answering:\n"
        f"Context:\n{context}\n"
        f"Question: {query_text}\n"
        f"Answer:"
    )
    return prompt

In [27]:
query_text = "What does the child do after picking up the ball?"
moments = retrieve_top_k_moments(query_text, k=5)
prompt = build_t5_prompt(query_text, moments)
print("Prompt para T5:")
print(prompt)

Prompt para T5:
question answering:
Context:
1.  The adults play do0dge ball. (timestamp 60)
2.  The adults play do0dge ball. (timestamp 62)
3.  The adults play do0dge ball. (timestamp 64)
4.  The adults play do0dge ball. (timestamp 66)
5.  The adults play do0dge ball. (timestamp 68)

Question: What does the child do after picking up the ball?
Answer:


In [28]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_response_with_t5(prompt):
    output = generator(prompt, max_length=128, do_sample=True)[0]
    return output["generated_text"]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cpu


In [29]:
query_text = "What does the child do after picking up the ball?"
moments = retrieve_top_k_moments(query_text, k=5)
prompt = build_t5_prompt(query_text, moments)

response = generate_response_with_t5(prompt)
print("\nResposta gerada com T5:")
print(response)

Both `max_new_tokens` (=256) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Resposta gerada com T5:
play do0dge ball


In [30]:
# Generates a response from Flan-T5 based on the given prompt with controlled sampling parameters.
def generate_response_with_t5(prompt):
    output = generator(
        prompt,
        max_new_tokens=100,
        do_sample=True,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3
    )[0]
    return output["generated_text"]


In [31]:
# Builds a prompt for Flan-T5 using example context, retrieved captions, and the user query.
def build_t5_prompt(query_text, moments):
    context = ""
    for i, m in enumerate(moments, 1):
        context += f"{i}. {m['caption']} (timestamp {m['timestamp']})\n"

    prompt = (
    "You are a helpful assistant answering video questions.\n\n"
    "Example:\n"
    "Context:\n"
    "1. The woman enters the room. (timestamp 10)\n"
    "2. She looks around. (timestamp 15)\n"
    "Question: What happens after the woman enters the room?\n"
    "Answer: After entering the room, the woman looks around.\n\n"
    f"Context:\n{context}\n"
    f"Question: {query_text}\n"
    "Answer:"
    )
    return prompt

In [42]:
query_text = "What does the child do after picking up the ball?"
moments = retrieve_top_k_moments(query_text, k=5)
prompt = build_t5_prompt(query_text, moments)

response = generate_response_with_t5(prompt)
print("Answer:")
print(response)

Answer:
play do0dge ball


In [43]:
query_text = "Do kids appear on the full video?"
moments = retrieve_top_k_moments(query_text, k=5)
prompt = build_t5_prompt(query_text, moments)

response = generate_response_with_t5(prompt)
print("Answer:")
print(response)

Answer:
yes


In [44]:
query_text = "What happens in the video?"
moments = retrieve_top_k_moments(query_text, k=5)
prompt = build_t5_prompt(query_text, moments)

response = generate_response_with_t5(prompt)
print("Answer:")
print(response)

Answer:
The camera zooms in on a piece of paper.


In [45]:
query_text = "Do dogs play in this video?"
moments = retrieve_top_k_moments(query_text, k=5)
prompt = build_t5_prompt(query_text, moments)

response = generate_response_with_t5(prompt)
print("Answer:")
print(response)

Answer:
no
