# Phase 2: Large Vision and Language Models
 This notebook implements cross-modal retrieval and visual question answering using CLIP and Llava.

### Imports

In [21]:
import os
import json
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
from pathlib import Path
from sentence_transformers import util
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


## Load video metadata and captions

In [None]:
# Our 10 selected video IDs from phase 1
selected_ids = [
    "v_-rKS00dzFxQ", # How to Cook Mashed Potatoes
    "v_-fjUWhSM6Hc", # London 2012 Olympics
    "v_v7o9uSu9AVI", # 20 Exercises on Parallel Bars
    "v_RJpWgi0EaUE", # Vin Diesel Breakdancing Throwback
    "v_G7kqlq8WhRo", # Twickenham Festival 2015 Tug of War
    "v_jTMdMnbW9OI", # Washing my Face
    "v_9wtMJoqGTg0", # Girl in Balance Beam (gymnastics)
    "v_Ffi7vDa3C2I", # Epic Rollerblading Montage 80s
    "v_JRr3BruqS2Y", # 'What U think about Rollerblading?'
    "v_Mkljhl3D9-Q", # Preparing Angel Hair Pasta
]

In [None]:
def load_activitynet_subset(json_path, selected_ids):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    videos = {}
    for video_id in selected_ids:
        entry = data.get(video_id)
        if not entry:
            continue
        videos[video_id] = {
            "duration": entry["duration"],
            "timestamps": entry["timestamps"],
            "sentences": entry["sentences"],
        }
    return videos


videos = load_activitynet_subset("train.json", selected_ids)
videos.get('v_-rKS00dzFxQ')   # first id

{'duration': 231.43,
 'timestamps': [[8.1, 23.14],
  [24.3, 39.34],
  [40.5, 53.23],
  [54.39, 60.17],
  [61.33, 62.49],
  [63.64, 72.9],
  [78.69, 82.16],
  [83.32, 84.47],
  [85.63, 91.42],
  [92.57, 102.99],
  [104.15, 105.3],
  [114.56, 123.82],
  [124.97, 131.92],
  [133.07, 136.55],
  [137.7, 152.75],
  [153.9, 160.85],
  [162, 167.79],
  [168.95, 177.05],
  [178.2, 180.52],
  [181.68, 196.72],
  [197.88, 210.6],
  [211.76, 217.55]],
 'sentences': ['A man and a woman stand by a table speaking to the camera.',
  ' A recipe of mashed potatoes sits on the table.',
  ' The man peels and cuts potatoes before throwing them into a yellow pot.',
  ' The man throws in three handfuls of salt into the yellow pot.',
  ' The man cuts up some garlic.',
  ' Into the pot, the man throws the garlic along with two leaves.',
  ' The woman joins the man at the table and she cuts butter in half.',
  ' The woman throws the butter into a saucepan.',
  ' The man speaks to her briefly and the woman proce

## Extract keyframes from videos
(1 every 2 seconds)

In [None]:
def extract_keyframes(video_path, output_folder, frame_interval=2):
    # TO DO: Use library OpenCV to extract frames every `frame_interval` seconds
    # Save frames as JPEG in `output_folder/video_id/frame_0001.jpg`
    
    return 0

## Compute CLIP embeddings - frames and captions

In [4]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [5]:
def compute_clip_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = clip_model.get_image_features(**inputs)
    return embedding.squeeze().cpu().numpy()

def compute_clip_text_embedding(text):
    inputs = clip_processor(text=[text], return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = clip_model.get_text_features(**inputs)
    return embedding.squeeze().cpu().numpy()

In [None]:
# test embbedings ...

## Index keyframes and embeddings in OpenSearch

In [None]:
def index_clip_embeddings_to_opensearch(index_name, frames_folder, metadata_dict):
    # TO DO: For each frame, compute image embedding and store in OpenSearch using appropriate mappings
    # Example of fields: video_id, frame_time, image_vec, caption, caption_vec
    return 0

## Cross-Modal Retrieval
text -> image  
image -> image  
image -> text

In [None]:
def search_image_by_text(query_text, index_name):
    # TO DO: Use CLIP text embedding and knn_vector query on OpenSearch over `image_vec`
    return 0

def search_image_by_image(query_image_path, index_name):
    # TO DO: Use CLIP image embedding and knn_vector query on OpenSearch over `image_vec`
    return 0

def search_text_by_image(query_image_path, index_name):
    # TO DO: Use CLIP image embedding and knn_vector query on OpenSearch over `image_vec` (?)
    return 0

## Visual question answer with Llava

In [None]:
def ask_question_with_llava(image_path, question):
    # TO DO: Use Llava API or local server to send image + question and receive a text answer
    # Example of payload: { "image": <image bytes>, "question": "What is the man doing?" }
    return 0

## LVLM Interpretability
Attention Maps,
Relevancy Maps

In [None]:
# TO DO: Use attention weights or Grad-CAM visualization to highlight image/text importance
# instrument the CLIP or LLaVA model to extract internal attention values

# ???

## Save/load embeddings

In [7]:
def save_embeddings_to_file(data_dict, output_path):
    import pickle
    with open(output_path, "wb") as f:
        pickle.dump(data_dict, f)

def load_embeddings_from_file(path):
    import pickle
    with open(path, "rb") as f:
        return pickle.load(f)

In [None]:
save_embeddings_to_file(embeddings, "clip_image_embeddings.pkl")