<a href="https://colab.research.google.com/github/tanyavijj/edtech-project/blob/main/msvd_Untitled33.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import cv2
import torch
import tarfile
import pandas as pd
from PIL import Image
import torchvision.transforms as transforms
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizer
from IPython.display import display

In [None]:
# Define Paths - Change This Based on Your Drive Location
base_folder = "/content/drive/MyDrive/iit data/"
tar_file = os.path.join(base_folder, "YouTubeClips.tar")
annotation_file = os.path.join(base_folder, "annotations.txt")
frame_output_folder = os.path.join(base_folder, "frames")

In [None]:
# Extract TAR File if Not Already Extracted
if not os.path.exists(frame_output_folder):
    os.makedirs(frame_output_folder, exist_ok=True)
    with tarfile.open(tar_file, "r") as tar:
        tar.extractall(path=frame_output_folder)
    print(f"✅ Extracted TAR file to {frame_output_folder}")
else:
    print("✅ Frames folder already exists.")

✅ Frames folder already exists.


In [None]:
#  Extract Frames from Videos
def extract_frames(video_folder, output_folder, fps=1):
    os.makedirs(output_folder, exist_ok=True)

    for video_file in os.listdir(video_folder):
        if video_file.endswith(".mp4") or video_file.endswith(".avi"):
            video_path = os.path.join(video_folder, video_file)
            cap = cv2.VideoCapture(video_path)
            frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
            frame_id = 0

            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                if frame_id % frame_rate == 0:  # Extract 1 frame per second
                    frame_filename = os.path.join(output_folder, f"{video_file.split('.')[0]}_frame_{frame_id}.jpg")
                    cv2.imwrite(frame_filename, frame)
                frame_id += 1

            cap.release()
            print(f" Frames extracted for {video_file}")

In [None]:
#  Run Frame Extraction
extract_frames(frame_output_folder, frame_output_folder)

In [None]:
import os
print(os.listdir("/content/drive/MyDrive/iit data/"))


['annotations.txt', 'YouTubeClips.tar', 'videos', 'image_captions.csv', 'frames']


In [None]:
import os

frame_output_folder = "/content/drive/MyDrive/iit data/frames"
frame_files = os.listdir(frame_output_folder)

if len(frame_files) == 0:
    print(" No frames found! Frame extraction failed.")
else:
    print(f" {len(frame_files)} frames found.")
    print("First 5 frame files:", frame_files[:5])  # Print first few extracted frames


✅ 20929 frames found.
🖼️ First 5 frame files: ['mJ9eRvxjLc4_0_16_frame_0.jpg', 'mJ9eRvxjLc4_0_16_frame_29.jpg', 'mJ9eRvxjLc4_0_16_frame_58.jpg', 'mJ9eRvxjLc4_0_16_frame_87.jpg', 'mJ9eRvxjLc4_0_16_frame_116.jpg']


In [None]:
import os

video_folder = "/content/drive/MyDrive/iit data/videos"
videos = os.listdir(video_folder)

if len(videos) == 0:
    print(" No videos found! Extraction failed.")
else:
    print(f" {len(videos)} videos found.")
    print(" First 5 videos:", videos[:5])  # Show first few videos

✅ 1 videos found.
🎥 First 5 videos: ['YouTubeClips']


In [None]:
import os

video_path = "/content/drive/MyDrive/iit data/videos/YouTubeClips"

if os.path.isdir(video_path):
    print("'YouTubeClips' is a folder. Checking contents...")
    print(os.listdir(video_path)[:5])  # Show first 5 files inside
elif os.path.isfile(video_path):
    print(" 'YouTubeClips' is a single file! Needs extraction.")
else:
    print(" 'YouTubeClips' does not exist.")

📂 'YouTubeClips' is a folder. Checking contents...
['mJ9eRvxjLc4_0_16.avi', 'MrMG15y3rWU_5_13.avi', '5CS4nLI2ZX8_50_59.avi', 'MF88IYF2MHY_25_59.avi', 'n2NLoLNecgI_168_178.avi']


In [None]:
#  Check Extracted Frames
frame_files = os.listdir(frame_output_folder)
if len(frame_files) == 0:
    print("No frames found! Check extraction process.")
else:
    print(f"{len(frame_files)} frames found.")
    print(frame_files[:5])  # Print first 5 frame names

✅ 20929 frames found.
['mJ9eRvxjLc4_0_16_frame_0.jpg', 'mJ9eRvxjLc4_0_16_frame_29.jpg', 'mJ9eRvxjLc4_0_16_frame_58.jpg', 'mJ9eRvxjLc4_0_16_frame_87.jpg', 'mJ9eRvxjLc4_0_16_frame_116.jpg']


In [None]:
video_folder = "/content/drive/MyDrive/iit data/videos/YouTubeClips"

In [None]:
extract_frames(video_folder, frame_output_folder)

✅ Frames extracted for mJ9eRvxjLc4_0_16.avi
✅ Frames extracted for MrMG15y3rWU_5_13.avi
✅ Frames extracted for 5CS4nLI2ZX8_50_59.avi
✅ Frames extracted for MF88IYF2MHY_25_59.avi
✅ Frames extracted for n2NLoLNecgI_168_178.avi
✅ Frames extracted for MWvCcwTw7Ac_78_86.avi
✅ Frames extracted for MW21lp833Vo_8_16.avi
✅ Frames extracted for MWzeInQaUk4_12_20.avi
✅ Frames extracted for vfktGc_qx-w_2_18.avi
✅ Frames extracted for 74tRCYS_534_49_57.avi
✅ Frames extracted for hJFBXHtxKIc_163_168.avi
✅ Frames extracted for mZVPkPqwzR4_38_45.avi
✅ Frames extracted for kWLNZzuo3do_145_151.avi
✅ Frames extracted for cnsjm3fNEec_4_10.avi
✅ Frames extracted for gbbRwBZuhzI_26_40.avi
✅ Frames extracted for MY-rGamtAJc_6_22.avi
✅ Frames extracted for Gn4Iv5ARIXc_83_93.avi
✅ Frames extracted for mYzajpeAWuA_100_112.avi
✅ Frames extracted for LEz0puaKNTk_38_48.avi
✅ Frames extracted for fMFvOgb4k6E_35_43.avi
✅ Frames extracted for MHWxjWwAbwM_10_25.avi
✅ Frames extracted for -_hbPLsZvvo_323_328.avi
✅ Fram

In [None]:
import os

def create_caption_mapping(annotation_file, frame_output_folder):
    # ✅ Step 1: Load all frame names once (FAST lookup with a dictionary)
    frame_dict = {os.path.splitext(frame)[0]: frame for frame in os.listdir(frame_output_folder)}
    captions = {}

    # ✅ Step 2: Process annotations
    with open(annotation_file, "r") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) >= 2:
                video_file = parts[0].strip()
                caption = parts[1].strip()
                frame_prefix = os.path.splitext(video_file)[0]

                # ✅ Instead of looping, just check if frame exists in dictionary
                if frame_prefix in frame_dict:
                    captions[frame_dict[frame_prefix]] = caption

    print(f"✅ Captions mapped: {len(captions)} frames found.")
    return captions

# Run function
captions = create_caption_mapping("/content/drive/MyDrive/iit data/annotations.txt",
                                  "/content/drive/MyDrive/iit data/frames")


✅ Captions mapped: 0 frames found.


In [None]:
# Save Captions to CSV
df = pd.DataFrame(list(captions.items()), columns=["Frame", "Caption"])
df.to_csv(os.path.join(base_folder, "captions.csv"), index=False)
print(f"Captions saved to {base_folder}captions.csv")

✅ Captions saved to /content/drive/MyDrive/iit data/captions.csv


In [None]:
# Load CLIP Model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
# Preprocessing Functions
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

def preprocess_image(image_path):
    img = Image.open(image_path).convert("RGB")
    img_tensor = transform(img).unsqueeze(0)  # Add batch dimension
    return img_tensor

def preprocess_text(text):
    tokens = tokenizer(text, return_tensors="pt")["input_ids"]
    return tokens

In [None]:
# Image Search Function
def image_search(query, frame_folder=frame_output_folder, top_k=5):
    query_tokens = preprocess_text(query)
    text_features = model.get_text_features(input_ids=query_tokens)

    frame_scores = []

    for frame in os.listdir(frame_folder):
        if frame.endswith(".jpg"):
            img_path = os.path.join(frame_folder, frame)
            img_tensor = preprocess_image(img_path)
            image_features = model.get_image_features(pixel_values=img_tensor)
            similarity = torch.cosine_similarity(image_features, text_features)
            frame_scores.append((frame, similarity.item(), img_path))

    frame_scores.sort(key=lambda x: x[1], reverse=True)
    top_results = frame_scores[:top_k]

    print("\n🔎 **Top Search Results:**")
    for frame, score, img_path in top_results:
        print(f"📸 {frame} - Similarity: {score:.4f} - Path: {img_path}")

        try:
            img = Image.open(img_path)
            display(img)  # Show image inside Colab
        except Exception as e:
            print(f"❌ Error displaying image: {e}")

In [None]:
# Run an Example Search Query
image_search("A person playing guitar")