## 1. Setup and Installations

This section prepares the environment by cloning necessary repositories and installing Python packages.

In [None]:
!git clone https://github.com/ludoplayer69/videotree
!git clone https://github.com/facebookresearch/perception_models.git
!git clone https://github.com/subhadarship/kmeans_pytorch

!pip install -q -U bitsandbytes accelerate transformers
!pip install -q gdown decord ftfy groq

## 2. Configuration

All key parameters for the pipeline are defined here. Modify this section to change model settings, paths, and hyperparameters for experiments.

In [None]:
import os
from pathlib import Path
import torch

# Set your Groq API key here
# Get one from https://console.groq.com/keys
os.environ["GROQ_API_KEY"] = "  "

# To prevent CUDA memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

class PipelineConfig:
    # General
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    MAX_VIDEOS_TO_PROCESS = 10  # Limit the number of videos for a quick run

    # Paths
    WORK_DIR = Path('/kaggle/working/')
    DATA_DIR = Path('/kaggle/input/egoschema/')
    
    # Derived Paths
    VIDEO_DIR = WORK_DIR / 'downloaded_videos'
    FRAMES_DIR = WORK_DIR / 'extracted_frames'
    FEATURES_DIR = WORK_DIR / 'extracted_features'
    OUTPUTS_DIR = WORK_DIR / 'outputs'
    
    # Data sources
    DRIVE_IDS_JSON = WORK_DIR / 'videotree/drive_ids.json'
    ANNOTATIONS_JSON = DATA_DIR / 'fullset_anno.json'
    CAPTIONS_JSON = DATA_DIR / 'blip2_fullset.json'

    # Feature Extraction
    FRAME_EXTRACTION_FPS = 1
    PERCEPTION_MODEL_NAME = 'PE-Core-B16-224'

    # Width Expansion (Adaptive Clustering)
    GROQ_MODEL = 'llama-3.1-8b-instant'
    MAX_CLUSTER_NUM = 32
    INIT_CLUSTER_NUM = 4
    RELEVANCE_THRESHOLD = 5  # Stop clustering if >= this many high-relevance frames are found

    # Depth Expansion (Hierarchical Clustering)
    NUM_SUBCLUSTERS = 4
    NUM_SUB_SUBCLUSTERS = 4

    # VLM Inference
    VLM_MODEL_ID = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"
    VLM_BATCH_SIZE = 10 # Lower if you encounter Out-of-Memory errors

    def __init__(self):
        # Create all necessary directories
        for path in [self.WORK_DIR, self.VIDEO_DIR, self.FRAMES_DIR, 
                     self.FEATURES_DIR, self.OUTPUTS_DIR]:
            path.mkdir(parents=True, exist_ok=True)

config = PipelineConfig()

## 3. Imports and Utility Functions

This section contains all required imports and helper functions used throughout the notebook.

In [2]:
import sys
import json
import cv2
import re
import gdown
import string
import time
from tqdm import tqdm
from PIL import Image
import numpy as np
from glob import glob
import gc

import torch
import torch.nn.functional as F
from transformers import AutoProcessor, AutoModelForVision2Seq
from groq import Groq
from scipy.cluster.hierarchy import linkage, fcluster

# Add cloned repos to system path for imports
sys.path.append(str(config.WORK_DIR / 'perception_models'))
sys.path.append(str(config.WORK_DIR / 'kmeans_pytorch'))
from kmeans_pytorch import kmeans

# --- Generic Utilities ---
def load_json(file_path: Path):
    with open(file_path, 'r') as f:
        return json.load(f)

def save_json(data, file_path: Path, indent=4):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=indent)
        
def _numeric_sort_key(p: Path):
    try:
        return int(p.stem)
    except ValueError:
        return p.stem

2025-08-23 04:32:25.595644: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755923545.621196     259 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755923545.629207     259 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## 4. Data Preparation

Downloads videos from Google Drive and loads the corresponding questions and captions.

In [3]:
def download_videos(drive_json_path: Path, save_path: Path, max_downloads: int):
    if not drive_json_path.exists():
        print(f"[ERROR] Drive IDs JSON not found at: {drive_json_path}")
        return
        
    drive_data = load_json(drive_json_path)
    video_ids_to_download = list(drive_data.items())[:max_downloads]
    
    for uuid, drive_id in tqdm(video_ids_to_download, desc="Downloading Videos"):
        output_file = save_path / f"{uuid}.mp4"
        if not output_file.exists(): # Skip if already downloaded
            gdown.download(id=drive_id, output=str(output_file), quiet=True)

def load_metadata(config: PipelineConfig):
    video_ids = [p.stem for p in config.VIDEO_DIR.glob("*.mp4")]
    annotations = load_json(config.ANNOTATIONS_JSON)
    captions = load_json(config.CAPTIONS_JSON)
    
    questions = {}
    for vid in video_ids:
        if vid not in annotations:
            continue
        
        anno = annotations[vid]
        option_keys = sorted([k for k in anno if k.startswith("option ")], key=lambda x: int(x.split()[1]))
        
        prompt = f"Question:\n{anno['question']}\n\nOptions:\n"
        for letter, key in zip(string.ascii_uppercase, option_keys):
            prompt += f"{letter}. {anno[key]}\n"
        prompt += "\nPlease choose the most appropriate answer (A–E)."
        questions[vid] = prompt
        
    return video_ids, questions, captions

# Execute Data Preparation
print("--- Starting Data Preparation ---")
download_videos(config.DRIVE_IDS_JSON, config.VIDEO_DIR, config.MAX_VIDEOS_TO_PROCESS)
video_ids, questions, captions = load_metadata(config)
print(f"Found {len(video_ids)} videos with corresponding questions.")

--- Starting Data Preparation ---


Downloading Videos: 100%|██████████| 10/10 [00:20<00:00,  2.00s/it]


Found 10 videos with corresponding questions.


## 5. Module 1: Feature Extraction

Extracts frames from videos and computes feature embeddings using a pre-trained perception model.

In [4]:
class FeatureExtractor:
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.model = None
        self.preprocess = None

    def _load_model(self):
        if self.model is not None and self.preprocess is not None:
            return
        
        # The model code expects to be run from its own directory
        cwd = os.getcwd()
        os.chdir(self.config.WORK_DIR / 'perception_models')
        import core.vision_encoder.pe as pe
        import core.vision_encoder.transforms as transforms
        
        self.model = pe.CLIP.from_config(self.config.PERCEPTION_MODEL_NAME, pretrained=True).to(self.config.DEVICE).eval()
        self.preprocess = transforms.get_image_transform(self.model.image_size)
        
        os.chdir(cwd)
        print("Feature extraction model loaded.")

    def _extract_frames_for_video(self, video_path: Path):
        out_dir = self.config.FRAMES_DIR / video_path.stem
        out_dir.mkdir(exist_ok=True)

        cap = cv2.VideoCapture(str(video_path))
        if not cap.isOpened(): return

        fps_ori = cap.get(cv2.CAP_PROP_FPS) or 30
        frame_interval = max(1, int(fps_ori // self.config.FRAME_EXTRACTION_FPS))
        
        count = 0
        success, img = cap.read()
        while success:
            if count % frame_interval == 0:
                cv2.imwrite(str(out_dir / f"{count}.jpg"), img)
            success, img = cap.read()
            count += 1
        cap.release()

    @torch.inference_mode()
    def _extract_features_for_dir(self, frames_dir: Path):
        if not frames_dir.exists(): return

        image_files = sorted(list(frames_dir.glob('*.jpg')), key=_numeric_sort_key)
        if not image_files: return

        feats_list = []
        for img_fp in image_files:
            img = Image.open(img_fp).convert('RGB')
            inp = self.preprocess(img).unsqueeze(0).to(self.config.DEVICE)
            with torch.cuda.amp.autocast():
                feat = self.model.encode_image(inp)
            feats_list.append(feat)
            
        stacked_feats = torch.cat(feats_list, dim=0)
        torch.save(stacked_feats, self.config.FEATURES_DIR / f"{frames_dir.name}.pt")

    def run(self, video_ids: list[str]):
        self._load_model()
        
        print("Extracting frames...")
        for video_id in tqdm(video_ids, desc="Extracting Frames"):
            video_path = self.config.VIDEO_DIR / f"{video_id}.mp4"
            self._extract_frames_for_video(video_path)

        print("Extracting features...")
        for video_id in tqdm(video_ids, desc="Extracting Features"):
            frames_dir = self.config.FRAMES_DIR / video_id
            self._extract_features_for_dir(frames_dir)

# Execute Feature Extraction
print("--- Starting Feature Extraction ---")
feature_extractor = FeatureExtractor(config)
feature_extractor.run(video_ids)

--- Starting Feature Extraction ---
Missing keys for loading model: []
Unexpected keys for loading model: []
Feature extraction model loaded.
Extracting frames...


Extracting Frames: 100%|██████████| 10/10 [00:40<00:00,  4.07s/it]


Extracting features...


  with torch.cuda.amp.autocast():
Extracting Features: 100%|██████████| 10/10 [00:38<00:00,  3.87s/it]


## 6. Module 2: Width Expansion (Adaptive Clustering)

Applies k-means clustering to the frame features. An LLM (via Groq) scores the relevance of representative frames to iteratively find the optimal number of clusters.

In [5]:
class WidthExpansion:
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.model = Groq(api_key=os.environ.get("GROQ_API_KEY"))

    def _get_relevance_scores(self, representative_frames, video_id, captions_dict, questions):
        # This method now expects a dictionary, which is handled in the calling function
        video_captions_list = captions_dict.get(video_id, [])
        question_text = questions.get(video_id, "")
        

        # Access captions by list index since we know it's a list
        descriptions = []
        for idx in representative_frames:
            caption = video_captions_list[idx] if 0 <= idx < len(video_captions_list) else 'No caption.'
            descriptions.append(f"Frame {idx}: {caption}")

        system_prompt = "You are an expert video analyst. Analyze frame descriptions and rate their relevance to answering the specific question."
        user_prompt = f"""VIDEO QUESTION: {question_text}\n
FRAME DESCRIPTIONS:\n{chr(10).join(descriptions)}\n
TASK: Rate each frame's relevance to answering the question on a scale of 1-3 (1=Not relevant, 2=Somewhat, 3=Highly relevant).\n
Provide ONLY the relevance scores in this exact format: frame relevance: [score1, score2, ...]\n
You must provide exactly {len(descriptions)} scores."""
        print('2')
        completion = self.model.chat.completions.create(
            model=self.config.GROQ_MODEL,
            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
            temperature=0.0, max_tokens=1000
        )
        response_text = completion.choices[0].message.content
        
        match = re.search(r'\[([0-9,\s]+)\]', response_text)
        if match:
            scores = [int(x.strip()) for x in match.group(1).split(',') if x.strip().isdigit()]
            return scores, completion.usage.total_tokens
        return [], 0


    def run_for_video(self, video_id: str, questions: dict, captions: list): # Expects a list of tuples
        features_path = self.config.FEATURES_DIR / f"{video_id}.pt"
        if not features_path.exists():
            print(f"[WARN] Features not found for {video_id}, skipping.")
            return

        # --- FIX: Convert the list of tuples into a dictionary ---
        captions_dict = dict(captions)
        # --- END FIX ---

        frame_feats = torch.load(features_path).to(self.config.DEVICE)
        num_clusters = self.config.INIT_CLUSTER_NUM
        all_attempts = []

        while num_clusters <= self.config.MAX_CLUSTER_NUM:
            cluster_ids, cluster_centers = kmeans(X=frame_feats, num_clusters=num_clusters, distance='cosine', device=self.config.DEVICE)
            
            cluster_centers = cluster_centers.to(self.config.DEVICE)

            representative_frames = []
            for i in range(num_clusters):
                cluster_indices = torch.where(cluster_ids == i)[0]
                if len(cluster_indices) > 0:
                    distances = torch.norm(frame_feats[cluster_indices] - cluster_centers[i], dim=1)
                    closest_local_idx = torch.argmin(distances).item()
                    representative_frames.append(cluster_indices[closest_local_idx].item())
            
            representative_frames = sorted(list(set(representative_frames)))
            # Pass the newly created dictionary to the helper function
            scores, tokens = self._get_relevance_scores(representative_frames, video_id, captions_dict, questions)
            
            high_relevance_count = scores.count(3)
            attempt_data = {
                'num_clusters': num_clusters,
                'representative_frames': representative_frames,
                'cluster_assignments': cluster_ids.tolist(),
                'frame_relevance': scores,
                'high_relevance_count': high_relevance_count,
                'tokens_used': tokens
            }
            all_attempts.append(attempt_data)
            
            if high_relevance_count >= self.config.RELEVANCE_THRESHOLD:
                break
            num_clusters *= 2
        
        if not all_attempts:
            print(f"[WARN] Could not perform clustering for {video_id}.")
            return

        final_result = all_attempts[-1]
        output_data = {
            'video_id': video_id,
            'final_result': final_result,
            'all_clustering_attempts': all_attempts,
            'total_tokens': sum(a['tokens_used'] for a in all_attempts)
        }
        save_json(output_data, self.config.OUTPUTS_DIR / f'width_expansion_{video_id}.json')
        return output_data

# Execute Width Expansion
print("--- Starting Width Expansion (Adaptive Clustering) ---")
width_expander = WidthExpansion(config)
for video_id in tqdm(video_ids, desc="Processing Videos (Width)"):
    # The 'captions' variable here is the original list of tuples
    width_expander.run_for_video(video_id, questions, captions)
print("Width expansion complete.")

--- Starting Width Expansion (Adaptive Clustering) ---


Processing Videos (Width):   0%|          | 0/10 [00:00<?, ?it/s]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=188.140259, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00,  3.90it/s, center_shift=188.140259, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00,  3.90it/s, center_shift=6.656952, iteration=2, tol=0.000100]  [A
[running kmeans]: 2it [00:00,  3.90it/s, center_shift=2.033382, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00,  3.90it/s, center_shift=0.353337, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00,  3.90it/s, center_shift=0.139455, iteration=5, tol=0.000100][A
[running kmeans]: 6it [00:00, 21.00it/s, center_shift=0.000000, iteration=6, tol=0.000100][A


2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=538.343384, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 126.02it/s, center_shift=11.064881, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 166.90it/s, center_shift=5.948859, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 187.88it/s, center_shift=1.407595, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 200.73it/s, center_shift=2.239845, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 207.25it/s, center_shift=2.057570, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 209.96it/s, center_shift=1.048385, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 214.48it/s, center_shift=0.268548, iteration=8, tol=0.000100][A
[running kmeans]: 9it [00:00, 232.70it/s, center_shift=0.000000, iteration=9, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=1389.674927, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 103.63it/s, center_shift=31.922663, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 133.11it/s, center_shift=4.012420, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 149.56it/s, center_shift=6.055304, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 158.85it/s, center_shift=0.294459, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 165.73it/s, center_shift=0.043747, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 170.71it/s, center_shift=0.039437, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 174.66it/s, center_shift=0.155348, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 177.75it/s, center_shift=0.040769, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 179.52it/s, center_shift=0.036466, iteration=10, tol=0.000100][A
[running

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=4606.518555, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 70.72it/s, center_shift=18.780474, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 93.48it/s, center_shift=2.821367, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 103.57it/s, center_shift=1.239353, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 110.46it/s, center_shift=0.355974, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 114.23it/s, center_shift=0.289929, iteration=6, tol=0.000100][A
[running kmeans]: 7it [00:00, 131.79it/s, center_shift=0.000000, iteration=7, tol=0.000100][A

2



Processing Videos (Width):  10%|█         | 1/10 [00:03<00:29,  3.32s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=321.375336, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 180.32it/s, center_shift=17.444477, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 217.78it/s, center_shift=2.592755, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 234.81it/s, center_shift=1.513756, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 243.63it/s, center_shift=0.396854, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 250.69it/s, center_shift=0.131118, iteration=6, tol=0.000100][A
[running kmeans]: 7it [00:00, 273.82it/s, center_shift=0.000000, iteration=7, tol=0.000100][A


2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=1020.801575, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 142.74it/s, center_shift=15.772219, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 182.41it/s, center_shift=4.262739, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 201.70it/s, center_shift=7.017419, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 210.90it/s, center_shift=0.872046, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 214.47it/s, center_shift=1.449256, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 217.28it/s, center_shift=2.711781, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 216.84it/s, center_shift=2.088401, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 219.81it/s, center_shift=0.498171, iteration=9, tol=0.000100][A
[running kmeans]: 10it [00:00, 232.53it/s, center_shift=0.000000, iteration=10, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=3377.533447, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 105.65it/s, center_shift=51.638443, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 137.56it/s, center_shift=38.048801, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 154.53it/s, center_shift=9.550435, iteration=4, tol=0.000100] [A
[running kmeans]: 4it [00:00, 166.03it/s, center_shift=1.536264, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 171.80it/s, center_shift=4.646233, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 176.68it/s, center_shift=1.812606, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 178.79it/s, center_shift=1.395089, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 179.38it/s, center_shift=0.960055, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 180.53it/s, center_shift=3.291580, iteration=10, tol=0.000100][A
[runnin

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=9640.849609, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 73.99it/s, center_shift=67.970192, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 93.31it/s, center_shift=28.737940, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 104.44it/s, center_shift=1.732220, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 110.13it/s, center_shift=1.007924, iteration=5, tol=0.000100][A
[running kmeans]: 6it [00:00, 131.35it/s, center_shift=0.000000, iteration=6, tol=0.000100][A

2



Processing Videos (Width):  20%|██        | 2/10 [00:07<00:28,  3.61s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=463.414368, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 165.25it/s, center_shift=12.491178, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 211.14it/s, center_shift=4.822322, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 233.51it/s, center_shift=4.039855, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 240.18it/s, center_shift=1.473219, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 248.19it/s, center_shift=0.414318, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 255.31it/s, center_shift=1.148929, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 262.21it/s, center_shift=0.136464, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 266.41it/s, center_shift=0.129773, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 270.67it/s, center_shift=0.110136, iteration=10, tol=0.000100][A
[running 

2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=1793.734985, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 136.31it/s, center_shift=139.356033, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 175.17it/s, center_shift=69.361763, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 187.94it/s, center_shift=50.438812, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 198.75it/s, center_shift=9.054205, iteration=5, tol=0.000100] [A
[running kmeans]: 5it [00:00, 202.94it/s, center_shift=3.646073, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 208.86it/s, center_shift=0.218852, iteration=7, tol=0.000100][A
[running kmeans]: 8it [00:00, 228.16it/s, center_shift=0.000000, iteration=8, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=3391.110840, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 111.83it/s, center_shift=41.664131, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 144.71it/s, center_shift=15.856494, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 154.51it/s, center_shift=1.387201, iteration=4, tol=0.000100] [A
[running kmeans]: 5it [00:00, 186.81it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=13793.071289, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 70.21it/s, center_shift=115.675385, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 92.06it/s, center_shift=58.226871, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 99.21it/s, center_shift=8.251153, iteration=4, tol=0.000100] [A
[running kmeans]: 5it [00:00, 125.08it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2



Processing Videos (Width):  30%|███       | 3/10 [00:12<00:30,  4.39s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=262.762268, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 162.27it/s, center_shift=14.912267, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 203.56it/s, center_shift=0.605691, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 220.63it/s, center_shift=0.423645, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 231.30it/s, center_shift=0.055259, iteration=5, tol=0.000100][A
[running kmeans]: 6it [00:00, 258.11it/s, center_shift=0.000000, iteration=6, tol=0.000100][A


2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=975.380066, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 139.01it/s, center_shift=30.399158, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 182.99it/s, center_shift=6.667058, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 202.58it/s, center_shift=0.747641, iteration=4, tol=0.000100][A
[running kmeans]: 5it [00:00, 241.03it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=3085.843018, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 102.21it/s, center_shift=96.599762, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 134.53it/s, center_shift=29.260267, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 150.20it/s, center_shift=0.502125, iteration=4, tol=0.000100] [A
[running kmeans]: 4it [00:00, 160.13it/s, center_shift=0.361889, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 166.15it/s, center_shift=0.299426, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 170.64it/s, center_shift=0.850568, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 174.67it/s, center_shift=0.124626, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 178.05it/s, center_shift=1.066220, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 180.29it/s, center_shift=0.088493, iteration=10, tol=0.000100][A
[runnin

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=6622.844238, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 81.08it/s, center_shift=121.904297, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 102.10it/s, center_shift=2.852664, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 112.29it/s, center_shift=0.340378, iteration=4, tol=0.000100][A
[running kmeans]: 5it [00:00, 138.03it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2



Processing Videos (Width):  40%|████      | 4/10 [00:35<01:10, 11.72s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=209.821259, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 163.09it/s, center_shift=3.515565, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 209.32it/s, center_shift=2.525043, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 233.22it/s, center_shift=3.609009, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 249.22it/s, center_shift=1.853347, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 258.96it/s, center_shift=1.144781, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 266.59it/s, center_shift=0.039114, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 271.05it/s, center_shift=0.040561, iteration=8, tol=0.000100][A
[running kmeans]: 9it [00:00, 289.20it/s, center_shift=0.000000, iteration=9, tol=0.000100][A


2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=822.633667, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 135.33it/s, center_shift=59.755886, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 178.21it/s, center_shift=13.300785, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 197.32it/s, center_shift=2.555480, iteration=4, tol=0.000100] [A
[running kmeans]: 4it [00:00, 207.48it/s, center_shift=1.132269, iteration=5, tol=0.000100][A
[running kmeans]: 6it [00:00, 237.49it/s, center_shift=0.000000, iteration=6, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=2232.361328, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 106.03it/s, center_shift=37.831345, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 140.80it/s, center_shift=0.279432, iteration=3, tol=0.000100] [A
[running kmeans]: 4it [00:00, 186.68it/s, center_shift=0.000000, iteration=4, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=7447.219238, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 67.71it/s, center_shift=49.606190, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 90.65it/s, center_shift=0.800744, iteration=3, tol=0.000100] [A
[running kmeans]: 4it [00:00, 126.92it/s, center_shift=0.000000, iteration=4, tol=0.000100][A

2



Processing Videos (Width):  50%|█████     | 5/10 [01:06<01:33, 18.69s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=324.727020, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 158.63it/s, center_shift=9.744284, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 207.85it/s, center_shift=4.176831, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 231.04it/s, center_shift=1.482144, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 243.55it/s, center_shift=0.740228, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 251.14it/s, center_shift=0.299111, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 254.60it/s, center_shift=0.139624, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 258.54it/s, center_shift=0.139928, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 260.93it/s, center_shift=0.186372, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 262.39it/s, center_shift=0.151171, iteration=10, tol=0.000100][A
[running km

2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=1140.818481, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 140.99it/s, center_shift=44.733231, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 181.23it/s, center_shift=25.894604, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 194.18it/s, center_shift=12.919411, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 201.70it/s, center_shift=11.507927, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 208.85it/s, center_shift=0.063971, iteration=6, tol=0.000100] [A
[running kmeans]: 6it [00:00, 214.08it/s, center_shift=0.167965, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 218.87it/s, center_shift=0.186743, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 222.38it/s, center_shift=0.681145, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 225.68it/s, center_shift=0.559716, iteration=10, tol=0.000100][A
[runn

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=3370.602783, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 107.33it/s, center_shift=104.334244, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 141.61it/s, center_shift=19.889393, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 157.13it/s, center_shift=37.277691, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 166.96it/s, center_shift=11.384549, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 171.75it/s, center_shift=2.694604, iteration=6, tol=0.000100] [A
[running kmeans]: 7it [00:00, 195.75it/s, center_shift=0.000000, iteration=7, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=9771.169922, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 73.75it/s, center_shift=110.124626, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 95.63it/s, center_shift=10.160331, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 106.07it/s, center_shift=1.302236, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 112.60it/s, center_shift=1.781830, iteration=5, tol=0.000100][A
[running kmeans]: 6it [00:00, 133.88it/s, center_shift=0.000000, iteration=6, tol=0.000100][A

2



Processing Videos (Width):  60%|██████    | 6/10 [01:38<01:33, 23.40s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=189.750305, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 157.92it/s, center_shift=9.343554, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 202.13it/s, center_shift=5.344783, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 223.86it/s, center_shift=15.996229, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 235.30it/s, center_shift=2.106526, iteration=5, tol=0.000100] [A
[running kmeans]: 5it [00:00, 244.70it/s, center_shift=1.155721, iteration=6, tol=0.000100][A
[running kmeans]: 7it [00:00, 269.36it/s, center_shift=0.000000, iteration=7, tol=0.000100][A


2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=908.942993, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 136.42it/s, center_shift=27.286325, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 173.45it/s, center_shift=11.436441, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 192.73it/s, center_shift=5.251940, iteration=4, tol=0.000100] [A
[running kmeans]: 4it [00:00, 206.68it/s, center_shift=0.466225, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 214.52it/s, center_shift=1.011579, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 218.66it/s, center_shift=1.363625, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 222.75it/s, center_shift=0.258433, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 224.60it/s, center_shift=0.586883, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 227.37it/s, center_shift=0.545586, iteration=10, tol=0.000100][A
[running

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=2484.963379, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 102.43it/s, center_shift=69.300560, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 133.32it/s, center_shift=23.132177, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 148.27it/s, center_shift=2.231285, iteration=4, tol=0.000100] [A
[running kmeans]: 5it [00:00, 181.84it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=6713.841309, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 70.60it/s, center_shift=287.481018, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 92.72it/s, center_shift=37.682331, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 100.90it/s, center_shift=37.969372, iteration=4, tol=0.000100][A
[running kmeans]: 5it [00:00, 125.66it/s, center_shift=0.000000, iteration=5, tol=0.000100] [A

2



Processing Videos (Width):  70%|███████   | 7/10 [02:12<01:20, 26.86s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=230.468597, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 164.43it/s, center_shift=10.286017, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 211.91it/s, center_shift=1.749592, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 234.67it/s, center_shift=0.604805, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 245.37it/s, center_shift=1.028258, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 256.59it/s, center_shift=1.692023, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 257.35it/s, center_shift=1.679716, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 264.51it/s, center_shift=1.070394, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 266.68it/s, center_shift=0.484296, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 265.17it/s, center_shift=0.061234, iteration=10, tol=0.000100][A
[running 

2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=770.531921, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 119.38it/s, center_shift=16.626810, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 155.09it/s, center_shift=10.069311, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 177.59it/s, center_shift=0.960586, iteration=4, tol=0.000100] [A
[running kmeans]: 4it [00:00, 189.74it/s, center_shift=0.162304, iteration=5, tol=0.000100][A
[running kmeans]: 6it [00:00, 218.75it/s, center_shift=0.000000, iteration=6, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=3203.256592, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 101.93it/s, center_shift=136.132462, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 134.52it/s, center_shift=20.140392, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 149.81it/s, center_shift=2.994395, iteration=4, tol=0.000100] [A
[running kmeans]: 5it [00:00, 184.06it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=7068.864258, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 70.19it/s, center_shift=48.080647, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 91.15it/s, center_shift=4.769558, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 103.36it/s, center_shift=0.329073, iteration=4, tol=0.000100][A
[running kmeans]: 5it [00:00, 130.50it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2



Processing Videos (Width):  80%|████████  | 8/10 [02:41<00:54, 27.28s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=443.979858, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 164.53it/s, center_shift=8.748177, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 205.23it/s, center_shift=1.756175, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 225.78it/s, center_shift=1.115261, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 231.60it/s, center_shift=0.900914, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 241.81it/s, center_shift=0.572431, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 247.34it/s, center_shift=0.595598, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 251.70it/s, center_shift=0.094314, iteration=8, tol=0.000100][A
[running kmeans]: 9it [00:00, 270.00it/s, center_shift=0.000000, iteration=9, tol=0.000100][A


2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=1172.735352, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 132.84it/s, center_shift=47.528881, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 169.93it/s, center_shift=13.774522, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 189.90it/s, center_shift=3.441651, iteration=4, tol=0.000100] [A
[running kmeans]: 4it [00:00, 199.12it/s, center_shift=1.686678, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 206.28it/s, center_shift=1.975641, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 211.73it/s, center_shift=1.635498, iteration=7, tol=0.000100][A
[running kmeans]: 8it [00:00, 230.18it/s, center_shift=0.000000, iteration=8, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=3577.828125, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 91.43it/s, center_shift=82.218208, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 125.96it/s, center_shift=8.187290, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 144.53it/s, center_shift=3.941617, iteration=4, tol=0.000100][A
[running kmeans]: 5it [00:00, 177.27it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=10377.283203, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 68.43it/s, center_shift=23.981173, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 90.14it/s, center_shift=1.941187, iteration=3, tol=0.000100] [A
[running kmeans]: 4it [00:00, 126.63it/s, center_shift=0.000000, iteration=4, tol=0.000100][A

2



Processing Videos (Width):  90%|█████████ | 9/10 [03:11<00:28, 28.22s/it]

running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=193.987900, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 155.73it/s, center_shift=6.956596, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 198.39it/s, center_shift=1.858677, iteration=3, tol=0.000100][A
[running kmeans]: 3it [00:00, 222.68it/s, center_shift=2.100240, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 236.58it/s, center_shift=0.505365, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 245.91it/s, center_shift=0.019288, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 251.39it/s, center_shift=0.013698, iteration=7, tol=0.000100][A
[running kmeans]: 8it [00:00, 272.32it/s, center_shift=0.000000, iteration=8, tol=0.000100][A


2
running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=494.594269, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 132.94it/s, center_shift=10.320456, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 171.00it/s, center_shift=0.721472, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 185.60it/s, center_shift=0.390723, iteration=4, tol=0.000100][A
[running kmeans]: 5it [00:00, 224.96it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=1737.062744, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 102.13it/s, center_shift=34.940376, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 135.10it/s, center_shift=6.081357, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 149.98it/s, center_shift=2.607036, iteration=4, tol=0.000100][A
[running kmeans]: 4it [00:00, 158.18it/s, center_shift=4.511014, iteration=5, tol=0.000100][A
[running kmeans]: 5it [00:00, 163.25it/s, center_shift=0.646058, iteration=6, tol=0.000100][A
[running kmeans]: 6it [00:00, 168.56it/s, center_shift=2.486021, iteration=7, tol=0.000100][A
[running kmeans]: 7it [00:00, 173.65it/s, center_shift=9.023144, iteration=8, tol=0.000100][A
[running kmeans]: 8it [00:00, 176.82it/s, center_shift=0.547890, iteration=9, tol=0.000100][A
[running kmeans]: 9it [00:00, 178.93it/s, center_shift=0.470930, iteration=10, tol=0.000100][A
[running

2





running k-means on cuda..



[running kmeans]: 0it [00:00, ?it/s][A
[running kmeans]: 0it [00:00, ?it/s, center_shift=5081.288574, iteration=1, tol=0.000100][A
[running kmeans]: 1it [00:00, 68.49it/s, center_shift=42.789051, iteration=2, tol=0.000100][A
[running kmeans]: 2it [00:00, 91.61it/s, center_shift=0.343911, iteration=3, tol=0.000100] [A
[running kmeans]: 3it [00:00, 102.72it/s, center_shift=2.425100, iteration=4, tol=0.000100][A
[running kmeans]: 5it [00:00, 128.80it/s, center_shift=0.000000, iteration=5, tol=0.000100][A

2



Processing Videos (Width): 100%|██████████| 10/10 [03:38<00:00, 21.83s/it]

Width expansion complete.





## 7. Module 3: Depth Expansion (Hierarchical Clustering)

Uses the relevance scores from the previous step to guide a hierarchical clustering process. High-relevance clusters are broken down into more granular sub-clusters to provide more detail to the VLM.

In [6]:
class DepthExpansion:
    def __init__(self, config: PipelineConfig):
        self.config = config

    def _hierarchical_clustering(self, features, cluster_ids, relevance_scores):
        clusters = {i: {} for i in range(max(cluster_ids) + 1)}

        for cid in set(cluster_ids):
            score = relevance_scores[cid] if cid < len(relevance_scores) else 1
            indices = [i for i, x in enumerate(cluster_ids) if x == cid]
            if len(indices) < 2 or score == 1:
                clusters[cid] = [indices]
                continue

            sub_features = features[indices]
            linked_sub = linkage(sub_features.cpu().numpy(), method='ward')
            sub_labels = fcluster(linked_sub, self.config.NUM_SUBCLUSTERS, criterion='maxclust') - 1
            
            if score == 2:
                clusters[cid] = [[indices[j] for j in np.where(sub_labels == i)[0]] for i in range(self.config.NUM_SUBCLUSTERS)]
                continue
            
            # Score == 3: Perform sub-subclustering
            sub_sub_clusters = []
            for sub_cid in range(self.config.NUM_SUBCLUSTERS):
                sub_indices = np.where(sub_labels == sub_cid)[0]
                if len(sub_indices) < 2:
                    sub_sub_clusters.append([indices[i] for i in sub_indices])
                    continue
                
                subsub_features = sub_features[sub_indices]
                linked_subsub = linkage(subsub_features.cpu().numpy(), method='ward')
                subsub_labels = fcluster(linked_subsub, self.config.NUM_SUB_SUBCLUSTERS, criterion='maxclust') - 1
                for subsub_cid in range(self.config.NUM_SUB_SUBCLUSTERS):
                    final_indices = sub_indices[np.where(subsub_labels == subsub_cid)[0]]
                    sub_sub_clusters.append([indices[i] for i in final_indices])
            clusters[cid] = sub_sub_clusters
        return clusters

    def _find_closest_points(self, features, clusters):
        final_indices = set()
        for primary_cluster in clusters.values():
            for sub_cluster in primary_cluster:
                if not sub_cluster: continue
                points = features[torch.tensor(sub_cluster, dtype=torch.long)]
                centroid = points.mean(dim=0)
                distances = torch.norm(points - centroid, dim=1)
                closest_idx = torch.argmin(distances).item()
                final_indices.add(sub_cluster[closest_idx])
        return sorted(list(final_indices))

    def run_for_video(self, video_id: str):
        width_results_path = self.config.OUTPUTS_DIR / f'width_expansion_{video_id}.json'
        features_path = self.config.FEATURES_DIR / f"{video_id}.pt"
        if not width_results_path.exists() or not features_path.exists():
            return

        width_data = load_json(width_results_path)['final_result']
        features = torch.load(features_path)

        clusters = self._hierarchical_clustering(
            features, width_data['cluster_assignments'], width_data['frame_relevance'])
        
        final_frames = self._find_closest_points(features, clusters)

        output_data = {
            "video_id": video_id,
            "original_representative_frames": width_data['representative_frames'],
            "final_representative_frames": final_frames
        }
        save_json(output_data, self.config.OUTPUTS_DIR / f'depth_expansion_{video_id}.json')
        return output_data

# Execute Depth Expansion
print("--- Starting Depth Expansion (Hierarchical Clustering) ---")
depth_expander = DepthExpansion(config)
for video_id in tqdm(video_ids, desc="Processing Videos (Depth)"):
    depth_expander.run_for_video(video_id)
print("Depth expansion complete.")

--- Starting Depth Expansion (Hierarchical Clustering) ---


Processing Videos (Depth): 100%|██████████| 10/10 [00:00<00:00, 118.44it/s]

Depth expansion complete.





In [7]:
import os
import json
import torch
import torch.nn.functional as F

# --- Config ---
features_dir = "/kaggle/working/extracted_features"
json_dir = "/kaggle/working/outputs"
output_dir = "/kaggle/working/filtered_jsons_thresholded"
os.makedirs(output_dir, exist_ok=True)

threshold = 0.8  # cosine similarity threshold

def load_json(path):
    with open(path, 'r') as f:
        return json.load(f)

# --- Loop over all .pt feature files ---
for feat_file in os.listdir(features_dir):
    if not feat_file.endswith(".pt"):
        continue

    # Extract video ID from filename
    video_id = os.path.splitext(feat_file)[0]  # e.g. "0074f737-11cb-497d-8d07-77c3a8127391"

    # Load features
    feat_path = os.path.join(features_dir, feat_file)
    frame_features = torch.load(feat_path)  # shape: [num_frames, 1024]

    # Load corresponding JSON
    json_filename = f"depth_expansion_{video_id}.json"
    json_path = os.path.join(json_dir, json_filename)
    if not os.path.exists(json_path):
        print(f"⚠️ JSON not found for {video_id}, skipping...")
        continue

    data = load_json(json_path)
    indices = data.get("final_representative_frames", [])

    # --- Apply threshold filtering ---
    filtered_indices = []
    if indices:
        filtered_indices.append(indices[0])
        ref_idx = indices[0]

        for next_idx in indices[1:]:
            sim = F.cosine_similarity(
                frame_features[ref_idx].unsqueeze(0),
                frame_features[next_idx].unsqueeze(0)
            ).item()

            if sim < threshold:
                filtered_indices.append(next_idx)

            ref_idx = next_idx  # update reference

    # --- Save new JSON ---
    out_data = {"final_representative_frames": filtered_indices}
    out_path = os.path.join(output_dir, f"{video_id}_thresholded.json")
    with open(out_path, "w") as f:
        json.dump(out_data, f, indent=2)

    print(f"✅ Processed {video_id} → saved {out_path}")


✅ Processed 00f93e1e-cf4e-4835-88b4-4ad68216e86f → saved /kaggle/working/filtered_jsons_thresholded/00f93e1e-cf4e-4835-88b4-4ad68216e86f_thresholded.json
✅ Processed 0074f737-11cb-497d-8d07-77c3a8127391 → saved /kaggle/working/filtered_jsons_thresholded/0074f737-11cb-497d-8d07-77c3a8127391_thresholded.json
✅ Processed 01a144a5-24d2-4a5a-af01-1f318d674bed → saved /kaggle/working/filtered_jsons_thresholded/01a144a5-24d2-4a5a-af01-1f318d674bed_thresholded.json
✅ Processed 02925d7a-a5db-4127-8c31-b232e78b684d → saved /kaggle/working/filtered_jsons_thresholded/02925d7a-a5db-4127-8c31-b232e78b684d_thresholded.json
✅ Processed 03657401-d4a4-40d0-9b03-d7e093ef93d1 → saved /kaggle/working/filtered_jsons_thresholded/03657401-d4a4-40d0-9b03-d7e093ef93d1_thresholded.json
✅ Processed 00b9a0de-c59e-49cb-a127-6081e2fb8c8e → saved /kaggle/working/filtered_jsons_thresholded/00b9a0de-c59e-49cb-a127-6081e2fb8c8e_thresholded.json
✅ Processed 026a2f15-c454-4c28-80e0-24c85d7f4ecf → saved /kaggle/working/fil

In [8]:
f = load_json('/kaggle/working/filtered_jsons_thresholded/0074f737-11cb-497d-8d07-77c3a8127391_thresholded.json')
f

{'final_representative_frames': [3,
  15,
  20,
  32,
  36,
  40,
  44,
  63,
  65,
  70,
  71,
  73,
  100,
  105,
  122,
  154,
  165,
  177]}

## 8. Module 4: VLM Question Answering

Loads a powerful Vision-Language Model. It processes the question along with the refined set of keyframes from the depth expansion step to generate a final answer.

In [9]:
import re
from collections import Counter

def most_frequent_choice(text, letter_to_num=None):
    if letter_to_num is None:
        letter_to_num = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5}

    # Extract lines and letters
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    letters = []
    for ln in lines:
        m = re.match(r'^([A-Z])\.', ln, flags=re.I)
        if m:
            letters.append(m.group(1).upper())

    # Find most frequent letter
    if not letters:
        return None, None

    most_letter, freq = Counter(letters).most_common(1)[0]
    most_num = letter_to_num.get(most_letter)

    return most_num

In [10]:
#load answer
subest_json_path = '/kaggle/working/videotree/subset_answers.json'
answer = load_json(subest_json_path)
answer['0074f737-11cb-497d-8d07-77c3a8127391']

3

In [11]:
class VLMInference:
    def __init__(self, config: PipelineConfig):
        self.config = config
        self.model = None
        self.processor = None

    def _load_model(self):
        if self.model is not None and self.processor is not None:
            return
        
        self.processor = AutoProcessor.from_pretrained(self.config.VLM_MODEL_ID, trust_remote_code=True)
        self.model = AutoModelForVision2Seq.from_pretrained(
            self.config.VLM_MODEL_ID,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            device_map="auto"
        ).eval()
        print("VLM model loaded.")

    def clear_memory(self):
        del self.model
        gc.collect()
        torch.cuda.empty_cache()
        

    def _extract_frames_at_1fps(self, video_path: Path) -> dict[int, Image.Image]:
        cap = cv2.VideoCapture(str(video_path))
        fps = cap.get(cv2.CAP_PROP_FPS) or 30
        
        frames = {}
        frame_idx, saved_idx = 0, 0
        while True:
            ret, frame = cap.read()
            if not ret: break
            if frame_idx % int(fps) == 0:
                frames[saved_idx] = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                saved_idx += 1
            frame_idx += 1
        cap.release()
        return frames

    def run_for_video(self, video_id: str, question: str):
        self._load_model()
        # depth_results_path = self.config.OUTPUTS_DIR / f'depth_expansion_{video_id}.json'
        t_out = "/kaggle/working/filtered_jsons_thresholded"
        depth_results_path = os.path.join(t_out, f"{video_id}_thresholded.json")
        video_path = self.config.VIDEO_DIR / f"{video_id}.mp4"

        # if not depth_results_path.exists():
        #     print(f"[WARN] Depth expansion results not found for {video_id}")
        #     return None
            
        depth_data = load_json(depth_results_path)
        keyframe_indices = depth_data['final_representative_frames']
        
        # The indices from feature extraction correspond to frames extracted at ~1 FPS
        all_frames = self._extract_frames_at_1fps(video_path)
        keyframe_images = [all_frames[i] for i in keyframe_indices if i in all_frames]
        
        if not keyframe_images:
            print(f"[WARN] No keyframe images found for {video_id}")
            return None
            
        # Batch processing for memory efficiency
        answers = []
        for i in range(0, len(keyframe_images), self.config.VLM_BATCH_SIZE):
            batch_images = keyframe_images[i:i + self.config.VLM_BATCH_SIZE]
            content = [{"type": "image", "image": img} for img in batch_images]
            content.append({"type": "text", "text": question})
            messages = [{"role": "user", "content": content}]
            
            try:
                inputs = self.processor.apply_chat_template(
                    messages, add_generation_prompt=True, tokenize=True,
                    return_dict=True, return_tensors="pt"
                ).to(self.model.device)

                with torch.no_grad():
                    outputs = self.model.generate(**inputs, max_new_tokens=50)
                
                answer = self.processor.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
                answers.append(answer.strip())

            except torch.cuda.OutOfMemoryError:
                print(f"[WARN] CUDA OOM on video {video_id}, batch {i}. Skipping batch.")
            finally:
                torch.cuda.empty_cache()
                
        return "\n".join(answers)


# Execute VLM Inference and Display Final Results
print("--- Starting VLM Question Answering ---")
vlm = VLMInference(config)
correct_count = 0
total_count = 0

for video_id in video_ids:
    print(f"\n{'='*20} Final Result for Video: {video_id} {'='*20}")
    question_text = questions.get(video_id, "No question available.")
    final_answer = vlm.run_for_video(video_id, question_text)
    choices = most_frequent_choice(final_answer)
    print("Most common letter(s):", choices)

    ground_truth = answer[video_id] + 1

    # Count total and correct answers
    total_count += 1
    if ground_truth ==choices:
        correct_count += 1

    print(f"\n❓ QUESTION:\n{question_text}")
    print(f"\n🧠 MODEL ANSWER:\n{final_answer if final_answer else 'Could not generate an answer.'}")
    print(f"\n✅ Ground Truth Answer:\n{ground_truth}")

# Final report
accuracy = (correct_count / total_count * 100) if total_count > 0 else 0
print("\n--- Pipeline Finished ---")
print(f"✅ Correct Answers: {correct_count}/{total_count} ({accuracy:.2f}% accuracy)")

--- Starting VLM Question Answering ---



The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


VLM model loaded.
Most common letter(s): 5

❓ QUESTION:
Question:
What was the primary purpose of the cup of water in this video, and how did it contribute to the overall painting process?

Options:
A. To provide a source of water for the paintbrush.
B. To provide a place to store the paintbrush.
C. To provide a place to dispose of the paintbrush.
D. To provide a place to rest the paintbrush.
E. To clean the paintbrush.

Please choose the most appropriate answer (A–E).

🧠 MODEL ANSWER:
E. To clean the paintbrush.

✅ Ground Truth Answer:
5

Most common letter(s): 4

❓ QUESTION:
Question:
Taking into account all the actions performed by c, what can you deduce about the primary objective and focus within the video content?

Options:
A. C is cooking.
B. C is doing laundry.
C. C is cleaning the kitchen.
D. C is cleaning dishes.
E. C is cleaning the bathroom.

Please choose the most appropriate answer (A–E).

🧠 MODEL ANSWER:
D. C is cleaning dishes.
D. C is cleaning dishes.

✅ Ground Truth A