In [1]:
!pip install webvtt-py spacy sentence-transformers qdrant-client bertopic gradio ffmpeg-python
!python -m spacy download en_core_web_sm

Collecting webvtt-py
  Downloading webvtt_py-0.5.1-py3-none-any.whl.metadata (3.4 kB)
Collecting spacy
  Downloading spacy-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl.metadata (27 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting qdrant-client
  Using cached qdrant_client-1.14.2-py3-none-any.whl.metadata (10 kB)
Collecting bertopic
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Collecting gradio
  Using cached gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting ffmpeg-python
  Using cached ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp39-cp39-macosx_10_9_x86_64.

In [2]:
import os, glob
from webvtt import WebVTT

In [3]:
def time_to_seconds(timestamp):
    h, m, s = timestamp.split(':')
    return int(h) * 3600 + int(m) * 60 + float(s)

captions_data = []  # List of dicts: {'video_id', 'start', 'end', 'text'}
for video_dir in sorted(os.listdir("./data/videos")):
    video_path = os.path.join("./data/videos", video_dir)
    if not os.path.isdir(video_path):
        continue
    # Find English VTT file
    vtt_files = glob.glob(os.path.join(video_path, "*.en.vtt"))
    if not vtt_files:
        continue
    vtt_path = vtt_files[0]
    # Parse captions
    for caption in WebVTT().read(vtt_path):
        text = caption.text.strip().replace("\n", " ")
        if not text:
            continue
        start = time_to_seconds(caption.start)
        end = time_to_seconds(caption.end)
        captions_data.append({"video_id": video_dir, "start": start, "end": end, "text": text})

# Sort by video and time
captions_data.sort(key=lambda x: (x['video_id'], x['start']))

# Deduplicate consecutive segments
filtered_caps = []
prev_text = None
prev_vid = None
for cap in captions_data:
    if cap["video_id"] != prev_vid:
        prev_text = None  # reset at new video
        prev_vid = cap["video_id"]
    if cap["text"] != prev_text:
        filtered_caps.append(cap)
    prev_text = cap["text"]
captions_data = filtered_caps

len(captions_data), "caption segments loaded."


(6970, 'caption segments loaded.')

In [4]:
import spacy
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

from itertools import groupby

sentences = []  # List of dicts: {'video_id', 'start', 'end', 'sentence'}
for video_id, group in groupby(captions_data, key=lambda x: x['video_id']):
    group = list(group)
    combined_text = " ".join([seg["text"] for seg in group])
    doc = nlp(combined_text)
    # Compute cumulative character lengths for segment boundaries
    cum_lengths = [0]
    for seg in group:
        cum_lengths.append(cum_lengths[-1] + len(seg["text"]) + 1)
    for sent in doc.sents:
        sent_text = sent.text.strip()
        if not sent_text:
            continue
        start_char = sent.start_char
        end_char = sent.end_char
        # Find segment indices containing sentence start/end
        start_idx = max(i for i, length in enumerate(cum_lengths) if length <= start_char)
        end_idx = max(i for i, length in enumerate(cum_lengths) if length < end_char)
        start_time = group[start_idx]["start"]
        end_time = group[end_idx]["end"]
        sentences.append({
            "video_id": video_id,
            "start": start_time,
            "end": end_time,
            "sentence": sent_text
        })

print(f"Created {len(sentences)} merged sentences from captions.")



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/opt/miniconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/miniconda3/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/chenyuzhong/.local/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/chenyuzhong/.local/lib/python3.9/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/chenyu

Created 8 merged sentences from captions.


In [None]:
from itertools import groupby
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import math

def windowize_sentence(sentence, window_size=50, stride=25):
    tokens = sentence.split()  # or use spaCy: [t.text for t in nlp(sentence)]
    spans = []
    for start in range(0, max(1, len(tokens) - window_size + 1), stride):
        span = tokens[start:start + window_size]
        spans.append(" ".join(span))
    # If the sentence is shorter than window_size, keep it as one span
    if not spans:
        spans = [" ".join(tokens)]
    return spans
# 1. Load base model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Build training pairs: consecutive sentences in each video
train_examples = []
for vid, group in groupby(sentences, key=lambda x: x['video_id']):
    group = list(group)
    for i in range(len(group) - 1):
        s1 = group[i]['sentence'].strip()
        s2 = group[i+1]['sentence'].strip()
        # Skip empty or too-long examples
        if not s1 or not s2 or len(s1) > 500 or len(s2) > 500:
            continue
        train_examples.append(InputExample(texts=[s1, s2]))

# 3. Shuffle and batch
import random
random.shuffle(train_examples)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# 4. Define loss and fine-tune
train_loss = losses.MultipleNegativesRankingLoss(model=model)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100
)

# 5. Save the fine-tuned model
model.save("fine_tuned_minilm")


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [17]:
sent_texts = [s["sentence"] for s in sentences]
embeddings = model.encode(sent_texts, convert_to_numpy=True)
print("Encoded sentences to vectors of shape:", embeddings.shape)



RuntimeError: Numpy is not available

In [None]:
from bertopic import BERTopic

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(sent_texts)
print(topic_model.get_topic_info().head())


In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct

client = QdrantClient(host="localhost", port=6333)
collection_name = "video_captions"
vector_dim = embeddings.shape[1]
client.recreate_collection(collection_name=collection_name, vector_size=vector_dim, distance="Cosine")

# Prepare points for upserting
points = []
for i, sent in enumerate(sentences):
    payload = {
        "video_id": sent["video_id"],
        "start": sent["start"],
        "end": sent["end"],
        "sentence": sent["sentence"]
    }
    points.append(PointStruct(id=i, vector=embeddings[i].tolist(), payload=payload))

client.upsert(collection_name=collection_name, points=points)
print(f"Uploaded {len(points)} points to Qdrant collection '{collection_name}'.")


In [None]:
def answer_query(query, top_k=1):
    q_vec = model.encode([query], convert_to_numpy=True)[0]
    results = client.search(collection_name=collection_name, query_vector=q_vec, limit=top_k)
    top = results[0]  # best match
    data = top.payload
    return data["video_id"], data["start"], data["end"], data["sentence"]

# Example query
video_id, start, end, sentence = answer_query("Explain how convolutional layers work")
print(f"Top answer: \"{sentence}\" (Video {video_id}, {start:.1f}-{end:.1f}s)")


In [None]:
import ffmpeg

def extract_clip(video_id, start, end, output_path="clip.mp4"):
    # Find the video file for this ID
    mp4_files = glob.glob(os.path.join("./data/videos", video_id, "*.mp4"))
    if not mp4_files:
        raise FileNotFoundError(f"No video found for ID {video_id}")
    video_path = mp4_files[0]
    (
        ffmpeg
        .input(video_path, ss=start, to=end)
        .output(output_path, codec="copy")
        .run(overwrite_output=True)
    )
    return output_path

# Example: extract the clip for the top result
clip_file = extract_clip(video_id, start, end, output_path=f"clip_{video_id}_{int(start)}_{int(end)}.mp4")
print(f"Saved clip to {clip_file}")


In [None]:
%load_ext gradio

%%blocks
import gradio as gr

def qa_pipeline(question):
    vid, s, e, ans = answer_query(question)
    clip = extract_clip(vid, s, e, output_path=f"clip_{vid}_{int(s)}_{int(e)}.mp4")
    return clip, ans

with gr.Blocks() as demo:
    gr.Markdown("## Chat with Your Video Library")
    inp = gr.Textbox(label="Enter your question")
    vid_out = gr.Video(label="Relevant clip")
    text_out = gr.Textbox(label="Answer sentence")
    inp.submit(qa_pipeline, [inp], [vid_out, text_out])

demo
