In [2]:
!pip install webvtt-py spacy sentence-transformers qdrant-client bertopic gradio ffmpeg-python
!python -m spacy download en_core_web_sm

Collecting numpy>=1.19.0 (from spacy)
  Obtaining dependency information for numpy>=1.19.0 from https://files.pythonhosted.org/packages/ef/4e/3d9e6d16237c2aa5485695f0626cbba82f6481efca2e9132368dea3b885e/numpy-2.2.5-cp310-cp310-macosx_10_9_x86_64.whl.metadata
  Using cached numpy-2.2.5-cp310-cp310-macosx_10_9_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.5-cp310-cp310-macosx_10_9_x86_64.whl (21.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.5
    Uninstalling numpy-1.23.5:
      Successfully uninstalled numpy-1.23.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
nncf 2.15.0 requires numpy<2.2.0,>=1.24.0, but you have numpy 2.2.5 which is incompatible.
vis-nav-game 1.2.1 requires cython~=3.0.2, but you have cython 3.0.0 which is incompatible.
vis-nav-game 1.2.1 requires numpy~=1.25.2

In [3]:
import os, glob
from webvtt import WebVTT

In [4]:
def time_to_seconds(timestamp):
    h, m, s = timestamp.split(':')
    return int(h) * 3600 + int(m) * 60 + float(s)

captions_data = []  # List of dicts: {'video_id', 'start', 'end', 'text'}
for video_dir in sorted(os.listdir("./data/videos")):
    video_path = os.path.join("./data/videos", video_dir)
    if not os.path.isdir(video_path):
        continue
    # Find English VTT file
    vtt_files = glob.glob(os.path.join(video_path, "*.en.vtt"))
    if not vtt_files:
        continue
    vtt_path = vtt_files[0]
    # Parse captions
    for caption in WebVTT().read(vtt_path):
        text = caption.text.strip().replace("\n", " ")
        if not text:
            continue
        start = time_to_seconds(caption.start)
        end = time_to_seconds(caption.end)
        captions_data.append({"video_id": video_dir, "start": start, "end": end, "text": text})

# Sort by video and time
captions_data.sort(key=lambda x: (x['video_id'], x['start']))

# Deduplicate consecutive segments
filtered_caps = []
prev_text = None
prev_vid = None
for cap in captions_data:
    if cap["video_id"] != prev_vid:
        prev_text = None  # reset at new video
        prev_vid = cap["video_id"]
    if cap["text"] != prev_text:
        filtered_caps.append(cap)
    prev_text = cap["text"]
captions_data = filtered_caps

len(captions_data), "caption segments loaded."


(6970, 'caption segments loaded.')

In [5]:
import spacy
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

from itertools import groupby

sentences = []  # List of dicts: {'video_id', 'start', 'end', 'sentence'}
for video_id, group in groupby(captions_data, key=lambda x: x['video_id']):
    group = list(group)
    combined_text = " ".join([seg["text"] for seg in group])
    doc = nlp(combined_text)
    # Compute cumulative character lengths for segment boundaries
    cum_lengths = [0]
    for seg in group:
        cum_lengths.append(cum_lengths[-1] + len(seg["text"]) + 1)
    for sent in doc.sents:
        sent_text = sent.text.strip()
        if not sent_text:
            continue
        start_char = sent.start_char
        end_char = sent.end_char
        # Find segment indices containing sentence start/end
        start_idx = max(i for i, length in enumerate(cum_lengths) if length <= start_char)
        end_idx = max(i for i, length in enumerate(cum_lengths) if length < end_char)
        start_time = group[start_idx]["start"]
        end_time = group[end_idx]["end"]
        sentences.append({
            "video_id": video_id,
            "start": start_time,
            "end": end_time,
            "sentence": sent_text
        })

print(f"Created {len(sentences)} merged sentences from captions.")


Created 8 merged sentences from captions.


In [23]:
! pip install -U 'accelerate==0.21.0'
! pip install -U transformers
from itertools import groupby
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import spacy
import random

nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

def windowize_sentence(sentence, window_size=50, stride=50):
    tokens = sentence.split()
    spans = []
    # slide a window of up to 50 words (no overlap, since stride=window_size)
    for start in range(0, max(1, len(tokens) - window_size + 1), stride):
        spans.append(" ".join(tokens[start:start + window_size]))
    if not spans:
        spans = [" ".join(tokens)]
    return spans

processed = []
for vid, group in groupby(sentences, key=lambda x: x['video_id']):
    group = list(group)
    long_para = group[0]['sentence']
    for sent in nlp(long_para).sents:
        # further split any spaCy sentence over 50 words
        for chunk in windowize_sentence(sent.text, window_size=50, stride=50):
            processed.append({
                "video_id": vid,
                "sentence": chunk.strip()
            })

train_examples = []
for vid, group in groupby(processed, key=lambda x: x['video_id']):
    grp = list(group)
    for i in range(len(grp) - 1):
        s1, s2 = grp[i]['sentence'], grp[i+1]['sentence']
        if not s1 or not s2:
            continue
        train_examples.append(InputExample(texts=[s1, s2]))

# 4. Shuffle, batch, and fine‐tune
random.shuffle(train_examples)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.MultipleNegativesRankingLoss(model=model)
# model.fit(
#     train_objectives=[(train_dataloader, train_loss)],
#     epochs=1,
#     warmup_steps=100
# )

# # 5. Save your fine‐tuned model
# model.save("fine_tuned_minilm")




In [24]:
sent_texts = [s["sentence"] for s in sentences]
embeddings = model.encode(sent_texts, convert_to_numpy=True)
print("Encoded sentences to vectors of shape:", embeddings.shape)



Encoded sentences to vectors of shape: (8, 384)


In [25]:
from bertopic import BERTopic

topic_model = BERTopic()
topics, probs = topic_model.fit_transform(sent_texts)
print(topic_model.get_topic_info().head())


RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
cannot import name 'is_torch_hpu_available' from 'transformers.utils' (/Users/chenyuzhong/miniconda3/envs/game/lib/python3.10/site-packages/transformers/utils/__init__.py)

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct

client = QdrantClient(host="localhost", port=6333)
collection_name = "video_captions"
vector_dim = embeddings.shape[1]
client.recreate_collection(collection_name=collection_name, vector_size=vector_dim, distance="Cosine")

# Prepare points for upserting
points = []
for i, sent in enumerate(sentences):
    payload = {
        "video_id": sent["video_id"],
        "start": sent["start"],
        "end": sent["end"],
        "sentence": sent["sentence"]
    }
    points.append(PointStruct(id=i, vector=embeddings[i].tolist(), payload=payload))

client.upsert(collection_name=collection_name, points=points)
print(f"Uploaded {len(points)} points to Qdrant collection '{collection_name}'.")


In [None]:
def answer_query(query, top_k=1):
    q_vec = model.encode([query], convert_to_numpy=True)[0]
    results = client.search(collection_name=collection_name, query_vector=q_vec, limit=top_k)
    top = results[0]  # best match
    data = top.payload
    return data["video_id"], data["start"], data["end"], data["sentence"]

# Example query
video_id, start, end, sentence = answer_query("Explain how convolutional layers work")
print(f"Top answer: \"{sentence}\" (Video {video_id}, {start:.1f}-{end:.1f}s)")


In [None]:
import ffmpeg

def extract_clip(video_id, start, end, output_path="clip.mp4"):
    # Find the video file for this ID
    mp4_files = glob.glob(os.path.join("./data/videos", video_id, "*.mp4"))
    if not mp4_files:
        raise FileNotFoundError(f"No video found for ID {video_id}")
    video_path = mp4_files[0]
    (
        ffmpeg
        .input(video_path, ss=start, to=end)
        .output(output_path, codec="copy")
        .run(overwrite_output=True)
    )
    return output_path

# Example: extract the clip for the top result
clip_file = extract_clip(video_id, start, end, output_path=f"clip_{video_id}_{int(start)}_{int(end)}.mp4")
print(f"Saved clip to {clip_file}")


In [None]:
%load_ext gradio

%%blocks
import gradio as gr

def qa_pipeline(question):
    vid, s, e, ans = answer_query(question)
    clip = extract_clip(vid, s, e, output_path=f"clip_{vid}_{int(s)}_{int(e)}.mp4")
    return clip, ans

with gr.Blocks() as demo:
    gr.Markdown("## Chat with Your Video Library")
    inp = gr.Textbox(label="Enter your question")
    vid_out = gr.Video(label="Relevant clip")
    text_out = gr.Textbox(label="Answer sentence")
    inp.submit(qa_pipeline, [inp], [vid_out, text_out])

demo
