In [1]:
# !pip install webvtt-py spacy sentence-transformers qdrant-client bertopic gradio ffmpeg-python
# !python -m spacy download en_core_web_sm

In [2]:
import os, glob
from webvtt import WebVTT

In [3]:
def time_to_seconds(timestamp):
    h, m, s = timestamp.split(':')
    return int(h) * 3600 + int(m) * 60 + float(s)

captions_data = []  # List of dicts: {'video_id', 'start', 'end', 'text'}
for video_dir in sorted(os.listdir("./data/videos")):
    video_path = os.path.join("./data/videos", video_dir)
    if not os.path.isdir(video_path):
        continue
    # Find English VTT file
    vtt_files = glob.glob(os.path.join(video_path, "*.en.vtt"))
    if not vtt_files:
        continue
    vtt_path = vtt_files[0]
    # Parse captions
    for caption in WebVTT().read(vtt_path):
        text = caption.text.strip().replace("\n", " ")
        if not text:
            continue
        start = time_to_seconds(caption.start)
        end = time_to_seconds(caption.end)
        captions_data.append({"video_id": video_dir, "start": start, "end": end, "text": text})

# Sort by video and time
captions_data.sort(key=lambda x: (x['video_id'], x['start']))

# Deduplicate consecutive segments
filtered_caps = []
prev_text = None
prev_vid = None
for cap in captions_data:
    if cap["video_id"] != prev_vid:
        prev_text = None  # reset at new video
        prev_vid = cap["video_id"]
    if cap["text"] != prev_text:
        filtered_caps.append(cap)
    prev_text = cap["text"]
captions_data = filtered_caps

len(captions_data), "caption segments loaded."


(6970, 'caption segments loaded.')

In [4]:
# # from google.colab import drive
# # drive.mount('/content/drive')
# !cp -r /content/drive/MyDrive/data /content

In [5]:
import spacy
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

from itertools import groupby

sentences = []  # List of dicts: {'video_id', 'start', 'end', 'sentence'}
for video_id, group in groupby(captions_data, key=lambda x: x['video_id']):
    group = list(group)
    combined_text = " ".join([seg["text"] for seg in group])
    doc = nlp(combined_text)
    # Compute cumulative character lengths for segment boundaries
    cum_lengths = [0]
    for seg in group:
        cum_lengths.append(cum_lengths[-1] + len(seg["text"]) + 1)
    for sent in doc.sents:
        sent_text = sent.text.strip()
        if not sent_text:
            continue
        start_char = sent.start_char
        end_char = sent.end_char
        # Find segment indices containing sentence start/end
        start_idx = max(i for i, length in enumerate(cum_lengths) if length <= start_char)
        end_idx = max(i for i, length in enumerate(cum_lengths) if length < end_char)
        start_time = group[start_idx]["start"]
        end_time = group[end_idx]["end"]
        sentences.append({
            "video_id": video_id,
            "start": start_time,
            "end": end_time,
            "sentence": sent_text
        })

print(f"Created {len(sentences)} merged sentences from captions.")


Created 8 merged sentences from captions.


In [10]:
# !pip install accelerate>=0.21.0
# !pip install -U transformers
# !pip install peft==0.3.0
# !pip install datasets
# !pip install "sentence-transformers[train]"
!pip install accelerate --upgrade
!pip install peft --upgrade
!pip install transformers --upgrade
from itertools import groupby
from torch.utils.data import DataLoader
import spacy
import random
# Import the datasets library and the Dataset class
# !pip install --upgrade sentence-transformers datasets

nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

def windowize_sentence(sentence, window_size=50, stride=50):
    tokens = sentence.split()
    spans = []
    # slide a window of up to 50 words (no overlap, since stride=window_size)
    for start in range(0, max(1, len(tokens) - window_size + 1), stride):
        spans.append(" ".join(tokens[start:start + window_size]))
    if not spans:
        spans = [" ".join(tokens)]
    return spans

processed = []
for vid, group in groupby(sentences, key=lambda x: x['video_id']):
    group = list(group)
    long_para = group[0]['sentence']
    for sent in nlp(long_para).sents:
        # further split any spaCy sentence over 50 words
        for chunk in windowize_sentence(sent.text, window_size=50, stride=50):
            processed.append({
                "video_id": vid,
                "sentence": chunk.strip()
            })

train_examples = []
for vid, group in groupby(processed, key=lambda x: x['video_id']):
    grp = list(group)
    for i in range(len(grp) - 1):
        s1, s2 = grp[i]['sentence'], grp[i+1]['sentence']
        if not s1 or not s2:
            continue
        # import here, where InputExample is used
        from sentence_transformers import InputExample
        train_examples.append(InputExample(texts=[s1, s2]))

# 4. Shuffle, batch, and fine‐tune
random.shuffle(train_examples)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# import here, where SentenceTransformer is used
from sentence_transformers import SentenceTransformer, losses
model = SentenceTransformer('all-MiniLM-L6-v2')
train_loss = losses.MultipleNegativesRankingLoss(model=model)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=100
)

# 5. Save your fine‐tuned model
model.save("fine_tuned_minilm")



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myc6317[0m ([33mproject_hpml_yuzhong[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


In [16]:
# !pip install peft --upgrade
from peft import PeftModelForFeatureExtraction
sent_texts = [
    chunk
    for s in sentences
    for sent in nlp(s["sentence"]).sents  # Split into spaCy sentences
    for chunk in windowize_sentence(sent.text)  # Further split into windowed chunks
]
embeddings = model.encode(sent_texts, convert_to_numpy=True)
print("Encoded sentences to vectors of shape:", embeddings.shape)



Encoded sentences to vectors of shape: (1435, 384)


In [17]:
from bertopic import BERTopic

print(len(sent_texts))
topics, probs = topic_model.fit_transform(sent_texts)
print(topic_model.get_topic_info().head())

1435
   Topic  Count                              Name  \
0     -1    495                   -1_uh_the_of_to   
1      0    178        0_feature_map_depth_filter   
2      1     87  1_layers_layer_convolutional_are   
3      2     58             2_image_images_uh_car   
4      3     50          3_signal_was_strength_of   

                                      Representation  \
0     [uh, the, of, to, is, and, this, that, in, we]   
1  [feature, map, depth, filter, the, output, inp...   
2  [layers, layer, convolutional, are, the, uh, s...   
3  [image, images, uh, car, in, um, of, that, you...   
4  [signal, was, strength, of, uh, the, this, wer...   

                                 Representative_Docs  
0  [uh to be called the radar problem and uh we'l...  
1  [input feature map of depth map of depth map o...  
2  [in the fully connected dense layer fully conn...  
3  [in they are uh positioned at some point in th...  
4  [the radar problem High signal strength low si...  


In [25]:
!pip install qdrant-client
import os
import time
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct


# Set Qdrant storage path to be within the current working directory
os.environ["QDRANT_STORAGE_PATH"] = "./qdrant_storage"

# Start the Qdrant server in the background
client = QdrantClient(host="localhost", port=6333)
print("Qdrant server started")

Qdrant server started


In [41]:
# First install required packages
!pip install -U qdrant-client[fastembed]

from qdrant_client import QdrantClient, models
import numpy as np
import time

# Start Qdrant in standalone mode (no Docker needed)
client = QdrantClient(
    location=":memory:",  # Use in-memory storage
    prefer_grpc=False     # Disable gRPC for Colab compatibility
)

# Create collection
client.recreate_collection(
    collection_name="video_captions",
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
)

# Generate sample embeddings (replace with your actual data)
embeddings = np.random.rand(len(sentences), 384)

# Prepare and insert points
points = [
    models.PointStruct(
        id=i,
        vector=embedding.tolist(),
        payload={
            "video_id": sent["video_id"],
            "start": sent["start"],
            "end": sent["end"],
            "sentence": sent["sentence"]
        }
    )
    for i, (sent, embedding) in enumerate(zip(sentences, embeddings))
]

# Insert data
client.upsert(
    collection_name="video_captions",
    points=points
)

print(f"Successfully stored {len(points)} vectors!")

Collecting fastembed==0.6.1 (from qdrant-client[fastembed])
  Downloading fastembed-0.6.1-py3-none-any.whl.metadata (10 kB)
Collecting loguru<0.8.0,>=0.7.2 (from fastembed==0.6.1->qdrant-client[fastembed])
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting mmh3<6.0.0,>=4.1.0 (from fastembed==0.6.1->qdrant-client[fastembed])
  Downloading mmh3-5.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime!=1.20.0,>=1.17.0 (from fastembed==0.6.1->qdrant-client[fastembed])
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting py-rust-stemmers<0.2.0,>=0.1.0 (from fastembed==0.6.1->qdrant-client[fastembed])
  Downloading py_rust_stemmers-0.1.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting coloredlogs (from onnxruntime!=1.20.0,>=1.17.0->fastembed==0.6.1->qdrant-client[fastembed])
  Downloading coloredlog

In [42]:
def answer_query(query, top_k=1):
    q_vec = model.encode([query], convert_to_numpy=True)[0]
    results = client.search(collection_name=collection_name, query_vector=q_vec, limit=top_k)
    top = results[0]  # best match
    data = top.payload
    return data["video_id"], data["start"], data["end"], data["sentence"]

# Example query
video_id, start, end, sentence = answer_query("Explain how convolutional layers work")
print(f"Top answer: \"{sentence}\" (Video {video_id}, {start:.1f}-{end:.1f}s)")


Top answer: "we have seen we have seen earlier the regression problem uh where earlier the regression problem uh where earlier the regression problem uh where we have effectively to model a we have effectively to model a we have effectively to model a conditional probability distribution at conditional probability distribution at conditional probability distribution at the output of our predictor now we the output of our predictor now we the output of our predictor now we actually switching to a new task it's a actually switching to a new task it's a actually switching to a new task it's a classification task where again we will classification task where again we will classification task where again we will solve this task using The Vaping block solve this task using The Vaping block solve this task using The Vaping block diagram again we will need to model our diagram again we will need to model our diagram again we will need to model our predictor with a conditional probability predi

In [43]:
import ffmpeg

def extract_clip(video_id, start, end, output_path="clip.mp4"):
    # Find the video file for this ID
    mp4_files = glob.glob(os.path.join("./data/videos", video_id, "*.mp4"))
    if not mp4_files:
        raise FileNotFoundError(f"No video found for ID {video_id}")
    video_path = mp4_files[0]
    (
        ffmpeg
        .input(video_path, ss=start, to=end)
        .output(output_path, codec="copy")
        .run(overwrite_output=True)
    )
    return output_path

# Example: extract the clip for the top result
clip_file = extract_clip(video_id, start, end, output_path=f"clip_{video_id}_{int(start)}_{int(end)}.mp4")
print(f"Saved clip to {clip_file}")


Saved clip to clip_eQ6UE968Xe4_1_3103.mp4


In [45]:
%load_ext gradio

# %%blocks
import gradio as gr

def qa_pipeline(question):
    vid, s, e, ans = answer_query(question)
    clip = extract_clip(vid, s, e, output_path=f"clip_{vid}_{int(s)}_{int(e)}.mp4")
    return clip, ans

with gr.Blocks() as demo:
    gr.Markdown("## Chat with Your Video Library")
    inp = gr.Textbox(label="Enter your question")
    vid_out = gr.Video(label="Relevant clip")
    text_out = gr.Textbox(label="Answer sentence")
    inp.submit(qa_pipeline, [inp], [vid_out, text_out])

demo


The gradio extension is already loaded. To reload it, use:
  %reload_ext gradio


Gradio Blocks instance: 1 backend functions
-------------------------------------------
fn_index=0
 inputs:
 |-<gradio.components.textbox.Textbox object at 0x7aa5769df410>
 outputs:
 |-<gradio.components.video.Video object at 0x7aa47a2c9210>
 |-<gradio.components.textbox.Textbox object at 0x7aa5769ca590>