# VideoDB for STS3E4

In [5]:
# Install SDK (run once)
%pip install videodb openai python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [6]:
import os
import json
import videodb

from dotenv import load_dotenv
from videodb import SceneExtractionType

base_dir = os.getcwd()
dotenv_path = os.path.join(base_dir, ".env")
load_dotenv(dotenv_path=dotenv_path, override=True)

videodb_api_key = os.getenv("VIDEO_DB_API_KEY")
if not videodb_api_key:
    raise ValueError(
        f"Missing VIDEO_DB_API_KEY. Looked in {dotenv_path} (exists={os.path.exists(dotenv_path)})"
    )

conn = videodb.connect(api_key=videodb_api_key)

video_path = "./videos/STS3E4.mp4" 
video = conn.upload(file_path=video_path)

video.id

'm-z-019c3af7-6cfc-7e82-9128-ffc5e515e796'

In [7]:
# Transcript generation
video.generate_transcript()

transcript = video.get_transcript()  # timestamped segments
transcript_text = video.get_transcript_text()

print("Transcript text preview:")
print(transcript_text[:1000])

# Save raw transcript for later processing
os.makedirs("./outputs", exist_ok=True)
with open("./outputs/sts3e4_transcript.json", "w") as f:
    json.dump(transcript, f, indent=2)

Transcript text preview:
Sam. Nancy. Nancy. You're so beautiful. Sa. Steve. Hey, Steve. I'll see you tomorrow, okay? It. Oh, Jesus. You scared me. I scared you? I know. I should have called. Where have you been? We agreed on Zen after the assembly. Some people wanted to get something to eat. I didn't think it'd be a big deal. You didn't think to call and let me know? With everything that's been going on, I didn't realize how late it was, okay? I'm sorry, Mom. What more do you want? Hey, wait. Whose sweatshirt is that? Steve's. Steve's? So is Steve your boyfriend now? What? No, it was just cold, so I borrowed his sweatshirt. It's no big deal. Nancy. What? You can talk to me. You can talk to me. Whatever happened. Nothing happened, Nancy. Nothing happened. Can I please go? Please? Me? Please. Just talk to me. Talk to me. Just say. Mom. Jonathan, Come here. What is this? Come here. Come here. What's going on? It's Will. It's Will. He's trying to talk to me. He's trying to talk to you? Yes

In [8]:
# Scene understanding (visual indexing)
scene_index_id = video.index_scenes(
    extraction_type=SceneExtractionType.shot_based,
    prompt="Describe the scene with a focus on products, brands, packaging, and any visible labels."
)

scenes = video.get_scene_index(scene_index_id)

# Build structured payload for downstream GPT
payload = {
    "video_id": video.id,
    "video_path": video_path,
    "transcript": transcript,
    "scenes": scenes,
}

with open("./outputs/sts3e4_scene_and_transcript.json", "w") as f:
    json.dump(payload, f, indent=2)

payload.keys()

dict_keys(['video_id', 'video_path', 'transcript', 'scenes'])

In [9]:
# GPT request with strict JSON schema (faster + logged)
from openai import OpenAI
from pathlib import Path
import time
import math

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError(
        f"Missing OPENAI_API_KEY. Looked in {dotenv_path} (exists={os.path.exists(dotenv_path)})"
    )

client = OpenAI(api_key=openai_api_key)

MODEL = "gpt-5"
BATCH_SIZE = 80
MAX_CHARS = 400
MAX_SCENES = None  # set to an int for quick runs

schema = {
    "type": "object",
    "additionalProperties": False,
    "properties": {
        "video_id": {"type": "string"},
        "scenes": {
            "type": "array",
            "items": {
                "type": "object",
                "additionalProperties": False,
                "properties": {
                    "scene_id": {"type": "string"},
                    "timestamp_range": {
                        "type": "array",
                        "items": {"type": "number"},
                        "minItems": 2,
                        "maxItems": 2,
                    },
                    "product_mentions": {
                        "type": "array",
                        "items": {
                            "type": "object",
                            "additionalProperties": False,
                            "properties": {
                                "product_name": {"type": "string"},
                                "brand": {"type": "string"},
                                "category": {"type": "string"},
                                "confidence": {"type": "number"},
                                "evidence": {
                                    "type": "object",
                                    "additionalProperties": False,
                                    "properties": {
                                        "visual": {"type": "string"},
                                        "dialogue": {"type": "string"},
                                    },
                                    "required": ["visual", "dialogue"],
                                },
                            },
                            "required": [
                                "product_name",
                                "brand",
                                "category",
                                "confidence",
                                "evidence",
                            ],
                        },
                    },
                },
                "required": ["scene_id", "timestamp_range", "product_mentions"],
            },
        },
    },
    "required": ["video_id", "scenes"],
}

schema_compact = json.dumps(schema, separators=(",", ":"))


def get_scene_time_range(scene):
    start = scene.get("start") or scene.get("start_time") or scene.get("start_sec")
    end = scene.get("end") or scene.get("end_time") or scene.get("end_sec")
    if start is None or end is None:
        ts = scene.get("timestamp_range") or scene.get("time_range")
        if isinstance(ts, list) and len(ts) == 2:
            start, end = ts
    return start, end


def format_scene(scene):
    start, end = get_scene_time_range(scene)
    desc = scene.get("description") or scene.get("caption") or scene.get("summary") or ""
    return {
        "scene_id": str(scene.get("scene_id") or scene.get("id") or ""),
        "timestamp_range": [float(start or 0), float(end or 0)],
        "scene_description": desc,
    }


def transcript_overlap_text(start, end, transcript_items, max_chars=MAX_CHARS):
    if start is None or end is None:
        return ""
    chunks = []
    length = 0
    for item in transcript_items or []:
        t_start = item.get("start") or item.get("start_time") or item.get("start_sec")
        t_end = item.get("end") or item.get("end_time") or item.get("end_sec")
        text = item.get("text") or item.get("word") or ""
        if t_start is None or t_end is None:
            continue
        if t_end >= start and t_start <= end and text:
            chunks.append(text)
            length += len(text) + 1
            if length >= max_chars:
                break
    joined = " ".join(chunks)
    return joined[:max_chars]


# Shrink the payload to just what the model needs
scenes_items = scenes if isinstance(scenes, list) else scenes.get("scenes", [])
transcript_items = transcript if isinstance(transcript, list) else transcript.get("segments", [])

if MAX_SCENES:
    scenes_items = scenes_items[:MAX_SCENES]

batch_size = BATCH_SIZE
all_scene_results = []

start_time = time.time()
total_batches = math.ceil(len(scenes_items) / batch_size) if scenes_items else 0

for batch_index in range(total_batches):
    i = batch_index * batch_size
    batch = scenes_items[i : i + batch_size]
    compact_batch = []
    for scene in batch:
        start, end = get_scene_time_range(scene)
        compact_batch.append(
            {
                **format_scene(scene),
                "dialogue_excerpt": transcript_overlap_text(start, end, transcript_items),
            }
        )

    batch_payload = {
        "video_id": video.id,
        "scenes": compact_batch,
    }

    print(f"Batch {batch_index + 1}/{total_batches} | scenes {i}..{i + len(batch) - 1}")
    batch_start = time.time()

    response = client.responses.create(
        model=MODEL,
        input=[
            {
                "role": "system",
                "content": (
                    "Extract product mentions per scene using the scene_description and dialogue_excerpt. "
                    "Return ONLY valid JSON that matches this JSON Schema and nothing else: "
                    + schema_compact
                ),
            },
            {
                "role": "user",
                "content": json.dumps(batch_payload, separators=(",", ":")),
            },
        ],
    )

    content = response.output_text
    structured = json.loads(content)
    all_scene_results.extend(structured.get("scenes", []))

    batch_elapsed = time.time() - batch_start
    done_scenes = min((batch_index + 1) * batch_size, len(scenes_items))
    total_elapsed = time.time() - start_time
    rate = done_scenes / max(total_elapsed, 1)
    print(f"Batch done in {batch_elapsed:.1f}s | {done_scenes}/{len(scenes_items)} scenes | {rate:.2f} scenes/s")

output_prefix = Path(video_path).stem
final_structured = {
    "video_id": video.id,
    "scenes": all_scene_results,
}

with open(f"./outputs/{output_prefix}_structured_products.json", "w") as f:
    json.dump(final_structured, f, indent=2)

print(f"Wrote ./outputs/{output_prefix}_structured_products.json")
final_structured.keys()

Batch 1/20 | scenes 0..79
Batch done in 168.5s | 80/1597 scenes | 0.47 scenes/s
Batch 2/20 | scenes 80..159
Batch done in 142.5s | 160/1597 scenes | 0.51 scenes/s
Batch 3/20 | scenes 160..239
Batch done in 187.7s | 240/1597 scenes | 0.48 scenes/s
Batch 4/20 | scenes 240..319
Batch done in 253.5s | 320/1597 scenes | 0.43 scenes/s
Batch 5/20 | scenes 320..399
Batch done in 190.6s | 400/1597 scenes | 0.42 scenes/s
Batch 6/20 | scenes 400..479
Batch done in 107.8s | 480/1597 scenes | 0.46 scenes/s
Batch 7/20 | scenes 480..559
Batch done in 185.8s | 560/1597 scenes | 0.45 scenes/s
Batch 8/20 | scenes 560..639
Batch done in 153.6s | 640/1597 scenes | 0.46 scenes/s
Batch 9/20 | scenes 640..719
Batch done in 183.9s | 720/1597 scenes | 0.46 scenes/s
Batch 10/20 | scenes 720..799
Batch done in 144.8s | 800/1597 scenes | 0.47 scenes/s
Batch 11/20 | scenes 800..879
Batch done in 180.0s | 880/1597 scenes | 0.46 scenes/s
Batch 12/20 | scenes 880..959
Batch done in 77.5s | 960/1597 scenes | 0.49 scen

dict_keys(['video_id', 'scenes'])