# Video Data Processing

This notebook demonstrates an example of video data processing on Tracto. It includes:

* Uploading a video dataset from Hugging Face on Tracto.
* Splitting video into 10-second segments.
* Obtaining video descriptions using `Qwen/Qwen2.5-VL-7B-Instruct` inference.

In [1]:
# configure environment to run this notebooks
import uuid
import yt.wrapper as yt

username = yt.get_user_name()
if yt.exists(f"//sys/users/{username}/@user_info/home_path"):
    # prepare working directory on distributed file system
    user_info = yt.get(f"//sys/users/{yt.get_user_name()}/@user_info")
    homedir = user_info["home_path"]
    # find avaliable vm presets
    cpu_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("cpu")] or ["default"]
    h100_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("h100")]
    h100_8_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("h100-8")]
    h200_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("h200")]
    h200_8_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("h200-8")]
    workdir = f"{homedir}/tmp/demo_workdir/{uuid.uuid4().hex}"
else:
    cpu_pool_trees = ["default"]
    h100_pool_trees = ["gpu_h100"]
    h100_8_pool_trees = ["gpu_h100"]
    h200_pool_trees = ["gpu_h200"]
    h200_8_pool_trees = ["gpu_h200"]
    workdir = f"//tmp/examples/{uuid.uuid4().hex}"

yt.create("map_node", workdir, recursive=True, ignore_existing=True)
print("Current working directory:", workdir)

Current working directory: //home/established_tan_whale/tmp/demo_workdir/ac3a9ffbfcfc41c0b514da1d62f18365


In [2]:
# we have to install extra dependency to be able to process a dataset using `huggingface_hub` library
!pip install zstandard



[0m


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from huggingface_hub import hf_hub_url
import requests
from yt import type_info
import os

from datasets import load_dataset

ds = load_dataset("nyuuzyou/cs2-highlights", split="metadata")

def process_dataset():
    for index, r in enumerate(ds.select(range(25))):
        video_url = hf_hub_url(repo_id="nyuuzyou/cs2-highlights", repo_type="dataset", filename=r["file"])
        response = requests.get(video_url)
        response.raise_for_status()
        video = response.content

        preview_url = hf_hub_url(repo_id="nyuuzyou/cs2-highlights", repo_type="dataset", filename=r["preview"])
        response = requests.get(preview_url)
        response.raise_for_status()
        preview = response.content
        yield {
            "ds_index": index,
            "video": video,
            "preview": preview,
            "title": r["title"],
            "file_type": os.path.splitext(r["file"])[1].lstrip("."),
        }

schema = yt.schema.TableSchema()
schema.add_column("ds_index", type_info.Int64)
schema.add_column("preview", type_info.Tagged[type_info.String, "image/jpeg"])
schema.add_column("video", type_info.Tagged[type_info.String, "video"])
schema.add_column("title", type_info.Optional[type_info.String])
schema.add_column("file_type", type_info.String)

dataset_path = f"{workdir}/dataset"
print(f"Dataset path: {dataset_path}")

yt.create("table", dataset_path, force=True, attributes={"schema": schema.to_yson_type()})
yt.write_table(dataset_path, process_dataset(), table_writer={"max_row_weight": 800 * 1024 * 1024})

Dataset path: //home/established_tan_whale/tmp/demo_workdir/ac3a9ffbfcfc41c0b514da1d62f18365/dataset


In [4]:
import subprocess
import sys
import tempfile

def split_and_convert_to_mp4(video_file, file_type, temp_dir, duration=10):
    # Qwen/Qwen2.5-VL-7B-Instruct decoder doesn't work well with webm, let's convert in to mp4

    cmd = [
        'ffmpeg',
        '-i', video_file,
        '-c:v', 'libx264',
        '-preset', 'slow',
        '-crf', '18',
        '-c:a', 'aac',
        '-b:a', '192k',
        '-segment_time', str(duration),
        '-f', 'segment',
        '-reset_timestamps', '1',
        f'{temp_dir}/segment_%03d.mp4',
    ]

    print("Output prefix", temp_dir, file=sys.stderr)

    result = subprocess.run(cmd, check=True, stdin=subprocess.PIPE, stdout=sys.stderr, stderr=sys.stderr)
    segments = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir)]
    segments.sort()
    print("Segments", segments, file=sys.stderr)

    return segments


def get_length(path):
    cmd = [
        'ffprobe',
        '-v', 
        'quiet',
        '-show_entries',
        'format=duration',
        '-of',
        'csv=p=0',
        path,
    ]
    result = subprocess.run(cmd, check=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=sys.stderr, text=True)
    return float(result.stdout.strip())


def mapper_split_and_convert_to_mp4(record):
    file_type = record["file_type"]
    with open(f"video.{file_type}", "wb") as f:
        f.write(yt.yson.get_bytes(record["video"]))
    with tempfile.TemporaryDirectory(dir='.') as temp_dir:
        segments = split_and_convert_to_mp4(f"video.{file_type}", file_type, temp_dir)
        for index, segment in enumerate(segments):
            with open(segment, "rb") as f:
                content = f.read()
            yield {
                "video": content,
                "ds_index": int(record["ds_index"]),
                "segment_index": index,
                "length": get_length(segment),
                "preview": record["preview"],
                "title": str(record["title"]),
                "file_type": "mp4",
            }

schema = yt.schema.TableSchema()
schema.add_column("ds_index", type_info.Int64)
schema.add_column("segment_index", type_info.Int64)
schema.add_column("preview", type_info.Tagged[type_info.String, "image/jpeg"])
schema.add_column("video", type_info.Tagged[type_info.String, "video"])
schema.add_column("title", type_info.Optional[type_info.String])
schema.add_column("file_type", type_info.String)
schema.add_column("length", type_info.Float)

segments_path = f"{workdir}/segments"
yt.create("table", segments_path, force=True, attributes={"schema": schema.to_yson_type()})

yt.run_map(
    mapper_split_and_convert_to_mp4,
    dataset_path,
    segments_path,
    spec={
        "mapper": {
            "tmpfs_path": ".",
            "tmpfs_size": 1 * 1024 * 1024 * 1024,
            "memory_limit": 12 * 1024 * 1024 * 1024,
        },
        "job_io": {
            "table_writer": {
                "max_row_weight": 800 * 1024 * 1024,
            },
        },
    },
)

print(segments_path)

2025-07-10 18:23:32,800	INFO	Operation started: https://playground.tracto.ai/playground/operations/2d555a69-fe536f4f-24dd03e8-17cb3b44/details


2025-07-10 18:23:32,856	INFO	( 0 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44 initializing


2025-07-10 18:23:34,650	INFO	( 0 min) Unrecognized spec: {'enable_partitioned_data_balancing': false, 'mapper': {'title': 'mapper_split_and_convert_to_mp'}}


2025-07-10 18:23:34,721	INFO	( 0 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=0     completed=0     pending=25    failed=0     aborted=0     lost=0     total=25    blocked=0    


2025-07-10 18:23:40,023	INFO	( 0 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=1     completed=0     pending=24    failed=0     aborted=0     lost=0     total=25    blocked=0    


2025-07-10 18:25:10,336	INFO	( 1 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=3     completed=1     pending=20    failed=0     aborted=0     lost=0     total=24    blocked=0    


2025-07-10 18:25:41,129	INFO	( 2 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=2     pending=20    failed=0     aborted=0     lost=0     total=24    blocked=0    


2025-07-10 18:27:34,150	INFO	( 4 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=3     completed=3     pending=17    failed=0     aborted=0     lost=0     total=23    blocked=0    


2025-07-10 18:27:59,760	INFO	( 4 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=3     completed=4     pending=16    failed=0     aborted=0     lost=0     total=23    blocked=0    


2025-07-10 18:29:16,638	INFO	( 5 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=3     completed=5     pending=15    failed=0     aborted=0     lost=0     total=23    blocked=0    


2025-07-10 18:29:37,155	INFO	( 6 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=3     completed=6     pending=14    failed=0     aborted=0     lost=0     total=23    blocked=0    


2025-07-10 18:29:42,286	INFO	( 6 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=7     pending=14    failed=0     aborted=0     lost=0     total=23    blocked=0    


2025-07-10 18:30:07,924	INFO	( 6 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=8     pending=13    failed=0     aborted=0     lost=0     total=23    blocked=0    


2025-07-10 18:31:14,646	INFO	( 7 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=9     pending=10    failed=0     aborted=0     lost=0     total=21    blocked=0    


2025-07-10 18:31:35,158	INFO	( 8 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=10    pending=9     failed=0     aborted=0     lost=0     total=21    blocked=0    


2025-07-10 18:32:52,057	INFO	( 9 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=11    pending=8     failed=0     aborted=0     lost=0     total=21    blocked=0    


2025-07-10 18:33:17,668	INFO	( 9 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=12    pending=7     failed=0     aborted=0     lost=0     total=21    blocked=0    


2025-07-10 18:33:33,055	INFO	(10 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=13    pending=5     failed=0     aborted=0     lost=0     total=20    blocked=0    


2025-07-10 18:34:55,137	INFO	(11 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=14    pending=2     failed=0     aborted=0     lost=0     total=18    blocked=0    


2025-07-10 18:36:42,844	INFO	(13 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=2     completed=15    pending=0     failed=0     aborted=0     lost=0     total=17    blocked=0    


2025-07-10 18:37:44,420	INFO	(14 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44: running=1     completed=16    pending=0     failed=0     aborted=0     lost=0     total=17    blocked=0    


2025-07-10 18:38:20,223	INFO	(14 min) operation 2d555a69-fe536f4f-24dd03e8-17cb3b44 completed


//home/established_tan_whale/tmp/demo_workdir/ac3a9ffbfcfc41c0b514da1d62f18365/segments


In [5]:
MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"


yt.config["pickling"]["safe_stream_mode"] = False


@yt.aggregator
def mapper_make_description(records):
    from transformers import AutoProcessor
    from vllm import LLM, SamplingParams
    from qwen_vl_utils import process_vision_info

    llm = LLM(
        model=MODEL,
        limit_mm_per_prompt={"video": 10},
    )
    sampling_params = SamplingParams(
        temperature=0.1,
        top_p=0.001,
        repetition_penalty=1.05,
        max_tokens=256,
        stop_token_ids=[],
    )
    processor = AutoProcessor.from_pretrained(MODEL)

    for record in records:
        file_type = record['file_type']
        video_file = f"./video.{file_type}"
        with open(video_file, "wb") as f:
            f.write(yt.yson.get_bytes(record["video"]))
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": [
                    {"type": "text", "text": "What happens on this video?"},
                    {
                        "type": "video", 
                        "video": f"file://{os.path.abspath(video_file)}",
                        "total_pixels": 20480 * 28 * 28, "min_pixels": 16 * 28 * 28
                    }
                ]
            },
        ]
        prompt = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)

        mm_data = {}
        if image_inputs is not None:
            mm_data["image"] = image_inputs
        if video_inputs is not None:
            mm_data["video"] = video_inputs

        llm_inputs = {
            "prompt": prompt,
            "multi_modal_data": mm_data,
            "mm_processor_kwargs": video_kwargs,
        }

        outputs = llm.generate([llm_inputs], sampling_params=sampling_params)
        generated_text = outputs[0].outputs[0].text
        yield {
            "video": record["video"],
            "ds_index": int(record["ds_index"]),
            "segment_index": record["segment_index"],
            "length": record["length"],
            "preview": record["preview"],
            "title": record["title"],
            "file_type": record["file_type"],
            "description": generated_text,
        }

schema = yt.schema.TableSchema()
schema.add_column("ds_index", type_info.Int64)
schema.add_column("segment_index", type_info.Int64)
schema.add_column("preview", type_info.Tagged[type_info.String, "image/jpeg"])
schema.add_column("video", type_info.Tagged[type_info.String, "video"])
schema.add_column("title", type_info.Optional[type_info.String])
schema.add_column("file_type", type_info.String)
schema.add_column("length", type_info.Float)
schema.add_column("description", type_info.String)

result_path = f"{workdir}/result"
yt.create("table", result_path, force=True, attributes={"schema": schema.to_yson_type()})

yt.run_map(
    mapper_make_description,
    segments_path,
    result_path,
    job_count=2,
    spec={
        "pool_trees": h100_pool_trees,
        "mapper": {
            "gpu_limit": 1,
            "memory_limit": 64424509440,
            "tmpfs_path": ".",
            "tmpfs_size": 64424509440,
            "environment": {
                # improtant for vllm v1
                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
            },
        },
        "job_io": {
            "table_writer": {
                "max_row_weight": 800 * 1024 * 1024,
            },
        },
        "max_speculative_job_count_per_task": 0,
    },
)

print(result_path)

2025-07-10 18:38:23,348	INFO	Operation started: https://playground.tracto.ai/playground/operations/eb7d0519-be6d821d-24dd03e8-979c1a37/details


2025-07-10 18:38:23,414	INFO	( 0 min) operation eb7d0519-be6d821d-24dd03e8-979c1a37 starting


2025-07-10 18:38:24,058	INFO	( 0 min) Unrecognized spec: {'enable_partitioned_data_balancing': false, 'mapper': {'title': 'mapper_make_description'}}


2025-07-10 18:38:24,059	INFO	( 0 min) operation eb7d0519-be6d821d-24dd03e8-979c1a37 preparing


2025-07-10 18:38:26,924	INFO	( 0 min) operation eb7d0519-be6d821d-24dd03e8-979c1a37: running=0     completed=0     pending=2     failed=0     aborted=0     lost=0     total=2     blocked=0    


2025-07-10 18:38:35,807	INFO	( 0 min) operation eb7d0519-be6d821d-24dd03e8-979c1a37: running=1     completed=0     pending=1     failed=0     aborted=0     lost=0     total=2     blocked=0    


2025-07-10 18:49:31,665	INFO	(11 min) operation eb7d0519-be6d821d-24dd03e8-979c1a37: running=1     completed=1     pending=0     failed=0     aborted=0     lost=0     total=2     blocked=0    


2025-07-10 18:57:02,702	INFO	(18 min) operation eb7d0519-be6d821d-24dd03e8-979c1a37 completed


//home/established_tan_whale/tmp/demo_workdir/ac3a9ffbfcfc41c0b514da1d62f18365/result
