# Video Data Processing

This notebook demonstrates an example of video data processing on Tracto. It includes:

* Uploading a video dataset from Hugging Face on Tracto.
* Obtaining video descriptions using inference from the Qwen/Qwen2.5-VL-7B-Instruct model.

In [1]:
# configure environment to run this notebooks
import uuid
import yt.wrapper as yt

username = yt.get_user_name()
if yt.exists(f"//sys/users/{username}/@user_info/home_path"):
    # prepare working directory on distributed file system
    user_info = yt.get(f"//sys/users/{yt.get_user_name()}/@user_info")
    homedir = user_info["home_path"]
    # find avaliable vm presets
    cpu_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("cpu")] or ["default"]
    h100_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("h100")]
    h100_8_pool_trees = [pool_tree for pool_tree in user_info["available_pool_trees"] if pool_tree.endswith("h100_8")]
    workdir = f"{homedir}/tmp/demo_workdir/{uuid.uuid4().hex}"
else:
    cpu_pool_trees = ["default"]
    h100_pool_trees = ["gpu_h100"]
    h100_8_pool_trees = ["gpu_h100"]
    workdir = f"//tmp/examples/{uuid.uuid4().hex}"

yt.create("map_node", workdir, recursive=True, ignore_existing=True)
print("Current working directory:", workdir)

Current working directory: //home/top_emerald_antlion/tmp/demo_workdir/0dbe6f35d7634d5c884fdccf17c605c7


In [2]:
# we have to install extra dependency to be able to process a dataset using `huggingface_hub` library
!pip install zstandard



[0m


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Files in this dataset are too large to be uploaded entirely into a table cell, so we'll cut them into 10-second segments before the uploading.
Alternative approach: upload each file entirely as a file, and provide a link to this file in the table.

In [4]:
from yt import type_info
import os
import sys
import uuid
import tempfile
import subprocess

from datasets import load_dataset

ds = load_dataset("tractoai/gameplay", split="train")

def split_video(video_file, file_type, temp_dir, duration=10):
    cmd = [
        'ffmpeg',
        '-i', video_file,
        '-c:v', 'copy',
        '-an',
        '-segment_time', str(duration),
        '-f', 'segment',
        '-reset_timestamps', '1',
        f'{temp_dir}/segment_%03d.mp4',
    ]

    result = subprocess.run(cmd, check=True, stdin=subprocess.PIPE)
    segments = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir)]
    segments.sort()
    print("Segments", segments, file=sys.stderr)
    return segments

def process_dataset():
    for ds_index, r in enumerate(ds):
        path = r["video"].container.name
        file_type = os.path.splitext(path)[1].lstrip(".")
        with tempfile.TemporaryDirectory(dir='.') as temp_dir:
            segments = split_video(path, file_type, temp_dir)
            for segment_index, segment in enumerate(segments):
                with open(segment, "rb") as f:
                    video = f.read()
                    yield {
                        "ds_index": ds_index,
                        "segment_index": segment_index,
                        "video": video,
                        "file_type": "mp4",
                    }

schema = yt.schema.TableSchema()
schema.add_column("ds_index", type_info.Int64)
schema.add_column("segment_index", type_info.Int64)
schema.add_column("video", type_info.Tagged[type_info.String, "video"])
schema.add_column("file_type", type_info.String)

dataset_path = f"{workdir}/dataset"
print(f"Dataset path: {dataset_path}")

yt.create("table", dataset_path, force=True, attributes={"schema": schema.to_yson_type()})
yt.write_table(dataset_path, process_dataset(), table_writer={"max_row_weight": 800 * 1024 * 1024})
print(dataset_path)

Dataset path: //home/top_emerald_antlion/tmp/demo_workdir/0dbe6f35d7634d5c884fdccf17c605c7/dataset


ffmpeg version 5.1.6-0+deb12u1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 12 (Debian 12.2.0-14)
  configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enab

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_030.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_031.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_032.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_033.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_034.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_035.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_036.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_037.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_038.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_069.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_070.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_071.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_072.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_073.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_074.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_075.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_076.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_077.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_111.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_112.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_113.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_114.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_115.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_116.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_117.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_118.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_119.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_151.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_152.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_153.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_154.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_155.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_156.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_157.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_158.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_159.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_192.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_193.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_194.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_195.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_196.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_197.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_198.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_199.mp4' for writing
frame=119605 fps=119601 q=-1.0 size=N/A time=00:33:13.36 bitrate=N/A speed=1.99e+03x    [segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_200.m

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_233.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_234.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_235.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_236.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_237.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_238.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_239.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_240.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_241.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_275.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_276.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_277.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_278.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_279.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_280.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_281.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_282.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_283.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_317.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_318.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_319.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_320.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_321.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_322.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_323.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_324.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_325.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk

[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_359.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_360.mp4' for writing
[segment @ 0x564795d19040] Opening '/slot/sandbox/working_dir/tmp016u8hlk/segment_361.mp4' for writing
frame=216714 fps=121690 q=-1.0 Lsize=N/A time=01:00:11.85 bitrate=N/A speed=2.03e+03x    
video:1105576kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
Segments ['/slot/sandbox/working_dir/tmp016u8hlk/segment_000.mp4', '/slot/sandbox/working_dir/tmp016u8hlk/segment_001.mp4', '/slot/sandbox/working_dir/tmp016u8hlk/segment_002.mp4', '/slot/sandbox/working_dir/tmp016u8hlk/segment_003.mp4', '/slot/sandbox/working_dir/tmp016u8hlk/segment_004.mp4', '/slot/sandbox/working_dir/tmp016u8hlk/segment_005.mp4', '/slot/sandbox/working_dir/tmp016u8hlk/segment_006.mp4', '/slot/sandbox/working_dir/tmp016u8hlk/segment_007.mp4', '/slot/sandbox/working_dir/tmp016u

//home/top_emerald_antlion/tmp/demo_workdir/0dbe6f35d7634d5c884fdccf17c605c7/dataset


In [5]:
MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"


yt.config["pickling"]["safe_stream_mode"] = False


@yt.aggregator
def mapper_make_description(records):
    from transformers import AutoProcessor
    from vllm import LLM, SamplingParams
    from qwen_vl_utils import process_vision_info
    import os
    import tempfile

    llm = LLM(
        model=MODEL,
        limit_mm_per_prompt={"video": 10},
    )
    sampling_params = SamplingParams(
        temperature=0.1,
        top_p=0.001,
        repetition_penalty=1.05,
        max_tokens=256,
        stop_token_ids=[],
    )
    processor = AutoProcessor.from_pretrained(MODEL)

    def process_batch(batch_records):
        with tempfile.TemporaryDirectory() as temp_dir:
            batch_inputs = []
            
            for i, br in enumerate(batch_records):
                file_type = br['file_type']
                video_file = os.path.join(temp_dir, f"video_{i}.{file_type}")

                with open(video_file, "wb") as f:
                    f.write(yt.yson.get_bytes(br["video"]))

                messages = [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "Here is a Call of Duty zombies gameplay video. Describe in detail what is happening in the video, what surrounds the main character, what they are doing, and what the other characters are doing."},
                            {
                                "type": "video",
                                "video": f"file://{os.path.abspath(video_file)}",
                                "total_pixels": 20480 * 28 * 28,
                                "min_pixels": 16 * 28 * 28
                            },
                        ],
                    },
                ]
                
                prompt = processor.apply_chat_template(
                    messages,
                    tokenize=False,
                    add_generation_prompt=True,
                )
                
                image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
                
                mm_data = {}
                if image_inputs is not None:
                    mm_data["image"] = image_inputs
                if video_inputs is not None:
                    mm_data["video"] = video_inputs
                
                llm_input = {
                    "prompt": prompt,
                    "multi_modal_data": mm_data,
                    "mm_processor_kwargs": video_kwargs,
                }
                
                batch_inputs.append(llm_input)

            outputs = llm.generate(batch_inputs, sampling_params=sampling_params)

            for i, (br, output) in enumerate(zip(batch_records, outputs)):
                generated_text = output.outputs[0].text
                yield {
                    "video": br["video"],
                    "ds_index": int(br["ds_index"]),
                    "segment_index": br["segment_index"],
                    "file_type": br["file_type"],
                    "description": generated_text,
                }

    batch_size = 10
    current_batch = []
    
    for record in records:
        current_batch.append(record)
        
        if len(current_batch) >= batch_size:
            yield from process_batch(current_batch)
            current_batch = []
    
    if current_batch:
        yield from process_batch(current_batch)

schema = yt.schema.TableSchema()
schema.add_column("ds_index", type_info.Int64)
schema.add_column("segment_index", type_info.Int64)
schema.add_column("video", type_info.Tagged[type_info.String, "video"])
schema.add_column("file_type", type_info.String)
schema.add_column("description", type_info.String)

result_path = f"{workdir}/result"
yt.create("table", result_path, force=True, attributes={"schema": schema.to_yson_type()})

yt.run_map(
    mapper_make_description,
    dataset_path,
    result_path,
    job_count=1,
    spec={
        "pool_trees": h100_pool_trees,
        "mapper": {
            "gpu_limit": 1,
            "cpu_limit": 10,
            "memory_limit": 64424509440,
            "tmpfs_path": ".",
            "tmpfs_size": 64424509440,
            "environment": {
                # improtant for vllm v1
                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
            },
        },
        "job_io": {
            "table_writer": {
                "max_row_weight": 800 * 1024 * 1024,
            },
        },
        "max_speculative_job_count_per_task": 0,
    },
)

print(result_path)





2025-07-04 22:06:26,936	INFO	Operation started: https://playground.tracto.ai/playground/operations/b5dd0d63-5e0da209-24dd03e8-2752e799/details


2025-07-04 22:06:27,000	INFO	( 0 min) operation b5dd0d63-5e0da209-24dd03e8-2752e799 starting


2025-07-04 22:06:27,563	INFO	( 0 min) operation b5dd0d63-5e0da209-24dd03e8-2752e799 initializing


2025-07-04 22:06:28,223	INFO	( 0 min) Unrecognized spec: {'enable_partitioned_data_balancing': false, 'mapper': {'title': 'mapper_make_description'}}


2025-07-04 22:06:28,284	INFO	( 0 min) operation b5dd0d63-5e0da209-24dd03e8-2752e799: running=0     completed=0     pending=1     failed=0     aborted=0     lost=0     total=1     blocked=0    


2025-07-04 22:06:38,751	INFO	( 0 min) operation b5dd0d63-5e0da209-24dd03e8-2752e799: running=1     completed=0     pending=0     failed=0     aborted=0     lost=0     total=1     blocked=0    


2025-07-04 22:27:06,700	INFO	(20 min) operation b5dd0d63-5e0da209-24dd03e8-2752e799 completed


2025-07-04 22:27:06,793	INFO	(20 min) Alerts: {'low_cpu_usage': {'code': 1, 'message': "Average CPU usage of some of your job types is significantly lower than requested 'cpu_limit'. Consider decreasing cpu_limit in spec of your operation", 'attributes': {'pid': 1, 'tid': 10133760987938265831, 'thread': 'Controller:14', 'fid': 18446252687753301616, 'host': 'ca-1.controller-agents.nebius-playground.svc.kyt.k8s.nebius.yt', 'datetime': '2025-07-04T22:27:05.251642Z', 'trace_id': '39b503fc-32fa7f97-8b39db17-415129b6', 'span_id': 7209057548914930213}, 'inner_errors': [{'code': 1, 'message': 'Jobs of task "map" use 29.25% of requested cpu limit', 'attributes': {'pid': 1, 'tid': 10133760987938265831, 'thread': 'Controller:14', 'fid': 18446252687753301616, 'host': 'ca-1.controller-agents.nebius-playground.svc.kyt.k8s.nebius.yt', 'datetime': '2025-07-04T22:27:05.251630Z', 'trace_id': '39b503fc-32fa7f97-8b39db17-415129b6', 'span_id': 7209057548914930213, 'cpu_time': 3597762, 'cpu_limit': 10.0, 'e

//home/top_emerald_antlion/tmp/demo_workdir/0dbe6f35d7634d5c884fdccf17c605c7/result
