The first video LLM capable of real-time commentary, trained with a novel video-ASR streaming method, SOTA on both streaming and offline benchmarks.
Ensure you have Python version >= 3.11 installed.
pip install torch torchvision torchaudio
pip install "transformers<=4.51.3" accelerate deepspeed peft opencv-python decord datasets tensorboard gradio pillow-heif gpustat timm sentencepiece openai av==12.0.0 qwen_vl_utils liger_kernel numpy==1.24.4
pip install flash-attn --no-build-isolation
pip install livecc-utils==0.0.2
We finished all things in torch==2.6.0
, transformers==4.50.0
, liger-kernel==0.5.5
. But other versions should also work. Our full environment is requirements.txt.
If you want to delve into our data production pipeline:
pip install insightface onnxruntime-gpu python_speech_features wavfile
python demo/app.py --js_monitor
--js_monitor
is to use javascript video timestamp monitoring (recommend to disable for environments with high latency)

python demo/cli.py

Please refer to inference.md
The following scripts are for a single node training, with the batch size of 512. If you have multiple nodes, please try to set torchrun arguments and --gradient_accumulation_steps
accordingly.
https://huggingface.co/datasets/chenjoya/Live-CC-5M
The explanation for the training arugments:
export VIDEO_MIN_PIXELS=78400 # 100*28*28. the minimum visual frame tokens sent to llm is 100
export FPS_MAX_FRAMES=480 # maximum number of frames for each video (480/60/2 = 4min)
export VIDEO_MAX_PIXELS=19267584 # 24576*28*28. the maximum overall video tokens sent to llm is 24k (leave 8k for language)
learning_rate=2e-5 # pretraining uses 2e-5 lr
run_name="livecc_pretrain_24kx480x100_bs512lr$learning_rate"
WANDB_PROJECT='joya.chen' TOKENIZERS_PARALLELISM=false torchrun --standalone --nproc_per_node=8 train.py \
--deepspeed ./scripts/deepspeed_zero2.json \ # Use DeepSpeed ZeRO-2 config
--output_dir checkpoints/$run_name \ # Where to save model checkpoints
--overwrite_output_dir True \ # Set False to resume from existing checkpoint
--run_name $run_name \ # Unique identifier for the training run (used by WandB)
--save_on_each_node True \ # Set False if nodes share a filesystem
--do_train True \ # Enable training mode
--eval_strategy no \ # No evaluation between training steps
--per_device_train_batch_size 1 \ # Batch size per GPU
--gradient_accumulation_steps 64 \ # Effective batch size = 64 Γ num_gpus
--learning_rate $learning_rate \ # Learning rate to use
--warmup_ratio 0.03 \ # Warm-up proportion of training steps
--optim adamw_torch \ # Optimizer: AdamW (PyTorch implementation)
--lr_scheduler_type cosine \ # Cosine decay learning rate schedule
--num_train_epochs 1 \ # Number of training epochs
--logging_steps 10 \ # Log training metrics every 10 steps
--save_steps 1000 \ # Save checkpoint every 1000 steps
--bf16 True \ # Use BF16 mixed precision (if supported)
--tf32 True \ # Use TF32 precision on NVIDIA Ampere+ GPUs
--gradient_checkpointing True \ # Enable gradient checkpointing to save memory
--pretrained_model_name_or_path Qwen/Qwen2-VL-7B \ # Start from pretrained Qwen2-VL-7B model
--annotation_paths datasets/live_cc_5m_with_seeks.jsonl \ # Dataset used for training
--dataloader_num_workers 16 \ # Number of parallel workers for data loading
--freeze_modules visual \ # Freeze visual encoder parameters
--use_liger_kernel True \ # Use Liger kernel for faster attention (must match in inference)
--report_to wandb # Enable logging to Weights & Biases
https://huggingface.co/datasets/chenjoya/Live-WhisperX-526K
https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K
export VIDEO_MIN_PIXELS=78400 # 100*28*28. the minimum visual frame tokens sent to llm is 100
export FPS_MAX_FRAMES=480 # maximum number of frames for each video (480/60/2 = 4min)
export VIDEO_MAX_PIXELS=19267584 # 24576*28*28. the maximum overall video tokens sent to llm is 24k (leave 8k for language)
learning_rate=1e-5 # sft uses 1e-5 lr
run_name="livecc_sft_24k480x100_live526k+llava178k+hound+onevision_lr$learning_rate"
WANDB_PROJECT='joya.chen' TOKENIZERS_PARALLELISM=false torchrun --standalone --nproc_per_node=8 train.py \
--deepspeed ./scripts/deepspeed_zero2.json \ # Use DeepSpeed ZeRO-2 config
--output_dir checkpoints/$run_name \ # Output checkpoint directory
--overwrite_output_dir True \ # Set to False to resume training
--run_name $run_name \ # Wandb and checkpoint run name
--save_on_each_node True \ # Set False if using shared storage
--do_train True \ # Enable training mode
--eval_strategy no \ # No evaluation during training
--per_device_train_batch_size 1 \ # Batch size per GPU
--gradient_accumulation_steps 64 \ # Accumulate gradients for effective batch size = 64 Γ num_gpus
--learning_rate $learning_rate \ # Learning rate to use
--warmup_ratio 0.03 \ # Learning rate warm-up ratio
--optim adamw_torch \ # Optimizer type
--lr_scheduler_type cosine \ # Cosine learning rate scheduler
--num_train_epochs 1 \ # Total number of training epochs
--logging_steps 10 \ # Log every 10 steps
--save_steps 1000 \ # Save checkpoint every 1000 steps
--bf16 True \ # Use BF16 mixed precision
--tf32 True \ # Enable TF32 acceleration (NVIDIA Ampere+)
--gradient_checkpointing True \ # Enable gradient checkpointing for memory efficiency
--pretrained_model_name_or_path chenjoya/LiveCC-7B-Base \ # Initialization checkpoint
--annotation_paths \ # Training datasets:
datasets/live_whisperx_526k_with_seeks.jsonl \ # - LiveCC 526k
datasets/llava_ov_single_image_text_mix_with_seeks.jsonl \ # - OneVision (single image)
datasets/llava_ov_multi_image_with_seeks.jsonl \ # - OneVision (multi-image)
datasets/llava_hound_video_with_seeks.jsonl \ # - LLaVA-Hound video
datasets/llava_video_178k_with_seeks.jsonl \ # - LLaVA-Video 178k
--dataloader_num_workers 16 \ # Number of workers for data loading
--freeze_modules visual \ # Do not update visual encoder
--use_liger_kernel True \ # Use Liger kernel for efficient attention (enable at inference too)
--report_to wandb # Report metrics to Weights & Biases
The following scripts will automatically download data from LiveSports3K.
# generate livecc
python evaluation/livesports3kcc/distributed_generate_livecc.py --model_name_or_path chenjoya/LiveCC-7B-Instruct --output_dir evaluation/livesports3kcc/livecc --num_workers 8 --repetition_penalty 1.15
# if evaluate base model, please add --not_instruct_model
python evaluation/livesports3kcc/distributed_generate_livecc.py --model_name_or_path chenjoya/LiveCC-7B-Base --output_dir evaluation/livesports3kcc/livecc --num_workers 8 --repetition_penalty 1.15 --not_instruct_model
# llm judge winning rate
AZURE_OPENAI_ENDPOINT=xxx AZURE_OPENAI_API_KEY=xxx python evaluation/livesports3kcc/llm_judge.py --model_id LiveCC-7B-Instruct --prediction_jsonl evaluation/livesports3kcc/livecc/LiveCC-7B-Instruct.jsonl --output_dir evaluation/livesports3kcc/judges --num_workers 16

(Slightly better than our paper results, since Azure GPT-4o output is not strictly stable, even if we set seed=42, temperature=0
π)
If you do not have GPT-4o quota, please submit results at CVPR'25 LoVE Workshop Track2A. We cover the GPT-4o evaluation cost 1 time per day for every participant.
python evaluation/livesports3kcc/distributed_generate_caption.py --model_name_or_path Qwen/Qwen2.5-VL-7B-Instruct --output_dir evaluation/livesports3kcc/captions --num_workers 8
Our fast distributed VideoMME evaluator needs videomme.jsonl
with the data format of each line as:
{"video_id": "001", "duration": "short", "domain": "Knowledge", "sub_category": "Humanity & History", "url": "https://www.youtube.com/watch?v=fFjv93ACGo8", "videoID": "fFjv93ACGo8", "question_id": "001-1", "task_type": "Counting Problem", "question": "When demonstrating the Germany modern Christmas tree is initially decorated with apples, candles and berries, which kind of the decoration has the largest number?", "options": ["A. Apples.", "B. Candles.", "C. Berries.", "D. The three kinds are of the same number."], "answer": "C", "subtitles": "[Music] and new at 6:00 ..."}
After preparation, please run:
# without subtitles
torchrun --standalone --nproc_per_node=8 evaluation/videomme/distributed_evaluate_videomme.py --model_name_or_path chenjoya/LiveCC-7B-Instruct --benchmark_path videomme.jsonl
# with subtitles
torchrun --standalone --nproc_per_node=8 evaluation/videomme/distributed_evaluate_videomme.py --model_name_or_path chenjoya/LiveCC-7B-Instruct --benchmark_path videomme.jsonl --with_subtitles
Typically, it costs ~40min (no subtitles) or ~50min (with subtitles) to finish the evaluation (8x80G GPUs). The results will be written to evaluation/videomme/results. We also provided the evaluation results of LiveCC-7B-Instruct at evaluation/videomme/results.
Too busy recently π, will update readme as soon as possible
Too busy recently π, will update readme as soon as possible
Too busy recently π, will update readme as soon as possible
@inproceedings{livecc,
author = {Joya Chen and Ziyun Zeng and Yiqi Lin and Wei Li and Zejun Ma and Mike Zheng Shou},
title = {LiveCC: Learning Video LLM with Streaming Speech Transcription at Scale},
booktitle = {CVPR},
year = {2025},
}