# Run verl by tractorun

In this notebook we run [Verl](https://github.com/volcengine/verl/) on the [Tracto.ai](https://tracto.ai) platform. As an example, we will use [this notebook](https://github.com/volcengine/verl/blob/main/examples/ppo_trainer/verl_getting_started.ipynb), but instead of using 1 GPU, we will utilize all 8.

To run this notebook, use the following Docker image.

```docker
ROM ubuntu:22.04

USER root

RUN apt-get update && apt-get install -y \
    software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa \
    && apt-get update && apt-get install -y \
    python3.10 \
    python3.10-venv \
    python3.10-dev

RUN apt-get install wget --yes
RUN wget https://bootstrap.pypa.io/get-pip.py && python3.10 get-pip.py

RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1

RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
RUN dpkg -i cuda-keyring_1.1-1_all.deb
RUN apt-get update
RUN apt-get -y install cuda-toolkit-12-4

ENV PATH /usr/local/cuda-12.4/bin:$PATH
ENV CUDA_HOME /usr/local/cuda-12.4

RUN pip3 install --upgrade pip setuptools wheel
RUN pip3 install torchvision==0.19.0
RUN pip install --pre torch==2.4.0 --index-url https://download.pytorch.org/whl/nightly/cu121
RUN pip install vllm==0.6.3

RUN pip install -U "huggingface_hub[cli]"

RUN pip install tractorun

RUN apt-get install git --yes
RUN mkdir /verl_repo && git clone https://github.com/volcengine/verl /verl_repo && cd /verl_repo && pip3 install -e . -U

RUN pip3 install flash-attn --no-build-isolation


RUN mkdir /models && huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct --local-dir /models/Qwen2.5-0.5B-Instruct
RUN mkdir /data && mkdir /gsm8k && python3 /verl_repo/examples/data_preprocess/gsm8k.py --local_dir /data/gsm8k
```

In [1]:
from tractorun.run import prepare_and_get_toolbox
from tractorun.backend.generic import GenericBackend
from tractorun.run import run
from tractorun.resources import Resources
from tractorun.mesh import Mesh
from tractorun.stderr_reader import StderrMode

import subprocess
import sys
import os

def controller(toolbox):
    command = [
        r"PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo data.train_files=/data/gsm8k/train.parquet "
        r"data.val_files=/data/gsm8k/test.parquet data.train_batch_size=128 data.val_batch_size=656 "
        r"data.max_prompt_length=512 data.max_response_length=256 actor_rollout_ref.model.path=/models/Qwen2.5-0.5B-Instruct "
        r"actor_rollout_ref.actor.optim.lr=1e-6 actor_rollout_ref.actor.ppo_mini_batch_size=64 "
        r"actor_rollout_ref.actor.ppo_micro_batch_size=8 actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 "
        r"actor_rollout_ref.rollout.tensor_model_parallel_size=1 actor_rollout_ref.rollout.gpu_memory_utilization=0.4 "
        r"actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 critic.optim.lr=1e-5 "
        r"critic.model.path=/models/Qwen2.5-0.5B-Instruct critic.ppo_micro_batch_size=8 "
        r"algorithm.kl_ctrl.kl_coef=0.001 +trainer.val_before_train=False trainer.default_hdfs_dir=null "
        r"trainer.n_gpus_per_node=8 trainer.nnodes=1 trainer.save_freq=10 trainer.test_freq=10 "
        r"trainer.total_epochs=1 trainer.logger=[console]"
    ]

    os.environ["VLLM_ATTENTION_BACKEND"] = "XFORMERS"
    process = subprocess.run(command, shell=True, stdout=sys.stdout, stderr=sys.stderr, env=os.environ)
    assert process.returncode == 0

run(
    controller,
    yt_path="//tmp/verl",
    resources=Resources(
        memory_limit=644245094400, # 600 GiB
        cpu_limit=120,
    ),
    mesh=Mesh(
        node_count=1,
        process_per_node=1,
        gpu_per_process=8,
        pool_trees=["gpu_h200"],
    ),
    proxy_stderr_mode=StderrMode.primary,
    backend=GenericBackend(),
    docker_image="cr.eu-north1.nebius.cloud/e00faee7vas5hpsh3s/chiffa/verl:v6",
)

2025-02-14 21:33:42,346	INFO	Operation started: https://playground.yt.nebius.yt/playground/operations/520620f6-3ce90f29-270703e8-f36e1657/details


2025-02-14 21:33:42,380	INFO	( 0 min) operation 520620f6-3ce90f29-270703e8-f36e1657 initializing


2025-02-14 21:33:42,932	INFO	( 0 min) Unrecognized spec: {'enable_partitioned_data_balancing': false}


2025-02-14 21:33:44,616	INFO	( 0 min) operation 520620f6-3ce90f29-270703e8-f36e1657: running=0     completed=0     pending=1     failed=0     aborted=0     lost=0     total=1     blocked=0    


2025-02-14 21:33:47,709	INFO	( 0 min) operation 520620f6-3ce90f29-270703e8-f36e1657: running=1     completed=0     pending=0     failed=0     aborted=0     lost=0     total=1     blocked=0    


2025-02-14 21:50:22,575	INFO	(16 min) operation 520620f6-3ce90f29-270703e8-f36e1657 completed


2025-02-14 21:50:22,604	INFO	(16 min) Alerts: {'low_cpu_usage': {'code': 1, 'message': "Average CPU usage of some of your job types is significantly lower than requested 'cpu_limit'. Consider decreasing cpu_limit in spec of your operation", 'attributes': {'pid': 1, 'tid': 4583944030924008135, 'thread': 'Controller:13', 'fid': 18446123350071491836, 'host': 'man0-0460.hw.nebius.yt', 'datetime': '2025-02-14T21:50:20.670172Z', 'trace_id': 'ad092a59-a0dcb7aa-4a232913-32edd2b8', 'span_id': 5365977716875535721}, 'inner_errors': [{'code': 1, 'message': 'Jobs of task "task" use 6.80% of requested cpu limit', 'attributes': {'pid': 1, 'tid': 4583944030924008135, 'thread': 'Controller:13', 'fid': 18446123350071491836, 'host': 'man0-0460.hw.nebius.yt', 'datetime': '2025-02-14T21:50:20.670153Z', 'trace_id': 'ad092a59-a0dcb7aa-4a232913-32edd2b8', 'span_id': 5365977716875535721, 'cpu_time': 8102394, 'cpu_limit': 120.0, 'exec_time': 992364}}]}}


RunInfo(operation_spec={'description': {'notebook_path': '//home/chiffa/verl/notebook_multiple_gpu.ipynb'}, 'started_by': {'hostname': 'end-h100-1.exec-nodes-h100.tundra.svc.testy.k8s.nebius.yt', 'pid': 249, 'wrapper_version': '0.13.23', 'python_version': '3.10.12', 'binary_name': 'ipykernel_launcher.py', 'command': ['/slot/sandbox/jlab/site-packages/ipykernel_launcher.py', '-f', '/slot/sandbox/.local/share/jupyter/runtime/kernel-8ed19f55-5722-4b41-95d6-f516c69e13c6.json'], 'user': 'root', 'platform': 'Ubuntu 22.04 (jammy)'}, 'fail_on_job_restart': True, 'is_gang': True, 'annotations': {'is_tractorun': True}, 'tasks': {'task': {'command': 'python3 _py_runner.py wrapped.pickle config_dump _modules_info modules/_main_module.py _main_module PY_SOURCE', 'job_count': 1, 'gpu_limit': 8, 'port_count': 1, 'cpu_limit': 120, 'memory_limit': 644253684654, 'docker_image': 'cr.eu-north1.nebius.cloud/e00faee7vas5hpsh3s/chiffa/verl:v6', 'file_paths': [{'value': '//tmp/yt_wrapper/file_storage/new_cach