# verl Demo

In [1]:
import os

In [2]:
os.chdir("/root/verl")

## Install `verl`

In [3]:
! pip install -e ".[vllm]"

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Obtaining file:///root/verl
  Installing build dependencies ... [?25ldone
[?25h  Checking if build backend supports build_editable ... [?25ldone
[?25h  Getting requirements to build editable ... [?25ldone
[?25h  Preparing editable metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: verl
  Building editable for verl (pyproject.toml) ... [?25ldone
[?25h  Created wheel for verl: filename=verl-0.2.0.dev0-0.editable-py3-none-any.whl size=16801 sha256=550efbf0decb154928dba09c8fee7557e87dff6516a85dfa209d6c4f484fb2a4
  Stored in directory: /tmp/pip-ephem-wheel-cache-1rhvmztn/wheels/d4/f5/29/7c5bb62e9344bc78534719365f2fb772bb330dbd23de4b25d2
Successfully built verl
Installing collected packages: verl
  Attempting uninstall: verl
    Found existing installation: verl 0.2.0.dev0
    Uninstalling verl-0.2.0.dev0:
      Successfully uninstalled verl-0.2.0.dev0
Successfully installed verl-0.

## GRPO

We modify from the setup in [SimpleRL Zoo](https://github.com/hkust-nlp/simpleRL-reason?tab=readme-ov-file#training). Kudos to their awesome work on verifying RL with LLMs of various scales!

### Prepare the Data

In [4]:
os.environ.update({
    "TRAIN_FILE": "/root/data/simplelr_qwen_level3to5/train.parquet",
    "TEST_FILE": "/root/data/simplelr_qwen_level3to5/test.parquet",
})

! mkdir -p $(dirname "${TRAIN_FILE}") && \
    wget -O "${TRAIN_FILE}" https://huggingface.co/datasets/hkust-nlp/SimpleRL-Zoo-Data/resolve/main/simplelr_qwen_level3to5/train.parquet
! mkdir -p $(dirname "${TEST_FILE}") && \
    wget -O "${TEST_FILE}" https://huggingface.co/datasets/hkust-nlp/SimpleRL-Zoo-Data/resolve/main/simplelr_qwen_level3to5/test.parquet

--2025-04-15 17:36:49--  https://huggingface.co/datasets/hkust-nlp/SimpleRL-Zoo-Data/resolve/main/simplelr_qwen_level3to5/train.parquet
Resolving huggingface.co (huggingface.co)... 13.35.202.121, 13.35.202.97, 13.35.202.40, ...
Connecting to huggingface.co (huggingface.co)|13.35.202.121|:443... connected.
HTTP request sent, awaiting response... 

302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/87/ea/87eaecf320107f69e4069b63f5ad6c44e451e2fb2519b068d00196efed35d66b/a6ce0b30da9d3be80b0bd8b2925c8707aec6f627af3628c5439241a3b22e7c13?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27train.parquet%3B+filename%3D%22train.parquet%22%3B&Expires=1744742209&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NDc0MjIwOX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzg3L2VhLzg3ZWFlY2YzMjAxMDdmNjllNDA2OWI2M2Y1YWQ2YzQ0ZTQ1MWUyZmIyNTE5YjA2OGQwMDE5NmVmZWQzNWQ2NmIvYTZjZTBiMzBkYTlkM2JlODBiMGJkOGIyOTI1Yzg3MDdhZWM2ZjYyN2FmMzYyOGM1NDM5MjQxYTNiMjJlN2MxMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=SSiPeQaZQClmrhsNdwaQ0l4biA3v7TG8KPMw2ojgfrXEnTY6XZcXVTvF0eUmYkghOJk9f47H6ZOgtiM8JOF8dmd0UjFrrEQTFa4XVUfS8ZqR8lqFPvvRhW-AHdTK41Z4-6pRznO9GnxNXpZ3eYaJyPvi2I8t3I-GZhDQ27Ei8OWUVMcXfKgFrmIRAaOaG1Lb9-25VltFl4dfGERG9P1u26AkjsagKHimQkg6H1O4TmzVAacNKPBxeXK7GQVONKG7woyXA0urH0huX6OyTRRCT

### Download the Base Model

In [5]:
os.environ.update({
    "MODEL_ID": "Qwen/Qwen2.5-0.5B",
})

! huggingface-cli download "${MODEL_ID}"

/root/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987


### Train!

In [6]:
os.environ.update({"VLLM_USE_V1": "1"})


In [8]:
os.environ.update({
    k: str(v) for k, v in {
        "train_max_token_num_per_gpu": 16384,
        "infer_max_token_num_per_gpu": 32768,
    }.items()
})

! python3 -m verl.trainer.main_ppo \
    algorithm.adv_estimator=grpo \
    data.train_files=[${TRAIN_FILE}] \
    data.val_files=[${TEST_FILE}] \
    data.max_prompt_length=1024 \
    data.max_response_length=8192 \
    data.train_batch_size=1024 \
    algorithm.use_kl_in_reward=True \
    algorithm.kl_ctrl.kl_coef=0.0001 \
    actor_rollout_ref.model.path=${MODEL_ID} \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.entropy_coeff=0.001 \
    actor_rollout_ref.actor.optim.lr=5e-7 \
    actor_rollout_ref.actor.clip_ratio_low=0.2 \
    actor_rollout_ref.actor.clip_ratio_high=0.2 \
    actor_rollout_ref.actor.clip_ratio_c=10.0 \
    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
    actor_rollout_ref.actor.use_dynamic_bsz=True \
    actor_rollout_ref.actor.ppo_max_token_len_per_gpu="${train_max_token_num_per_gpu}" \
    actor_rollout_ref.rollout.n=8 \
    actor_rollout_ref.rollout.temperature=1.0 \
    actor_rollout_ref.rollout.top_p=1.0 \
    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
    actor_rollout_ref.rollout.val_kwargs.top_p=0.95 \
    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
    actor_rollout_ref.rollout.val_kwargs.n=1 \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
    actor_rollout_ref.rollout.enable_chunked_prefill=True \
    actor_rollout_ref.rollout.max_num_batched_tokens=10240 \
    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True  \
    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
    trainer.total_epochs=20 \
    trainer.val_before_train=True \
    trainer.test_freq=5 \
    trainer.save_freq=20 \
    trainer.resume_mode=auto \
    trainer.nnodes=1 \
    trainer.n_gpus_per_node=1 \
    trainer.logger=["console"] \
    trainer.project_name="verl-demo" \
    trainer.experiment_name="grpo-qwen2p5-0p5b"

2025-04-15 17:37:08,581	INFO worker.py:1843 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
[36m(TaskRunner pid=20084)[0m {'actor_rollout_ref': {'actor': {'checkpoint': {'contents': ['model',
[36m(TaskRunner pid=20084)[0m                                                              'optimizer',
[36m(TaskRunner pid=20084)[0m                                                              'extra']},
[36m(TaskRunner pid=20084)[0m                                  'clip_ratio': 0.2,
[36m(TaskRunner pid=20084)[0m                                  'clip_ratio_c': 10.0,
[36m(TaskRunner pid=20084)[0m                                  'clip_ratio_high': 0.2,
[36m(TaskRunner pid=20084)[0m                                  'clip_ratio_low': 0.2,
[36m(TaskRunner pid=20084)[0m                                  'entropy_coeff': 0.001,
[36m(TaskRunner pid=20084)[0m                                  'fsdp_config': {'fsdp_size': -1,
[36m(TaskRunner pi