In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [None]:
import art
from dotenv import load_dotenv
from tau_bench.types import TauBenchPolicyConfig, TauBenchTrainingConfig
from run_rl import train
from run import RunConfig
import torch

load_dotenv()

MODEL_NAME = "002"
model = art.TrainableModel(
    name=MODEL_NAME,
    project="tau-bench",
    base_model="Qwen/Qwen2.5-14B-Instruct",
    config=TauBenchPolicyConfig(
        training_config=TauBenchTrainingConfig(
            trajectories_per_group=64,
            groups_per_step=4,
            learning_rate=1e-6,
            eval_steps=1000,
            val_set_size=85,
            training_dataset_size=4,
            num_epochs=1000,
            train_mode="sync_rl",
        ),
        run_config=RunConfig(
            model_provider="hosted_vllm",
            user_model_provider="openai",
            model=MODEL_NAME,
            user_model="gpt-4o",
            agent_strategy="tool-calling-rl",
            temperature=1.0,
            task_split="test",
            log_dir="rl_results",
            skip_eval=True,
        ),
    ),
    _internal_config=art.dev.InternalModelConfig(
        engine_args=art.dev.EngineArgs(
            tensor_parallel_size=torch.cuda.device_count(), gpu_memory_utilization=0.75
        ),
        torchtune_args=art.dev.TorchtuneArgs(
            model="qwen2_5_14b_instruct", model_type="QWEN2", async_weight_syncing=True
        ),
    ),
)
await train(model)

[34m[1mwandb[0m: Currently logged in as: [33mbradhilton[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO 07-03 17:07:19 [__init__.py:244] Automatically detected platform cuda.
INFO 07-03 17:07:26 [__init__.py:244] Automatically detected platform cuda.
/home/ubuntu/.cache/huggingface/hub/models--Qwen--Qwen2.5-14B-Instruct/snapshots/cf98f3b3bbb457ad9e2bb7baf9a0125b6b88caa8
INFO 07-03 17:07:37 [config.py:823] This model supports multiple tasks: {'embed', 'score', 'reward', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 07-03 17:07:37 [config.py:1946] Defaulting to use mp for distributed inference
INFO 07-03 17:07:37 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=2048.
INFO 07-03 17:07:40 [__init__.py:244] Automatically detected platform cuda.
INFO 07-03 17:07:43 [core.py:455] Waiting for init message from front-end.
INFO 07-03 17:07:43 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='Qwen/Qwen2.5-14B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-14B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=N

Loading safetensors checkpoint shards:   0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 2/8 [00:00<00:01,  4.75it/s]
Loading safetensors checkpoint shards:  38% Completed | 3/8 [00:00<00:01,  3.82it/s]
Loading safetensors checkpoint shards:  50% Completed | 4/8 [00:01<00:01,  3.82it/s]
Loading safetensors checkpoint shards:  62% Completed | 5/8 [00:01<00:00,  3.51it/s]
Loading safetensors checkpoint shards:  75% Completed | 6/8 [00:01<00:00,  3.34it/s]
Loading safetensors checkpoint shards:  88% Completed | 7/8 [00:02<00:00,  3.21it/s]


[1;36m(VllmWorker rank=3 pid=50024)[0;0m INFO 07-03 17:07:55 [default_loader.py:272] Loading weights took 2.42 seconds
[1;36m(VllmWorker rank=1 pid=50020)[0;0m INFO 07-03 17:07:55 [default_loader.py:272] Loading weights took 2.34 seconds
[1;36m(VllmWorker rank=2 pid=50021)[0;0m INFO 07-03 17:07:55 [default_loader.py:272] Loading weights took 2.38 seconds
[1;36m(VllmWorker rank=0 pid=50019)[0;0m INFO 07-03 17:07:55 [default_loader.py:272] Loading weights took 2.37 seconds


Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:02<00:00,  3.16it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:02<00:00,  3.42it/s]
[1;36m(VllmWorker rank=0 pid=50019)[0;0m 


[1;36m(VllmWorker rank=3 pid=50024)[0;0m INFO 07-03 17:07:55 [gpu_model_runner.py:1624] Model loading took 6.9461 GiB and 2.819862 seconds
[1;36m(VllmWorker rank=1 pid=50020)[0;0m INFO 07-03 17:07:56 [gpu_model_runner.py:1624] Model loading took 6.9461 GiB and 2.894747 seconds
[1;36m(VllmWorker rank=2 pid=50021)[0;0m INFO 07-03 17:07:56 [gpu_model_runner.py:1624] Model loading took 6.9461 GiB and 3.029783 seconds
[1;36m(VllmWorker rank=0 pid=50019)[0;0m INFO 07-03 17:07:56 [gpu_model_runner.py:1624] Model loading took 6.9461 GiB and 3.120032 seconds
[1;36m(VllmWorker rank=3 pid=50024)[0;0m INFO 07-03 17:08:05 [backends.py:462] Using cache directory: /home/ubuntu/.cache/vllm/torch_compile_cache/a3b60490c8/rank_3_0 for vLLM's torch.compile
[1;36m(VllmWorker rank=3 pid=50024)[0;0m INFO 07-03 17:08:05 [backends.py:472] Dynamo bytecode transform time: 9.09 s
[1;36m(VllmWorker rank=1 pid=50020)[0;0m INFO 07-03 17:08:05 [backends.py:462] Using cache directory: /home/ubuntu/.cach

Iterating dataset:   0%|          | 0/1000 [00:00<?, ?batch/s]


--- Training Step 0 (Epoch 0, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.

Error in rollout for task 2: litellm.InternalServerError: InternalServerError: Hosted_vllmException - Connection error.

[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.

Error in rollout for task 1: litellm.InternalServerError: InternalServerError: Hosted_vllmException - Connection error.
Training on 4 trajectory groups...
Packed 254 trajectories into 164 sequences of length 16384


train:   0%|          | 0/41 [00:00<?, ?it/s]

Step 0: Average training reward = 0.421875

--- Training Step 1 (Epoch 1, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]

Training on 4 trajectory groups...
Packed 256 trajectories into 123 sequences of length 18432


train:   0%|          | 0/31 [00:00<?, ?it/s]

Step 1: Average training reward = 0.38671875

--- Training Step 2 (Epoch 2, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]

Training on 4 trajectory groups...
Packed 256 trajectories into 161 sequences of length 16384


train:   0%|          | 0/41 [00:00<?, ?it/s]

Step 2: Average training reward = 0.44921875

--- Training Step 3 (Epoch 3, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]

Training on 4 trajectory groups...
Packed 256 trajectories into 162 sequences of length 16384


train:   0%|          | 0/41 [00:00<?, ?it/s]

Step 3: Average training reward = 0.3984375

--- Training Step 4 (Epoch 4, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]

Training on 4 trajectory groups...
Packed 256 trajectories into 164 sequences of length 16384


train:   0%|          | 0/41 [00:00<?, ?it/s]

Step 4: Average training reward = 0.4453125

--- Training Step 5 (Epoch 5, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]

Training on 4 trajectory groups...
Packed 256 trajectories into 156 sequences of length 16384


train:   0%|          | 0/39 [00:00<?, ?it/s]

Step 5: Average training reward = 0.4609375

--- Training Step 6 (Epoch 6, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]

ERROR:asyncio:Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f7f66646690>
ERROR:asyncio:Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x7f7fa4bc5410>


Training on 4 trajectory groups...
Packed 256 trajectories into 218 sequences of length 14336


train:   0%|          | 0/55 [00:00<?, ?it/s]

Step 6: Average training reward = 0.5

--- Training Step 7 (Epoch 7, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]

Training on 4 trajectory groups...
Packed 256 trajectories into 125 sequences of length 18432


train:   0%|          | 0/32 [00:00<?, ?it/s]

Step 7: Average training reward = 0.46875

--- Training Step 8 (Epoch 8, Step 0) ---
Generating trajectories for 4 tasks...


gather:   0%|          | 0/256 [00:00<?, ?it/s]

Error logging trajectory to openpipe: Server disconnected without sending a response.
Training on 4 trajectory groups...
Packed 256 trajectories into 227 sequences of length 14336


train:   0%|          | 0/57 [00:00<?, ?it/s]