In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
import art
import asyncio
from dotenv import load_dotenv
import json
import openai
import random
import re
from typing import TypedDict

load_dotenv()


class TemporalCluePuzzle(TypedDict):
    num_clues: int
    prompt: str
    solution: dict[str, str]


puzzles: list[TemporalCluePuzzle] = json.load(open("./data/temporal-clue/puzzles.json"))
val_puzzles = puzzles[:64]
test_puzzles = puzzles[64:128]
train_puzzles = puzzles[128:]
random.seed(42)
random.shuffle(train_puzzles)


api = art.LocalAPI(wandb_project="agent-reinforcement-training")
model = await api.get_or_create_model(
    name="temporal-clue-tool-use-001",
    base_model="NousResearch/Hermes-3-Llama-3.1-8B",
)


async def rollout(
    client: openai.AsyncOpenAI, puzzle: TemporalCluePuzzle
) -> art.Trajectory:
    messages: art.Messages = [{"role": "user", "content": puzzle["prompt"]}]
    tools: art.Tools = [
        {
            "type": "function",
            "function": {
                "name": "get_hints",
                "description": "A function to retrieve one or two hints. No more than 3 hints may be retrieved total. "
                "Each retrieved hint decreases your final accuracy score by 10%.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "num_hints": {
                            "type": "integer",
                            "description": "Number of hints to retrieve (1 or 2)",
                            "enum": [1, 2],
                        }
                    },
                },
            },
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages,
        model=model.name,
        tools=tools,
    )
    choice = chat_completion.choices[0]
    messages_and_choices = [*messages, choice]
    hints = [
        f"The answer for {key} is {value}" for key, value in puzzle["solution"].items()
    ]
    random.shuffle(hints)
    hints_shared = 0

    def get_hints(function_name: str, function_arguments: str) -> str:
        nonlocal hints_shared
        if function_name != "get_hints":
            return f"Error: unexpected function name {function_name}"
        try:
            num_hints = json.loads(function_arguments or "{}").get("num_hints", 1)
        except Exception:
            return f"Error: invalid JSON {function_arguments}"
        if num_hints not in {1, 2}:
            return f"Error: invalid number of hints {num_hints}"
        if num_hints + hints_shared > 3:
            return f"Error: cannot retrieve {num_hints} hints, already retrieved {hints_shared} hints"
        hints_shared += num_hints
        content = "Hints:"
        for _ in range(num_hints):
            content += f"\n{hints.pop()}"
        return content

    while tool_calls := choice.message.tool_calls:
        messages.append(
            {
                "role": "assistant",
                "content": choice.message.content,
                "tool_calls": [
                    {
                        "id": tool_call.id,
                        "type": "function",
                        "function": {
                            "name": tool_call.function.name,
                            "arguments": tool_call.function.arguments or "{}",
                        },
                    }
                    for tool_call in tool_calls
                ],
            }
        )
        for tool_call in tool_calls:
            if tool_call.function.arguments:
                print(tool_call.function.arguments)
            messages.append(
                {
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "content": get_hints(
                        tool_call.function.name, tool_call.function.arguments
                    ),
                }
            )
            messages_and_choices.append(messages[-1])
        try:
            chat_completion = await client.chat.completions.create(
                messages=messages,
                model=model.name,
                tools=tools,
            )
        except openai.BadRequestError:
            # Likely incorrectly formatted tool call arguments. We'll break
            # out of the loop and allow the model to (probably) fail.
            print(messages[-1].get("tool_calls"))
            break
        choice = chat_completion.choices[0]
        messages_and_choices.append(choice)

    content = choice.message.content or ""
    num_correct = 0
    for key, value in puzzle["solution"].items():
        if matches := re.findall(rf"{key}\. ([A-Za-z \.:-]+)", content):
            match = matches[-1]
            if match.strip().lower() == value.lower():
                num_correct += 1
    reward = acc = num_correct / len(puzzle["solution"])
    return art.Trajectory(
        messages_and_choices=messages_and_choices,
        reward=reward - hints_shared * 0.1,
        metrics={"acc": acc, "hints_shared": hints_shared},
    )


stride = 32
for i in range(await model.get_iteration(), 1_000):
    async with model.openai_client(tool_use=True, verbosity=2) as openai_client:
        val_groups, train_groups = await asyncio.gather(
            art.gather_groups(
                (
                    (rollout(openai_client, puzzle) for _ in range(2))
                    for puzzle in val_puzzles
                ),
                pbar_desc="val",
            ),
            art.gather_groups(
                (
                    (rollout(openai_client, puzzle) for _ in range(50))
                    for puzzle in train_puzzles[i * stride : (i + 1) * stride]
                ),
                pbar_desc="train",
            ),
        )
    _, _ = await asyncio.gather(
        model.save(val_groups),
        model.tune(train_groups, config=art.TuneConfig(plot_tensors=True, verbosity=2)),
    )

$ vllm serve NousResearch/Hermes-3-Llama-3.1-8B --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.95 --max-num-seqs=4096 --max-num-batched-tokens=16384 --num-scheduler-steps=16 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=80 --tensor-parallel-size=1 --enable-auto-tool-choice --tool-call-parser=hermes --served-model-name=temporal-clue-tool-use-001 --port=8000 --api-key=default
INFO 03-12 21:30:12 __init__.py:207] Automatically detected platform cuda.
INFO 03-12 21:30:12 api_server.py:912] vLLM API server version 0.7.3
INFO 03-12 21:30:12 api_server.py:913] args: Namespace(subparser='serve', model_tag='NousResearch/Hermes-3-Llama-3.1-8B', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.10it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.06it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.55it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.33it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.30it/s]



INFO 03-12 21:30:31 model_runner.py:1115] Loading model weights took 14.9888 GB
INFO 03-12 21:30:33 worker.py:267] Memory profiling takes 1.82 seconds
INFO 03-12 21:30:33 worker.py:267] the current vLLM instance can use total_gpu_memory (79.10GiB) x gpu_memory_utilization (0.95) = 75.14GiB
INFO 03-12 21:30:33 worker.py:267] model weights take 14.99GiB; non_torch_memory takes 0.14GiB; PyTorch activation peak memory takes 18.75GiB; the rest of the memory reserved for KV Cache is 41.26GiB.
INFO 03-12 21:30:33 executor_base.py:111] # cuda blocks: 10563, # CPU blocks: 20480
INFO 03-12 21:30:33 executor_base.py:116] Maximum concurrency for 131072 tokens per request: 2.58x
INFO 03-12 21:31:24 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 52.92 seconds
INFO 03-12 21:31:24 serving_chat.py:76] "auto" tool choice has been enabled please note that while the parallel_tool_calls client option is preset for compatibility reasons, it will be ignored.
INFO 03-12 21:31:24 

INFO:     Started server process [85187]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


INFO 03-12 21:31:31 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
INFO 03-12 21:31:32 metrics.py:455] Avg prompt throughput: 2.7 tokens/s, Avg generation throughput: 0.1 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
INFO 03-12 21:31:32 metrics.py:471] Prefix cache hit rate: GPU: 0.00%, CPU: 0.00%
INFO:     127.0.0.1:50498 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


val:   0%|          | 0/128 [00:00<?, ?it/s]

train:   0%|          | 0/1600 [00:00<?, ?it/s]

{"clues": ["The suspect motivated by Betrayal was in the Kitchen at 12:00 AM", "The murderer moved from the Kitchen to the Cloak Room at 12:00 AM", "The Horseshoe was in the room just east of the Rope at 12:00 AM", "The Horseshoe was in the room just east of Mr. Boddy at 11:30 PM", "Miss Scarlet was in the room just east of Mr. Boddy at 12:00 AM", "The Rope was in the room just west of Miss Scarlet at 11
None


CancelledError: 

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model.base_model)

In [27]:
tools: art.Tools = [
    {
        "type": "function",
        "function": {
            "name": "get_hints",
            "description": "A function to retrieve one or two hints. No more than 3 hints may be retrieved total. "
            "Each retrieved hint decreases your final accuracy score by 10%.",
            "parameters": {
                "type": "object",
                "properties": {
                    "num_hints": {
                        "type": "integer",
                        "description": "Number of hints to retrieve (1 or 2)",
                        "enum": [1, 2],
                    }
                },
            },
        },
    }
]

print(
    tokenizer.get_chat_template()
    # .replace(
    #     ' or (message.role == "assistant" and message.tool_calls is not defined)', ""
    # )
    # .replace(
    #     '{%- elif message.role == "assistant" %}',
    #     "{%- elif message.role == \"assistant\" and message.tool_calls is not defined %}\n        {{- '<|im_start|>' + message.role + '\n' }}{% generate %}{{ message.content + '<|im_end|>' }}{% endgenerate %}{{ '\n' }}\n    {%- elif message.role == \"assistant\" %}",
    # )
)

{{bos_token}}{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
You are a helpful assistant.<|im_end|>
' }}{% endif %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}


In [15]:
print(
    tokenizer.apply_chat_template(
        [
            {"role": "assistant", "content": "Hello, world!"},
            {
                "role": "assistant",
                "content": "Hello, world!",
                "tool_calls": [
                    {
                        "id": "1",
                        "type": "function",
                        "function":{"name": "get_hints", "arguments": "{}"},
                    }
                ],
            }
        ],
        tools=tools,
        chat_template=tokenizer.get_chat_template("tool_use"),
        tokenize=False,
    )
)

<|begin_of_text|><|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> {"type": "function", "function": {"name": "get_hints", "description": "get_hints(num_hints: int) - A function to retrieve one or two hints. No more than 3 hints may be retrieved total. Each retrieved hint decreases your final accuracy score by 10%.

    Args:
        num_hints(int): Number of hints to retrieve (1 or 2)", "parameters": {"type": "object", "properties": {"num_hints": {"type": "integer", "description": "Number of hints to retrieve (1 or 2)", "enum": [1, 2]}}}} </tools>Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "

In [None]:
context_manager = model.openai_client(verbosity=2)
openai_client = await context_manager.gen.__anext__()

$ vllm serve NousResearch/Hermes-3-Llama-3.1-8B --block-size=32 --disable-log-requests --enable-chunked-prefill --enable-prefix-caching --enforce-eager --gpu-memory-utilization=0.95 --max-num-seqs=4096 --max-num-batched-tokens=16384 --num-scheduler-steps=16 --preemption-mode=swap --return-tokens-as-token-ids --swap-space=80 --tensor-parallel-size=1 --enable-auto-tool-choice --tool-call-parser=hermes --chat-template=./tool_chat_template_hermes.jinja --served-model-name=temporal-clue-tool-use-001 --port=8001 --api-key=default
INFO 03-12 20:47:42 __init__.py:207] Automatically detected platform cuda.
INFO 03-12 20:47:42 api_server.py:912] vLLM API server version 0.7.3
INFO 03-12 20:47:42 api_server.py:913] args: Namespace(subparser='serve', model_tag='NousResearch/Hermes-3-Llama-3.1-8B', config='', host=None, port=8001, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='default', lora_modules=None, prompt_adapte

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:02,  1.08it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.05it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.54it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.32it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.29it/s]



INFO 03-12 20:48:01 model_runner.py:1115] Loading model weights took 14.9888 GB
INFO 03-12 20:48:03 worker.py:267] Memory profiling takes 1.68 seconds
INFO 03-12 20:48:03 worker.py:267] the current vLLM instance can use total_gpu_memory (79.10GiB) x gpu_memory_utilization (0.95) = 75.14GiB
INFO 03-12 20:48:03 worker.py:267] model weights take 14.99GiB; non_torch_memory takes 0.14GiB; PyTorch activation peak memory takes 18.75GiB; the rest of the memory reserved for KV Cache is 41.26GiB.
INFO 03-12 20:48:03 executor_base.py:111] # cuda blocks: 10563, # CPU blocks: 20480
INFO 03-12 20:48:03 executor_base.py:116] Maximum concurrency for 131072 tokens per request: 2.58x
INFO 03-12 20:48:50 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 49.18 seconds
INFO 03-12 20:48:51 api_server.py:820] Using supplied chat template:
INFO 03-12 20:48:51 api_server.py:820] {%- macro json_to_python_type(json_spec) %}
INFO 03-12 20:48:51 api_server.py:820]     {%- set basic_type_

INFO:     Started server process [69159]
INFO:     Waiting for application startup.
INFO:     Application startup complete.


INFO 03-12 20:48:54 chat_utils.py:332] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.
INFO:     127.0.0.1:54422 - "POST /v1/chat/completions HTTP/1.1" 200 OK
vLLM server started succesfully. Logs can be found at ./logs/vllm.log


In [None]:
async def rollout(
    client: openai.AsyncOpenAI, puzzle: TemporalCluePuzzle
) -> art.Trajectory:
    messages: art.Messages = [{"role": "user", "content": puzzle["prompt"]}]
    tools: art.Tools = [
        {
            "type": "function",
            "function": {
                "name": "get_hints",
                "description": "A function to retrieve one or two hints. No more than 3 hints may be retrieved total. "
                "Each retrieved hint decreases your final accuracy score by 10%.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "num_hints": {
                            "type": "integer",
                            "description": "Number of hints to retrieve (1 or 2)",
                            "enum": [1, 2],
                        }
                    },
                },
            },
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages,
        model=model.name,
        tools=tools,
    )
    choice = chat_completion.choices[0]
    messages_and_choices = [*messages, choice]
    hints = [
        f"The answer for {key} is {value}" for key, value in puzzle["solution"].items()
    ]
    random.shuffle(hints)
    hints_shared = 0

    def get_hints(function_name: str, function_arguments: str) -> str:
        nonlocal hints_shared
        if function_name != "get_hints":
            return f"Error: unexpected function name {function_name}"
        try:
            num_hints = json.loads(function_arguments or "{}").get("num_hints", 1)
        except Exception:
            return f"Error: invalid JSON {function_arguments}"
        if num_hints not in {1, 2}:
            return f"Error: invalid number of hints {num_hints}"
        if num_hints + hints_shared > 3:
            return f"Error: cannot retrieve {num_hints} hints, already retrieved {hints_shared} hints"
        hints_shared += num_hints
        content = "Hints:"
        for _ in range(num_hints):
            content += f"\n{hints.pop()}"
        return content

    while tool_calls := choice.message.tool_calls:
        messages.append(
            {
                "role": "assistant",
                "content": choice.message.content,
                "tool_calls": [
                    {
                        "id": tool_call.id,
                        "type": "function",
                        "function": {
                            "name": tool_call.function.name,
                            "arguments": tool_call.function.arguments or "{}",
                        },
                    }
                    for tool_call in tool_calls
                ],
            }
        )
        for tool_call in tool_calls:
            if tool_call.function.arguments:
                print(tool_call.function.arguments)
            messages.append(
                {
                    "role": "tool",
                    "tool_call_id": tool_call.id,
                    "content": get_hints(
                        tool_call.function.name, tool_call.function.arguments
                    ),
                }
            )
            messages_and_choices.append(messages[-1])
        try:
            chat_completion = await client.chat.completions.create(
                messages=messages,
                model=model.name,
                tools=tools,
            )
        except openai.BadRequestError:
            # Likely incorrectly formatted tool call arguments. We'll break out of the loop and allow the model to fail.
            print(messages[-1].get("tool_calls"))
            break
        choice = chat_completion.choices[0]
        messages_and_choices.append(choice)

    content = choice.message.content or ""
    num_correct = 0
    for key, value in puzzle["solution"].items():
        if matches := re.findall(rf"{key}\. ([A-Za-z \.:-]+)", content):
            match = matches[-1]
            if match.strip().lower() == value.lower():
                num_correct += 1
    reward = acc = num_correct / len(puzzle["solution"])
    return art.Trajectory(
        messages_and_choices=messages_and_choices,
        reward=reward - hints_shared * 0.1,
        metrics={"acc": acc, "hints_shared": hints_shared},
    )


val_groups, train_groups = await asyncio.gather(
    art.gather_groups(
        ((rollout(openai_client, puzzle) for _ in range(2)) for puzzle in val_puzzles),
        pbar_desc="val",
    ),
    art.gather_groups(
        (
            (rollout(openai_client, puzzle) for _ in range(50))
            for puzzle in train_puzzles[i * stride : (i + 1) * stride]
        ),
        pbar_desc="train",
    ),
)

val:   0%|          | 0/128 [00:00<?, ?it/s]

train:   0%|          | 0/1600 [00:00<?, ?it/s]

CancelledError: 

In [6]:
result.reward

0.07499999999999996