To train this agent, click *Runtime* and press *Run all*. Make sure you've enabled a free Tesla T4 GPU!
<div class="align-center">
<a href="https://openpipe.ai/"><img src="https://github.com/openpipe/art/raw/main/images/ART_pill.png" width="115"></a>
<a href="https://discord.gg/openpipe"><img src="https://github.com/openpipe/art/raw/main/images/Discord_pill.png" width="145"></a>
<a href="https://art.openpipe.ai/"><img src="https://github.com/openpipe/art/raw/main/images/Documentation_pill.png" width="125"></a></a> Join Discord to ask questions + ⭐ <i>Star us on <a href="https://github.com/openpipe/art">Github</a> </i> ⭐
</div>

To run on your own machine, follow the installation instructions [here](https://art.openpipe.ai).

This notebook shows how to train a Qwen 2.5 7B model to play 2048. It will demonstrate how to set up a multi-turn agent, how to train it, and how to evaluate it.

Completions will be logged to OpenPipe, and metrics will be logged to Weights & Biases.

You will learn how to [set up](#Setup) a simulated environment, how to declare a [model](#Model), how to define a [rollout](#Rollout), and how to run a [training loop](#Training Loop).

In [None]:
%load_ext autoreload
%autoreload 2

%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [3]:
import art
from dotenv import load_dotenv
import random
from openpipe.client import OpenPipe

load_dotenv()

op_client = OpenPipe()
print("OpenPipe client initialized")

random.seed(42)


api = art.LocalAPI(wandb_project="agent-reinforcement-training")
model = await api.get_or_create_model(
    name="2048-multi-turn-001", base_model="Qwen/Qwen2.5-7B-Instruct"
)

OpenPipe client initialized


In [None]:
import art
from art.utils.get_trajectory_messages import get_trajectory_messages
import openai
import time
import math
import requests

from .utils import (
    generate_game,
    render_board,
    apply_agent_move,
    check_game_finished,
    max_cell_value,
)

WINNING_VALUE = 512


@art.retry(exceptions=(openai.LengthFinishReasonError, requests.ReadTimeout))
async def rollout(
    client: openai.AsyncOpenAI, iteration: int, is_validation: bool
) -> art.Trajectory:

    game = generate_game()

    move_number = 0

    trajectory = art.Trajectory(
        messages_and_choices=[
            {
                "role": "system",
                "content": "You are an excellent 2048 player. Always choose the move most likely to lead to combine cells to eventually reach the number 2048. Optional moves are 'left', 'right', 'up', 'down'. Return your move as an XML object with a single property 'move', like so: <move>left</move>",
            }
        ],
        reward=0,
        metrics={"test": 5},
    )

    while True:

        trajectory.messages_and_choices.append(
            {"role": "user", "content": render_board(game)}
        )

        requested_at = int(time.time() * 1000)
        messages = get_trajectory_messages(trajectory)

        async def get_completion():
            return await client.chat.completions.create(
                max_completion_tokens=2048,
                messages=messages,
                model=model.name,
            )

        try:
            chat_completion = await get_completion()
            last_completion = chat_completion
        except openai.LengthFinishReasonError as e:
            raise e
        except Exception as e:
            print("caught exception generating chat completion")
            print(e)
            global failing_trajectory
            failing_trajectory = trajectory
            raise e

        try:
            op_client.report(
                requested_at=requested_at,
                received_at=int(time.time() * 1000),
                req_payload={
                    "model": model.name,
                    "messages": messages,
                    "metadata": {
                        "game_id": game["id"],
                        "notebook-id": "2048",
                        "iteration": str(iteration),
                        "validation": str(is_validation),
                        "move_number": str(move_number),
                    },
                },
                resp_payload=chat_completion,
                status_code=200,
            )
        except Exception as e:
            print(f"Error reporting to OpenPipe: {e}")

        choice = chat_completion.choices[0]
        content = choice.message.content
        assert isinstance(content, str)
        trajectory.messages_and_choices.append(choice)

        try:
            apply_agent_move(game, content)
            move_number += 1
        except ValueError:
            trajectory.reward = -1
            break

        if check_game_finished(game):
            max_value = max_cell_value(game)

            if max_value < WINNING_VALUE:
                # scale reward logarithmically between 0 for 2 and 1 for 2048
                trajectory.reward = (math.log(max_value, 2) - 1) / (
                    math.log(WINNING_VALUE, 2) - 1
                )
            else:
                # double reward if it wins
                trajectory.reward = 2
            break

    try:
        op_client.update_log_metadata(
            filters=[
                {
                    "field": "completionId",
                    "equals": last_completion.id,
                }
            ],
            metadata={
                "reward": str(trajectory.reward),
                "reward_assigned": "true",
            },
        )
    except Exception as e:
        print(f"Error updating log metadata: {e}")

    return trajectory


openai_client = await model.openai_client()
for i in range(await model.get_step(), 500):
    train_groups = await art.gather_trajectory_groups(
        (
            art.TrajectoryGroup(
                rollout(openai_client, i, is_validation=False) for _ in range(18)
            )
            for _ in range(1)
        ),
        pbar_desc="gather",
    )
    await model.delete_checkpoints()
    await model.train(train_groups, config=art.TrainConfig(learning_rate=3e-5))