In [6]:
from dotenv import load_dotenv
import random
from openpipe.client import OpenPipe

load_dotenv()

op_client = OpenPipe()
print("OpenPipe client initialized")

random.seed(42)

OpenPipe client initialized


In [9]:
import art
from art.utils.get_trajectory_messages import get_trajectory_messages
import openai
import time
import math
import requests
from litellm import acompletion

from .utils import generate_game, render_board, apply_agent_move, max_cell_value, check_game_finished


WINNING_VALUE = 512

@art.retry(exceptions=(openai.LengthFinishReasonError, requests.ReadTimeout))
async def rollout(
    model: str, iteration: int, is_validation: bool
) -> art.Trajectory:

    game = generate_game()

    move_number = 0

    trajectory = art.Trajectory(
        messages_and_choices=[
            {
                "role": "system",
                "content": "You are an excellent 2048 player. Always choose the move most likely to lead to combine cells to eventually reach the number 2048. Optional moves are 'left', 'right', 'up', 'down'. Return your move as an XML object with a single property 'move', like so: <move>left</move>",
            }
        ],
        reward=0,
        metrics={"test": 5},
    )

    while True:
                    
        trajectory.messages_and_choices.append(
            {"role": "user", "content": render_board(game)}
        )

        requested_at = int(time.time() * 1000)
        messages = get_trajectory_messages(trajectory)

        async def get_completion():
            return await acompletion(
                model=model,
                messages=messages,
            )

        try:
            chat_completion = await get_completion()
            last_completion = chat_completion
        except openai.LengthFinishReasonError as e:
            raise e
        except Exception as e:
            print("caught exception generating chat completion")
            print(e)
            global failing_trajectory
            failing_trajectory = trajectory
            raise e
        
        try:
            op_client.report(
                requested_at=requested_at,
                received_at=int(time.time() * 1000),
                req_payload={
                    "model": model,
                    "messages": messages,
                    "metadata": {
                        "game_id": game["id"],
                        "notebook-id": "2048",
                        "iteration": str(iteration),
                        "validation": str(is_validation),
                        "move_number": str(move_number),
                    },
                },
                resp_payload=chat_completion,
                status_code=200,
            )
        except Exception as e:
            print(f"Error reporting to OpenPipe: {e}")

        choice = chat_completion.choices[0]
        content = choice.message.content
        assert isinstance(content, str)
        trajectory.messages_and_choices.append(choice)

        try:
            apply_agent_move(game, content)
            move_number += 1
        except ValueError:
            trajectory.reward = -1
            break

        if check_game_finished(game):
            max_value = max_cell_value(game)

            if max_value < WINNING_VALUE:
                # scale reward logarithmically between 0 for 2 and 1 for 2048
                trajectory.reward = (math.log(max_value, 2) - 1) / (math.log(WINNING_VALUE, 2) - 1)
            else:
                # double reward if it wins
                trajectory.reward = 2
            break

    try:
        op_client.update_log_metadata(
            filters=[
                {
                    "field": "completionId",
                    "equals": last_completion.id,
                }
            ],
            metadata={
                "reward": str(trajectory.reward),
                "reward_assigned": "true",
            }
        )
    except Exception as e:
        print(f"Error updating log metadata: {e}")

    return trajectory

In [10]:
from art.utils.benchmark_rollout import benchmark_rollout

await benchmark_rollout(
    "gpt-4o-mini",
    100,
    rollout,
)


Benchmarking rollout:   0%|          | 0/100 [00:00<?, ?it/s]

Average reward for gpt-4o-mini: 0.5


0.5

In [11]:
await benchmark_rollout(
    "gpt-4o-2024-11-20",
    100,
    rollout,
)

Benchmarking rollout:   0%|          | 0/100 [00:00<?, ?it/s]

Error reporting to OpenPipe: Server disconnected without sending a response.
Average reward for gpt-4o-2024-11-20: 0.70625


0.70625

In [12]:
await benchmark_rollout(
    "o3-mini-2025-01-31",
    100,
    rollout,
)

Benchmarking rollout:   0%|          | 0/100 [00:00<?, ?it/s]

Average reward for o3-mini-2025-01-31: 0.7575


0.7575

In [13]:
await benchmark_rollout(
    "gpt-4.5-preview-2025-02-27",
    10,
    rollout,
)

Benchmarking rollout:   0%|          | 0/10 [00:00<?, ?it/s]

Average reward for gpt-4.5-preview-2025-02-27: 0.5375


0.5375