In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [7]:
import art
from dotenv import load_dotenv
import random
from openpipe.client import OpenPipe

load_dotenv()

op_client = OpenPipe()
print("OpenPipe client initialized")

random.seed(42)

api = art.UnslothAPI(wandb_project="agent-reinforcement-training")
model = await api.get_or_create_model(
    name="2048-single-turn-interim-reward-001", base_model="Qwen/Qwen2.5-7B-Instruct"
)

OpenPipe client initialized


In [None]:
import art
import asyncio
from dotenv import load_dotenv
from art.utils.get_trajectory_messages import get_trajectory_messages
import json
import openai
import random
from typing import TypedDict
import time
from typing import Literal
from pydantic import BaseModel
import math
import string
import xml.etree.ElementTree as ET
import requests
import wandb

load_dotenv()

WINNING_VALUE = 512


class TwentyFortyEightGame(TypedDict):
    id: str
    board: list[list[int | None]]

def populate_random_cell(game: TwentyFortyEightGame) -> None:
    all_clear_coordinates = [(i, j) for i in range(len(game["board"])) for j in range(len(game["board"][i])) if game["board"][i][j] is None]
    random_clear_coordinates = random.choice(all_clear_coordinates)
    # 90% chance to populate a 2, 10% chance to populate a 4
    game["board"][random_clear_coordinates[0]][random_clear_coordinates[1]] = 2 if random.random() < 0.9 else 4


def generate_game(board_length: int = 4) -> TwentyFortyEightGame:
    # random 6 character string
    id = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
    game = {
        "id": id,
        "board": [[None for _ in range(board_length)] for _ in range(board_length)]
    }

    # populate two random cells
    populate_random_cell(game)
    populate_random_cell(game)
    
    return game


def render_board(game: TwentyFortyEightGame) -> str:
    board = game["board"]
    # print something like this:
    # _    | 2    | _    | 4
    # 4    | 8    | 2    | 16
    # 16   | 32   | 64   | 128
    # _    | 2    | 2    | 4
    # where _ is an empty cell

    max_cell_width = max([len(str(cell)) for row in board for cell in row if cell is not None])

    board_str = ""
    for row in board:
        # pad the cells with spaces to make them the same width
        board_str += "|".join([str(cell).rjust(max_cell_width) if cell is not None else "_".rjust(max_cell_width) for cell in row])
        board_str += "\n"
    return board_str

# condense, privileging matches at the start of the sequence
# sequences should be passed starting with cells that are the furthest in the direction in which the board is being condensed
def condense_sequence(sequence: list[int | None]) -> list[int | None]:
    condensed_sequence = []
    
    gapless_sequence = [cell for cell in sequence if cell is not None]

    i = 0
    while i < len(gapless_sequence):
        if i + 1 < len(gapless_sequence) and gapless_sequence[i] == gapless_sequence[i + 1]:
            condensed_sequence.append(gapless_sequence[i] * 2)
            i += 2
        else:
            condensed_sequence.append(gapless_sequence[i])
            i += 1

    # pad the sequence with None at the end
    return condensed_sequence + [None] * (4 - len(condensed_sequence))

def condense_board(game: TwentyFortyEightGame, direction: Literal["left", "right", "up", "down"]) -> None:

    if direction == "left":
        for row in game["board"]:
            condensed_row = condense_sequence(row)
            for i in range(len(row)):
                row[i] = condensed_row[i]
    
    if direction == "right":
        for row in game["board"]:
            reversed_row = row[::-1]
            # reverse the row before and after condensing
            condensed_row = condense_sequence(reversed_row)[::-1]
            for i in range(len(row)):
                row[i] = condensed_row[i]

    if direction == "up":
        for col_index in range(len(game["board"][0])):
            column = [row[col_index] for row in game["board"]]

            condensed_column = condense_sequence(column)
            for row_index in range(len(column)):
                game["board"][row_index][col_index] = condensed_column[row_index]
    
    if direction == "down":
        for col_index in range(len(game["board"][0])):
            column = [row[col_index] for row in game["board"]]
            reversed_column = column[::-1]
            condensed_column = condense_sequence(reversed_column)[::-1]
            for row_index in range(len(column)):
                game["board"][row_index][col_index] = condensed_column[row_index]
        
        

class AgentMove(BaseModel):
    direction: Literal["left", "right", "up", "down"]


def apply_agent_move(game: TwentyFortyEightGame, move_xml: str) -> None:

    direction = None
    # parse the move
    try:
        root = ET.fromstring(move_xml)
        direction = root.text
    except Exception as e:
        raise ValueError("Invalid xml")
    
    if direction not in ["left", "right", "up", "down"]:
        raise ValueError("Invalid direction")

    condense_board(game, direction)

    populate_random_cell(game)

def max_cell_value(game: TwentyFortyEightGame) -> int:
    return max([cell for row in game["board"] for cell in row if cell is not None])


def check_num_empty_cells(game: TwentyFortyEightGame) -> int:
    return sum(1 for row in game["board"] for cell in row if cell is None)


def check_game_finished(game: TwentyFortyEightGame) -> bool:

    if max_cell_value(game) >= WINNING_VALUE:
        return True

    # check if any cell is empty
    if check_num_empty_cells(game) == 0:
        return True

    return False

def report_trajectory_rewards(trajectories: list[art.Trajectory]) -> None:
    for trajectory in trajectories:
        for item in trajectory.messages_and_choices:
            item_dict = None
            # check if item is a dict
            if isinstance(item, dict):
                item_dict = item
            else:
                item_dict = item.model_dump()

            if "completion_id" in item_dict:
                completion_id = item_dict["completion_id"]
                op_client.update_log_metadata(
                    filters=[
                        {
                            "field": "completionId",
                            "equals": completion_id,
                        }
                    ],
                    metadata={"reward": str(trajectory.reward)},
                )


@art.retry(exceptions=(openai.LengthFinishReasonError, requests.ReadTimeout))
async def rollout(
    client: openai.AsyncOpenAI, iteration: int
) -> art.Trajectory:

    game = generate_game()

    game_outcome_reward = 0
    move_number = 0


    trajectories: list[art.Trajectory] = []

    directions = ["left", "right", "up", "down"]

    num_empty_cells_two_turns_ago = 16
    num_empty_cells_one_turn_ago = 16

    while True:
        # randomize directions to avoid bias
        random.shuffle(directions)
        directions_str = "', '".join(directions)

        trajectory = art.Trajectory(
            messages_and_choices=[
                {
                    "role": "system",
                    "content": f"You are an excellent 2048 player. Always choose the move most likely to lead to combine cells to eventually reach the number 2048. Optional moves are '{directions_str}'. Return your move as an XML object with a single property 'move', like so: <move>direction</move>",
                }
            ],
            reward=0,
        )
        trajectories.append(trajectory)
                    
        trajectory.messages_and_choices.append(
            {"role": "user", "content": render_board(game)}
        )

        requested_at = int(time.time() * 1000)
        messages = get_trajectory_messages(trajectory)

        async def get_completion():
            return await client.chat.completions.create(
                max_completion_tokens=2048,
                messages=messages,
                model=model.name,
                temperature=1.5,
            )

        try:
            chat_completion = await get_completion()
            last_completion = chat_completion
        except openai.LengthFinishReasonError as e:
            raise e
        except Exception as e:
            print("caught exception generating chat completion")
            print(e)
            raise e
        
        try:
            op_client.report(
                requested_at=requested_at,
                received_at=int(time.time() * 1000),
                req_payload={
                    "model": model.name,
                    "messages": messages,
                    "metadata": {
                        "game_id": game["id"],
                        "notebook-id": "2048",
                        "iteration": str(iteration),
                        "move_number": str(move_number),
                    },
                },
                resp_payload=chat_completion,
                status_code=200,
            )
        except Exception as e:
            print(f"Error reporting to OpenPipe: {e}")

        choice = chat_completion.choices[0]
        choice.completion_id = chat_completion.id
        content = choice.message.content
        assert isinstance(content, str)
        trajectory.messages_and_choices.append(choice)

        num_empty_cells_this_turn = check_num_empty_cells(game)

        try:
            apply_agent_move(game, content)
            move_number += 1
        except ValueError:
            trajectory.reward = -1
            # remove all other trajectories to avoid giving negative reward for valid moves
            trajectories = [trajectory]
            break

        if len(trajectories) > 1:
            reward_for_immediately_condensing_cells = num_empty_cells_two_turns_ago - num_empty_cells_one_turn_ago
            reward_for_condensing_cells_next_turn = num_empty_cells_two_turns_ago - num_empty_cells_this_turn

            trajectories[-2].reward = (reward_for_immediately_condensing_cells + reward_for_condensing_cells_next_turn) / 32

        num_empty_cells_two_turns_ago = num_empty_cells_one_turn_ago
        num_empty_cells_one_turn_ago = num_empty_cells_this_turn

        if check_game_finished(game):
            max_value = max_cell_value(game)

            if max_value < WINNING_VALUE:
                # scale reward logarithmically between 0 for 2 and 1 for 2048
                game_outcome_reward = (math.log(max_value, 2) - 1) / (math.log(WINNING_VALUE, 2) - 1)
            else:
                # double reward if it wins
                game_outcome_reward = 2
            break

    for trajectory in trajectories:
        # add reward for winning
        trajectory.reward += game_outcome_reward
    
    try:
        op_client.update_log_metadata(
            filters=[
                {
                    "field": "completionId",
                    "equals": last_completion.id,
                }
            ],
            metadata={
                "reward": str(trajectories[-1].reward),
                "final_trajectory": "true",
            }
        )
    except Exception as e:
        print(f"Error updating log metadata: {e}")

    report_trajectory_rewards(trajectories)

    return trajectories

persistent_client = None

openai_client = await model.openai_client()

for i in range(await model.get_iteration(), 500):
    train_groups = await art.gather_trajectories(
        (
            rollout(openai_client, i) for _ in range(12)
        ),
        pbar_desc="train",
        return_exceptions=False,
    )

    # get last trajectory from each group
    final_trajectories = [group[-1] for group in train_groups]

    # report average reward of final trajectories to wandb
    api._log_wandb_data(
        model,
        {
            "game_outcome_reward": sum([t.reward for t in final_trajectories]) / len(final_trajectories),
        }, 
        "train",
    )

    # combine train_groups into a single list
    train_groups = [[item for sublist in train_groups for item in sublist]]

    await model.clear_iterations()
    await model.tune(
        train_groups, config=art.TuneConfig(lr=3e-5)
    )

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0038
Packed 130 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0039
Packed 94 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0040
Packed 114 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0041
Packed 128 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0042
Packed 47 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0043
Packed 46 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0044
Packed 84 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0045
Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Packed 171 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0046
Packed 137 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0047
Packed 60 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0048
Packed 93 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0049
Packed 55 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0050
Packed 117 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0051
Packed 75 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0052
Packed 51 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0053
Packed 106 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0054
Packed 49 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0055
Packed 103 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0056
Packed 125 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0057
Packed 171 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0058
Packed 237 trajectories into 5 sequences of length 8192


tune:   0%|          | 0/5 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0059
Packed 200 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0060
Packed 94 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0061
Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Packed 91 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0062
Packed 189 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0063
Packed 108 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0064
Packed 56 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0065
Packed 104 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0066
Packed 48 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0067
Packed 90 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0068
Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Packed 47 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0069
Packed 89 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0070
Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Packed 39 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0071
Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Packed 34 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0072
Packed 86 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0073
Packed 64 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0074
Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Packed 46 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0075
Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Packed 83 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0076
Packed 86 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0077
Packed 95 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0078
Packed 59 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0079
Packed 135 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0080
Packed 53 trajectories into 3 sequences of length 8192


tune:   0%|          | 0/3 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0081
Packed 73 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]

Deleted iteration directory ./.art/models/2048-single-turn-interim-reward-001/0082
Skipping tuning as there is no suitable data.


train:   0%|          | 0/12 [00:00<?, ?it/s]

Packed 53 trajectories into 4 sequences of length 8192


tune:   0%|          | 0/4 [00:00<?, ?it/s]

train:   0%|          | 0/12 [00:00<?, ?it/s]