## Offline Multiplayer Eval
In this minimal tutorial, we will showcase how you can evaluate your models offline against a fixed opponent. In this case, we will pick a basket of two- and multi-player environments, run the model for a fixed number of total games, and finally show the win/loss/draw rates for each environment.


### 1. Defining parameters and loading the model
First we need to specifiy the environment(s), number of games and opponent model name (we recommend using OpenRouter to run the opponent model).

For the purpose of this guide, we will evaluate a quantized qwen3 8b model (`Qwen/Qwen3-8B-GGUF`) against gemini 2.0 lite flash (`google/gemini-2.0-flash-001`).

In [1]:
!pip install transformers 
!pip install -U bitsandbytes



In [2]:
import os, csv, dotenv
from tqdm import tqdm
from collections import defaultdict

import pandas as pd 
import numpy as np
import textarena as ta 

# defining parameters
NUM_EPISODES = 8
EVAL_ENV_IDS = [("TicTacToe-v0", 2), ("DontSayIt-v0", 2)] #, ("Snake-v0", 4)]
OPPONENT_NAME = "google/gemini-2.0-flash-001"
MODEL_NAME = "Qwen/Qwen3-0.6B"

# load the OpenRouter api key 
dotenv.load_dotenv()

# loading the models
model = ta.agents.HFLocalAgent(
    model_name=MODEL_NAME,
    quantize=True,
    max_new_tokens=512
)
opponent = ta.agents.OpenRouterAgent(
    model_name=OPPONENT_NAME
)

  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Device set to use cuda:0


### 2. Defining the Game Loop
Now we can define the actual game loop. It should take both models as input, randomly allocate roles, initialize the environment, play the game, and return the outcome.

In [3]:
def run_game_loop(env_id, num_players, model, opponent):
    # build the environment
    env = ta.make(env_id) # already wrapped with the default wrappers
    # reset the environment
    env.reset(num_players=num_players)

    # randomly allocate the player id to the model to be evaluated
    model_player_id = np.random.randint(0, num_players)

    done = False
    while not done:
        pid, observation = env.get_observation()
        # get action
        if pid==model_player_id:
            action = model(observation)
        else:
            action = opponent(observation)

        # step in the env
        done, info = env.step(action=action)

    # get final rewards
    rewards = env.close()

    # for simplicity we will average the opponent rewards (i.e. treating FFA multiplayer games as a two-player game)
    return {
        "model_reward": rewards[model_player_id],
        "opponent_reward": np.mean(rewards[i] for i in range(num_players) if i != model_player_id)
    }

### 3. Putting it all Together
Finally, we can write our main loop that will iterate over all envs for the specified number of games and track the outcomes.

In [None]:
# Initialize tracking variables
results = defaultdict(list)

# Prepare tqdm 
outer_bar = tqdm(EVAL_ENV_IDS, desc="Environments")
for (env_id, num_players) in outer_bar:
    inner_bar = tqdm(range(NUM_EPISODES), desc=f"Evaluating {env_id}", leave=False)
    env_results = {
        "wins": 0, "losses": 0, "draws": 0,
        "total_reward_model": 0.0, "total_reward_opponent": 0.0,
        "total_length": 0,
    }
    for _ in inner_bar:
        outcome = run_game_loop(env_id=env_id, num_players=num_players, model=model, opponent=opponent)
        model_reward = outcome["model_reward"]
        opponent_reward = outcome["opponent_reward"]

        # Determine outcome
        if model_reward > opponent_reward: env_results["wins"] += 1
        elif model_reward < opponent_reward: env_results["losses"] += 1
        else: env_results["draws"] += 1

        env_results["total_reward_model"] += model_reward
        env_results["total_reward_opponent"] += opponent_reward
        env_results["total_length"] += 1

        avg_reward_model = env_results["total_reward_model"] / env_results["total_length"]
        avg_reward_opponent = env_results["total_reward_opponent"] / env_results["total_length"]

        inner_bar.set_postfix({
            "Win%": f"{env_results['wins'] / env_results['total_length']:.2%}",
            "Loss%": f"{env_results['losses'] / env_results['total_length']:.2%}",
            "Draw%": f"{env_results['draws'] / env_results['total_length']:.2%}",
            "Model R": f"{avg_reward_model:.2f}",
            "Opp R": f"{avg_reward_opponent:.2f}"
        })

    # Save results for this environment
    results["env_id"].append(env_id)
    results["win_rate"].append(env_results["wins"] / NUM_EPISODES)
    results["loss_rate"].append(env_results["losses"] / NUM_EPISODES)
    results["draw_rate"].append(env_results["draws"] / NUM_EPISODES)
    results["avg_model_reward"].append(env_results["total_reward_model"] / NUM_EPISODES)
    results["avg_opponent_reward"].append(env_results["total_reward_opponent"] / NUM_EPISODES)
    results["avg_game_length"].append(env_results["total_length"] / NUM_EPISODES)



# Output results as a pretty table
df = pd.DataFrame(results)
from IPython.display import display
display(df)

# Save to CSV
os.makedirs("eval_results", exist_ok=True)
df.to_csv("eval_results/eval_summary.csv", index=False)

Environments:   0%|          | 0/2 [02:01<?, ?it/s]
