In [1]:
from __future__ import annotations

import glob
import os
import time

from pettingzoo.test import api_test
import pettingzoo
import gymnasium as gym

from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.common.wrappers import ActionMasker

from lib.briscola_env.briscola_env import BriscolaEnv

In [2]:
env = BriscolaEnv()
api_test(env, num_cycles=1000)

Starting API test
Passed API test




In [3]:
# To pass into other gymnasium wrappers, we need to ensure that pettingzoo's wrappper
# can also be a gymnasium Env. Thus, we subclass under gym.Env as well.
class SB3ActionMaskWrapper(pettingzoo.utils.BaseWrapper, gym.Env):
    """Wrapper to allow PettingZoo environments to be used with SB3 illegal action masking."""

    def reset(self, seed=None, options=None):
        """Gymnasium-like reset function which assigns obs/action spaces to be the same for each agent.

        This is required as SB3 is designed for single-agent RL and doesn't expect obs/action spaces to be functions
        """
        super().reset(seed, options)

        # Strip the action mask out from the observation space
        self.observation_space = super().observation_space(self.possible_agents[0])[
            "observation"
        ]
        self.action_space = super().action_space(self.possible_agents[0])

        # Return initial observation, info (PettingZoo AEC envs do not by default)
        return self.observe(self.agent_selection), {}

    def step(self, action):
        """Gymnasium-like step function, returning observation, reward, termination, truncation, info.

        The observation is for the next agent (used to determine the next action), while the remaining
        items are for the agent that just acted (used to understand what just happened).
        """
        current_agent = self.agent_selection

        super().step(action)

        next_agent = self.agent_selection
        return (
            self.observe(next_agent),
            self._cumulative_rewards[current_agent],
            self.terminations[current_agent],
            self.truncations[current_agent],
            self.infos[current_agent],
        )

    def observe(self, agent):
        """Return only raw observation, removing action mask."""
        return super().observe(agent)["observation"]

    def action_mask(self):
        """Separate function used in order to access the action mask."""
        return super().observe(self.agent_selection)["action_mask"]

In [4]:
def mask_fn(env):
    return env.action_mask()

def train(
    steps: int = 10_000, seed: int | None = 0, **env_kwargs
):
    # Train a single model to play as each agent in a cooperative Parallel environment
    env = BriscolaEnv()
    env = SB3ActionMaskWrapper(env)
    env.reset(seed=seed)
    env = ActionMasker(env, mask_fn)

    print(f"Starting training on {str(env.metadata)}.")
    model = MaskablePPO(MaskableActorCriticPolicy, env, verbose=1)
    model.set_random_seed(seed)
    model.learn(total_timesteps=steps)
    model.save(f"{env.unwrapped.metadata.get('name')}_{time.strftime('%Y%m%d-%H%M%S')}")

    print("Model has been saved.")
    print(f"Finished training on {str(env.unwrapped.metadata['name'])}.\n")
    env.close()



In [5]:
train(steps=1000)

Starting training on {'render_modes': []}.
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 40       |
|    ep_rew_mean     | 605      |
| time/              |          |
|    fps             | 344      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2048     |
---------------------------------
Model has been saved.
Finished training on briscola.



In [8]:
def baseline(num_games = 100):
    env = BriscolaEnv()
    print("Starting baseline evalutation using all random agents")
    wins = {agent: 0 for agent in env.possible_agents}
    total_rewards = {agent: 0 for agent in env.possible_agents}
    for i in range(num_games):
        env.reset(seed=i)

        for agent in env.agent_iter():
            obs, reward, termination, truncation, info = env.last()
            # Separate observation and action mask
            observation, action_mask = obs.values()
            if termination or truncation:
                winner = max(env.rewards, key=env.rewards.get)
                wins[winner] += env.infos[
                    winner
                ]["wins"] # only tracks the largest reward (winner of game)
                # Also track negative and positive rewards (penalizes illegal moves)
                for a in env.possible_agents:
                    total_rewards[a] += env.rewards[a]
                # List of rewards by round, for reference
                break
            else:
                act = env.action_space(agent).sample(action_mask)
            env.step(act)
    env.close()

    player_results = []
    print("Winrates:")
    for p in env.agents:
        if sum(wins.values()) == 0:
            winrate = 0
        else:
            winrate = wins[p] / sum(wins.values())
        print(f"\t{p}: {winrate*100}%")
        print(f"\tWins: {wins[p]} Rewards: {total_rewards[p]}")
        player_results.append({"winrate": winrate})
    return player_results


def eval_action_mask(player, num_games=100):
    # Evaluate a trained agent vs a random agent
    env = BriscolaEnv()
    print(
        f"Starting evaluation vs random agents. Trained agent will play as {env.possible_agents[player]}."
    )

    try:
        latest_policy = max(
            glob.glob(f"{env.metadata['name']}*.zip"), key=os.path.getctime
        )
    except ValueError:
        print("Policy not found.")
        exit(0)
    print("using", latest_policy)
    model = MaskablePPO.load(latest_policy)

    wins = {agent: 0 for agent in env.possible_agents}
    total_rewards = {agent: 0 for agent in env.possible_agents}
    for i in range(num_games):
        env.reset(seed=i)

        for agent in env.agent_iter():
            obs, reward, termination, truncation, info = env.last()

            # Separate observation and action mask
            observation, action_mask = obs.values()

            if termination or truncation:
                winner = max(env.rewards, key=env.rewards.get)
                wins[winner] += env.infos[
                    winner
                ]["wins"] # only tracks the largest reward (winner of game)
                # Also track negative and positive rewards (penalizes illegal moves)
                for a in env.possible_agents:
                    total_rewards[a] += env.rewards[a]
                # List of rewards by round, for reference
                break
            else:
                if agent != env.possible_agents[player]:
                    act = env.action_space(agent).sample(action_mask)
                else:
                    # Note: PettingZoo expects integer actions # TODO: change chess to cast actions to type int?
                    act = int(
                        model.predict(
                            observation, action_masks=action_mask, deterministic=True
                        )[0]
                    )
            env.step(act)
    env.close()

    player_results = []
    print("Winrates:")
    for p in env.agents:
        if sum(wins.values()) == 0:
            winrate = 0
        else:
            winrate = wins[p] / sum(wins.values())
        print(f"\t{p}: {winrate*100}%")
        print(f"\tWins: {wins[p]} Rewards: {total_rewards[p]}")
        player_results.append({"winrate": winrate})
    return player_results

In [None]:
NUM_EVAL_GAMES = 500

results_by_position = []
for position in range(4):
	print(f"--- Testing position {position} ---")
	results_by_position.append(eval_action_mask(position, num_games=NUM_EVAL_GAMES))

--- Testing position 0 ---
Starting evaluation vs random agents. Trained agent will play as player_0.
using briscola_20250427-153905.zip
Winrates:
	player_0: 30.4%
	Wins: 152 Rewards: 53683
	player_1: 19.400000000000002%
	Wins: 97 Rewards: 38123
	player_2: 25.8%
	Wins: 129 Rewards: 48340
	player_3: 24.4%
	Wins: 122 Rewards: 44854
--- Testing position 1 ---
Starting evaluation vs random agents. Trained agent will play as player_1.
using briscola_20250427-153905.zip
Winrates:
	player_0: 31.0%
	Wins: 155 Rewards: 53945
	player_1: 22.0%
	Wins: 110 Rewards: 42268
	player_2: 26.8%
	Wins: 134 Rewards: 48469
	player_3: 20.200000000000003%
	Wins: 101 Rewards: 40318
--- Testing position 2 ---
Starting evaluation vs random agents. Trained agent will play as player_2.
using briscola_20250427-153905.zip
Winrates:
	player_0: 31.0%
	Wins: 155 Rewards: 54304
	player_1: 20.200000000000003%
	Wins: 101 Rewards: 38938
	player_2: 27.0%
	Wins: 135 Rewards: 50160
	player_3: 21.8%
	Wins: 109 Rewards: 41598
--

In [None]:
baseline_results = baseline(NUM_EVAL_GAMES)

print(results_by_position)

Starting baseline evalutation using all random agents
Winrates:
	player_0: 32.6%
	Wins: 163 Rewards: 56334
	player_1: 19.400000000000002%
	Wins: 97 Rewards: 38143
	player_2: 25.4%
	Wins: 127 Rewards: 47733
	player_3: 22.6%
	Wins: 113 Rewards: 42790
[[{'winrate': 0.304}, {'winrate': 0.194}, {'winrate': 0.258}, {'winrate': 0.244}], [{'winrate': 0.31}, {'winrate': 0.22}, {'winrate': 0.268}, {'winrate': 0.202}], [{'winrate': 0.31}, {'winrate': 0.202}, {'winrate': 0.27}, {'winrate': 0.218}], [{'winrate': 0.298}, {'winrate': 0.23}, {'winrate': 0.272}, {'winrate': 0.2}]]
