<a href="https://colab.research.google.com/github/zhus-dika/togyz-qumalaq-agent/blob/main/togyzqumalaq_aec_vs_random_policy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  🐘 AEC environment https://pettingzoo.farama.org/api/aec/#about-aec

###  🦜 Install need packages

In [1]:
!pip install gymnasium
!pip install pettingzoo[classic]==1.23.0

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting pettingzoo[classic]==1.23.0
  Downloading pettingzoo-1.23.0-py3-none-any.whl (826 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m826.8/826.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting chess==1.7.0 (from pettingzoo[classic]==1.23.0)
  Downloading chess-1.7.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.1/147.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rlcard==1.0.5 (from pettingzoo[classic]==1.23.0)
  Downloading rlcard-1.0.5.t

### 🐞 Imports

In [2]:
import functools

import gymnasium
import numpy as np
from gymnasium.spaces import Discrete, MultiDiscrete
from gymnasium import spaces

from IPython.display import clear_output
import time
from pettingzoo import AECEnv
from pettingzoo.utils import agent_selector, wrappers
import matplotlib.pyplot as plt

NUM_ITERS = 400
PLAYS = {"bastaushy": 0, "qostaushy": 0}

### 🦉 Create environment

In [56]:
def env(render_mode=None):
    """
    The env function often wraps the environment in wrappers by default.
    You can find full documentation for these methods
    elsewhere in the developer documentation.
    """
    internal_render_mode = render_mode if render_mode != "ansi" else "human"
    env = raw_env(render_mode=internal_render_mode)
    # This wrapper is only for environments which print results to the terminal
    if render_mode == "ansi":
        env = wrappers.CaptureStdoutWrapper(env)
    # this wrapper helps error handling for discrete action spaces
    env = wrappers.AssertOutOfBoundsWrapper(env)
    # Provides a wide vareity of helpful user errors
    # Strongly recommended
    env = wrappers.OrderEnforcingWrapper(env)
    return env


class raw_env(AECEnv):
    """
    The metadata holds environment constants. From gymnasium, we inherit the "render_modes",
    metadata which specifies which modes can be put into the render() method.
    At least human mode should be supported.
    The "name" metadata allows the environment to be pretty printed.
    """

    metadata = {
        "render_modes": ["ansi", "human"],
        "name": "togyzqumalaq_v0"
        }

    def __init__(self, render_mode=None):
        """
        The init method takes in environment arguments and
         should define the following attributes:
        - otaular
        - tuzdyq
        - qazandar
        - possible_agents
        - render_mode

        Note: as of v1.18.1, the action_spaces and observation_spaces attributes are deprecated.
        Spaces should be defined in the action_space() and observation_space() methods.
        If these methods are not overridden, spaces will be inferred from self.observation_spaces/action_spaces, raising a warning.

        These attributes should not be changed after initialization.
        """
        self.otaular = []
        self.tuzdyq = []
        self.qazandar = []
        self.direction = []
        self.agents = ["bastaushy", "qostaushy"]
        self.possible_agents = self.agents[:]
        # optional: we can define the observation and action spaces here as attributes to be used in their corresponding methods
        self.action_spaces = {i: spaces.Discrete(9) for i in self.agents}
        self.observation_spaces = {
            i: spaces.Dict(
                {
                    "observation": MultiDiscrete([100] * 18 + [9] * 2 + [82] * 2),
                    "action_mask": Discrete(9),
                }
            )
            for i in self.agents
        }
        self.render_mode = render_mode

    # Observation space should be defined here.
    def action_space(self, agent):
        return self.action_spaces[agent]

    # Action space should be defined here.
    def observation_space(self, agent):
        return self.observation_spaces[agent]

    def render(self):
        """
        Renders the environment. In human mode, it can print to terminal, open
        up a graphical window, or open up some other display that a human can see and understand.
        """
        """Renders the environment."""
        if self.render_mode is None:
            gymnasium.logger.warn(
                "You are calling render method without specifying any render mode."
            )
            return

        if len(self.agents) == 2:
            points_bastaushy_x = np.array([i * 2 for i in range(10)])
            points_bastaushy_y = np.array([i % 5 for i in range(50)])

            x = np.arange(-3, 225, 1)
            y = -1

            text_kwargs = dict(ha='center', va='center', fontsize=12)
            plt.figure(figsize=(17, 6))

            for i in range(9):
                # qostaushy's part
                plt.scatter(np.repeat(points_bastaushy_x + 25 * i, 5)[:self.otaular[17 - i]], points_bastaushy_y[:self.otaular[17 - i]], marker='o')
                # horizontal line
                plt.plot(x, np.repeat(y, len(x)))
                # vertical lines
                plt.plot(np.repeat(25 * i - 2, len(x)), np.arange(-7, 5, 12 / len(x)))
                # bastaushy's part
                plt.scatter(np.repeat(points_bastaushy_x + 25 * i, 5)[:self.otaular[i]], points_bastaushy_y[:self.otaular[i]] - 6, marker='o')

            #last vertical line
            plt.plot(np.repeat(25 * 9 - 2, len(x)), np.arange(-7, 5, 12 / len(x)))

            for i in range(9):
                # bastaushy's qumalaqtar
                plt.text(25 * i + 10, -7, f'{i} ({self.otaular[i]})', **text_kwargs)
                # qostaushy's qumalaqtar
                plt.text(25 * i + 10, 5, f'{17 - i} ({self.otaular[17 - i]})', **text_kwargs)
            # bastaushy qazan's qumalaqtar
            plt.text(230, -4, f'qazan: {self.qazandar[0]}', **text_kwargs)
            # qostaushy qazan's qumalaqtar
            plt.text(230, 2, f'qazan: {self.qazandar[1]}', **text_kwargs);
            # bastaushy tuzdyq's qumalaqtar
            plt.text(230, -6, f'tuzdyq: {self.tuzdyq[0]}', **text_kwargs)
            # qostaushy tuzdyq's qumalaqtar
            plt.text(230, 0, f'tuzdyq: {self.tuzdyq[1]}', **text_kwargs);
            plt.show()
        else:
            if self.render_mode == "human":
                print("Game over")
        time.sleep(2)
        clear_output(True)

    def _legal_moves(self, agent):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2
        return [item for item in range(9 * cur_player, (cur_player + 1) * 9) if self.tuzdyq[opp_player] != item and self.otaular[item] > 0]

    def observe(self, agent):
        """
        Observe should return the observation of the specified agent. This function
        should return a sane observation (though not necessarily the most up to date possible)
        at any time after reset() is called.
        """
        # observation of one agent is the previous state of the other
        legal_moves = self._legal_moves(agent) if agent == self.agent_selection else []
        action_mask = np.zeros(9, "int8")
        if self.possible_agents.index(agent) == 1:
            legal_moves = [i - 9 for i in legal_moves]
        for i in legal_moves:
            action_mask[i] = 1
        observation = tuple(
            self.otaular + self.tuzdyq + self.qazandar
        )
        return {"observation": observation, "action_mask": action_mask}

    def close(self):
        """
        Close should release any graphical displays, subprocesses, network connections
        or any other environment data which should not be kept around after the
        user is no longer using the environment.
        """
        pass

    def reset(self, seed=None, options=None):
        """
        Reset needs to initialize the following attributes
        - agents
        - rewards
        - _cumulative_rewards
        - terminations
        - truncations
        - infos
        - agent_selection
        And must set up the environment so that render(), step(), and observe()
        can be called without issues.
        Here it sets up the state dictionary which is used by step() and the observations dictionary which is used by step() and observe()
        """
        self.agents = self.possible_agents[:]
        self.rewards = {agent: 0 for agent in self.agents}
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.otaular = [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
        self.direction = [list(range(18)), [9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8]]
        self.tuzdyq = [-1, -1]
        self.qazandar = [0, 0]
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}
        self.num_moves = 0
        observation = tuple(
            self.otaular + self.tuzdyq + self.qazandar
        )
        self.observations = {agent: observation for agent in self.agents}
        """
        Our agent_selector utility allows easy cyclic stepping through the agents list.
        """
        self._agent_selector = agent_selector(self.agents)
        self.agent_selection = self._agent_selector.next()

    def step(self, action):
        """
        step(action) takes in an action for the current agent (specified by
        agent_selection) and needs to update
        - rewards
        - _cumulative_rewards (accumulating the rewards)
        - terminations
        - truncations
        - infos
        - agent_selection (to the next agent)
        And any internal state used by observe() or render()
        """
        if (
            self.terminations[self.agent_selection]
            or self.truncations[self.agent_selection]
        ):
            # handles stepping an agent which is already dead
            # accepts a None action for the one agent, and moves the agent_selection to
            # the next dead agent,  or if there are no more dead agents, to the next live agent
            self._was_dead_step(action)
            return

        cur_player = self.possible_agents.index(self.agent_selection)
        opp_player = (cur_player + 1) % 2
        self.num_moves += 1
        if self.render_mode == "human":
            print(f'MOVE #{self.num_moves}')
        # The truncations dictionary must be updated for all players.
        self.truncations = {
            agent: self.num_moves >= NUM_ITERS for agent in self.agents
        }
        # distribute qumalaqs
        if cur_player == 1:
            action += 9
        if self.render_mode == "human":
            print(f'{self.agent_selection} made action {action}')
        num_qumalaq = self.otaular[action]
        idx_action = self.direction[cur_player].index(action)
        if self.otaular[action] == 1:
            self.otaular[self.direction[cur_player][idx_action + 1]] += 1
            self.otaular[action] -= 1
        else:
            i = 1
            while self.otaular[action] > 1:
                self.otaular[self.direction[cur_player][(idx_action + i) % 18]] += 1
                self.otaular[action] -= 1
                i += 1
        # check tuzdyq & add rewards to qazandar
        reward = 0
        if self.check_tuzdyq(self.agent_selection, action):
            reward += 3
            if self.render_mode == "human":
                print(f'{self.agent_selection} won tuzdyq {reward}')
        else:

            if num_qumalaq > 1:
                last_otau = self.direction[cur_player][(idx_action + num_qumalaq - 1) % 18]
            else:
                last_otau = self.direction[cur_player][(idx_action + num_qumalaq) % 18]

            if last_otau in range(opp_player * 9, (opp_player + 1) * 9) and self.otaular[last_otau] % 2 == 0:
                reward += self.otaular[last_otau]
                if self.render_mode == "human":
                    print(f'{self.agent_selection} won {reward}')
                self.otaular[last_otau] = 0
            if self.tuzdyq[cur_player] >= 0 and self.otaular[self.tuzdyq[cur_player]] > 0:
                reward += self.otaular[self.tuzdyq[cur_player]]
                if self.render_mode == "human":
                    print(f'{self.agent_selection} won tuzdyq {self.otaular[self.tuzdyq[cur_player]]}')
                self.otaular[self.tuzdyq[cur_player]] = 0
        if self.render_mode == "human":
            print(f'{self.agent_selection} won total {reward}')
        self.qazandar[cur_player] += reward
        self.rewards[self.agent_selection] += reward
        # check if there is a winner
        winner = self.check_for_winner()
        if winner:
            self.terminations = {i: True for i in self.agents}
            if self.render_mode == "human":
                print(f'{self.agent_selection} won the game!!!')
        # selects the next agent.
        self.agent_selection = self._agent_selector.next()
        # Adds .rewards to ._cumulative_rewards
        self._accumulate_rewards()

        if self.render_mode == "human":
            self.render()

    def check_tuzdyq(self, agent, action):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2
        idx = self.direction[cur_player].index(action)
        num_qumalaq = self.otaular[action]

        if num_qumalaq > 1:
            last_otau = self.direction[cur_player][(idx + num_qumalaq - 1) % 18]
        else:
            last_otau = self.direction[cur_player][(idx + num_qumalaq) % 18]

        if last_otau in range(opp_player * 9, (opp_player + 1) * 9) and self.otaular[last_otau] == 3 and last_otau != 17 - cur_player * 9 and abs(last_otau - self.tuzdyq[opp_player]) != 9:
            self.tuzdyq[cur_player] = last_otau
            self.otaular[last_otau] = 0
            if self.render_mode == "human":
                print(f'{agent} got tuzdyq {last_otau}!')
            return True

        return False

    def check_atsyrau(self, agent):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2

        for idx, i in enumerate(self.otaular[cur_player * 9: (cur_player + 1) * 9]):
            if i > 0 and idx + cur_player * 9 != self.tuzdyq[opp_player]:
                return False
        if self.render_mode == "human":
            print(f'{agent} reached atsyrau')
        return True

    def check_for_winner(self):
        cur_player = self.possible_agents.index(self.agent_selection)
        opp_player = (cur_player + 1) % 2
        if self.qazandar[cur_player] > 81:
            PLAYS[self.agent_selection] += 1
            return True
        if self.check_atsyrau(self.possible_agents[opp_player]) and self.qazandar[opp_player] <= 81:
            PLAYS[self.agent_selection] += 1
            return True
        return False

### 🦚 Testing environment

In [None]:
# env = env(render_mode="human")
# env.reset(seed=42)

# for agent in env.agent_iter():
#     observation, reward, termination, truncation, info = env.last()

#     if termination or truncation:
#         action = None
#     else:
#         mask = observation["action_mask"]
#         # this is where you would insert your policy
#         action = env.action_space(agent).sample(mask)

#     env.step(action)
# env.close()

# 🐼 DQN agent to play vs a random policy agent https://pettingzoo.farama.org/tutorials/tianshou/intermediate/

### 🐡 Additional installations https://github.com/thu-ml/tianshou

In [5]:
!pip install packaging==21.3
!pip install tianshou==0.5.0

Collecting packaging==21.3
  Downloading packaging-21.3-py3-none-any.whl (40 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/40.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m937.2 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: packaging
  Attempting uninstall: packaging
    Found existing installation: packaging 24.0
    Uninstalling packaging-24.0:
      Successfully uninstalled packaging-24.0
Successfully installed packaging-21.3


Collecting tianshou==0.5.0
  Downloading tianshou-0.5.0-py3-none-any.whl (162 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.7/162.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.4.0->tianshou==0.5.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.4.0->tianshou==0.5.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.4.0->tianshou==0.5.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.4.0->tianshou==0.5.0)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.4.0->tianshou==0.5.0)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-no

### 🐝 Imports

In [7]:
import os
from typing import Optional, Tuple

import gymnasium
import numpy as np
import torch
from copy import deepcopy
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, RainbowPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils.net.common import Net

### 🐎 Load trained agents

In [10]:
agent1_path = "policy_128x256x256x128_bs64.pth"
agent2_path = "policy_256x512x512x256_bs128.pth"
agent3_path = "policy_512x1024x1024x512_bs128.pth"
agent4_path = "policy_128x256x512x256x128_trained_128x256x256x128.pth"

env = PettingZooEnv(env())
net1 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            #hidden_sizes=[1024, 2048, 2048, 1024],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent1 = DQNPolicy(
            model=net1,
            optim = torch.optim.Adam(net1.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent1_learned = deepcopy(agent1)
agent1_learned.load_state_dict(torch.load(agent1_path))


net2 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[256, 512, 512, 256],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent2 = DQNPolicy(
            model=net2,
            optim = torch.optim.Adam(net1.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
        ).to("cuda" if torch.cuda.is_available() else "cpu")
agent2_learned = deepcopy(agent2)
agent2_learned.load_state_dict(torch.load(agent2_path))


net3 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[512, 1024, 1024, 512],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent3 = DQNPolicy(
            model=net3,
            optim = torch.optim.Adam(net3.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent3_learned = deepcopy(agent3)
agent3_learned.load_state_dict(torch.load(agent3_path))


net4 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 512, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent4 = DQNPolicy(
            model=net4,
            optim = torch.optim.Adam(net3.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent4_learned = deepcopy(agent4)
agent4_learned.load_state_dict(torch.load(agent4_path))

<All keys matched successfully>

### 🐫 Prepare main functions

In [12]:
def _get_agents(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gymnasium.spaces.Dict)
        else env.observation_space
    )
    if agent_learn is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 512, 256, 128],
            #hidden_sizes=[1024, 2048, 2048, 1024],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
        ).to("cuda" if torch.cuda.is_available() else "cpu")


    if agent_opponent is None:
        if agent1_path:
            agent_opponent = agent1_learned
        else:
            agent_opponent = RandomPolicy(action_space=env.action_space)

    agents = [agent_opponent, agent_learn]
    #agents = [agent_learn, agent_opponent]
    policy = MultiAgentPolicyManager(agents, env)
    return policy, optim, env.agents


def _get_env():
    """This function is needed to provide callables for DummyVectorEnv."""
    return PettingZooEnv(env())

###  🐑 Training code https://tianshou.org/en/stable/01_tutorials/04_tictactoe.html

In [None]:
# Before evaluate this cell run the cell with env
# ======== Step 1: Environment setup =========
train_envs = DummyVectorEnv([_get_env for _ in range(100)])
test_envs = DummyVectorEnv([_get_env for _ in range(100)])

# seed
seed = 11
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# ======== Step 2: Agent setup =========
policy, optim, agents = _get_agents(agent_opponent=agent2_learned)

# # ======== Step 3: Collector setup =========
train_collector = Collector(
    policy,
    train_envs,
    VectorReplayBuffer(20_000, len(train_envs)),
    exploration_noise=True,
)
test_collector = Collector(policy, test_envs, exploration_noise=True)
# policy.set_eps(1)
train_collector.collect(n_step=256 * 100)  # batch size * training_num

# ======== Step 4: Callback functions setup =========
def save_best_fn(policy):
    model_save_path = os.path.join("log", "ttt", "dqn", "policy.pth")
    os.makedirs(os.path.join("log", "ttt", "dqn"), exist_ok=True)
    torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

def stop_fn(mean_rewards):
    return mean_rewards >= 21000

def train_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.1)

def test_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.05)

def reward_metric(rews):
    return rews[:, 1]

# ======== Step 5: Run the trainer =========
result = offpolicy_trainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=150,
    step_per_epoch=1000,
    step_per_collect=50,
    episode_per_test=10,
    batch_size=256,
    train_fn=train_fn,
    test_fn=test_fn,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn,
    update_per_step=0.1,
    test_in_train=False,
    reward_metric=reward_metric,
    verbose=True
)

# return result, policy.policies[agents[1]]
print(f"\n==========Result==========\n{result}")
print("\n(the trained policy can be accessed via policy.policies[agents[1]])")

Epoch #1: 1001it [00:03, 273.56it/s, bastaushy/loss=199.918, env_step=1000, len=0, n/ep=0, n/st=100, qostaushy/loss=15209.492, rew=0.00]                          


Epoch #1: test_reward: 15070.300000 ± 3247.526876, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #2: 1001it [00:04, 231.01it/s, bastaushy/loss=198.840, env_step=2000, len=0, n/ep=0, n/st=100, qostaushy/loss=434.394, rew=0.00]                          


Epoch #2: test_reward: 14878.800000 ± 2874.701264, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #3: 1001it [00:03, 271.34it/s, bastaushy/loss=185.680, env_step=3000, len=0, n/ep=0, n/st=100, qostaushy/loss=151.296, rew=0.00]                          


Epoch #3: test_reward: 5113.500000 ± 5748.884157, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #4: 1001it [00:05, 199.14it/s, bastaushy/loss=174.654, env_step=4000, len=0, n/ep=0, n/st=100, qostaushy/loss=953.569, rew=0.00]                          


Epoch #4: test_reward: 13456.500000 ± 5788.924619, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #5: 1001it [00:03, 270.00it/s, bastaushy/loss=723.793, env_step=5000, len=305, n/ep=0, n/st=100, qostaushy/loss=125.236, rew=15475.17]                          


Epoch #5: test_reward: 10944.200000 ± 4765.192647, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #6: 1001it [00:03, 264.22it/s, bastaushy/loss=2637.010, env_step=6000, len=310, n/ep=0, n/st=100, qostaushy/loss=333.974, rew=16141.00]                          


Epoch #6: test_reward: 11117.200000 ± 3211.922253, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #7: 1001it [00:03, 266.63it/s, bastaushy/loss=2674.904, env_step=7000, len=310, n/ep=0, n/st=100, qostaushy/loss=981.351, rew=16141.00]                          


Epoch #7: test_reward: 11571.900000 ± 3864.958873, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #8: 1001it [00:03, 259.74it/s, bastaushy/loss=2735.919, env_step=8000, len=310, n/ep=0, n/st=100, qostaushy/loss=725.330, rew=16141.00]                          


Epoch #8: test_reward: 11782.400000 ± 6068.435106, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #9: 1001it [00:04, 212.18it/s, bastaushy/loss=2468.014, env_step=9000, len=310, n/ep=0, n/st=100, qostaushy/loss=746.334, rew=16141.00]                          


Epoch #9: test_reward: 10757.200000 ± 4162.330448, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #10: 1001it [00:03, 264.37it/s, bastaushy/loss=2794.343, env_step=10000, len=356, n/ep=1, n/st=100, qostaushy/loss=1182.168, rew=19505.00]                          


Epoch #10: test_reward: 3110.400000 ± 1766.924741, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #11: 1001it [00:04, 222.91it/s, bastaushy/loss=2779.011, env_step=11000, len=356, n/ep=0, n/st=100, qostaushy/loss=1075.480, rew=19505.00]                          


Epoch #11: test_reward: 7127.200000 ± 5265.342625, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #12: 1001it [00:03, 259.95it/s, bastaushy/loss=2787.686, env_step=12000, len=375, n/ep=0, n/st=100, qostaushy/loss=1431.310, rew=20844.00]                          


Epoch #12: test_reward: 6135.600000 ± 2651.157453, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #13: 1001it [00:03, 257.29it/s, bastaushy/loss=2966.338, env_step=13000, len=378, n/ep=0, n/st=100, qostaushy/loss=1587.890, rew=21050.00]                          


Epoch #13: test_reward: 4883.300000 ± 3411.992440, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #14: 1001it [00:04, 222.35it/s, bastaushy/loss=2858.835, env_step=14000, len=396, n/ep=1, n/st=100, qostaushy/loss=1659.048, rew=22742.00]                          


Epoch #14: test_reward: 3315.000000 ± 1734.183900, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #15: 1001it [00:03, 258.33it/s, bastaushy/loss=2802.533, env_step=15000, len=400, n/ep=0, n/st=100, qostaushy/loss=1965.852, rew=22548.55]                          


Epoch #15: test_reward: 6042.200000 ± 4346.849843, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #16: 1001it [00:04, 207.72it/s, bastaushy/loss=2807.400, env_step=16000, len=110, n/ep=0, n/st=100, qostaushy/loss=1943.369, rew=4618.00]                          


Epoch #16: test_reward: 4688.400000 ± 3328.244919, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #17: 1001it [00:04, 246.71it/s, bastaushy/loss=2867.922, env_step=17000, len=114, n/ep=0, n/st=100, qostaushy/loss=2318.948, rew=4790.00]                          


Epoch #17: test_reward: 2996.600000 ± 3031.287522, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #18: 1001it [00:04, 239.41it/s, bastaushy/loss=2927.468, env_step=18000, len=114, n/ep=0, n/st=100, qostaushy/loss=2445.655, rew=4790.00]                          


Epoch #18: test_reward: 3865.800000 ± 2446.546701, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #19: 1001it [00:04, 243.92it/s, bastaushy/loss=2850.018, env_step=19000, len=139, n/ep=0, n/st=100, qostaushy/loss=2513.268, rew=3880.00]                          


Epoch #19: test_reward: 3043.800000 ± 4059.188141, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #20: 1001it [00:04, 250.08it/s, bastaushy/loss=2666.040, env_step=20000, len=148, n/ep=0, n/st=100, qostaushy/loss=2695.874, rew=7902.00]                          


Epoch #20: test_reward: 3451.000000 ± 2560.071601, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #21: 1001it [00:04, 212.38it/s, bastaushy/loss=2639.178, env_step=21000, len=160, n/ep=0, n/st=100, qostaushy/loss=2538.398, rew=8090.00]                          


Epoch #21: test_reward: 9478.400000 ± 6341.846659, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #22: 1001it [00:03, 259.05it/s, bastaushy/loss=2406.683, env_step=22000, len=171, n/ep=1, n/st=100, qostaushy/loss=2954.305, rew=4624.00]                          


Epoch #22: test_reward: 8690.200000 ± 4439.538215, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #23: 1001it [00:04, 210.58it/s, bastaushy/loss=2376.528, env_step=23000, len=178, n/ep=0, n/st=100, qostaushy/loss=2719.509, rew=9550.00]                          


Epoch #23: test_reward: 1998.800000 ± 1112.021654, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #24: 1001it [00:03, 259.80it/s, bastaushy/loss=1971.922, env_step=24000, len=189, n/ep=0, n/st=100, qostaushy/loss=2757.810, rew=3904.00]                          


Epoch #24: test_reward: 1269.400000 ± 1200.092013, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #25: 1001it [00:03, 253.98it/s, bastaushy/loss=1568.158, env_step=25000, len=197, n/ep=0, n/st=100, qostaushy/loss=2701.944, rew=9300.00]                          


Epoch #25: test_reward: 1990.800000 ± 2093.484789, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #26: 1001it [00:04, 208.56it/s, bastaushy/loss=1481.519, env_step=26000, len=203, n/ep=0, n/st=100, qostaushy/loss=2041.504, rew=2840.00]                          


Epoch #26: test_reward: 5779.600000 ± 2970.236731, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #27: 1001it [00:04, 248.34it/s, bastaushy/loss=1525.610, env_step=27000, len=220, n/ep=0, n/st=100, qostaushy/loss=1925.394, rew=10451.00]                          


Epoch #27: test_reward: 5229.800000 ± 3210.071706, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #28: 1001it [00:04, 207.69it/s, bastaushy/loss=1500.549, env_step=28000, len=231, n/ep=1, n/st=100, qostaushy/loss=1988.662, rew=5756.00]                          


Epoch #28: test_reward: 9531.000000 ± 3362.250110, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #29: 1001it [00:04, 250.08it/s, bastaushy/loss=1447.568, env_step=29000, len=240, n/ep=0, n/st=100, qostaushy/loss=2471.414, rew=11846.00]                          


Epoch #29: test_reward: 9972.500000 ± 4563.673592, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #30: 1001it [00:04, 231.43it/s, bastaushy/loss=1555.495, env_step=30000, len=251, n/ep=1, n/st=100, qostaushy/loss=2325.883, rew=7144.00]                          


Epoch #30: test_reward: 3183.800000 ± 1657.968021, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #31: 1001it [00:04, 248.04it/s, bastaushy/loss=1947.040, env_step=31000, len=259, n/ep=0, n/st=100, qostaushy/loss=2433.100, rew=5472.00]                          


Epoch #31: test_reward: 6172.600000 ± 5229.017101, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #32: 1001it [00:03, 252.04it/s, bastaushy/loss=1614.100, env_step=32000, len=269, n/ep=0, n/st=100, qostaushy/loss=1966.325, rew=10408.00]                          


Epoch #32: test_reward: 7396.700000 ± 4636.257328, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #33: 1001it [00:04, 210.10it/s, bastaushy/loss=1641.297, env_step=33000, len=281, n/ep=1, n/st=100, qostaushy/loss=2200.720, rew=15540.00]                          


Epoch #33: test_reward: 6757.500000 ± 5145.027954, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #34: 1001it [00:04, 247.59it/s, bastaushy/loss=1815.729, env_step=34000, len=253, n/ep=0, n/st=100, qostaushy/loss=2232.922, rew=12586.00]                          


Epoch #34: test_reward: 9133.200000 ± 4418.010430, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #35: 1001it [00:04, 209.34it/s, bastaushy/loss=1814.688, env_step=35000, len=178, n/ep=2, n/st=100, qostaushy/loss=2124.950, rew=6693.00]                          


Epoch #35: test_reward: 6540.800000 ± 5189.542269, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #36: 1001it [00:04, 248.65it/s, bastaushy/loss=1922.479, env_step=36000, len=89, n/ep=0, n/st=100, qostaushy/loss=2255.530, rew=1180.00]                          


Epoch #36: test_reward: 5708.000000 ± 3736.220978, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #37: 1001it [00:04, 227.63it/s, bastaushy/loss=2137.231, env_step=37000, len=316, n/ep=0, n/st=100, qostaushy/loss=2417.128, rew=13645.00]                          


Epoch #37: test_reward: 10297.300000 ± 2992.955865, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #38: 1001it [00:03, 257.07it/s, bastaushy/loss=2654.726, env_step=38000, len=331, n/ep=1, n/st=100, qostaushy/loss=2496.492, rew=15780.00]                          


Epoch #38: test_reward: 9298.800000 ± 4874.497488, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #39: 1001it [00:04, 250.03it/s, bastaushy/loss=3135.911, env_step=39000, len=339, n/ep=0, n/st=100, qostaushy/loss=2927.902, rew=10344.00]                          


Epoch #39: test_reward: 7979.000000 ± 3474.664905, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #40: 1001it [00:04, 227.51it/s, bastaushy/loss=2627.093, env_step=40000, len=339, n/ep=0, n/st=100, qostaushy/loss=2689.615, rew=10344.00]                          


Epoch #40: test_reward: 9353.200000 ± 4050.291861, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #41: 1001it [00:03, 254.67it/s, bastaushy/loss=2596.986, env_step=41000, len=360, n/ep=0, n/st=100, qostaushy/loss=2501.689, rew=16251.00]                          


Epoch #41: test_reward: 10806.600000 ± 3124.130093, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #42: 1001it [00:04, 209.59it/s, bastaushy/loss=2849.220, env_step=42000, len=270, n/ep=0, n/st=100, qostaushy/loss=2885.922, rew=8248.00]                          


Epoch #42: test_reward: 6096.800000 ± 1686.919370, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #43: 1001it [00:03, 251.33it/s, bastaushy/loss=2664.598, env_step=43000, len=380, n/ep=0, n/st=100, qostaushy/loss=2567.290, rew=23738.00]                          


Epoch #43: test_reward: 11020.000000 ± 4515.528230, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #44: 1001it [00:04, 210.39it/s, bastaushy/loss=2633.349, env_step=44000, len=389, n/ep=0, n/st=100, qostaushy/loss=2525.057, rew=12292.00]                          


Epoch #44: test_reward: 10379.800000 ± 3751.443823, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #45: 1001it [00:04, 247.80it/s, bastaushy/loss=2936.265, env_step=45000, len=400, n/ep=0, n/st=100, qostaushy/loss=2279.353, rew=19747.25]                          


Epoch #45: test_reward: 13140.100000 ± 3545.678678, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #46: 1001it [00:04, 235.44it/s, bastaushy/loss=2715.585, env_step=46000, len=315, n/ep=0, n/st=100, qostaushy/loss=2395.832, rew=8512.00]                          


Epoch #46: test_reward: 14041.200000 ± 4596.486067, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #47: 1001it [00:04, 243.41it/s, bastaushy/loss=2737.095, env_step=47000, len=85, n/ep=0, n/st=100, qostaushy/loss=2125.568, rew=1280.00]                          


Epoch #47: test_reward: 13124.700000 ± 4291.125867, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #48: 1001it [00:04, 248.12it/s, bastaushy/loss=2318.259, env_step=48000, len=117, n/ep=1, n/st=100, qostaushy/loss=2437.363, rew=2880.00]                          


Epoch #48: test_reward: 12780.600000 ± 4983.400289, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #49: 1001it [00:05, 187.42it/s, bastaushy/loss=2724.417, env_step=49000, len=296, n/ep=0, n/st=100, qostaushy/loss=1910.480, rew=14496.00]                          


Epoch #49: test_reward: 14053.100000 ± 5608.724640, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #50: 1001it [00:04, 247.73it/s, bastaushy/loss=2866.507, env_step=50000, len=121, n/ep=0, n/st=100, qostaushy/loss=2323.681, rew=3924.00]                          


Epoch #50: test_reward: 12867.000000 ± 3144.679348, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #51: 1001it [00:04, 211.16it/s, bastaushy/loss=2641.916, env_step=51000, len=145, n/ep=0, n/st=100, qostaushy/loss=2078.024, rew=4054.00]                          


Epoch #51: test_reward: 11725.800000 ± 5429.544397, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #52: 1001it [00:03, 253.02it/s, bastaushy/loss=2650.186, env_step=52000, len=308, n/ep=0, n/st=100, qostaushy/loss=2032.687, rew=8573.33]                          


Epoch #52: test_reward: 14974.000000 ± 5811.248196, best_reward: 15070.300000 ± 3247.526876 in #1


Epoch #53: 1001it [00:04, 207.15it/s, bastaushy/loss=2478.578, env_step=53000, len=300, n/ep=0, n/st=100, qostaushy/loss=2069.578, rew=10995.00]                          


Epoch #53: test_reward: 16113.400000 ± 6374.200110, best_reward: 16113.400000 ± 6374.200110 in #53


Epoch #54: 1001it [00:03, 252.67it/s, bastaushy/loss=2958.517, env_step=54000, len=297, n/ep=1, n/st=100, qostaushy/loss=1716.313, rew=5884.00]                          


Epoch #54: test_reward: 12932.300000 ± 4801.469943, best_reward: 16113.400000 ± 6374.200110 in #53


Epoch #55: 1001it [00:04, 226.16it/s, bastaushy/loss=2243.080, env_step=55000, len=215, n/ep=0, n/st=100, qostaushy/loss=1538.511, rew=8232.00]                          


Epoch #55: test_reward: 8283.600000 ± 1334.123772, best_reward: 16113.400000 ± 6374.200110 in #53


Epoch #56: 1001it [00:04, 234.27it/s, bastaushy/loss=2415.089, env_step=56000, len=243, n/ep=0, n/st=100, qostaushy/loss=1503.387, rew=6004.00]                          


Epoch #56: test_reward: 11520.800000 ± 5133.478446, best_reward: 16113.400000 ± 6374.200110 in #53


Epoch #57: 1001it [00:04, 249.61it/s, bastaushy/loss=2346.529, env_step=57000, len=121, n/ep=1, n/st=100, qostaushy/loss=1570.692, rew=1228.00]                          


Epoch #57: test_reward: 10790.500000 ± 3760.999262, best_reward: 16113.400000 ± 6374.200110 in #53


Epoch #58: 1001it [00:04, 213.85it/s, bastaushy/loss=2370.688, env_step=58000, len=279, n/ep=1, n/st=100, qostaushy/loss=1625.853, rew=8500.00]                          


Epoch #58: test_reward: 16577.800000 ± 5136.407943, best_reward: 16577.800000 ± 5136.407943 in #58


Epoch #59: 1001it [00:04, 241.81it/s, bastaushy/loss=2384.837, env_step=59000, len=383, n/ep=1, n/st=100, qostaushy/loss=1550.137, rew=6724.00]                          


Epoch #59: test_reward: 17203.000000 ± 4722.927863, best_reward: 17203.000000 ± 4722.927863 in #59


Epoch #60: 1001it [00:04, 203.95it/s, bastaushy/loss=2206.806, env_step=60000, len=271, n/ep=1, n/st=100, qostaushy/loss=1432.972, rew=11480.00]                          


Epoch #60: test_reward: 18226.700000 ± 4867.458784, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #61: 1001it [00:04, 239.52it/s, bastaushy/loss=2186.029, env_step=61000, len=281, n/ep=0, n/st=100, qostaushy/loss=1539.111, rew=10768.00]                          


Epoch #61: test_reward: 12408.400000 ± 5034.536745, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #62: 1001it [00:04, 209.81it/s, bastaushy/loss=2040.418, env_step=62000, len=146, n/ep=0, n/st=100, qostaushy/loss=1505.431, rew=9480.00]                          


Epoch #62: test_reward: 14223.300000 ± 5517.836787, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #63: 1001it [00:04, 240.23it/s, bastaushy/loss=1771.080, env_step=63000, len=60, n/ep=1, n/st=100, qostaushy/loss=1496.861, rew=3094.00]                          


Epoch #63: test_reward: 12145.600000 ± 3869.795219, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #64: 1001it [00:04, 232.73it/s, bastaushy/loss=1970.353, env_step=64000, len=253, n/ep=1, n/st=100, qostaushy/loss=1790.186, rew=15536.00]                          


Epoch #64: test_reward: 14649.900000 ± 6901.972696, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #65: 1001it [00:04, 250.18it/s, bastaushy/loss=2497.035, env_step=65000, len=304, n/ep=1, n/st=100, qostaushy/loss=1670.599, rew=16014.00]                          


Epoch #65: test_reward: 13366.500000 ± 6387.653642, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #66: 1001it [00:04, 250.06it/s, bastaushy/loss=2135.515, env_step=66000, len=76, n/ep=0, n/st=100, qostaushy/loss=1460.463, rew=4060.00]                          


Epoch #66: test_reward: 13702.500000 ± 5132.741241, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #67: 1001it [00:04, 218.40it/s, bastaushy/loss=2200.976, env_step=67000, len=319, n/ep=0, n/st=100, qostaushy/loss=1614.404, rew=14820.00]                          


Epoch #67: test_reward: 16366.600000 ± 5004.209972, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #68: 1001it [00:04, 245.71it/s, bastaushy/loss=2194.843, env_step=68000, len=92, n/ep=0, n/st=100, qostaushy/loss=1660.426, rew=4630.00]                          


Epoch #68: test_reward: 10506.700000 ± 4336.317240, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #69: 1001it [00:04, 207.82it/s, bastaushy/loss=1932.596, env_step=69000, len=380, n/ep=0, n/st=100, qostaushy/loss=1930.713, rew=16985.00]                          


Epoch #69: test_reward: 15502.300000 ± 3343.094735, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #70: 1001it [00:04, 242.61it/s, bastaushy/loss=2487.623, env_step=70000, len=159, n/ep=1, n/st=100, qostaushy/loss=1793.907, rew=6624.00]                          


Epoch #70: test_reward: 14290.300000 ± 5199.022832, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #71: 1001it [00:04, 206.02it/s, bastaushy/loss=2073.962, env_step=71000, len=335, n/ep=0, n/st=100, qostaushy/loss=1922.946, rew=13526.00]                          


Epoch #71: test_reward: 13838.000000 ± 5253.552017, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #72: 1001it [00:04, 244.00it/s, bastaushy/loss=2107.100, env_step=72000, len=353, n/ep=0, n/st=100, qostaushy/loss=2119.216, rew=13500.00]                          


Epoch #72: test_reward: 16530.200000 ± 6074.475941, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #73: 1001it [00:04, 244.48it/s, bastaushy/loss=2419.831, env_step=73000, len=270, n/ep=0, n/st=100, qostaushy/loss=2119.357, rew=15614.00]                          


Epoch #73: test_reward: 15553.400000 ± 4862.348737, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #74: 1001it [00:04, 242.61it/s, bastaushy/loss=2311.090, env_step=74000, len=400, n/ep=0, n/st=100, qostaushy/loss=2369.793, rew=16462.00]                          


Epoch #74: test_reward: 13280.100000 ± 2566.327238, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #75: 1001it [00:04, 245.31it/s, bastaushy/loss=2380.427, env_step=75000, len=400, n/ep=1, n/st=100, qostaushy/loss=2909.171, rew=16039.00]                          


Epoch #75: test_reward: 13096.000000 ± 3105.946683, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #76: 1001it [00:04, 208.93it/s, bastaushy/loss=2325.344, env_step=76000, len=400, n/ep=0, n/st=100, qostaushy/loss=2621.206, rew=15602.00]                          


Epoch #76: test_reward: 14767.700000 ± 4807.162033, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #77: 1001it [00:04, 246.22it/s, bastaushy/loss=2110.947, env_step=77000, len=355, n/ep=0, n/st=100, qostaushy/loss=2444.560, rew=21320.00]                          


Epoch #77: test_reward: 10669.200000 ± 5681.512225, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #78: 1001it [00:04, 208.34it/s, bastaushy/loss=2175.854, env_step=78000, len=139, n/ep=0, n/st=100, qostaushy/loss=2564.170, rew=5940.00]                          


Epoch #78: test_reward: 17570.700000 ± 4004.580229, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #79: 1001it [00:03, 250.67it/s, bastaushy/loss=2188.954, env_step=79000, len=353, n/ep=0, n/st=100, qostaushy/loss=2581.704, rew=20512.00]                          


Epoch #79: test_reward: 13812.200000 ± 5596.210554, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #80: 1001it [00:04, 211.41it/s, bastaushy/loss=2003.136, env_step=80000, len=205, n/ep=1, n/st=100, qostaushy/loss=2763.444, rew=9012.00]                          


Epoch #80: test_reward: 7685.400000 ± 7090.397791, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #81: 1001it [00:04, 245.14it/s, bastaushy/loss=2000.765, env_step=81000, len=337, n/ep=0, n/st=100, qostaushy/loss=2589.240, rew=17868.00]                          


Epoch #81: test_reward: 17981.800000 ± 4149.444560, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #82: 1001it [00:04, 247.73it/s, bastaushy/loss=2116.297, env_step=82000, len=338, n/ep=0, n/st=100, qostaushy/loss=2122.961, rew=23150.00]                          


Epoch #82: test_reward: 12489.400000 ± 5772.524443, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #83: 1001it [00:04, 223.05it/s, bastaushy/loss=1737.396, env_step=83000, len=149, n/ep=1, n/st=100, qostaushy/loss=2479.100, rew=7880.00]                          


Epoch #83: test_reward: 11845.100000 ± 7265.695541, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #84: 1001it [00:04, 247.49it/s, bastaushy/loss=2172.589, env_step=84000, len=400, n/ep=0, n/st=100, qostaushy/loss=2284.682, rew=19118.00]                          


Epoch #84: test_reward: 15518.900000 ± 7188.442911, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #85: 1001it [00:04, 205.02it/s, bastaushy/loss=2134.515, env_step=85000, len=259, n/ep=0, n/st=100, qostaushy/loss=2473.496, rew=15892.00]                          


Epoch #85: test_reward: 16601.300000 ± 5697.226466, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #86: 1001it [00:04, 245.15it/s, bastaushy/loss=1973.440, env_step=86000, len=137, n/ep=0, n/st=100, qostaushy/loss=2343.186, rew=7670.00]                          


Epoch #86: test_reward: 17187.700000 ± 4727.650961, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #87: 1001it [00:04, 202.26it/s, bastaushy/loss=1914.979, env_step=87000, len=176, n/ep=1, n/st=100, qostaushy/loss=2846.388, rew=10611.00]                          


Epoch #87: test_reward: 14431.300000 ± 5195.258070, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #88: 1001it [00:04, 241.71it/s, bastaushy/loss=1804.930, env_step=88000, len=117, n/ep=0, n/st=100, qostaushy/loss=2654.302, rew=3748.00]                          


Epoch #88: test_reward: 16366.100000 ± 5637.347576, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #89: 1001it [00:04, 212.20it/s, bastaushy/loss=1990.712, env_step=89000, len=350, n/ep=0, n/st=100, qostaushy/loss=2091.976, rew=22680.00]                          


Epoch #89: test_reward: 10428.000000 ± 4887.818245, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #90: 1001it [00:04, 245.82it/s, bastaushy/loss=1877.466, env_step=90000, len=321, n/ep=0, n/st=100, qostaushy/loss=2658.458, rew=17556.00]                          


Epoch #90: test_reward: 11721.400000 ± 5493.456311, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #91: 1001it [00:04, 237.20it/s, bastaushy/loss=1990.034, env_step=91000, len=216, n/ep=0, n/st=100, qostaushy/loss=3205.900, rew=13146.00]                          


Epoch #91: test_reward: 12449.400000 ± 4900.916959, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #92: 1001it [00:05, 198.52it/s, bastaushy/loss=1988.214, env_step=92000, len=347, n/ep=0, n/st=100, qostaushy/loss=2610.343, rew=21732.00]                          


Epoch #92: test_reward: 9275.000000 ± 4947.341953, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #93: 1001it [00:04, 243.77it/s, bastaushy/loss=1884.444, env_step=93000, len=205, n/ep=2, n/st=100, qostaushy/loss=2433.743, rew=10354.00]                          


Epoch #93: test_reward: 11042.000000 ± 5093.949509, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #94: 1001it [00:04, 224.55it/s, bastaushy/loss=2069.656, env_step=94000, len=368, n/ep=1, n/st=100, qostaushy/loss=2640.787, rew=22062.00]                          


Epoch #94: test_reward: 14633.400000 ± 7076.614504, best_reward: 18226.700000 ± 4867.458784 in #60


Epoch #95: 1001it [00:03, 250.65it/s, bastaushy/loss=2079.880, env_step=95000, len=142, n/ep=1, n/st=100, qostaushy/loss=2715.162, rew=8684.00]                          


Epoch #95: test_reward: 18662.000000 ± 6199.948871, best_reward: 18662.000000 ± 6199.948871 in #95


Epoch #96: 1001it [00:04, 203.25it/s, bastaushy/loss=1988.258, env_step=96000, len=291, n/ep=0, n/st=100, qostaushy/loss=3135.394, rew=13236.00]                          


Epoch #96: test_reward: 10018.800000 ± 5158.199159, best_reward: 18662.000000 ± 6199.948871 in #95


Epoch #97: 1001it [00:04, 246.17it/s, bastaushy/loss=2160.905, env_step=97000, len=284, n/ep=0, n/st=100, qostaushy/loss=2847.630, rew=15998.00]                          


Epoch #97: test_reward: 11285.200000 ± 4069.444896, best_reward: 18662.000000 ± 6199.948871 in #95


Epoch #98: 1001it [00:04, 206.49it/s, bastaushy/loss=2008.497, env_step=98000, len=227, n/ep=0, n/st=100, qostaushy/loss=2814.769, rew=9976.00]                          


Epoch #98: test_reward: 13272.000000 ± 5006.452237, best_reward: 18662.000000 ± 6199.948871 in #95


Epoch #99: 1001it [00:04, 240.69it/s, bastaushy/loss=2032.803, env_step=99000, len=265, n/ep=3, n/st=100, qostaushy/loss=3467.338, rew=15857.33]                          


Epoch #99: test_reward: 6036.300000 ± 6595.260147, best_reward: 18662.000000 ± 6199.948871 in #95


Epoch #100: 1001it [00:04, 233.33it/s, bastaushy/loss=2109.710, env_step=100000, len=225, n/ep=0, n/st=100, qostaushy/loss=3791.146, rew=10140.00]                          


Epoch #100: test_reward: 11034.400000 ± 5132.146358, best_reward: 18662.000000 ± 6199.948871 in #95

{'duration': '548.56s', 'train_time/model': '410.00s', 'test_step': 275666, 'test_episode': 1010, 'test_time': '115.68s', 'test_speed': '2383.08 step/s', 'best_reward': 18662.0, 'best_result': '18662.00 ± 6199.95', 'train_step': 100000, 'train_episode': 424, 'train_time/collector': '22.88s', 'train_speed': '231.01 step/s'}

(the trained policy can be accessed via policy.policies[agents[1]])


### 🐙 Evaluate best Qostaushy agent with random policy

In [54]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}
# Step 1: Load the PettingZoo environment
env = env()#render_mode="human")

# Step 2: Wrap the environment for Tianshou interfacing
env = PettingZooEnv(env)

# # Step 3: Define policies for each agent
policies = MultiAgentPolicyManager([RandomPolicy(), agent4_learned], env)
# # Step 4: Convert the env to vector format
env = DummyVectorEnv([lambda: env])

# # Step 5: Construct the Collector, which interfaces the policies with the vectorised environment
collector = Collector(policies, env)

# # Step 6: Execute the environment with the agents playing for 1 episode, and render a frame every ? seconds
result = collector.collect(n_episode=100)
print(PLAYS)

{'bastaushy': 81, 'qostaushy': 17}


🐳 Experiments

1.

 net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=64 * 100)  # batch size * training_num

res: {'bastaushy': 781, 'qostaushy': 194}

res: {'bastaushy': 790, 'qostaushy': 191}

2.

 net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=256 * 100)  # batch size * training_num

res: 6/3

3.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=128 * 100)  # batch size * training_num     

{'bastaushy': 537, 'qostaushy': 411}

4.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[256, 512, 512, 256],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=128 * 100)  # batch size * training_num   

res: {'bastaushy': 493, 'qostaushy': 483}

res: {'bastaushy': 512, 'qostaushy': 464}

5.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[512, 1024, 1024, 512],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

res: {'bastaushy': 38, 'qostaushy': 59}

res: {'bastaushy': 372, 'qostaushy': 562}

### 🦎 Play with different policies

In [57]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}
# Step 1: Load the PettingZoo environment
env = env(render_mode="human")

# Step 2: Wrap the environment for Tianshou interfacing
env = PettingZooEnv(env)

# # Step 3: Define policies for each agent
policies = MultiAgentPolicyManager([agent4_learned, agent3_learned], env)
# # Step 4: Convert the env to vector format
env = DummyVectorEnv([lambda: env])

# # Step 5: Construct the Collector, which interfaces the policies with the vectorised environment
collector = Collector(policies, env)

# # Step 6: Execute the environment with the agents playing for 1 episode, and render a frame every 2 seconds
result = collector.collect(n_episode=1)
print(PLAYS)

{'bastaushy': 1, 'qostaushy': 0}


🐯 Experiment results


*   agent3 vs agent1: 0-0
*   agent1 vs agent3: 0-1
*   agent3 vs agent2: 0-1
*   agent2 vs agent3: 1-0
*   agent2 vs agent1: 0-1
*   agent1 vs agent2: 1-0
*   trained with agent1 vs agent1: 1-0
*   agent1 vs trained with agent1: 0-0
*   trained with agent1 vs agent2: 0-0
*   agent2 vs trained with agent1: 0-0
*   trained with agent1 vs agent3: 1-0
*   agent3 vs trained with agent1: 0-0

