<a href="https://colab.research.google.com/github/zhus-dika/togyz-qumalaq-agent/blob/main/togyzqumalaq_aec_vs_random_policy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!python -m ipykernel install --user --name=venv

#  🐘 AEC environment https://pettingzoo.farama.org/api/aec/#about-aec

### 🐞 Imports

In [1]:
import gymnasium
import os
import numpy as np
from gymnasium.spaces import Discrete, MultiDiscrete
from gymnasium import spaces

from IPython.display import clear_output
import time
from pettingzoo import AECEnv
from pettingzoo.utils import agent_selector, wrappers
import matplotlib.pyplot as plt

NUM_ITERS = 400
PLAYS = {"bastaushy": 0, "qostaushy": 0}

### 🦉 Create environment

In [2]:
def env(render_mode=None):
    """
    The env function often wraps the environment in wrappers by default.
    You can find full documentation for these methods
    elsewhere in the developer documentation.
    """
    internal_render_mode = render_mode if render_mode != "ansi" else "human"
    env = raw_env(render_mode=internal_render_mode)
    # This wrapper is only for environments which print results to the terminal
    if render_mode == "ansi":
        env = wrappers.CaptureStdoutWrapper(env)
    # this wrapper helps error handling for discrete action spaces
    env = wrappers.AssertOutOfBoundsWrapper(env)
    # Provides a wide vareity of helpful user errors
    # Strongly recommended
    env = wrappers.OrderEnforcingWrapper(env)
    return env


class raw_env(AECEnv):
    """
    The metadata holds environment constants. From gymnasium, we inherit the "render_modes",
    metadata which specifies which modes can be put into the render() method.
    At least human mode should be supported.
    The "name" metadata allows the environment to be pretty printed.
    """

    metadata = {
        "render_modes": ["ansi", "human"],
        "name": "togyzqumalaq_v0"
        }

    def __init__(self, render_mode=None):
        """
        The init method takes in environment arguments and
         should define the following attributes:
        - otaular
        - tuzdyq
        - qazandar
        - possible_agents
        - render_mode

        Note: as of v1.18.1, the action_spaces and observation_spaces attributes are deprecated.
        Spaces should be defined in the action_space() and observation_space() methods.
        If these methods are not overridden, spaces will be inferred from self.observation_spaces/action_spaces, raising a warning.

        These attributes should not be changed after initialization.
        """
        self.otaular = []
        self.tuzdyq = []
        self.qazandar = []
        self.direction = []
        self.agents = ["bastaushy", "qostaushy"]
        self.possible_agents = self.agents[:]
        # optional: we can define the observation and action spaces here as attributes to be used in their corresponding methods
        self.action_spaces = {i: spaces.Discrete(9) for i in self.agents}
        self.observation_spaces = {
            i: spaces.Dict(
                {
                    "observation": MultiDiscrete([100] * 18 + [9] * 2 + [82] * 2),
                    "action_mask": Discrete(9),
                }
            )
            for i in self.agents
        }
        self.render_mode = render_mode

    # Observation space should be defined here.
    def action_space(self, agent):
        return self.action_spaces[agent]

    # Action space should be defined here.
    def observation_space(self, agent):
        return self.observation_spaces[agent]

    def render(self):
        """
        Renders the environment. In human mode, it can print to terminal, open
        up a graphical window, or open up some other display that a human can see and understand.
        """
        """Renders the environment."""
        if self.render_mode is None:
            gymnasium.logger.warn(
                "You are calling render method without specifying any render mode."
            )
            return

        if len(self.agents) == 2:
            points_bastaushy_x = np.array([i * 2 for i in range(10)])
            points_bastaushy_y = np.array([i % 5 for i in range(50)])

            x = np.arange(-3, 225, 1)
            y = -1

            text_kwargs = dict(ha='center', va='center', fontsize=12)
            plt.figure(figsize=(17, 6))

            for i in range(9):
                # qostaushy's part
                plt.scatter(np.repeat(points_bastaushy_x + 25 * i, 5)[:self.otaular[17 - i]], points_bastaushy_y[:self.otaular[17 - i]], marker='o')
                # horizontal line
                plt.plot(x, np.repeat(y, len(x)))
                # vertical lines
                plt.plot(np.repeat(25 * i - 2, len(x)), np.arange(-7, 5, 12 / len(x)))
                # bastaushy's part
                plt.scatter(np.repeat(points_bastaushy_x + 25 * i, 5)[:self.otaular[i]], points_bastaushy_y[:self.otaular[i]] - 6, marker='o')

            #last vertical line
            plt.plot(np.repeat(25 * 9 - 2, len(x)), np.arange(-7, 5, 12 / len(x)))

            for i in range(9):
                # bastaushy's qumalaqtar
                plt.text(25 * i + 10, -7, f'{i} ({self.otaular[i]})', **text_kwargs)
                # qostaushy's qumalaqtar
                plt.text(25 * i + 10, 5, f'{17 - i} ({self.otaular[17 - i]})', **text_kwargs)
            # bastaushy qazan's qumalaqtar
            plt.text(230, -4, f'qazan: {self.qazandar[0]}', **text_kwargs)
            # qostaushy qazan's qumalaqtar
            plt.text(230, 2, f'qazan: {self.qazandar[1]}', **text_kwargs);
            # bastaushy tuzdyq's qumalaqtar
            plt.text(230, -6, f'tuzdyq: {self.tuzdyq[0]}', **text_kwargs)
            # qostaushy tuzdyq's qumalaqtar
            plt.text(230, 0, f'tuzdyq: {self.tuzdyq[1]}', **text_kwargs);
            plt.show()
        else:
            if self.render_mode == "human":
                print("Game over")
        time.sleep(2)
        clear_output(True)

    def _legal_moves(self, agent):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2
        return [item for item in range(9 * cur_player, (cur_player + 1) * 9) if self.tuzdyq[opp_player] != item and self.otaular[item] > 0]

    def observe(self, agent):
        """
        Observe should return the observation of the specified agent. This function
        should return a sane observation (though not necessarily the most up to date possible)
        at any time after reset() is called.
        """
        # observation of one agent is the previous state of the other
        legal_moves = self._legal_moves(agent) if agent == self.agent_selection else []
        action_mask = np.zeros(9, "int8")
        if self.possible_agents.index(agent) == 1:
            legal_moves = [i - 9 for i in legal_moves]
        for i in legal_moves:
            action_mask[i] = 1
        observation = tuple(
            self.otaular + self.tuzdyq + self.qazandar
        )
        return {"observation": observation, "action_mask": action_mask}

    def close(self):
        """
        Close should release any graphical displays, subprocesses, network connections
        or any other environment data which should not be kept around after the
        user is no longer using the environment.
        """
        pass

    def reset(self, seed=None, options=None):
        """
        Reset needs to initialize the following attributes
        - agents
        - rewards
        - _cumulative_rewards
        - terminations
        - truncations
        - infos
        - agent_selection
        And must set up the environment so that render(), step(), and observe()
        can be called without issues.
        Here it sets up the state dictionary which is used by step() and the observations dictionary which is used by step() and observe()
        """
        self.agents = self.possible_agents[:]
        self.rewards = {agent: 0 for agent in self.agents}
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.otaular = [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
        self.direction = [list(range(18)), [9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8]]
        self.tuzdyq = [-1, -1]
        self.qazandar = [0, 0]
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}
        self.num_moves = 0
        observation = tuple(
            self.otaular + self.tuzdyq + self.qazandar
        )
        self.observations = {agent: observation for agent in self.agents}
        """
        Our agent_selector utility allows easy cyclic stepping through the agents list.
        """
        self._agent_selector = agent_selector(self.agents)
        self.agent_selection = self._agent_selector.next()

    def step(self, action):
        """
        step(action) takes in an action for the current agent (specified by
        agent_selection) and needs to update
        - rewards
        - _cumulative_rewards (accumulating the rewards)
        - terminations
        - truncations
        - infos
        - agent_selection (to the next agent)
        And any internal state used by observe() or render()
        """
        if (
            self.terminations[self.agent_selection]
            or self.truncations[self.agent_selection]
        ):
            # handles stepping an agent which is already dead
            # accepts a None action for the one agent, and moves the agent_selection to
            # the next dead agent,  or if there are no more dead agents, to the next live agent
            self._was_dead_step(action)
            return

        cur_player = self.possible_agents.index(self.agent_selection)
        opp_player = (cur_player + 1) % 2
        self.num_moves += 1
        if self.render_mode == "human":
            print(f'MOVE #{self.num_moves}')
        # The truncations dictionary must be updated for all players.
        self.truncations = {
            agent: self.num_moves >= NUM_ITERS for agent in self.agents
        }
        # distribute qumalaqs
        if cur_player == 1:
            action += 9
        if self.render_mode == "human":
            print(f'{self.agent_selection} made action {action}')
        num_qumalaq = self.otaular[action]
        idx_action = self.direction[cur_player].index(action)
        if self.otaular[action] == 1:
            self.otaular[self.direction[cur_player][idx_action + 1]] += 1
            self.otaular[action] -= 1
        else:
            i = 1
            while self.otaular[action] > 1:
                self.otaular[self.direction[cur_player][(idx_action + i) % 18]] += 1
                self.otaular[action] -= 1
                i += 1
        # check tuzdyq & add rewards to qazandar
        reward = 0
        if self.check_tuzdyq(self.agent_selection, action):
            reward += 3
            if self.render_mode == "human":
                print(f'{self.agent_selection} won tuzdyq {reward}')
        else:

            if num_qumalaq > 1:
                last_otau = self.direction[cur_player][(idx_action + num_qumalaq - 1) % 18]
            else:
                last_otau = self.direction[cur_player][(idx_action + num_qumalaq) % 18]

            if last_otau in range(opp_player * 9, (opp_player + 1) * 9) and self.otaular[last_otau] % 2 == 0:
                reward += self.otaular[last_otau]
                if self.render_mode == "human":
                    print(f'{self.agent_selection} won {reward}')
                self.otaular[last_otau] = 0
            if self.tuzdyq[cur_player] >= 0 and self.otaular[self.tuzdyq[cur_player]] > 0:
                reward += self.otaular[self.tuzdyq[cur_player]]
                if self.render_mode == "human":
                    print(f'{self.agent_selection} won tuzdyq {self.otaular[self.tuzdyq[cur_player]]}')
                self.otaular[self.tuzdyq[cur_player]] = 0
        if self.render_mode == "human":
            print(f'{self.agent_selection} won total {reward}')
        self.qazandar[cur_player] += reward
        self.rewards[self.agent_selection] += reward
        # check if there is a winner
        winner = self.check_for_winner()
        if winner:
            self.terminations = {i: True for i in self.agents}
            if self.render_mode == "human":
                print(f'{self.agent_selection} won the game!!!')
        # selects the next agent.
        self.agent_selection = self._agent_selector.next()
        # Adds .rewards to ._cumulative_rewards
        self._accumulate_rewards()

        if self.render_mode == "human":
            self.render()

    def check_tuzdyq(self, agent, action):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2
        idx = self.direction[cur_player].index(action)
        num_qumalaq = self.otaular[action]

        if num_qumalaq > 1:
            last_otau = self.direction[cur_player][(idx + num_qumalaq - 1) % 18]
        else:
            last_otau = self.direction[cur_player][(idx + num_qumalaq) % 18]

        if last_otau in range(opp_player * 9, (opp_player + 1) * 9) and self.otaular[last_otau] == 3 and last_otau != 17 - cur_player * 9 and abs(last_otau - self.tuzdyq[opp_player]) != 9:
            self.tuzdyq[cur_player] = last_otau
            self.otaular[last_otau] = 0
            if self.render_mode == "human":
                print(f'{agent} got tuzdyq {last_otau}!')
            return True

        return False

    def check_atsyrau(self, agent):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2

        for idx, i in enumerate(self.otaular[cur_player * 9: (cur_player + 1) * 9]):
            if i > 0 and idx + cur_player * 9 != self.tuzdyq[opp_player]:
                return False
        if self.render_mode == "human":
            print(f'{agent} reached atsyrau')
        return True

    def check_for_winner(self):
        cur_player = self.possible_agents.index(self.agent_selection)
        opp_player = (cur_player + 1) % 2
        if self.qazandar[cur_player] > 81:
            PLAYS[self.agent_selection] += 1
            return True
        if self.check_atsyrau(self.possible_agents[opp_player]) and self.qazandar[opp_player] <= 81:
            PLAYS[self.agent_selection] += 1
            return True
        return False

### 🦚 Testing environment

In [None]:
# env = env(render_mode="human")
# env.reset(seed=42)

# for agent in env.agent_iter():
#     observation, reward, termination, truncation, info = env.last()

#     if termination or truncation:
#         action = None
#     else:
#         mask = observation["action_mask"]
#         # this is where you would insert your policy
#         action = env.action_space(agent).sample(mask)

#     env.step(action)
# env.close()

# 🐼 DQN agent to play vs a random policy agent https://pettingzoo.farama.org/tutorials/tianshou/intermediate/

### 🐝 Imports

In [7]:
import os
from typing import Optional, Tuple

import gymnasium
import numpy as np
import torch
from copy import deepcopy
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, RainbowPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import OffpolicyTrainer
from tianshou.utils.net.common import Net

### 🐎 Load trained agents

In [37]:
agent1_path = "models/policy_128x256x256x128_bs64.pth"
agent2_path = "models/policy_256x512x512x256_bs128.pth"
agent3_path = "models/policy_512x1024x1024x512_bs128.pth"
agent4_path = "models/policy_128x256x512x256x128_trained_128x256x256x128.pth"

agents_learned = []
env = _get_env()
net1 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent1_learned = DQNPolicy(
            model=net1,
            optim = torch.optim.Adam(net1.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent1_learned.load_state_dict(torch.load(agent1_path))
agents_learned.append(agent1_learned)


net2 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[256, 512, 512, 256],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent2_learned = DQNPolicy(
            model=net2,
            optim = torch.optim.Adam(net1.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")
agent2_learned.load_state_dict(torch.load(agent2_path))
agents_learned.append(agent2_learned)


net3 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[512, 1024, 1024, 512],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent3_learned = DQNPolicy(
            model=net3,
            optim = torch.optim.Adam(net3.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")
agent3_learned.load_state_dict(torch.load(agent3_path))
agents_learned.append(agent3_learned)


net4 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 512, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent4_learned = DQNPolicy(
            model=net4,
            optim = torch.optim.Adam(net3.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent4_learned.load_state_dict(torch.load(agent4_path))
agents_learned.append(agent4_learned)

### 🐫 Prepare main functions

In [8]:
def _get_agents_dqn(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gymnasium.spaces.Dict)
        else env.observation_space
    )
    if agent_learn is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[256, 512, 512, 256],
            #hidden_sizes=[1024, 2048, 2048, 1024],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")


    if agent_opponent is None:
        if agent1_path:
            agent_opponent = agent1_learned
        else:
            agent_opponent = RandomPolicy(action_space=env.action_space)

    agents = [agent_opponent, agent_learn]
    #agents = [agent_learn, agent_opponent]
    policy = MultiAgentPolicyManager(policies=agents, env=env)
    return policy, optim, env.agents


def _get_env(render_mode=None):
    """This function is needed to provide callables for DummyVectorEnv."""
    def env(render_mode=None):
        """
        The env function often wraps the environment in wrappers by default.
        You can find full documentation for these methods
        elsewhere in the developer documentation.
        """
        internal_render_mode = render_mode if render_mode != "ansi" else "human"
        env = raw_env(render_mode=internal_render_mode)
        # This wrapper is only for environments which print results to the terminal
        if render_mode == "ansi":
            env = wrappers.CaptureStdoutWrapper(env)
        # this wrapper helps error handling for discrete action spaces
        env = wrappers.AssertOutOfBoundsWrapper(env)
        # Provides a wide vareity of helpful user errors
        # Strongly recommended
        env = wrappers.OrderEnforcingWrapper(env)
        return env
    return PettingZooEnv(env(render_mode=render_mode))

###  🐑 Training code https://tianshou.org/en/stable/01_tutorials/04_tictactoe.html

In [None]:
# Before evaluate this cell run the cell with env
# ======== Step 1: Environment setup =========
train_envs = DummyVectorEnv([_get_env for _ in range(100)])
test_envs = DummyVectorEnv([_get_env for _ in range(100)])

# seed
seed = 11
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# ======== Step 2: Agent setup =========
policy, optim, agents = _get_agents_dqn(agent_opponent=agents_learned[1])

# # ======== Step 3: Collector setup =========
train_collector = Collector(
    policy,
    train_envs,
    VectorReplayBuffer(20_000, len(train_envs)),
    exploration_noise=True,
)
test_collector = Collector(policy, test_envs, exploration_noise=True)
# policy.set_eps(1)

# ======== Step 4: Callback functions setup =========
def save_best_fn(policy):
    model_save_path = os.path.join("models", "dqn", "policy.pth")
    os.makedirs(os.path.join("models", "dqn"), exist_ok=True)
    torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

def stop_fn(mean_rewards):
    return mean_rewards >= 21000

def train_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.1)

def test_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.05)

def reward_metric(rews):
    return rews[:, 1]

# ======== Step 5: Run the trainer =========
result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=150,
    step_per_epoch=1000,
    step_per_collect=50,
    episode_per_test=10,
    batch_size=256,
    train_fn=train_fn,
    test_fn=test_fn,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn,
    update_per_step=0.1,
    test_in_train=False,
    reward_metric=reward_metric,
    verbose=True
).run()

# return result, policy.policies[agents[1]]
print(f"\n==========Result==========\n{result}")
print("\n(the trained policy can be accessed via policy.policies[agents[1]])")

### 🐙 Evaluate best Qostaushy agent with random policy

In [67]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}
# Step 1: Load the PettingZoo environment
env = env()#render_mode="human")

# Step 2: Wrap the environment for Tianshou interfacing
env = PettingZooEnv(env)

# # Step 3: Define policies for each agent
policies = MultiAgentPolicyManager(policies=[RandomPolicy(action_space=env.action_space), agents_learned[3]], env=env)
# # Step 4: Convert the env to vector format
env = DummyVectorEnv([lambda: env])

# # Step 5: Construct the Collector, which interfaces the policies with the vectorised environment
collector = Collector(policies, env)

# # Step 6: Execute the environment with the agents playing for 1 episode, and render a frame every ? seconds
result = collector.collect(n_episode=100, reset_before_collect=True)
print(PLAYS)

{'bastaushy': 85, 'qostaushy': 13}


🐳 Experiments

1.

 net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=64 * 100)  # batch size * training_num

res: {'bastaushy': 781, 'qostaushy': 194}

res: {'bastaushy': 790, 'qostaushy': 191}

2.

 net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=256 * 100)  # batch size * training_num

res: 6/3

3.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=128 * 100)  # batch size * training_num     

{'bastaushy': 537, 'qostaushy': 411}

4.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[256, 512, 512, 256],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=128 * 100)  # batch size * training_num   

res: {'bastaushy': 493, 'qostaushy': 483}

res: {'bastaushy': 512, 'qostaushy': 464}

5.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[512, 1024, 1024, 512],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

res: {'bastaushy': 38, 'qostaushy': 59}

res: {'bastaushy': 372, 'qostaushy': 562}

### 🦎 Play with different policies

In [77]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}
# Step 1: Load the PettingZoo environment
env = env()#render_mode="human")

# Step 2: Wrap the environment for Tianshou interfacing
env = PettingZooEnv(env)

# # Step 3: Define policies for each agent
policies = MultiAgentPolicyManager(policies=[agents_learned[0], agents_learned[2]], env=env)
# # Step 4: Convert the env to vector format
env = DummyVectorEnv([lambda: env])

# # Step 5: Construct the Collector, which interfaces the policies with the vectorised environment
collector = Collector(policies, env)

# # Step 6: Execute the environment with the agents playing for 1 episode, and render a frame every 2 seconds
result = collector.collect(n_episode=1, reset_before_collect=True)
print(PLAYS)

{'bastaushy': 0, 'qostaushy': 1}


🐯 Experiment results


*   agent3 vs agent1: 0-0
*   agent1 vs agent3: 0-1
*   agent3 vs agent2: 0-1
*   agent2 vs agent3: 1-0
*   agent2 vs agent1: 0-1
*   agent1 vs agent2: 1-0
*   trained with agent1 vs agent1: 1-0
*   agent1 vs trained with agent1: 0-0
*   trained with agent1 vs agent2: 0-0
*   agent2 vs trained with agent1: 0-0
*   trained with agent1 vs agent3: 1-0
*   agent3 vs trained with agent1: 0-0



# 🦩 PPO policy training

### 🐊 Change files tianshou.utils.net.common & tianshou.policy.modelfree.pgpolicy

In [None]:
#common.py
from abc import ABC, abstractmethod
from collections.abc import Callable, Sequence
from typing import Any, Generic, TypeAlias, TypeVar, cast, no_type_check

import numpy as np
import torch
from torch import nn

from tianshou.data import to_torch_as
from tianshou.data.batch import Batch
from tianshou.data.types import RecurrentStateBatch

ModuleType = type[nn.Module]
ArgsType = tuple[Any, ...] | dict[Any, Any] | Sequence[tuple[Any, ...]] | Sequence[dict[Any, Any]]
TActionShape: TypeAlias = Sequence[int] | int | np.int64
TLinearLayer: TypeAlias = Callable[[int, int], nn.Module]
T = TypeVar("T")


def miniblock(
    input_size: int,
    output_size: int = 0,
    norm_layer: ModuleType | None = None,
    norm_args: tuple[Any, ...] | dict[Any, Any] | None = None,
    activation: ModuleType | None = None,
    act_args: tuple[Any, ...] | dict[Any, Any] | None = None,
    linear_layer: TLinearLayer = nn.Linear,
) -> list[nn.Module]:
    """Construct a miniblock with given input/output-size, norm layer and activation."""
    layers: list[nn.Module] = [linear_layer(input_size, output_size)]
    if norm_layer is not None:
        if isinstance(norm_args, tuple):
            layers += [norm_layer(output_size, *norm_args)]
        elif isinstance(norm_args, dict):
            layers += [norm_layer(output_size, **norm_args)]
        else:
            layers += [norm_layer(output_size)]
    if activation is not None:
        if isinstance(act_args, tuple):
            layers += [activation(*act_args)]
        elif isinstance(act_args, dict):
            layers += [activation(**act_args)]
        else:
            layers += [activation()]
    return layers


class MLP(nn.Module):
    """Simple MLP backbone.

    Create a MLP of size input_dim * hidden_sizes[0] * hidden_sizes[1] * ...
    * hidden_sizes[-1] * output_dim

    :param input_dim: dimension of the input vector.
    :param output_dim: dimension of the output vector. If set to 0, there
        is no final linear layer.
    :param hidden_sizes: shape of MLP passed in as a list, not including
        input_dim and output_dim.
    :param norm_layer: use which normalization before activation, e.g.,
        ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization.
        You can also pass a list of normalization modules with the same length
        of hidden_sizes, to use different normalization module in different
        layers. Default to no normalization.
    :param activation: which activation to use after each layer, can be both
        the same activation for all layers if passed in nn.Module, or different
        activation for different Modules if passed in a list. Default to
        nn.ReLU.
    :param device: which device to create this model on. Default to None.
    :param linear_layer: use this module as linear layer. Default to nn.Linear.
    :param flatten_input: whether to flatten input data. Default to True.
    """

    def __init__(
        self,
        input_dim: int,
        output_dim: int = 0,
        hidden_sizes: Sequence[int] = (),
        norm_layer: ModuleType | Sequence[ModuleType] | None = None,
        norm_args: ArgsType | None = None,
        activation: ModuleType | Sequence[ModuleType] | None = nn.ReLU,
        act_args: ArgsType | None = None,
        device: str | int | torch.device | None = None,
        linear_layer: TLinearLayer = nn.Linear,
        flatten_input: bool = True,
    ) -> None:
        super().__init__()
        self.device = device
        if norm_layer:
            if isinstance(norm_layer, list):
                assert len(norm_layer) == len(hidden_sizes)
                norm_layer_list = norm_layer
                if isinstance(norm_args, list):
                    assert len(norm_args) == len(hidden_sizes)
                    norm_args_list = norm_args
                else:
                    norm_args_list = [norm_args for _ in range(len(hidden_sizes))]
            else:
                norm_layer_list = [norm_layer for _ in range(len(hidden_sizes))]
                norm_args_list = [norm_args for _ in range(len(hidden_sizes))]
        else:
            norm_layer_list = [None] * len(hidden_sizes)
            norm_args_list = [None] * len(hidden_sizes)
        if activation:
            if isinstance(activation, list):
                assert len(activation) == len(hidden_sizes)
                activation_list = activation
                if isinstance(act_args, list):
                    assert len(act_args) == len(hidden_sizes)
                    act_args_list = act_args
                else:
                    act_args_list = [act_args for _ in range(len(hidden_sizes))]
            else:
                activation_list = [activation for _ in range(len(hidden_sizes))]
                act_args_list = [act_args for _ in range(len(hidden_sizes))]
        else:
            activation_list = [None] * len(hidden_sizes)
            act_args_list = [None] * len(hidden_sizes)
        hidden_sizes = [input_dim, *list(hidden_sizes)]
        model = []
        for in_dim, out_dim, norm, norm_args, activ, act_args in zip(
            hidden_sizes[:-1],
            hidden_sizes[1:],
            norm_layer_list,
            norm_args_list,
            activation_list,
            act_args_list,
            strict=True,
        ):
            model += miniblock(in_dim, out_dim, norm, norm_args, activ, act_args, linear_layer)
        if output_dim > 0:
            model += [linear_layer(hidden_sizes[-1], output_dim)]
        self.output_dim = output_dim or hidden_sizes[-1]
        self.model = nn.Sequential(*model)
        self.flatten_input = flatten_input

    @no_type_check
    def forward(self, obs: np.ndarray | torch.Tensor) -> torch.Tensor:
        obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
        if self.flatten_input:
            obs = obs.flatten(1)
        return self.model(obs)


TRecurrentState = TypeVar("TRecurrentState", bound=Any)


class NetBase(nn.Module, Generic[TRecurrentState], ABC):
    """Interface for NNs used in policies."""

    @abstractmethod
    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: TRecurrentState | None = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[torch.Tensor, TRecurrentState | None]:
        pass


class Net(NetBase[Any]):
    """Wrapper of MLP to support more specific DRL usage.

    For advanced usage (how to customize the network), please refer to
    :ref:`build_the_network`.

    :param state_shape: int or a sequence of int of the shape of state.
    :param action_shape: int or a sequence of int of the shape of action.
    :param hidden_sizes: shape of MLP passed in as a list.
    :param norm_layer: use which normalization before activation, e.g.,
        ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization.
        You can also pass a list of normalization modules with the same length
        of hidden_sizes, to use different normalization module in different
        layers. Default to no normalization.
    :param activation: which activation to use after each layer, can be both
        the same activation for all layers if passed in nn.Module, or different
        activation for different Modules if passed in a list. Default to
        nn.ReLU.
    :param device: specify the device when the network actually runs. Default
        to "cpu".
    :param softmax: whether to apply a softmax layer over the last layer's
        output.
    :param concat: whether the input shape is concatenated by state_shape
        and action_shape. If it is True, ``action_shape`` is not the output
        shape, but affects the input shape only.
    :param num_atoms: in order to expand to the net of distributional RL.
        Default to 1 (not use).
    :param dueling_param: whether to use dueling network to calculate Q
        values (for Dueling DQN). If you want to use dueling option, you should
        pass a tuple of two dict (first for Q and second for V) stating
        self-defined arguments as stated in
        class:`~tianshou.utils.net.common.MLP`. Default to None.
    :param linear_layer: use this module constructor, which takes the input
        and output dimension as input, as linear layer. Default to nn.Linear.

    .. seealso::

        Please refer to :class:`~tianshou.utils.net.common.MLP` for more
        detailed explanation on the usage of activation, norm_layer, etc.

        You can also refer to :class:`~tianshou.utils.net.continuous.Actor`,
        :class:`~tianshou.utils.net.continuous.Critic`, etc, to see how it's
        suggested be used.
    """

    def __init__(
        self,
        state_shape: int | Sequence[int],
        action_shape: TActionShape = 0,
        hidden_sizes: Sequence[int] = (),
        norm_layer: ModuleType | Sequence[ModuleType] | None = None,
        norm_args: ArgsType | None = None,
        activation: ModuleType | Sequence[ModuleType] | None = nn.ReLU,
        act_args: ArgsType | None = None,
        device: str | int | torch.device = "cpu",
        softmax: bool = False,
        concat: bool = False,
        num_atoms: int = 1,
        dueling_param: tuple[dict[str, Any], dict[str, Any]] | None = None,
        linear_layer: TLinearLayer = nn.Linear,
    ) -> None:
        super().__init__()
        self.device = device
        self.softmax = softmax
        self.num_atoms = num_atoms
        self.Q: MLP | None = None
        self.V: MLP | None = None

        input_dim = int(np.prod(state_shape))
        action_dim = int(np.prod(action_shape)) * num_atoms
        if concat:
            input_dim += action_dim
        self.use_dueling = dueling_param is not None
        output_dim = action_dim if not self.use_dueling and not concat else 0
        self.model = MLP(
            input_dim,
            output_dim,
            hidden_sizes,
            norm_layer,
            norm_args,
            activation,
            act_args,
            device,
            linear_layer,
        )
        if self.use_dueling:  # dueling DQN
            assert dueling_param is not None
            kwargs_update = {
                "input_dim": self.model.output_dim,
                "device": self.device,
            }
            # Important: don't change the original dict (e.g., don't use .update())
            q_kwargs = {**dueling_param[0], **kwargs_update}
            v_kwargs = {**dueling_param[1], **kwargs_update}

            q_kwargs["output_dim"] = 0 if concat else action_dim
            v_kwargs["output_dim"] = 0 if concat else num_atoms
            self.Q, self.V = MLP(**q_kwargs), MLP(**v_kwargs)
            self.output_dim = self.Q.output_dim
        else:
            self.output_dim = self.model.output_dim

    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: Any = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[torch.Tensor, Any]:
        """Mapping: obs -> flatten (inside MLP)-> logits.

        :param obs:
        :param state: unused and returned as is
        :param info: unused
        """
        if hasattr(obs, "obs"):
            obs = obs.obs
        logits = self.model(obs)
        batch_size = logits.shape[0]
        if self.use_dueling:  # Dueling DQN
            assert self.Q is not None
            assert self.V is not None
            q, v = self.Q(logits), self.V(logits)
            if self.num_atoms > 1:
                q = q.view(batch_size, -1, self.num_atoms)
                v = v.view(batch_size, -1, self.num_atoms)
            logits = q - q.mean(dim=1, keepdim=True) + v
        elif self.num_atoms > 1:
            logits = logits.view(batch_size, -1, self.num_atoms)
        if self.softmax:
            logits = torch.softmax(logits, dim=-1)
        return logits, state


class Recurrent(NetBase[RecurrentStateBatch]):
    """Simple Recurrent network based on LSTM.

    For advanced usage (how to customize the network), please refer to
    :ref:`build_the_network`.
    """

    def __init__(
        self,
        layer_num: int,
        state_shape: int | Sequence[int],
        action_shape: TActionShape,
        device: str | int | torch.device = "cpu",
        hidden_layer_size: int = 128,
    ) -> None:
        super().__init__()
        self.device = device
        self.nn = nn.LSTM(
            input_size=hidden_layer_size,
            hidden_size=hidden_layer_size,
            num_layers=layer_num,
            batch_first=True,
        )
        self.fc1 = nn.Linear(int(np.prod(state_shape)), hidden_layer_size)
        self.fc2 = nn.Linear(hidden_layer_size, int(np.prod(action_shape)))

    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: RecurrentStateBatch | None = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[torch.Tensor, RecurrentStateBatch]:
        """Mapping: obs -> flatten -> logits.

        In the evaluation mode, `obs` should be with shape ``[bsz, dim]``; in the
        training mode, `obs` should be with shape ``[bsz, len, dim]``. See the code
        and comment for more detail.

        :param obs:
        :param state: either None or a dict with keys 'hidden' and 'cell'
        :param info: unused
        :return: predicted action, next state as dict with keys 'hidden' and 'cell'
        """
        # Note: the original type of state is Batch but it might also be a dict
        # If it is a Batch, .issubset(state) will not work. However,
        # issubset(state.keys()) always works
        if state is not None and not {"hidden", "cell"}.issubset(state.keys()):
            raise ValueError(
                f"Expected to find keys 'hidden' and 'cell' but instead found {state.keys()}",
            )

        obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
        # obs [bsz, len, dim] (training) or [bsz, dim] (evaluation)
        # In short, the tensor's shape in training phase is longer than which
        # in evaluation phase.
        if len(obs.shape) == 2:
            obs = obs.unsqueeze(-2)
        obs = self.fc1(obs)
        self.nn.flatten_parameters()
        if state is None:
            obs, (hidden, cell) = self.nn(obs)
        else:
            # we store the stack data in [bsz, len, ...] format
            # but pytorch rnn needs [len, bsz, ...]
            obs, (hidden, cell) = self.nn(
                obs,
                (
                    state["hidden"].transpose(0, 1).contiguous(),
                    state["cell"].transpose(0, 1).contiguous(),
                ),
            )
        obs = self.fc2(obs[:, -1])
        # please ensure the first dim is batch size: [bsz, len, ...]
        rnn_state_batch = cast(
            RecurrentStateBatch,
            Batch(
                {
                    "hidden": hidden.transpose(0, 1).detach(),
                    "cell": cell.transpose(0, 1).detach(),
                },
            ),
        )
        return obs, rnn_state_batch


class ActorCritic(nn.Module):
    """An actor-critic network for parsing parameters.

    Using ``actor_critic.parameters()`` instead of set.union or list+list to avoid
    issue #449.

    :param nn.Module actor: the actor network.
    :param nn.Module critic: the critic network.
    """

    def __init__(self, actor: nn.Module, critic: nn.Module) -> None:
        super().__init__()
        self.actor = actor
        self.critic = critic


class DataParallelNet(nn.Module):
    """DataParallel wrapper for training agent with multi-GPU.

    This class does only the conversion of input data type, from numpy array to torch's
    Tensor. If the input is a nested dictionary, the user should create a similar class
    to do the same thing.

    :param nn.Module net: the network to be distributed in different GPUs.
    """

    def __init__(self, net: nn.Module) -> None:
        super().__init__()
        self.net = nn.DataParallel(net)

    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        *args: Any,
        **kwargs: Any,
    ) -> tuple[Any, Any]:
        if not isinstance(obs, torch.Tensor):
            obs = torch.as_tensor(obs, dtype=torch.float32)
        return self.net(obs=obs.cuda(), *args, **kwargs)  # noqa: B026


class EnsembleLinear(nn.Module):
    """Linear Layer of Ensemble network.

    :param ensemble_size: Number of subnets in the ensemble.
    :param in_feature: dimension of the input vector.
    :param out_feature: dimension of the output vector.
    :param bias: whether to include an additive bias, default to be True.
    """

    def __init__(
        self,
        ensemble_size: int,
        in_feature: int,
        out_feature: int,
        bias: bool = True,
    ) -> None:
        super().__init__()

        # To be consistent with PyTorch default initializer
        k = np.sqrt(1.0 / in_feature)
        weight_data = torch.rand((ensemble_size, in_feature, out_feature)) * 2 * k - k
        self.weight = nn.Parameter(weight_data, requires_grad=True)

        self.bias_weights: nn.Parameter | None = None
        if bias:
            bias_data = torch.rand((ensemble_size, 1, out_feature)) * 2 * k - k
            self.bias_weights = nn.Parameter(bias_data, requires_grad=True)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.matmul(x, self.weight)
        if self.bias_weights is not None:
            x = x + self.bias_weights
        return x


# TODO: fix docstring
class BranchingNet(NetBase[Any]):
    """Branching dual Q network.

    Network for the BranchingDQNPolicy, it uses a common network module, a value module
    and action "branches" one for each dimension.It allows for a linear scaling
    of Q-value the output w.r.t. the number of dimensions in the action space.
    For more info please refer to: arXiv:1711.08946.
    :param state_shape: int or a sequence of int of the shape of state.
    :param action_shape: int or a sequence of int of the shape of action.
    :param action_peer_branch: int or a sequence of int of the number of actions in
    each dimension.
    :param common_hidden_sizes: shape of the common MLP network passed in as a list.
    :param value_hidden_sizes: shape of the value MLP network passed in as a list.
    :param action_hidden_sizes: shape of the action MLP network passed in as a list.
    :param norm_layer: use which normalization before activation, e.g.,
    ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization.
    You can also pass a list of normalization modules with the same length
    of hidden_sizes, to use different normalization module in different
    layers. Default to no normalization.
    :param activation: which activation to use after each layer, can be both
    the same activation for all layers if passed in nn.Module, or different
    activation for different Modules if passed in a list. Default to
    nn.ReLU.
    :param device: specify the device when the network actually runs. Default
    to "cpu".
    :param softmax: whether to apply a softmax layer over the last layer's
    output.
    """

    def __init__(
        self,
        state_shape: int | Sequence[int],
        num_branches: int = 0,
        action_per_branch: int = 2,
        common_hidden_sizes: list[int] | None = None,
        value_hidden_sizes: list[int] | None = None,
        action_hidden_sizes: list[int] | None = None,
        norm_layer: ModuleType | None = None,
        norm_args: ArgsType | None = None,
        activation: ModuleType | None = nn.ReLU,
        act_args: ArgsType | None = None,
        device: str | int | torch.device = "cpu",
    ) -> None:
        super().__init__()
        common_hidden_sizes = common_hidden_sizes or []
        value_hidden_sizes = value_hidden_sizes or []
        action_hidden_sizes = action_hidden_sizes or []

        self.device = device
        self.num_branches = num_branches
        self.action_per_branch = action_per_branch
        # common network
        common_input_dim = int(np.prod(state_shape))
        common_output_dim = 0
        self.common = MLP(
            common_input_dim,
            common_output_dim,
            common_hidden_sizes,
            norm_layer,
            norm_args,
            activation,
            act_args,
            device,
        )
        # value network
        value_input_dim = common_hidden_sizes[-1]
        value_output_dim = 1
        self.value = MLP(
            value_input_dim,
            value_output_dim,
            value_hidden_sizes,
            norm_layer,
            norm_args,
            activation,
            act_args,
            device,
        )
        # action branching network
        action_input_dim = common_hidden_sizes[-1]
        action_output_dim = action_per_branch
        self.branches = nn.ModuleList(
            [
                MLP(
                    action_input_dim,
                    action_output_dim,
                    action_hidden_sizes,
                    norm_layer,
                    norm_args,
                    activation,
                    act_args,
                    device,
                )
                for _ in range(self.num_branches)
            ],
        )

    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: Any = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[torch.Tensor, Any]:
        """Mapping: obs -> model -> logits."""
        common_out = self.common(obs)
        value_out = self.value(common_out)
        value_out = torch.unsqueeze(value_out, 1)
        action_out = []
        for b in self.branches:
            action_out.append(b(common_out))
        action_scores = torch.stack(action_out, 1)
        action_scores = action_scores - torch.mean(action_scores, 2, keepdim=True)
        logits = value_out + action_scores
        return logits, state


def get_dict_state_decorator(
    state_shape: dict[str, int | Sequence[int]],
    keys: Sequence[str],
) -> tuple[Callable, int]:
    """A helper function to make Net or equivalent classes (e.g. Actor, Critic) applicable to dict state.

    The first return item, ``decorator_fn``, will alter the implementation of forward
    function of the given class by preprocessing the observation. The preprocessing is
    basically flatten the observation and concatenate them based on the ``keys`` order.
    The batch dimension is preserved if presented. The result observation shape will
    be equal to ``new_state_shape``, the second return item.

    :param state_shape: A dictionary indicating each state's shape
    :param keys: A list of state's keys. The flatten observation will be according to
        this list order.
    :returns: a 2-items tuple ``decorator_fn`` and ``new_state_shape``
    """
    original_shape = state_shape
    flat_state_shapes = []
    for k in keys:
        flat_state_shapes.append(int(np.prod(state_shape[k])))
    new_state_shape = sum(flat_state_shapes)

    def preprocess_obs(obs: Batch | dict | torch.Tensor | np.ndarray) -> torch.Tensor:
        if isinstance(obs, dict) or (isinstance(obs, Batch) and keys[0] in obs):
            if original_shape[keys[0]] == obs[keys[0]].shape:
                # No batch dim
                new_obs = torch.Tensor([obs[k] for k in keys]).flatten()
                # new_obs = torch.Tensor([obs[k] for k in keys]).reshape(1, -1)
            else:
                bsz = obs[keys[0]].shape[0]
                new_obs = torch.cat([torch.Tensor(obs[k].reshape(bsz, -1)) for k in keys], dim=1)
        else:
            new_obs = torch.Tensor(obs)
        return new_obs

    @no_type_check
    def decorator_fn(net_class):
        class new_net_class(net_class):
            def forward(self, obs: np.ndarray | torch.Tensor, *args, **kwargs) -> Any:
                return super().forward(preprocess_obs(obs), *args, **kwargs)

        return new_net_class

    return decorator_fn, new_state_shape


class BaseActor(nn.Module, ABC):
    @abstractmethod
    def get_preprocess_net(self) -> nn.Module:
        pass

    @abstractmethod
    def get_output_dim(self) -> int:
        pass

    @abstractmethod
    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: Any = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[Any, Any]:
        # TODO: ALGO-REFACTORING. Marked to be addressed as part of Algorithm abstraction.
        #  Return type needs to be more specific
        pass


def getattr_with_matching_alt_value(obj: Any, attr_name: str, alt_value: T | None) -> T:
    """Gets the given attribute from the given object or takes the alternative value if it is not present.
    If both are present, they are required to match.

    :param obj: the object from which to obtain the attribute value
    :param attr_name: the attribute name
    :param alt_value: the alternative value for the case where the attribute is not present, which cannot be None
        if the attribute is not present
    :return: the value
    """
    v = getattr(obj, attr_name)
    if v is not None:
        if alt_value is not None and v != alt_value:
            raise ValueError(
                f"Attribute '{attr_name}' of {obj} is defined ({v}) but does not match alt. value ({alt_value})",
            )
        return v
    else:
        if alt_value is None:
            raise ValueError(
                f"Attribute '{attr_name}' of {obj} is not defined and no fallback given",
            )
        return alt_value


def get_output_dim(module: nn.Module, alt_value: int | None) -> int:
    """Retrieves value the `output_dim` attribute of the given module or uses the given alternative value if the attribute is not present.
    If both are present, they must match.

    :param module: the module
    :param alt_value: the alternative value
    :return: the value
    """
    return getattr_with_matching_alt_value(module, "output_dim", alt_value)


In [None]:
# pgpolicy.py
import warnings
from collections.abc import Callable
from dataclasses import dataclass
from typing import Any, Generic, Literal, TypeVar, cast

import gymnasium as gym
import numpy as np
import torch

from tianshou.data import (
    Batch,
    ReplayBuffer,
    SequenceSummaryStats,
    to_torch,
    to_torch_as,
)
from tianshou.data.batch import BatchProtocol
from tianshou.data.types import (
    BatchWithReturnsProtocol,
    DistBatchProtocol,
    ObsBatchProtocol,
    RolloutBatchProtocol,
)
from tianshou.policy import BasePolicy
from tianshou.policy.base import TLearningRateScheduler, TrainingStats
from tianshou.utils import RunningMeanStd
from tianshou.utils.net.continuous import ActorProb
from tianshou.utils.net.discrete import Actor

# Dimension Naming Convention
# B - Batch Size
# A - Action
# D - Dist input (usually 2, loc and scale)
# H - Dimension of hidden, can be None

TDistFnContinuous = Callable[
    [tuple[torch.Tensor, torch.Tensor]],
    torch.distributions.Distribution,
]
TDistFnDiscrete = Callable[[torch.Tensor], torch.distributions.Categorical]

TDistFnDiscrOrCont = TDistFnContinuous | TDistFnDiscrete


@dataclass(kw_only=True)
class PGTrainingStats(TrainingStats):
    loss: SequenceSummaryStats


TPGTrainingStats = TypeVar("TPGTrainingStats", bound=PGTrainingStats)


class PGPolicy(BasePolicy[TPGTrainingStats], Generic[TPGTrainingStats]):
    """Implementation of REINFORCE algorithm.

    :param actor: the actor network following the rules:
        If `self.action_type == "discrete"`: (`s_B` ->`action_values_BA`).
        If `self.action_type == "continuous"`: (`s_B` -> `dist_input_BD`).
    :param optim: optimizer for actor network.
    :param dist_fn: distribution class for computing the action.
        Maps model_output -> distribution. Typically a Gaussian distribution
        taking `model_output=mean,std` as input for continuous action spaces,
        or a categorical distribution taking `model_output=logits`
        for discrete action spaces. Note that as user, you are responsible
        for ensuring that the distribution is compatible with the action space.
    :param action_space: env's action space.
    :param discount_factor: in [0, 1].
    :param reward_normalization: if True, will normalize the *returns*
        by subtracting the running mean and dividing by the running standard deviation.
        Can be detrimental to performance! See TODO in process_fn.
    :param deterministic_eval: if True, will use deterministic action (the dist's mode)
        instead of stochastic one during evaluation. Does not affect training.
    :param observation_space: Env's observation space.
    :param action_scaling: if True, scale the action from [-1, 1] to the range
        of action_space. Only used if the action_space is continuous.
    :param action_bound_method: method to bound action to range [-1, 1].
        Only used if the action_space is continuous.
    :param lr_scheduler: if not None, will be called in `policy.update()`.

    .. seealso::

        Please refer to :class:`~tianshou.policy.BasePolicy` for more detailed explanation.
    """

    def __init__(
        self,
        *,
        actor: torch.nn.Module | ActorProb | Actor,
        optim: torch.optim.Optimizer,
        dist_fn: TDistFnDiscrOrCont,
        action_space: gym.Space,
        discount_factor: float = 0.99,
        # TODO: rename to return_normalization?
        reward_normalization: bool = False,
        deterministic_eval: bool = False,
        observation_space: gym.Space | None = None,
        # TODO: why change the default from the base?
        action_scaling: bool = True,
        action_bound_method: Literal["clip", "tanh"] | None = "clip",
        lr_scheduler: TLearningRateScheduler | None = None,
    ) -> None:
        super().__init__(
            action_space=action_space,
            observation_space=observation_space,
            action_scaling=action_scaling,
            action_bound_method=action_bound_method,
            lr_scheduler=lr_scheduler,
        )
        if action_scaling and not np.isclose(actor.max_action, 1.0):
            warnings.warn(
                "action_scaling and action_bound_method are only intended"
                "to deal with unbounded model action space, but find actor model"
                f"bound action space with max_action={actor.max_action}."
                "Consider using unbounded=True option of the actor model,"
                "or set action_scaling to False and action_bound_method to None.",
            )
        self.actor = actor
        self.optim = optim
        self.dist_fn = dist_fn
        assert 0.0 <= discount_factor <= 1.0, "discount factor should be in [0, 1]"
        self.gamma = discount_factor
        self.rew_norm = reward_normalization
        self.ret_rms = RunningMeanStd()
        self._eps = 1e-8
        self.deterministic_eval = deterministic_eval

    def process_fn(
        self,
        batch: RolloutBatchProtocol,
        buffer: ReplayBuffer,
        indices: np.ndarray,
    ) -> BatchWithReturnsProtocol:
        r"""Compute the discounted returns (Monte Carlo estimates) for each transition.

        They are added to the batch under the field `returns`.
        Note: this function will modify the input batch!

        .. math::
            G_t = \sum_{i=t}^T \gamma^{i-t}r_i

        where :math:`T` is the terminal time step, :math:`\gamma` is the
        discount factor, :math:`\gamma \in [0, 1]`.

        :param batch: a data batch which contains several episodes of data in
            sequential order. Mind that the end of each finished episode of batch
            should be marked by done flag, unfinished (or collecting) episodes will be
            recognized by buffer.unfinished_index().
        :param buffer: the corresponding replay buffer.
        :param numpy.ndarray indices: tell batch's location in buffer, batch is equal
            to buffer[indices].
        """
        v_s_ = np.full(indices.shape, self.ret_rms.mean)
        # gae_lambda = 1.0 means we use Monte Carlo estimate
        unnormalized_returns, _ = self.compute_episodic_return(
            batch,
            buffer,
            indices,
            v_s_=v_s_,
            gamma=self.gamma,
            gae_lambda=1.0,
        )
        # TODO: overridden in A2C, where mean is not subtracted. Subtracting mean
        #  can be very detrimental! It also has no theoretical grounding.
        #  This should be addressed soon!
        if self.rew_norm:
            batch.returns = (unnormalized_returns - self.ret_rms.mean) / np.sqrt(
                self.ret_rms.var + self._eps,
            )
            self.ret_rms.update(unnormalized_returns)
        else:
            batch.returns = unnormalized_returns
        batch: BatchWithReturnsProtocol
        return batch

    def forward(
        self,
        batch: ObsBatchProtocol,
        state: dict | BatchProtocol | np.ndarray | None = None,
        **kwargs: Any,
    ) -> DistBatchProtocol:
        """Compute action over the given batch data by applying the actor.

        Will sample from the dist_fn, if appropriate.
        Returns a new object representing the processed batch data
        (contrary to other methods that modify the input batch inplace).

        .. seealso::

            Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for
            more detailed explanation.
        """
        # TODO - ALGO: marked for algorithm refactoring
        # print(type(batch))
        # print('batch', batch)
        # print('batch.obs.obs', batch.obs.obs)
        # print('batch.obs.mask', batch.obs.mask)
        obs = batch.obs
        # TODO: this is convoluted! See also other places where this is done.
        obs_next = obs.obs if hasattr(obs, "obs") else obs
        # action_values_BA, hidden_BH = model(obs_next, state=state, info=batch.info)
        action_dist_input_BD, hidden_BH = self.actor(obs_next, state=state, info=batch.info)
        # in the case that self.action_type == "discrete", the dist should always be Categorical, and D=A
        # therefore action_dist_input_BD is equivalent to logits_BA
        # If discrete, dist_fn will typically map loc, scale to a distribution (usually a Gaussian)
        # the action_dist_input_BD in that case is a tuple of loc_B, scale_B and needs to be unpacked

        # print('action_dist_input_BD', action_dist_input_BD)
        # print('to_torch_as(batch.obs.mask, logits)', to_torch_as(obs.mask, action_dist_input_BD))
        if isinstance(action_dist_input_BD, tuple):

            # this is for (mu, sigma) from Normal distribution
            dist = self.dist_fn(*action_dist_input_BD)

        else:  # categorical distribution
        # mask: np.array with shape=(bsz, n_act), dtype=bool, True means available
        # logits: torch.Tensor with shape=(bsz, n_act), dtype=torch.float, range=0~1
            action_dist_input_BD = to_torch_as(batch.obs.mask, action_dist_input_BD)
        dist = self.dist_fn(action_dist_input_BD)
        if self.deterministic_eval and not self.training:
            act_B = dist.mode
        else:
            act_B = dist.sample()
        # act is of dimension BA in continuous case and of dimension B in discrete
        result = Batch(logits=action_dist_input_BD, act=act_B, state=hidden_BH, dist=dist)
        return cast(DistBatchProtocol, result)

    # TODO: why does mypy complain?
    def learn(  # type: ignore
        self,
        batch: BatchWithReturnsProtocol,
        batch_size: int | None,
        repeat: int,
        *args: Any,
        **kwargs: Any,
    ) -> TPGTrainingStats:
        losses = []
        split_batch_size = batch_size or -1
        for _ in range(repeat):
            for minibatch in batch.split(split_batch_size, merge_last=True):
                self.optim.zero_grad()
                result = self(minibatch)
                dist = result.dist
                act = to_torch_as(minibatch.act, result.act)
                ret = to_torch(minibatch.returns, torch.float, result.act.device)
                log_prob = dist.log_prob(act).reshape(len(ret), -1).transpose(0, 1)
                loss = -(log_prob * ret).mean()
                loss.backward()
                self.optim.step()
                losses.append(loss.item())

        loss_summary_stat = SequenceSummaryStats.from_sequence(losses)

        return PGTrainingStats(loss=loss_summary_stat)  # type: ignore[return-value]

### 🦆 Imports

In [49]:
from typing import Optional, Tuple
from tianshou.env.pettingzoo_env import PettingZooEnv
import torch

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.policy import BasePolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.env import DummyVectorEnv
from tianshou.policy import BasePolicy, PPOPolicy
from tianshou.trainer import OnpolicyTrainer
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.discrete import Actor, Critic

device = "cuda" if torch.cuda.is_available() else "cpu"
NUM_ITERS = 400
PLAYS = {"bastaushy": 0, "qostaushy": 0}

### 🐥 Prepare functions

In [218]:
def _get_agents_ppo(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gymnasium.spaces.Dict)
        else env.observation_space
    )
    assert env.observation_space["observation"].shape is not None  # for mypy
    assert isinstance(env.action_space, gymnasium.spaces.Discrete)
    if agent_learn is None:
        # model
        net = Net(state_shape=observation_space.shape, hidden_sizes=[128, 256, 256, 128], device=device).to(device)
        actor = Actor(preprocess_net=net, action_shape=env.action_space.shape, device=device).to(device)
        critic = Critic(preprocess_net=net, device=device).to(device)
        actor_critic = ActorCritic(actor=actor, critic=critic)

        # optimizer of the actor and the critic
        if optim is None:
            optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)
        agent_learn: PPOPolicy = PPOPolicy(
            actor=actor,
            critic=critic,
            optim=optim,
            dist_fn=torch.distributions.Categorical,
            action_space=env.action_space,
            deterministic_eval=True,
            action_scaling=False,
        )

    if agent_opponent is None:
        agent_opponent = RandomPolicy(action_space=env.action_space)

    agents = [agent_opponent, agent_learn]
    #agents = [agent_learn, agent_opponent]
    policy = MultiAgentPolicyManager(policies=agents, env=env)
    return policy, optim, env.agents


def _get_env(render_mode=None):
    """This function is needed to provide callables for DummyVectorEnv."""
    def env(render_mode=None):
        """
        The env function often wraps the environment in wrappers by default.
        You can find full documentation for these methods
        elsewhere in the developer documentation.
        """
        internal_render_mode = render_mode if render_mode != "ansi" else "human"
        env = raw_env(render_mode=internal_render_mode)
        # This wrapper is only for environments which print results to the terminal
        if render_mode == "ansi":
            env = wrappers.CaptureStdoutWrapper(env)
        # this wrapper helps error handling for discrete action spaces
        env = wrappers.AssertOutOfBoundsWrapper(env)
        # Provides a wide vareity of helpful user errors
        # Strongly recommended
        env = wrappers.OrderEnforcingWrapper(env)
        return env
    return PettingZooEnv(env(render_mode=render_mode))

### 🐸 Training code

In [68]:
# Before evaluate this cell run the cell with env
# ======== Step 1: Environment setup =========

train_envs = DummyVectorEnv([_get_env for _ in range(100)])
test_envs = DummyVectorEnv([_get_env for _ in range(100)])

# seed
seed = 77
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# ======== Step 2: Agent setup =========
policy, optim, agents = _get_agents_ppo()

# ======== Step 3: Collector setup =========
train_collector = Collector(
    policy=policy,
    env=train_envs,
    buffer=VectorReplayBuffer(20_000, len(train_envs)),
)
test_collector = Collector(policy=policy, env=test_envs)

#======== Step 4: Callback functions setup =========

def save_best_fn(policy):
    model_save_path = os.path.join("models", "ppo", "policy_ppo_128x256x256x128.pth")
    os.makedirs(os.path.join("models", "ppo"), exist_ok=True)
    torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

def stop_fn(mean_rewards):
    return mean_rewards >= 22000

def reward_metric(rews):
    return rews[:, 1]

#======== Step 5: Run the trainer =========
result = OnpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=100,
    step_per_epoch=50000,
    repeat_per_collect=10,
    episode_per_test=10,
    batch_size=256,
    step_per_collect=2000,
    reward_metric=reward_metric,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn
).run()

# return result, policy.policies[agents[1]]
print(f"\n==========Result==========\n{result}")
print("\n(the trained policy can be accessed via policy.policies[agents[1]])")

Epoch #1: 50001it [00:21, 2352.92it/s, env_step=50000, gradient_step=200, len=192, n/ep=14, n/st=2000, rew=10251.79]                            


Epoch #1: test_reward: 5326.200000 ± 4119.836934, best_reward: 5572.000000 ± 3150.819322 in #0


Epoch #2: 50001it [00:23, 2165.75it/s, env_step=100000, gradient_step=400, len=163, n/ep=14, n/st=2000, rew=8485.25]                            


Epoch #2: test_reward: 4131.600000 ± 3282.587004, best_reward: 5572.000000 ± 3150.819322 in #0


Epoch #3: 50001it [00:22, 2220.85it/s, env_step=150000, gradient_step=600, len=144, n/ep=11, n/st=2000, rew=7083.59]                            


Epoch #3: test_reward: 5586.000000 ± 3230.172255, best_reward: 5586.000000 ± 3230.172255 in #3


Epoch #4: 50001it [00:20, 2468.04it/s, env_step=200000, gradient_step=800, len=141, n/ep=11, n/st=2000, rew=7083.77]                            


Epoch #4: test_reward: 4204.000000 ± 2408.019933, best_reward: 5586.000000 ± 3230.172255 in #3


Epoch #5: 50001it [00:19, 2551.85it/s, env_step=250000, gradient_step=1000, len=137, n/ep=14, n/st=2000, rew=6835.57]                           


Epoch #5: test_reward: 5098.800000 ± 4635.544171, best_reward: 5586.000000 ± 3230.172255 in #3


Epoch #6: 50001it [00:19, 2510.60it/s, env_step=300000, gradient_step=1200, len=183, n/ep=16, n/st=2000, rew=9730.31]                           


Epoch #6: test_reward: 4358.200000 ± 3324.836652, best_reward: 5586.000000 ± 3230.172255 in #3


Epoch #7: 50001it [00:20, 2469.95it/s, env_step=350000, gradient_step=1400, len=183, n/ep=10, n/st=2000, rew=9454.10]                           


Epoch #7: test_reward: 5757.000000 ± 3577.310750, best_reward: 5757.000000 ± 3577.310750 in #7


Epoch #8: 50001it [00:20, 2414.89it/s, env_step=400000, gradient_step=1600, len=176, n/ep=13, n/st=2000, rew=8995.08]                           


Epoch #8: test_reward: 6098.200000 ± 4260.609670, best_reward: 6098.200000 ± 4260.609670 in #8


Epoch #9: 50001it [00:17, 2862.55it/s, env_step=450000, gradient_step=1800, len=156, n/ep=14, n/st=2000, rew=7924.71]                           


Epoch #9: test_reward: 4496.200000 ± 3081.464386, best_reward: 6098.200000 ± 4260.609670 in #8


Epoch #10: 50001it [00:19, 2569.37it/s, env_step=500000, gradient_step=2000, len=149, n/ep=11, n/st=2000, rew=7709.68]                          


Epoch #10: test_reward: 6602.500000 ± 4518.199359, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #11: 50001it [00:17, 2783.69it/s, env_step=550000, gradient_step=2200, len=180, n/ep=12, n/st=2000, rew=9105.67]                          


Epoch #11: test_reward: 3592.400000 ± 2351.290675, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #12: 50001it [00:19, 2629.72it/s, env_step=600000, gradient_step=2400, len=186, n/ep=17, n/st=2000, rew=9543.65]                          


Epoch #12: test_reward: 4511.200000 ± 2234.224734, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #13: 50001it [00:18, 2659.34it/s, env_step=650000, gradient_step=2600, len=152, n/ep=10, n/st=2000, rew=7499.35]                          


Epoch #13: test_reward: 2609.200000 ± 887.379152, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #14: 50001it [00:20, 2443.70it/s, env_step=700000, gradient_step=2800, len=171, n/ep=16, n/st=2000, rew=9055.66]                          


Epoch #14: test_reward: 4877.800000 ± 3549.715420, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #15: 50001it [00:19, 2546.54it/s, env_step=750000, gradient_step=3000, len=154, n/ep=17, n/st=2000, rew=7804.38]                          


Epoch #15: test_reward: 3852.800000 ± 2870.296737, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #16: 50001it [00:19, 2629.95it/s, env_step=800000, gradient_step=3200, len=170, n/ep=15, n/st=2000, rew=8896.10]                          


Epoch #16: test_reward: 4611.800000 ± 2678.213352, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #17: 50001it [00:18, 2727.96it/s, env_step=850000, gradient_step=3400, len=139, n/ep=11, n/st=2000, rew=6684.50]                          


Epoch #17: test_reward: 3369.800000 ± 1696.818541, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #18: 50001it [00:19, 2628.89it/s, env_step=900000, gradient_step=3600, len=116, n/ep=6, n/st=2000, rew=5218.00]                           


Epoch #18: test_reward: 4780.800000 ± 2903.210113, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #19: 50001it [00:19, 2602.10it/s, env_step=950000, gradient_step=3800, len=158, n/ep=8, n/st=2000, rew=8117.75]                           


Epoch #19: test_reward: 4059.800000 ± 1350.193897, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #20: 50001it [00:18, 2659.47it/s, env_step=1000000, gradient_step=4000, len=177, n/ep=16, n/st=2000, rew=9345.25]                         


Epoch #20: test_reward: 2745.200000 ± 2169.040931, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #21: 50001it [00:19, 2613.66it/s, env_step=1050000, gradient_step=4200, len=196, n/ep=11, n/st=2000, rew=10159.68]                        


Epoch #21: test_reward: 3393.200000 ± 1798.118839, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #22: 50001it [00:18, 2774.77it/s, env_step=1100000, gradient_step=4400, len=177, n/ep=12, n/st=2000, rew=8973.88]                         


Epoch #22: test_reward: 3774.800000 ± 1912.079747, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #23: 50001it [00:19, 2590.21it/s, env_step=1150000, gradient_step=4600, len=142, n/ep=9, n/st=2000, rew=6789.22]                          


Epoch #23: test_reward: 5076.400000 ± 3637.774023, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #24: 50001it [00:19, 2593.75it/s, env_step=1200000, gradient_step=4800, len=150, n/ep=13, n/st=2000, rew=7813.50]                         


Epoch #24: test_reward: 3706.200000 ± 1130.051486, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #25: 50001it [00:18, 2770.69it/s, env_step=1250000, gradient_step=5000, len=169, n/ep=15, n/st=2000, rew=8860.13]                         


Epoch #25: test_reward: 3112.800000 ± 2793.697936, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #26: 50001it [00:19, 2585.72it/s, env_step=1300000, gradient_step=5200, len=159, n/ep=14, n/st=2000, rew=8011.89]                         


Epoch #26: test_reward: 3768.200000 ± 1888.206228, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #27: 50001it [00:19, 2592.54it/s, env_step=1350000, gradient_step=5400, len=193, n/ep=9, n/st=2000, rew=10182.78]                         


Epoch #27: test_reward: 3174.800000 ± 1242.355247, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #28: 50001it [00:18, 2699.85it/s, env_step=1400000, gradient_step=5600, len=201, n/ep=11, n/st=2000, rew=10718.73]                        


Epoch #28: test_reward: 4321.200000 ± 2832.275156, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #29: 50001it [00:18, 2701.37it/s, env_step=1450000, gradient_step=5800, len=130, n/ep=14, n/st=2000, rew=6549.61]                         


Epoch #29: test_reward: 4474.800000 ± 4186.136854, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #30: 50001it [00:19, 2584.78it/s, env_step=1500000, gradient_step=6000, len=189, n/ep=14, n/st=2000, rew=9697.04]                         


Epoch #30: test_reward: 6022.200000 ± 3815.354605, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #31: 50001it [00:19, 2501.03it/s, env_step=1550000, gradient_step=6200, len=164, n/ep=13, n/st=2000, rew=8628.27]                         


Epoch #31: test_reward: 4164.200000 ± 2425.888448, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #32: 50001it [00:19, 2525.01it/s, env_step=1600000, gradient_step=6400, len=148, n/ep=10, n/st=2000, rew=7309.15]                         


Epoch #32: test_reward: 4856.000000 ± 2788.553890, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #33: 50001it [00:20, 2477.69it/s, env_step=1650000, gradient_step=6600, len=143, n/ep=11, n/st=2000, rew=7114.50]                         


Epoch #33: test_reward: 5252.000000 ± 2685.595800, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #34: 50001it [00:19, 2591.54it/s, env_step=1700000, gradient_step=6800, len=166, n/ep=14, n/st=2000, rew=8756.82]                         


Epoch #34: test_reward: 6460.600000 ± 3876.872662, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #35: 50001it [00:20, 2395.63it/s, env_step=1750000, gradient_step=7000, len=169, n/ep=17, n/st=2000, rew=8533.94]                         


Epoch #35: test_reward: 3292.000000 ± 1212.639105, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #36: 50001it [00:20, 2469.25it/s, env_step=1800000, gradient_step=7200, len=128, n/ep=10, n/st=2000, rew=6587.90]                         


Epoch #36: test_reward: 4945.200000 ± 3049.082249, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #37: 50001it [00:19, 2629.92it/s, env_step=1850000, gradient_step=7400, len=189, n/ep=12, n/st=2000, rew=10433.12]                        


Epoch #37: test_reward: 4568.200000 ± 2871.068435, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #38: 50001it [00:18, 2670.91it/s, env_step=1900000, gradient_step=7600, len=148, n/ep=10, n/st=2000, rew=7115.85]                         


Epoch #38: test_reward: 5214.400000 ± 2458.259677, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #39: 50001it [00:19, 2576.33it/s, env_step=1950000, gradient_step=7800, len=133, n/ep=10, n/st=2000, rew=6608.25]                         


Epoch #39: test_reward: 5086.000000 ± 1937.245674, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #40: 50001it [00:20, 2429.73it/s, env_step=2000000, gradient_step=8000, len=167, n/ep=16, n/st=2000, rew=9151.50]                         


Epoch #40: test_reward: 3670.400000 ± 1516.454760, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #41: 50001it [00:20, 2445.77it/s, env_step=2050000, gradient_step=8200, len=147, n/ep=17, n/st=2000, rew=7385.62]                         


Epoch #41: test_reward: 3598.600000 ± 2151.073230, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #42: 50001it [00:19, 2514.32it/s, env_step=2100000, gradient_step=8400, len=149, n/ep=9, n/st=2000, rew=7409.83]                          


Epoch #42: test_reward: 5013.200000 ± 3670.029667, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #43: 50001it [00:19, 2523.44it/s, env_step=2150000, gradient_step=8600, len=170, n/ep=12, n/st=2000, rew=8701.25]                         


Epoch #43: test_reward: 6328.000000 ± 4468.991743, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #44: 50001it [00:20, 2445.61it/s, env_step=2200000, gradient_step=8800, len=168, n/ep=12, n/st=2000, rew=8545.83]                         


Epoch #44: test_reward: 2319.000000 ± 2618.034263, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #45: 50001it [00:19, 2536.27it/s, env_step=2250000, gradient_step=9000, len=152, n/ep=16, n/st=2000, rew=7485.56]                         


Epoch #45: test_reward: 3732.400000 ± 2543.545211, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #46: 50001it [00:19, 2529.46it/s, env_step=2300000, gradient_step=9200, len=165, n/ep=12, n/st=2000, rew=8536.42]                         


Epoch #46: test_reward: 4376.400000 ± 2614.282586, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #47: 50001it [00:20, 2487.14it/s, env_step=2350000, gradient_step=9400, len=136, n/ep=14, n/st=2000, rew=6549.39]                         


Epoch #47: test_reward: 4850.800000 ± 2671.903322, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #48: 50001it [00:19, 2575.07it/s, env_step=2400000, gradient_step=9600, len=164, n/ep=17, n/st=2000, rew=7907.59]                         


Epoch #48: test_reward: 4584.800000 ± 3036.019657, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #49: 50001it [00:20, 2498.61it/s, env_step=2450000, gradient_step=9800, len=181, n/ep=14, n/st=2000, rew=9660.61]                         


Epoch #49: test_reward: 3338.400000 ± 1821.017035, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #50: 50001it [00:19, 2605.42it/s, env_step=2500000, gradient_step=10000, len=176, n/ep=11, n/st=2000, rew=9237.73]                        


Epoch #50: test_reward: 4193.800000 ± 2672.109347, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #51: 50001it [00:19, 2623.53it/s, env_step=2550000, gradient_step=10200, len=172, n/ep=14, n/st=2000, rew=8769.71]                        


Epoch #51: test_reward: 3710.400000 ± 2025.099859, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #52: 50001it [00:19, 2577.53it/s, env_step=2600000, gradient_step=10400, len=113, n/ep=12, n/st=2000, rew=5238.88]                        


Epoch #52: test_reward: 4558.000000 ± 2206.235527, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #53: 50001it [00:18, 2640.78it/s, env_step=2650000, gradient_step=10600, len=164, n/ep=14, n/st=2000, rew=8170.54]                        


Epoch #53: test_reward: 5211.200000 ± 1880.252792, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #54: 50001it [00:19, 2606.99it/s, env_step=2700000, gradient_step=10800, len=145, n/ep=8, n/st=2000, rew=6741.12]                         


Epoch #54: test_reward: 6237.200000 ± 4673.507479, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #55: 50001it [00:19, 2607.36it/s, env_step=2750000, gradient_step=11000, len=119, n/ep=5, n/st=2000, rew=5784.90]                         


Epoch #55: test_reward: 4586.200000 ± 2905.288757, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #56: 50001it [00:19, 2606.41it/s, env_step=2800000, gradient_step=11200, len=119, n/ep=10, n/st=2000, rew=5592.60]                        


Epoch #56: test_reward: 4481.400000 ± 1730.077004, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #57: 50001it [00:19, 2573.82it/s, env_step=2850000, gradient_step=11400, len=136, n/ep=13, n/st=2000, rew=6498.54]                        


Epoch #57: test_reward: 4687.400000 ± 2457.547729, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #58: 50001it [00:19, 2624.29it/s, env_step=2900000, gradient_step=11600, len=156, n/ep=9, n/st=2000, rew=7989.39]                         


Epoch #58: test_reward: 5284.000000 ± 2993.724904, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #59: 50001it [00:19, 2600.51it/s, env_step=2950000, gradient_step=11800, len=173, n/ep=12, n/st=2000, rew=8961.29]                        


Epoch #59: test_reward: 3549.400000 ± 1934.511422, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #60: 50001it [00:20, 2490.63it/s, env_step=3000000, gradient_step=12000, len=197, n/ep=15, n/st=2000, rew=10256.67]                       


Epoch #60: test_reward: 4800.800000 ± 3278.418546, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #61: 50001it [00:20, 2499.43it/s, env_step=3050000, gradient_step=12200, len=149, n/ep=10, n/st=2000, rew=7564.95]                        


Epoch #61: test_reward: 2620.000000 ± 1782.639167, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #62: 50001it [00:18, 2647.89it/s, env_step=3100000, gradient_step=12400, len=149, n/ep=15, n/st=2000, rew=7022.90]                        


Epoch #62: test_reward: 3423.600000 ± 2365.344592, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #63: 50001it [00:20, 2432.47it/s, env_step=3150000, gradient_step=12600, len=134, n/ep=13, n/st=2000, rew=6096.19]                        


Epoch #63: test_reward: 3952.000000 ± 1916.713646, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #64: 50001it [00:21, 2369.95it/s, env_step=3200000, gradient_step=12800, len=180, n/ep=11, n/st=2000, rew=9633.68]                        


Epoch #64: test_reward: 4388.600000 ± 2970.434318, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #65: 50001it [00:19, 2622.78it/s, env_step=3250000, gradient_step=13000, len=168, n/ep=15, n/st=2000, rew=8386.67]                        


Epoch #65: test_reward: 3751.000000 ± 3131.544316, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #66: 50001it [00:18, 2637.05it/s, env_step=3300000, gradient_step=13200, len=205, n/ep=14, n/st=2000, rew=10296.86]                       


Epoch #66: test_reward: 4132.200000 ± 3106.201726, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #67: 50001it [00:18, 2752.90it/s, env_step=3350000, gradient_step=13400, len=152, n/ep=9, n/st=2000, rew=6819.50]                         


Epoch #67: test_reward: 2992.200000 ± 2025.003002, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #68: 50001it [00:18, 2682.17it/s, env_step=3400000, gradient_step=13600, len=194, n/ep=9, n/st=2000, rew=10431.94]                        


Epoch #68: test_reward: 3881.600000 ± 1640.350889, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #69: 50001it [00:21, 2294.73it/s, env_step=3450000, gradient_step=13800, len=140, n/ep=11, n/st=2000, rew=6674.09]                        


Epoch #69: test_reward: 4849.200000 ± 2715.142457, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #70: 50001it [00:19, 2618.13it/s, env_step=3500000, gradient_step=14000, len=147, n/ep=9, n/st=2000, rew=6839.06]                         


Epoch #70: test_reward: 4185.800000 ± 2298.116176, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #71: 50001it [00:19, 2547.97it/s, env_step=3550000, gradient_step=14200, len=181, n/ep=15, n/st=2000, rew=9245.47]                        


Epoch #71: test_reward: 6512.000000 ± 2964.698703, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #72: 50001it [00:20, 2481.44it/s, env_step=3600000, gradient_step=14400, len=165, n/ep=16, n/st=2000, rew=8092.28]                        


Epoch #72: test_reward: 6213.700000 ± 4076.827395, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #73: 50001it [00:20, 2453.66it/s, env_step=3650000, gradient_step=14600, len=197, n/ep=8, n/st=2000, rew=10967.88]                        


Epoch #73: test_reward: 2853.400000 ± 1787.691931, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #74: 50001it [00:20, 2497.21it/s, env_step=3700000, gradient_step=14800, len=209, n/ep=9, n/st=2000, rew=11509.50]                        


Epoch #74: test_reward: 3363.600000 ± 2897.608780, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #75: 50001it [00:19, 2610.36it/s, env_step=3750000, gradient_step=15000, len=184, n/ep=12, n/st=2000, rew=9276.42]                        


Epoch #75: test_reward: 5571.600000 ± 3707.875597, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #76: 50001it [00:18, 2723.19it/s, env_step=3800000, gradient_step=15200, len=203, n/ep=5, n/st=2000, rew=10413.40]                        


Epoch #76: test_reward: 6105.000000 ± 3292.483834, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #77: 50001it [00:18, 2727.72it/s, env_step=3850000, gradient_step=15400, len=256, n/ep=7, n/st=2000, rew=13936.14]                        


Epoch #77: test_reward: 2995.000000 ± 2505.556745, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #78: 50001it [00:20, 2488.74it/s, env_step=3900000, gradient_step=15600, len=146, n/ep=6, n/st=2000, rew=6939.50]                         


Epoch #78: test_reward: 4142.400000 ± 2155.003814, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #79: 50001it [00:18, 2685.88it/s, env_step=3950000, gradient_step=15800, len=185, n/ep=14, n/st=2000, rew=9614.79]                        


Epoch #79: test_reward: 4051.800000 ± 2459.068189, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #80: 50001it [00:19, 2570.16it/s, env_step=4000000, gradient_step=16000, len=167, n/ep=11, n/st=2000, rew=8395.23]                        


Epoch #80: test_reward: 4231.600000 ± 2481.118425, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #81: 50001it [00:18, 2693.57it/s, env_step=4050000, gradient_step=16200, len=166, n/ep=7, n/st=2000, rew=7624.50]                         


Epoch #81: test_reward: 4549.000000 ± 3190.031881, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #82: 50001it [00:18, 2644.80it/s, env_step=4100000, gradient_step=16400, len=143, n/ep=14, n/st=2000, rew=7207.57]                        


Epoch #82: test_reward: 4128.800000 ± 1927.700848, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #83: 50001it [00:19, 2553.02it/s, env_step=4150000, gradient_step=16600, len=160, n/ep=13, n/st=2000, rew=8088.15]                        


Epoch #83: test_reward: 3168.400000 ± 2436.067454, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #84: 50001it [00:19, 2502.39it/s, env_step=4200000, gradient_step=16800, len=159, n/ep=17, n/st=2000, rew=7796.56]                        


Epoch #84: test_reward: 3946.400000 ± 2144.008358, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #85: 50001it [00:18, 2727.02it/s, env_step=4250000, gradient_step=17000, len=158, n/ep=14, n/st=2000, rew=7641.96]                        


Epoch #85: test_reward: 5063.400000 ± 3161.745474, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #86: 50001it [00:20, 2438.89it/s, env_step=4300000, gradient_step=17200, len=149, n/ep=11, n/st=2000, rew=7373.18]                        


Epoch #86: test_reward: 3849.000000 ± 1854.193140, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #87: 50001it [00:20, 2472.80it/s, env_step=4350000, gradient_step=17400, len=180, n/ep=13, n/st=2000, rew=8866.00]                        


Epoch #87: test_reward: 3398.000000 ± 2384.036409, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #88: 50001it [00:18, 2767.21it/s, env_step=4400000, gradient_step=17600, len=194, n/ep=18, n/st=2000, rew=10353.17]                       


Epoch #88: test_reward: 5092.400000 ± 4270.532782, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #89: 50001it [00:20, 2469.27it/s, env_step=4450000, gradient_step=17800, len=153, n/ep=10, n/st=2000, rew=7301.65]                        


Epoch #89: test_reward: 3865.200000 ± 2250.190783, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #90: 50001it [00:20, 2494.99it/s, env_step=4500000, gradient_step=18000, len=187, n/ep=10, n/st=2000, rew=10085.75]                       


Epoch #90: test_reward: 3568.000000 ± 2623.652873, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #91: 50001it [00:20, 2384.18it/s, env_step=4550000, gradient_step=18200, len=178, n/ep=14, n/st=2000, rew=9025.86]                        


Epoch #91: test_reward: 3832.400000 ± 1409.067436, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #92: 50001it [00:19, 2596.77it/s, env_step=4600000, gradient_step=18400, len=146, n/ep=16, n/st=2000, rew=6939.19]                        


Epoch #92: test_reward: 4641.000000 ± 2008.585821, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #93: 50001it [00:20, 2415.39it/s, env_step=4650000, gradient_step=18600, len=131, n/ep=10, n/st=2000, rew=6322.25]                        


Epoch #93: test_reward: 4366.200000 ± 3091.402588, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #94:  20%|#4     | 10000/50000 [00:04<00:19, 2047.28it/s, env_step=4660000, gradient_step=18640, len=147, n/ep=13, n/st=2000, rew=7074.42]


KeyboardInterrupt: 

### 🐃 Load ppo agents

In [165]:
agent1_learned_ppo_path = "models/policy_ppo_64x128x128x64.pth"
agent2_learned_ppo_path = "models/policy_ppo_128x256x256x128.pth"
env = _get_env()
net1 = Net(state_shape=(22,), hidden_sizes=[64, 128, 128, 64], device=device).to(device)
actor = Actor(preprocess_net=net1, action_shape=env.action_space.shape, device=device).to(device)
critic = Critic(preprocess_net=net1, device=device).to(device)
actor_critic = ActorCritic(actor=actor, critic=critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

agent1_learned_ppo = PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=torch.distributions.Categorical,
    action_space=env.action_space,
    deterministic_eval=True,
    action_scaling=False,
)
agent1_learned_ppo.load_state_dict(torch.load(agent1_learned_ppo_path))
agents_learned.append(agent1_learned_ppo)


net2 = Net(state_shape=(22,), hidden_sizes=[128, 256, 256, 128], device=device).to(device)
actor = Actor(preprocess_net=net2, action_shape=env.action_space.shape, device=device).to(device)
critic = Critic(preprocess_net=net2, device=device).to(device)
actor_critic = ActorCritic(actor=actor, critic=critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

agent2_learned_ppo = PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=torch.distributions.Categorical,
    action_space=env.action_space,
    deterministic_eval=True,
    action_scaling=False,
)
agent2_learned_ppo.load_state_dict(torch.load(agent2_learned_ppo_path))
agents_learned.append(agent2_learned_ppo)

### 🐟 Evaluate best Qostaushy agent with different policies

In [59]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}
# Step 1: Load the PettingZoo environment
env = env()#render_mode="human")

# Step 2: Wrap the environment for Tianshou interfacing
env = PettingZooEnv(env)

# # Step 3: Define policies for each agent
policies = MultiAgentPolicyManager(policies=[agents_learned[3], agents_learned[4]], env=env)
# # Step 4: Convert the env to vector format
env = DummyVectorEnv([lambda: env])

# # Step 5: Construct the Collector, which interfaces the policies with the vectorised environment
collector = Collector(policies, env)

# # Step 6: Execute the environment with the agents playing for 1 episode, and render a frame every 2 seconds
result = collector.collect(n_episode=1, reset_before_collect=True)
print(PLAYS)

{'bastaushy': 0, 'qostaushy': 1}


### 𓃰 Play results

- *agent4_learned* vs *agent1_learned_ppo*: {'bastaushy': 1, 'qostaushy': 0}

- *agent1_learned* vs *agent1_learned_ppo*: {'bastaushy': 0, 'qostaushy': 1}

- *agent1_learned_ppo* vs *agent1_learned*: {'bastaushy': 1, 'qostaushy': 0}

- *agent2_learned* vs *agent1_learned_ppo*: {'bastaushy': 0, 'qostaushy': 1}

- *agent1_learned_ppo* vs *agent2_learned*: {'bastaushy': 0, 'qostaushy': 1}

- *agent3_learned* vs *agent1_learned_ppo*: {'bastaushy': 0, 'qostaushy': 0}

- *agent1_learned_ppo* vs *agent3_learned*: {'bastaushy': 1, 'qostaushy': 0}

# 🐎 Self-play training

### 🦘 Imports

In [6]:
import random
import copy

### 🦚 Method train agent

In [18]:
def train_agent_ppo(index, agent_learn=None, agent_opponent=None):
    # Before evaluate this cell run the cell with env
    # ======== Step 1: Environment setup =========
    
    train_envs = DummyVectorEnv([_get_env for _ in range(100)])
    test_envs = DummyVectorEnv([_get_env for _ in range(100)])
    
    # seed
    seed=77
    np.random.seed(seed)
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)
    
    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents_ppo(agent_learn=agent_learn, agent_opponent=agent_opponent)
    
    # ======== Step 3: Collector setup =========
    train_collector = Collector(
        policy=policy,
        env=train_envs,
        buffer=VectorReplayBuffer(100_000, len(train_envs)),
    )
    test_collector = Collector(policy=policy, env=test_envs)
    
    #======== Step 4: Callback functions setup =========
    
    def save_best_fn(policy):
        model_save_path = os.path.join("models", f'policy_ppo_128x256x256x128_{i}.pth')
        os.makedirs(os.path.join("models"), exist_ok=True)
        torch.save(policy.policies[agents[1]].state_dict(), model_save_path)
    
    def stop_fn(mean_rewards):
        return mean_rewards >= 22000
    
    def reward_metric(rews):
        return rews[:, 1]
    
    #======== Step 5: Run the trainer =========
    result = OnpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,
        max_epoch=100,
        step_per_epoch=50000,
        repeat_per_collect=10,
        episode_per_test=10,
        batch_size=256,
        step_per_collect=2000,
        reward_metric=reward_metric,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn
    ).run()
    
    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[1]])")
    return policy.policies[agents[1]]

In [31]:
def train_agent_dqn(index, agent_learn=None, agent_opponent=None):
    # Before evaluate this cell run the cell with env
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(100)])
    test_envs = DummyVectorEnv([_get_env for _ in range(100)])
    
    # seed
    seed=77
    np.random.seed(seed)
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)
    
    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents_dqn(agent_learn=agent_learn, agent_opponent=agent_opponent)
    
    # # ======== Step 3: Collector setup =========
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(100_000, len(train_envs)),
        exploration_noise=True,
    )
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # policy.set_eps(1)
    
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):
        model_save_path = os.path.join("models", f'policy_dqn_256x512x512x256_{i}.pth')
        os.makedirs(os.path.join("models"), exist_ok=True)
        torch.save(policy.policies[agents[1]].state_dict(), model_save_path)
    
    def stop_fn(mean_rewards):
        return mean_rewards >= 22000
    
    def train_fn(epoch, env_step):
        policy.policies[agents[1]].set_eps(0.1)
    
    def test_fn(epoch, env_step):
        policy.policies[agents[1]].set_eps(0.05)
    
    def reward_metric(rews):
        return rews[:, 1]
    
    # ======== Step 5: Run the trainer =========
    result = OffpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,
        max_epoch=100,
        step_per_epoch=1000,
        step_per_collect=50,
        episode_per_test=10,
        batch_size=256,
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=0.1,
        test_in_train=False,
        reward_metric=reward_metric,
        verbose=True
    ).run()
    
    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[1]])")
    return policy.policies[agents[1]]

### 🐘 Load trained agents

In [38]:
for model in range(1, 21):
    env = _get_env()
    net = Net(
                state_shape=(22,),
                action_shape=env.action_space.shape or env.action_space.n,
                hidden_sizes=[256, 512, 512, 256],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
    
    agent_learned = DQNPolicy(
                model=net,
                optim = torch.optim.Adam(net.parameters(), lr=1e-4),
                discount_factor=0.9,
                estimation_step=3,
                target_update_freq=320,
                action_space=env.action_space
            ).to("cuda" if torch.cuda.is_available() else "cpu")
    
    agent_learned.load_state_dict(torch.load(f"models/policy_dqn_256x512x512x256_{model}.pth"))
    agents_learned.append(agent_learned)

### 🐑 Self-play

In [39]:
for i in range(21, 42):
    # random select opponent
    directory = "models"

    # for path, folders, files in os.walk(directory):
    #     for filename in files:
    #         models.append(os.path.join(directory, filename))
    # agent_opponent_random = random.choice(range(len(models)))
    agent_opponent_random = random.choice(agents_learned)
    idx = agents_learned.index(agent_opponent_random)
    
    print(f"Train with agent_opponent #{idx}")

    # training
    if i == 0:
        agent_learned = copy.deepcopy(agent_opponent_random)
    next_agent_learned = train_agent_dqn(index=i, agent_learn=agent_learned, agent_opponent=agent_opponent_random)
    #add new learned agent
    agents_learned.append(next_agent_learned)
    agent_learned = copy.deepcopy(next_agent_learned)

Train with agent_opponent #13


Epoch #1: 1001it [00:02, 368.25it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 11676.900000 ± 4056.994859, best_reward: 11676.900000 ± 4056.994859 in #1


Epoch #2: 1001it [00:02, 383.02it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 15168.900000 ± 4369.954999, best_reward: 15168.900000 ± 4369.954999 in #2


Epoch #3: 1001it [00:02, 363.54it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 16521.500000 ± 4161.997339, best_reward: 16521.500000 ± 4161.997339 in #3


Epoch #4: 1001it [00:02, 376.63it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 16592.600000 ± 5452.526959, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #5: 1001it [00:02, 384.06it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 13371.800000 ± 3209.378096, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #6: 1001it [00:02, 379.09it/s, env_step=6000, gradient_step=600, len=54, n/ep=0, n/st=100, rew=1666.00]                                                                                     


Epoch #6: test_reward: 13322.400000 ± 5738.371933, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #7: 1001it [00:02, 364.25it/s, env_step=7000, gradient_step=700, len=70, n/ep=2, n/st=100, rew=2707.00]                                                                                     


Epoch #7: test_reward: 5483.600000 ± 4301.279651, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #8: 1001it [00:02, 352.14it/s, env_step=8000, gradient_step=800, len=72, n/ep=0, n/st=100, rew=2251.00]                                                                                     


Epoch #8: test_reward: 11904.200000 ± 4059.538614, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #9: 1001it [00:02, 349.89it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=3396.00]                                                                                     


Epoch #9: test_reward: 8309.000000 ± 1749.831820, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #10: 1001it [00:03, 310.74it/s, env_step=10000, gradient_step=1000, len=100, n/ep=1, n/st=100, rew=4161.00]                                                                                 


Epoch #10: test_reward: 15260.400000 ± 4910.693601, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #11: 1001it [00:02, 339.38it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=3581.00]                                                                                 


Epoch #11: test_reward: 14054.300000 ± 5184.439894, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #12: 1001it [00:02, 346.57it/s, env_step=12000, gradient_step=1200, len=108, n/ep=0, n/st=100, rew=3581.00]                                                                                 


Epoch #12: test_reward: 10264.600000 ± 4118.174940, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #13: 1001it [00:02, 342.87it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=5725.00]                                                                                 


Epoch #13: test_reward: 7426.800000 ± 2294.009102, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #14: 1001it [00:02, 371.40it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=6485.00]                                                                                 


Epoch #14: test_reward: 10497.600000 ± 4588.148869, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #15: 1001it [00:02, 344.66it/s, env_step=15000, gradient_step=1500, len=146, n/ep=0, n/st=100, rew=7020.00]                                                                                 


Epoch #15: test_reward: 11693.200000 ± 5689.467054, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #16: 1001it [00:02, 384.46it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=6868.00]                                                                                 


Epoch #16: test_reward: 12236.400000 ± 4199.251152, best_reward: 16592.600000 ± 5452.526959 in #4


Epoch #17: 1001it [00:02, 359.45it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=9342.00]                                                                                 


Epoch #17: test_reward: 16963.200000 ± 3434.964273, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #18: 1001it [00:03, 306.00it/s, env_step=18000, gradient_step=1800, len=180, n/ep=4, n/st=100, rew=9263.12]                                                                                 


Epoch #18: test_reward: 7767.800000 ± 1569.293459, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #19: 1001it [00:03, 317.70it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=10714.00]                                                                                


Epoch #19: test_reward: 13175.300000 ± 4007.722996, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #20: 1001it [00:03, 330.44it/s, env_step=20000, gradient_step=2000, len=199, n/ep=0, n/st=100, rew=9682.00]                                                                                 


Epoch #20: test_reward: 8962.800000 ± 3553.012491, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #21: 1001it [00:03, 320.09it/s, env_step=21000, gradient_step=2100, len=207, n/ep=0, n/st=100, rew=10900.00]                                                                                


Epoch #21: test_reward: 7290.000000 ± 3196.894243, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #22: 1001it [00:02, 338.31it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=12007.00]                                                                                


Epoch #22: test_reward: 16176.900000 ± 4036.548586, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #23: 1001it [00:02, 349.29it/s, env_step=23000, gradient_step=2300, len=229, n/ep=0, n/st=100, rew=12884.00]                                                                                


Epoch #23: test_reward: 8928.800000 ± 2596.243317, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #24: 1001it [00:03, 315.72it/s, env_step=24000, gradient_step=2400, len=229, n/ep=0, n/st=100, rew=12884.00]                                                                                


Epoch #24: test_reward: 13591.800000 ± 6464.629932, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #25: 1001it [00:03, 329.79it/s, env_step=25000, gradient_step=2500, len=179, n/ep=0, n/st=100, rew=8549.00]                                                                                 


Epoch #25: test_reward: 10424.700000 ± 7133.048353, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #26: 1001it [00:02, 343.79it/s, env_step=26000, gradient_step=2600, len=260, n/ep=3, n/st=100, rew=15532.83]                                                                                


Epoch #26: test_reward: 13352.900000 ± 5243.633482, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #27: 1001it [00:02, 339.02it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=15255.00]                                                                                


Epoch #27: test_reward: 7588.600000 ± 5354.144529, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #28: 1001it [00:03, 308.06it/s, env_step=28000, gradient_step=2800, len=280, n/ep=1, n/st=100, rew=15969.00]                                                                                


Epoch #28: test_reward: 11699.400000 ± 4159.389984, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #29: 1001it [00:03, 321.46it/s, env_step=29000, gradient_step=2900, len=290, n/ep=1, n/st=100, rew=16754.50]                                                                                


Epoch #29: test_reward: 7987.700000 ± 2310.245271, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #30: 1001it [00:02, 334.84it/s, env_step=30000, gradient_step=3000, len=245, n/ep=0, n/st=100, rew=13245.25]                                                                                


Epoch #30: test_reward: 14862.200000 ± 5039.085488, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #31: 1001it [00:03, 327.55it/s, env_step=31000, gradient_step=3100, len=138, n/ep=2, n/st=100, rew=7415.00]                                                                                 


Epoch #31: test_reward: 14185.700000 ± 3928.109368, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #32: 1001it [00:03, 301.51it/s, env_step=32000, gradient_step=3200, len=318, n/ep=0, n/st=100, rew=17867.00]                                                                                


Epoch #32: test_reward: 7340.600000 ± 3878.592430, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #33: 1001it [00:03, 301.93it/s, env_step=33000, gradient_step=3300, len=140, n/ep=1, n/st=100, rew=7886.50]                                                                                 


Epoch #33: test_reward: 13382.500000 ± 3588.993654, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #34: 1001it [00:02, 364.19it/s, env_step=34000, gradient_step=3400, len=208, n/ep=2, n/st=100, rew=11266.00]                                                                                


Epoch #34: test_reward: 12537.900000 ± 4234.269298, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #35: 1001it [00:03, 323.78it/s, env_step=35000, gradient_step=3500, len=165, n/ep=0, n/st=100, rew=9364.50]                                                                                 


Epoch #35: test_reward: 13663.500000 ± 4627.810157, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #36: 1001it [00:03, 303.19it/s, env_step=36000, gradient_step=3600, len=119, n/ep=1, n/st=100, rew=5551.00]                                                                                 


Epoch #36: test_reward: 13856.400000 ± 3669.055988, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #37: 1001it [00:03, 307.83it/s, env_step=37000, gradient_step=3700, len=171, n/ep=1, n/st=100, rew=9124.50]                                                                                 


Epoch #37: test_reward: 9371.600000 ± 2557.316257, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #38: 1001it [00:02, 346.34it/s, env_step=38000, gradient_step=3800, len=212, n/ep=0, n/st=100, rew=12086.83]                                                                                


Epoch #38: test_reward: 12469.000000 ± 4670.926953, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #39: 1001it [00:03, 313.91it/s, env_step=39000, gradient_step=3900, len=171, n/ep=0, n/st=100, rew=9697.75]                                                                                 


Epoch #39: test_reward: 12409.500000 ± 4764.767974, best_reward: 16963.200000 ± 3434.964273 in #17


Epoch #40: 1001it [00:03, 327.30it/s, env_step=40000, gradient_step=4000, len=255, n/ep=1, n/st=100, rew=14726.00]                                                                                


Epoch #40: test_reward: 20716.500000 ± 3241.530510, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #41: 1001it [00:03, 313.03it/s, env_step=41000, gradient_step=4100, len=93, n/ep=1, n/st=100, rew=4219.50]                                                                                  


Epoch #41: test_reward: 7054.100000 ± 1594.810487, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #42: 1001it [00:02, 361.72it/s, env_step=42000, gradient_step=4200, len=133, n/ep=0, n/st=100, rew=7111.50]                                                                                 


Epoch #42: test_reward: 14362.300000 ± 3875.315421, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #43: 1001it [00:03, 314.33it/s, env_step=43000, gradient_step=4300, len=147, n/ep=0, n/st=100, rew=8076.67]                                                                                 


Epoch #43: test_reward: 10326.000000 ± 2396.364580, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #44: 1001it [00:03, 327.12it/s, env_step=44000, gradient_step=4400, len=154, n/ep=1, n/st=100, rew=8613.00]                                                                                 


Epoch #44: test_reward: 16662.600000 ± 6310.720279, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #45: 1001it [00:03, 312.48it/s, env_step=45000, gradient_step=4500, len=188, n/ep=0, n/st=100, rew=9819.00]                                                                                 


Epoch #45: test_reward: 15578.000000 ± 4180.371515, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #46: 1001it [00:03, 332.40it/s, env_step=46000, gradient_step=4600, len=151, n/ep=2, n/st=100, rew=7921.50]                                                                                 


Epoch #46: test_reward: 12006.300000 ± 5744.121483, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #47: 1001it [00:03, 321.45it/s, env_step=47000, gradient_step=4700, len=69, n/ep=0, n/st=100, rew=2962.00]                                                                                  


Epoch #47: test_reward: 6699.600000 ± 5166.893829, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #48: 1001it [00:03, 314.86it/s, env_step=48000, gradient_step=4800, len=184, n/ep=2, n/st=100, rew=9631.00]                                                                                 


Epoch #48: test_reward: 9384.900000 ± 6434.792653, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #49: 1001it [00:03, 310.21it/s, env_step=49000, gradient_step=4900, len=153, n/ep=0, n/st=100, rew=8169.50]                                                                                 


Epoch #49: test_reward: 9123.400000 ± 4833.217235, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #50: 1001it [00:02, 336.87it/s, env_step=50000, gradient_step=5000, len=182, n/ep=1, n/st=100, rew=11049.00]                                                                                


Epoch #50: test_reward: 14216.900000 ± 4620.748153, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #51: 1001it [00:03, 332.57it/s, env_step=51000, gradient_step=5100, len=145, n/ep=1, n/st=100, rew=8113.50]                                                                                 


Epoch #51: test_reward: 11341.200000 ± 5235.295174, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #52: 1001it [00:03, 298.11it/s, env_step=52000, gradient_step=5200, len=179, n/ep=1, n/st=100, rew=9432.00]                                                                                 


Epoch #52: test_reward: 14481.700000 ± 4735.482405, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #53: 1001it [00:03, 330.24it/s, env_step=53000, gradient_step=5300, len=187, n/ep=0, n/st=100, rew=11285.50]                                                                                


Epoch #53: test_reward: 13981.800000 ± 6269.079706, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #54: 1001it [00:03, 328.15it/s, env_step=54000, gradient_step=5400, len=133, n/ep=1, n/st=100, rew=6697.00]                                                                                 


Epoch #54: test_reward: 13961.700000 ± 4149.187343, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #55: 1001it [00:03, 319.49it/s, env_step=55000, gradient_step=5500, len=89, n/ep=0, n/st=100, rew=4536.00]                                                                                  


Epoch #55: test_reward: 15444.400000 ± 6504.762412, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #56: 1001it [00:02, 340.59it/s, env_step=56000, gradient_step=5600, len=215, n/ep=0, n/st=100, rew=12457.00]                                                                                


Epoch #56: test_reward: 12850.600000 ± 4655.373502, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #57: 1001it [00:03, 293.36it/s, env_step=57000, gradient_step=5700, len=192, n/ep=0, n/st=100, rew=9921.50]                                                                                 


Epoch #57: test_reward: 13596.800000 ± 5126.661678, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #58: 1001it [00:03, 290.36it/s, env_step=58000, gradient_step=5800, len=141, n/ep=1, n/st=100, rew=6988.00]                                                                                 


Epoch #58: test_reward: 7225.700000 ± 3414.557074, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #59: 1001it [00:03, 313.18it/s, env_step=59000, gradient_step=5900, len=236, n/ep=0, n/st=100, rew=13262.50]                                                                                


Epoch #59: test_reward: 7553.300000 ± 2213.419529, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #60: 1001it [00:02, 338.48it/s, env_step=60000, gradient_step=6000, len=198, n/ep=1, n/st=100, rew=11357.00]                                                                                


Epoch #60: test_reward: 14954.400000 ± 5282.746543, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #61: 1001it [00:03, 330.81it/s, env_step=61000, gradient_step=6100, len=170, n/ep=1, n/st=100, rew=9703.00]                                                                                 


Epoch #61: test_reward: 2945.400000 ± 2235.042738, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #62: 1001it [00:03, 299.17it/s, env_step=62000, gradient_step=6200, len=220, n/ep=1, n/st=100, rew=13363.50]                                                                                


Epoch #62: test_reward: 7786.000000 ± 4099.685183, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #63: 1001it [00:03, 313.95it/s, env_step=63000, gradient_step=6300, len=165, n/ep=0, n/st=100, rew=10061.00]                                                                                


Epoch #63: test_reward: 13253.900000 ± 2391.886304, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #64: 1001it [00:03, 308.58it/s, env_step=64000, gradient_step=6400, len=209, n/ep=0, n/st=100, rew=11985.50]                                                                                


Epoch #64: test_reward: 10589.000000 ± 4876.092308, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #65: 1001it [00:03, 320.31it/s, env_step=65000, gradient_step=6500, len=192, n/ep=0, n/st=100, rew=10170.00]                                                                                


Epoch #65: test_reward: 8999.600000 ± 3385.176368, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #66: 1001it [00:03, 321.79it/s, env_step=66000, gradient_step=6600, len=183, n/ep=1, n/st=100, rew=10566.50]                                                                                


Epoch #66: test_reward: 6273.300000 ± 4724.290085, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #67: 1001it [00:03, 292.78it/s, env_step=67000, gradient_step=6700, len=55, n/ep=0, n/st=100, rew=2239.00]                                                                                  


Epoch #67: test_reward: 11376.500000 ± 3535.469877, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #68: 1001it [00:03, 307.00it/s, env_step=68000, gradient_step=6800, len=197, n/ep=0, n/st=100, rew=11219.75]                                                                                


Epoch #68: test_reward: 15904.500000 ± 3701.952005, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #69: 1001it [00:03, 300.63it/s, env_step=69000, gradient_step=6900, len=231, n/ep=0, n/st=100, rew=12750.50]                                                                                


Epoch #69: test_reward: 9863.400000 ± 3315.980826, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #70: 1001it [00:03, 308.65it/s, env_step=70000, gradient_step=7000, len=95, n/ep=0, n/st=100, rew=4459.00]                                                                                  


Epoch #70: test_reward: 17175.200000 ± 5578.267989, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #71: 1001it [00:03, 314.39it/s, env_step=71000, gradient_step=7100, len=168, n/ep=1, n/st=100, rew=9091.00]                                                                                 


Epoch #71: test_reward: 8363.100000 ± 3676.328235, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #72: 1001it [00:03, 330.88it/s, env_step=72000, gradient_step=7200, len=291, n/ep=1, n/st=100, rew=17422.00]                                                                                


Epoch #72: test_reward: 13203.000000 ± 4149.513056, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #73: 1001it [00:03, 326.71it/s, env_step=73000, gradient_step=7300, len=198, n/ep=0, n/st=100, rew=11788.50]                                                                                


Epoch #73: test_reward: 8580.400000 ± 2071.920230, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #74: 1001it [00:03, 317.52it/s, env_step=74000, gradient_step=7400, len=160, n/ep=1, n/st=100, rew=9706.00]                                                                                 


Epoch #74: test_reward: 13327.000000 ± 5548.683393, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #75: 1001it [00:03, 306.98it/s, env_step=75000, gradient_step=7500, len=61, n/ep=0, n/st=100, rew=2617.00]                                                                                  


Epoch #75: test_reward: 6799.300000 ± 3205.212444, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #76: 1001it [00:03, 298.81it/s, env_step=76000, gradient_step=7600, len=282, n/ep=0, n/st=100, rew=16206.00]                                                                                


Epoch #76: test_reward: 13498.500000 ± 8351.995609, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #77: 1001it [00:03, 307.81it/s, env_step=77000, gradient_step=7700, len=148, n/ep=0, n/st=100, rew=8588.00]                                                                                 


Epoch #77: test_reward: 12057.100000 ± 3931.669123, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #78: 1001it [00:03, 329.52it/s, env_step=78000, gradient_step=7800, len=178, n/ep=0, n/st=100, rew=10520.25]                                                                                


Epoch #78: test_reward: 12261.900000 ± 4681.463456, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #79: 1001it [00:02, 335.10it/s, env_step=79000, gradient_step=7900, len=158, n/ep=1, n/st=100, rew=8563.00]                                                                                 


Epoch #79: test_reward: 9581.800000 ± 3777.421390, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #80: 1001it [00:03, 305.68it/s, env_step=80000, gradient_step=8000, len=132, n/ep=0, n/st=100, rew=7250.00]                                                                                 


Epoch #80: test_reward: 10424.400000 ± 3649.327697, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #81: 1001it [00:03, 300.77it/s, env_step=81000, gradient_step=8100, len=263, n/ep=0, n/st=100, rew=15195.50]                                                                                


Epoch #81: test_reward: 13444.000000 ± 5991.567207, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #82: 1001it [00:03, 315.05it/s, env_step=82000, gradient_step=8200, len=237, n/ep=0, n/st=100, rew=13830.75]                                                                                


Epoch #82: test_reward: 13595.400000 ± 3055.383812, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #83: 1001it [00:03, 313.62it/s, env_step=83000, gradient_step=8300, len=154, n/ep=0, n/st=100, rew=8775.00]                                                                                 


Epoch #83: test_reward: 9582.600000 ± 5123.673549, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #84: 1001it [00:03, 324.51it/s, env_step=84000, gradient_step=8400, len=250, n/ep=0, n/st=100, rew=14681.00]                                                                                


Epoch #84: test_reward: 6605.600000 ± 2112.485418, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #85: 1001it [00:03, 324.25it/s, env_step=85000, gradient_step=8500, len=160, n/ep=0, n/st=100, rew=8537.50]                                                                                 


Epoch #85: test_reward: 12546.900000 ± 4858.655276, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #86: 1001it [00:03, 291.67it/s, env_step=86000, gradient_step=8600, len=306, n/ep=0, n/st=100, rew=18361.50]                                                                                


Epoch #86: test_reward: 12583.000000 ± 4060.268809, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #87: 1001it [00:03, 296.89it/s, env_step=87000, gradient_step=8700, len=223, n/ep=1, n/st=100, rew=12949.00]                                                                                


Epoch #87: test_reward: 13252.800000 ± 6771.199315, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #88: 1001it [00:03, 317.42it/s, env_step=88000, gradient_step=8800, len=149, n/ep=0, n/st=100, rew=7968.00]                                                                                 


Epoch #88: test_reward: 14868.600000 ± 4975.988047, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #89: 1001it [00:03, 299.64it/s, env_step=89000, gradient_step=8900, len=230, n/ep=0, n/st=100, rew=13885.50]                                                                                


Epoch #89: test_reward: 15077.000000 ± 6348.083301, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #90: 1001it [00:03, 284.83it/s, env_step=90000, gradient_step=9000, len=230, n/ep=0, n/st=100, rew=13885.50]                                                                                


Epoch #90: test_reward: 6363.200000 ± 1718.986492, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #91: 1001it [00:03, 297.51it/s, env_step=91000, gradient_step=9100, len=301, n/ep=0, n/st=100, rew=18333.00]                                                                                


Epoch #91: test_reward: 8861.000000 ± 4044.942052, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #92: 1001it [00:03, 323.43it/s, env_step=92000, gradient_step=9200, len=208, n/ep=0, n/st=100, rew=12616.00]                                                                                


Epoch #92: test_reward: 7542.200000 ± 5831.378942, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #93: 1001it [00:03, 302.62it/s, env_step=93000, gradient_step=9300, len=180, n/ep=0, n/st=100, rew=10097.00]                                                                                


Epoch #93: test_reward: 8555.400000 ± 3016.057798, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #94: 1001it [00:03, 314.54it/s, env_step=94000, gradient_step=9400, len=295, n/ep=2, n/st=100, rew=19288.25]                                                                                


Epoch #94: test_reward: 10824.100000 ± 3231.203257, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #95: 1001it [00:03, 310.17it/s, env_step=95000, gradient_step=9500, len=198, n/ep=1, n/st=100, rew=11289.00]                                                                                


Epoch #95: test_reward: 9767.000000 ± 4495.168718, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #96: 1001it [00:03, 327.81it/s, env_step=96000, gradient_step=9600, len=169, n/ep=0, n/st=100, rew=9460.00]                                                                                 


Epoch #96: test_reward: 9826.600000 ± 4987.316557, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #97: 1001it [00:03, 296.32it/s, env_step=97000, gradient_step=9700, len=162, n/ep=1, n/st=100, rew=8979.50]                                                                                 


Epoch #97: test_reward: 13765.800000 ± 5134.161077, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #98: 1001it [00:03, 330.66it/s, env_step=98000, gradient_step=9800, len=211, n/ep=1, n/st=100, rew=12604.00]                                                                                


Epoch #98: test_reward: 9196.000000 ± 5541.939083, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #99: 1001it [00:03, 298.74it/s, env_step=99000, gradient_step=9900, len=155, n/ep=1, n/st=100, rew=8154.00]                                                                                 


Epoch #99: test_reward: 11212.100000 ± 5099.730982, best_reward: 20716.500000 ± 3241.530510 in #40


Epoch #100: 1001it [00:03, 310.36it/s, env_step=100000, gradient_step=10000, len=138, n/ep=2, n/st=100, rew=7357.00]                                                                              


Epoch #100: test_reward: 13116.700000 ± 6260.565311, best_reward: 20716.500000 ± 3241.530510 in #40

InfoStats(gradient_step=10000, best_reward=20716.5, best_reward_std=3241.530510422507, train_step=100000, train_episode=476, test_step=234552, test_episode=1010, timing=TimingStats(total_time=525.1878530979156, train_time=311.2124605178833, train_time_collect=46.899511098861694, train_time_update=257.98715353012085, test_time=213.97539258003235, update_speed=321.32389504453556))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #19


Epoch #1: 1001it [00:02, 346.88it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 14202.500000 ± 7640.739850, best_reward: 14202.500000 ± 7640.739850 in #1


Epoch #2: 1001it [00:02, 337.71it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 13765.200000 ± 5229.909632, best_reward: 14202.500000 ± 7640.739850 in #1


Epoch #3: 1001it [00:03, 329.84it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 17554.500000 ± 5526.301480, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #4: 1001it [00:02, 346.15it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 17467.900000 ± 1577.820234, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #5: 1001it [00:02, 344.45it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 14381.100000 ± 6138.808882, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #6: 1001it [00:03, 307.32it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 12520.600000 ± 7578.507877, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #7: 1001it [00:03, 310.09it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 15187.800000 ± 6876.230491, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #8: 1001it [00:02, 348.73it/s, env_step=8000, gradient_step=800, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #8: test_reward: 4788.900000 ± 7659.849743, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #9: 1001it [00:03, 321.15it/s, env_step=9000, gradient_step=900, len=90, n/ep=2, n/st=100, rew=2374.50]                                                                                     


Epoch #9: test_reward: 11374.000000 ± 3509.956239, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #10: 1001it [00:02, 339.28it/s, env_step=10000, gradient_step=1000, len=100, n/ep=1, n/st=100, rew=3173.00]                                                                                 


Epoch #10: test_reward: 14779.900000 ± 4983.750805, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #11: 1001it [00:03, 325.26it/s, env_step=11000, gradient_step=1100, len=100, n/ep=0, n/st=100, rew=3173.00]                                                                                 


Epoch #11: test_reward: 11233.800000 ± 3608.427020, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #12: 1001it [00:03, 327.88it/s, env_step=12000, gradient_step=1200, len=115, n/ep=0, n/st=100, rew=3946.00]                                                                                 


Epoch #12: test_reward: 15641.900000 ± 5568.439897, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #13: 1001it [00:03, 312.83it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=4332.00]                                                                                 


Epoch #13: test_reward: 15626.100000 ± 3976.875066, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #14: 1001it [00:03, 288.83it/s, env_step=14000, gradient_step=1400, len=130, n/ep=0, n/st=100, rew=4332.00]                                                                                 


Epoch #14: test_reward: 13591.900000 ± 4683.612227, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #15: 1001it [00:02, 344.08it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=4515.00]                                                                                 


Epoch #15: test_reward: 13998.500000 ± 3424.637010, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #16: 1001it [00:03, 307.24it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=5963.00]                                                                                 


Epoch #16: test_reward: 12651.400000 ± 4201.247439, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #17: 1001it [00:03, 314.21it/s, env_step=17000, gradient_step=1700, len=168, n/ep=0, n/st=100, rew=6279.00]                                                                                 


Epoch #17: test_reward: 11497.700000 ± 5369.516702, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #18: 1001it [00:03, 302.47it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=7033.50]                                                                                 


Epoch #18: test_reward: 13092.200000 ± 2918.642863, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #19: 1001it [00:03, 316.43it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=7723.00]                                                                                 


Epoch #19: test_reward: 6182.700000 ± 3226.570596, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #20: 1001it [00:03, 310.93it/s, env_step=20000, gradient_step=2000, len=200, n/ep=2, n/st=100, rew=8485.25]                                                                                 


Epoch #20: test_reward: 14437.800000 ± 5754.458042, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #21: 1001it [00:03, 301.17it/s, env_step=21000, gradient_step=2100, len=210, n/ep=2, n/st=100, rew=7365.25]                                                                                 


Epoch #21: test_reward: 14145.000000 ± 5678.242070, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #22: 1001it [00:03, 332.79it/s, env_step=22000, gradient_step=2200, len=198, n/ep=4, n/st=100, rew=7688.75]                                                                                 


Epoch #22: test_reward: 11154.000000 ± 3333.200504, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #23: 1001it [00:03, 314.88it/s, env_step=23000, gradient_step=2300, len=185, n/ep=2, n/st=100, rew=6902.50]                                                                                 


Epoch #23: test_reward: 13493.500000 ± 4395.625627, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #24: 1001it [00:03, 320.48it/s, env_step=24000, gradient_step=2400, len=155, n/ep=0, n/st=100, rew=6582.00]                                                                                 


Epoch #24: test_reward: 14380.000000 ± 3333.630633, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #25: 1001it [00:03, 307.87it/s, env_step=25000, gradient_step=2500, len=196, n/ep=0, n/st=100, rew=8172.00]                                                                                 


Epoch #25: test_reward: 14958.500000 ± 6775.591358, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #26: 1001it [00:03, 286.15it/s, env_step=26000, gradient_step=2600, len=260, n/ep=2, n/st=100, rew=11802.50]                                                                                


Epoch #26: test_reward: 8890.200000 ± 3837.872999, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #27: 1001it [00:03, 298.03it/s, env_step=27000, gradient_step=2700, len=158, n/ep=0, n/st=100, rew=6310.25]                                                                                 


Epoch #27: test_reward: 12059.400000 ± 2849.276266, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #28: 1001it [00:03, 296.34it/s, env_step=28000, gradient_step=2800, len=279, n/ep=0, n/st=100, rew=12293.00]                                                                                


Epoch #28: test_reward: 12735.300000 ± 2695.584911, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #29: 1001it [00:03, 307.18it/s, env_step=29000, gradient_step=2900, len=114, n/ep=0, n/st=100, rew=5305.00]                                                                                 


Epoch #29: test_reward: 13748.600000 ± 4267.193977, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #30: 1001it [00:03, 333.12it/s, env_step=30000, gradient_step=3000, len=204, n/ep=2, n/st=100, rew=9988.75]                                                                                 


Epoch #30: test_reward: 9498.200000 ± 5418.732653, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #31: 1001it [00:03, 297.78it/s, env_step=31000, gradient_step=3100, len=218, n/ep=0, n/st=100, rew=9733.50]                                                                                 


Epoch #31: test_reward: 12461.400000 ± 3144.844677, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #32: 1001it [00:03, 324.03it/s, env_step=32000, gradient_step=3200, len=102, n/ep=1, n/st=100, rew=4558.50]                                                                                 


Epoch #32: test_reward: 9981.300000 ± 3164.534343, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #33: 1001it [00:03, 299.97it/s, env_step=33000, gradient_step=3300, len=330, n/ep=1, n/st=100, rew=17317.00]                                                                                


Epoch #33: test_reward: 12072.300000 ± 6028.439981, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #34: 1001it [00:03, 313.44it/s, env_step=34000, gradient_step=3400, len=135, n/ep=2, n/st=100, rew=6312.50]                                                                                 


Epoch #34: test_reward: 12562.600000 ± 5017.578643, best_reward: 17554.500000 ± 5526.301480 in #3


Epoch #35: 1001it [00:03, 318.02it/s, env_step=35000, gradient_step=3500, len=344, n/ep=0, n/st=100, rew=17231.00]                                                                                


Epoch #35: test_reward: 20559.600000 ± 6887.182809, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #36: 1001it [00:03, 274.15it/s, env_step=36000, gradient_step=3600, len=157, n/ep=0, n/st=100, rew=9178.00]                                                                                 


Epoch #36: test_reward: 12357.500000 ± 2875.844754, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #37: 1001it [00:03, 311.24it/s, env_step=37000, gradient_step=3700, len=130, n/ep=0, n/st=100, rew=6874.00]                                                                                 


Epoch #37: test_reward: 13733.700000 ± 4942.420258, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #38: 1001it [00:03, 304.58it/s, env_step=38000, gradient_step=3800, len=174, n/ep=1, n/st=100, rew=7949.00]                                                                                 


Epoch #38: test_reward: 11841.800000 ± 5183.191407, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #39: 1001it [00:03, 304.93it/s, env_step=39000, gradient_step=3900, len=176, n/ep=1, n/st=100, rew=9948.00]                                                                                 


Epoch #39: test_reward: 11572.300000 ± 4893.864078, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #40: 1001it [00:03, 291.70it/s, env_step=40000, gradient_step=4000, len=189, n/ep=0, n/st=100, rew=8155.25]                                                                                 


Epoch #40: test_reward: 11225.300000 ± 3054.643221, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #41: 1001it [00:03, 270.98it/s, env_step=41000, gradient_step=4100, len=216, n/ep=1, n/st=100, rew=11820.00]                                                                                


Epoch #41: test_reward: 10320.600000 ± 2216.291506, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #42: 1001it [00:03, 279.18it/s, env_step=42000, gradient_step=4200, len=103, n/ep=1, n/st=100, rew=5483.00]                                                                                 


Epoch #42: test_reward: 8845.100000 ± 4609.122616, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #43: 1001it [00:03, 319.56it/s, env_step=43000, gradient_step=4300, len=124, n/ep=0, n/st=100, rew=6549.50]                                                                                 


Epoch #43: test_reward: 13574.500000 ± 3007.739226, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #44: 1001it [00:03, 306.26it/s, env_step=44000, gradient_step=4400, len=124, n/ep=0, n/st=100, rew=6549.50]                                                                                 


Epoch #44: test_reward: 15054.900000 ± 2279.088039, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #45: 1001it [00:03, 297.81it/s, env_step=45000, gradient_step=4500, len=145, n/ep=0, n/st=100, rew=7685.75]                                                                                 


Epoch #45: test_reward: 10748.400000 ± 3053.457719, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #46: 1001it [00:03, 325.10it/s, env_step=46000, gradient_step=4600, len=122, n/ep=3, n/st=100, rew=6213.00]                                                                                 


Epoch #46: test_reward: 15029.100000 ± 5032.590257, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #47: 1001it [00:03, 319.70it/s, env_step=47000, gradient_step=4700, len=64, n/ep=1, n/st=100, rew=3107.50]                                                                                  


Epoch #47: test_reward: 7896.400000 ± 1210.119267, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #48: 1001it [00:03, 315.84it/s, env_step=48000, gradient_step=4800, len=214, n/ep=1, n/st=100, rew=12759.50]                                                                                


Epoch #48: test_reward: 7043.900000 ± 3406.047018, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #49: 1001it [00:03, 302.66it/s, env_step=49000, gradient_step=4900, len=172, n/ep=0, n/st=100, rew=9651.50]                                                                                 


Epoch #49: test_reward: 10856.100000 ± 3628.108059, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #50: 1001it [00:03, 282.56it/s, env_step=50000, gradient_step=5000, len=120, n/ep=1, n/st=100, rew=6946.00]                                                                                 


Epoch #50: test_reward: 17718.100000 ± 5837.938377, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #51: 1001it [00:03, 270.61it/s, env_step=51000, gradient_step=5100, len=203, n/ep=3, n/st=100, rew=11275.83]                                                                                


Epoch #51: test_reward: 11720.300000 ± 3703.206963, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #52: 1001it [00:03, 276.17it/s, env_step=52000, gradient_step=5200, len=178, n/ep=1, n/st=100, rew=9743.00]                                                                                 


Epoch #52: test_reward: 11009.700000 ± 2653.281253, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #53: 1001it [00:03, 311.34it/s, env_step=53000, gradient_step=5300, len=230, n/ep=0, n/st=100, rew=13198.00]                                                                                


Epoch #53: test_reward: 11891.900000 ± 6604.200004, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #54: 1001it [00:03, 330.05it/s, env_step=54000, gradient_step=5400, len=178, n/ep=2, n/st=100, rew=10343.25]                                                                                


Epoch #54: test_reward: 9165.200000 ± 5988.106976, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #55: 1001it [00:03, 305.56it/s, env_step=55000, gradient_step=5500, len=68, n/ep=0, n/st=100, rew=2427.00]                                                                                  


Epoch #55: test_reward: 10131.300000 ± 2504.455072, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #56: 1001it [00:03, 278.35it/s, env_step=56000, gradient_step=5600, len=186, n/ep=1, n/st=100, rew=11179.50]                                                                                


Epoch #56: test_reward: 12425.800000 ± 1922.462213, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #57: 1001it [00:03, 280.58it/s, env_step=57000, gradient_step=5700, len=178, n/ep=1, n/st=100, rew=9504.00]                                                                                 


Epoch #57: test_reward: 8113.300000 ± 5685.318849, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #58: 1001it [00:03, 299.39it/s, env_step=58000, gradient_step=5800, len=194, n/ep=1, n/st=100, rew=11220.00]                                                                                


Epoch #58: test_reward: 13060.900000 ± 3013.582667, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #59: 1001it [00:03, 294.88it/s, env_step=59000, gradient_step=5900, len=84, n/ep=0, n/st=100, rew=4536.00]                                                                                  


Epoch #59: test_reward: 13950.200000 ± 5599.722704, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #60: 1001it [00:03, 299.69it/s, env_step=60000, gradient_step=6000, len=212, n/ep=0, n/st=100, rew=12499.75]                                                                                


Epoch #60: test_reward: 15703.000000 ± 5078.664470, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #61: 1001it [00:03, 296.20it/s, env_step=61000, gradient_step=6100, len=193, n/ep=0, n/st=100, rew=11803.50]                                                                                


Epoch #61: test_reward: 11827.200000 ± 4637.030705, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #62: 1001it [00:03, 291.31it/s, env_step=62000, gradient_step=6200, len=116, n/ep=1, n/st=100, rew=6319.00]                                                                                 


Epoch #62: test_reward: 14592.300000 ± 2584.475500, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #63: 1001it [00:03, 293.73it/s, env_step=63000, gradient_step=6300, len=50, n/ep=0, n/st=100, rew=2328.00]                                                                                  


Epoch #63: test_reward: 18834.600000 ± 6163.287503, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #64: 1001it [00:03, 299.42it/s, env_step=64000, gradient_step=6400, len=177, n/ep=0, n/st=100, rew=10711.00]                                                                                


Epoch #64: test_reward: 12841.000000 ± 2540.950964, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #65: 1001it [00:03, 320.63it/s, env_step=65000, gradient_step=6500, len=295, n/ep=0, n/st=100, rew=17926.50]                                                                                


Epoch #65: test_reward: 14141.000000 ± 6205.952530, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #66: 1001it [00:03, 298.27it/s, env_step=66000, gradient_step=6600, len=190, n/ep=0, n/st=100, rew=10407.50]                                                                                


Epoch #66: test_reward: 10673.600000 ± 4443.070992, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #67: 1001it [00:03, 304.15it/s, env_step=67000, gradient_step=6700, len=198, n/ep=1, n/st=100, rew=11461.00]                                                                                


Epoch #67: test_reward: 13612.800000 ± 4975.646125, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #68: 1001it [00:03, 293.57it/s, env_step=68000, gradient_step=6800, len=54, n/ep=0, n/st=100, rew=2295.00]                                                                                  


Epoch #68: test_reward: 9994.000000 ± 3921.450752, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #69: 1001it [00:03, 276.93it/s, env_step=69000, gradient_step=6900, len=178, n/ep=1, n/st=100, rew=10493.00]                                                                                


Epoch #69: test_reward: 12769.400000 ± 4900.427353, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #70: 1001it [00:03, 333.19it/s, env_step=70000, gradient_step=7000, len=140, n/ep=0, n/st=100, rew=7602.25]                                                                                 


Epoch #70: test_reward: 10730.100000 ± 4028.147005, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #71: 1001it [00:03, 322.10it/s, env_step=71000, gradient_step=7100, len=204, n/ep=1, n/st=100, rew=12163.00]                                                                                


Epoch #71: test_reward: 9825.400000 ± 3106.184837, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #72: 1001it [00:03, 259.23it/s, env_step=72000, gradient_step=7200, len=237, n/ep=2, n/st=100, rew=14560.25]                                                                                


Epoch #72: test_reward: 13548.700000 ± 6320.514299, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #73: 1001it [00:03, 284.70it/s, env_step=73000, gradient_step=7300, len=301, n/ep=0, n/st=100, rew=17881.00]                                                                                


Epoch #73: test_reward: 10886.000000 ± 4250.202348, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #74: 1001it [00:03, 291.95it/s, env_step=74000, gradient_step=7400, len=216, n/ep=1, n/st=100, rew=13350.00]                                                                                


Epoch #74: test_reward: 11773.200000 ± 4489.646062, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #75: 1001it [00:03, 329.75it/s, env_step=75000, gradient_step=7500, len=202, n/ep=1, n/st=100, rew=12231.50]                                                                                


Epoch #75: test_reward: 8526.000000 ± 2856.579913, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #76: 1001it [00:03, 284.92it/s, env_step=76000, gradient_step=7600, len=140, n/ep=1, n/st=100, rew=7025.00]                                                                                 


Epoch #76: test_reward: 15082.300000 ± 6488.659338, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #77: 1001it [00:03, 307.31it/s, env_step=77000, gradient_step=7700, len=225, n/ep=2, n/st=100, rew=13563.75]                                                                                


Epoch #77: test_reward: 12078.600000 ± 4471.973775, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #78: 1001it [00:03, 305.18it/s, env_step=78000, gradient_step=7800, len=162, n/ep=3, n/st=100, rew=9507.00]                                                                                 


Epoch #78: test_reward: 10921.600000 ± 1668.463677, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #79: 1001it [00:03, 288.24it/s, env_step=79000, gradient_step=7900, len=182, n/ep=0, n/st=100, rew=10958.50]                                                                                


Epoch #79: test_reward: 4861.900000 ± 5537.419479, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #80: 1001it [00:03, 324.11it/s, env_step=80000, gradient_step=8000, len=290, n/ep=0, n/st=100, rew=18120.00]                                                                                


Epoch #80: test_reward: 15016.300000 ± 5959.279286, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #81: 1001it [00:03, 276.21it/s, env_step=81000, gradient_step=8100, len=244, n/ep=0, n/st=100, rew=13832.50]                                                                                


Epoch #81: test_reward: 13421.600000 ± 4816.933863, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #82: 1001it [00:03, 290.91it/s, env_step=82000, gradient_step=8200, len=208, n/ep=0, n/st=100, rew=12717.00]                                                                                


Epoch #82: test_reward: 14182.000000 ± 4709.651707, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #83: 1001it [00:03, 287.49it/s, env_step=83000, gradient_step=8300, len=182, n/ep=0, n/st=100, rew=10404.00]                                                                                


Epoch #83: test_reward: 13203.600000 ± 6835.691029, best_reward: 20559.600000 ± 6887.182809 in #35


Epoch #84: 1001it [00:03, 292.43it/s, env_step=84000, gradient_step=8400, len=233, n/ep=0, n/st=100, rew=15110.00]                                                                                


Epoch #84: test_reward: 24595.300000 ± 2208.956770, best_reward: 24595.300000 ± 2208.956770 in #84

InfoStats(gradient_step=8400, best_reward=24595.3, best_reward_std=2208.9567696086765, train_step=84000, train_episode=418, test_step=177506, test_episode=850, timing=TimingStats(total_time=437.3051743507385, train_time=276.85353899002075, train_time_collect=39.38797903060913, train_time_update=232.17190980911255, test_time=160.45163536071777, update_speed=303.4095222565596))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #10


Epoch #1: 1001it [00:03, 280.96it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 11049.400000 ± 2975.287724, best_reward: 15309.400000 ± 5278.371495 in #0


Epoch #2: 1001it [00:03, 331.21it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 12625.500000 ± 4603.196308, best_reward: 15309.400000 ± 5278.371495 in #0


Epoch #3: 1001it [00:03, 325.58it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 15587.200000 ± 4825.602238, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #4: 1001it [00:03, 325.78it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 11466.300000 ± 3999.395306, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #5: 1001it [00:03, 318.56it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 10092.800000 ± 3171.101001, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #6: 1001it [00:03, 330.65it/s, env_step=6000, gradient_step=600, len=54, n/ep=0, n/st=100, rew=1846.00]                                                                                     


Epoch #6: test_reward: 10900.900000 ± 4246.959158, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #7: 1001it [00:03, 316.59it/s, env_step=7000, gradient_step=700, len=66, n/ep=0, n/st=100, rew=2151.00]                                                                                     


Epoch #7: test_reward: 13323.600000 ± 4105.090406, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #8: 1001it [00:03, 292.87it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=2414.00]                                                                                     


Epoch #8: test_reward: 10520.600000 ± 6157.401452, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #9: 1001it [00:03, 301.99it/s, env_step=9000, gradient_step=900, len=86, n/ep=0, n/st=100, rew=3449.00]                                                                                     


Epoch #9: test_reward: 11129.800000 ± 4087.734478, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #10: 1001it [00:03, 293.86it/s, env_step=10000, gradient_step=1000, len=86, n/ep=0, n/st=100, rew=3449.00]                                                                                  


Epoch #10: test_reward: 9302.200000 ± 3005.009744, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #11: 1001it [00:03, 304.07it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=4529.00]                                                                                 


Epoch #11: test_reward: 11861.100000 ± 5201.555737, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #12: 1001it [00:03, 291.63it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=5797.00]                                                                                 


Epoch #12: test_reward: 8334.100000 ± 2538.221442, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #13: 1001it [00:03, 288.56it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=4641.00]                                                                                 


Epoch #13: test_reward: 9139.700000 ± 3016.883725, best_reward: 15587.200000 ± 4825.602238 in #3


Epoch #14: 1001it [00:02, 343.60it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=6243.00]                                                                                 


Epoch #14: test_reward: 18374.400000 ± 3198.548802, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #15: 1001it [00:02, 337.73it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=5570.00]                                                                                 


Epoch #15: test_reward: 12721.200000 ± 5380.292349, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #16: 1001it [00:02, 361.93it/s, env_step=16000, gradient_step=1600, len=156, n/ep=0, n/st=100, rew=7624.00]                                                                                 


Epoch #16: test_reward: 11213.100000 ± 4748.370383, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #17: 1001it [00:02, 353.24it/s, env_step=17000, gradient_step=1700, len=112, n/ep=0, n/st=100, rew=2535.00]                                                                                 


Epoch #17: test_reward: 10558.100000 ± 4631.576329, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #18: 1001it [00:03, 333.29it/s, env_step=18000, gradient_step=1800, len=140, n/ep=2, n/st=100, rew=6823.00]                                                                                 


Epoch #18: test_reward: 11576.500000 ± 3672.674210, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #19: 1001it [00:02, 375.95it/s, env_step=19000, gradient_step=1900, len=190, n/ep=3, n/st=100, rew=7956.00]                                                                                 


Epoch #19: test_reward: 14176.000000 ± 4371.217496, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #20: 1001it [00:02, 390.29it/s, env_step=20000, gradient_step=2000, len=198, n/ep=0, n/st=100, rew=8892.00]                                                                                 


Epoch #20: test_reward: 7969.700000 ± 2941.182757, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #21: 1001it [00:02, 418.78it/s, env_step=21000, gradient_step=2100, len=210, n/ep=2, n/st=100, rew=10538.50]                                                                                


Epoch #21: test_reward: 12636.000000 ± 5358.294374, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #22: 1001it [00:02, 430.17it/s, env_step=22000, gradient_step=2200, len=219, n/ep=0, n/st=100, rew=10558.00]                                                                                


Epoch #22: test_reward: 10739.100000 ± 3488.410023, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #23: 1001it [00:02, 363.91it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=10434.00]                                                                                


Epoch #23: test_reward: 9356.000000 ± 2084.171970, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #24: 1001it [00:03, 307.12it/s, env_step=24000, gradient_step=2400, len=234, n/ep=0, n/st=100, rew=10843.00]                                                                                


Epoch #24: test_reward: 12466.500000 ± 6420.452651, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #25: 1001it [00:02, 409.97it/s, env_step=25000, gradient_step=2500, len=250, n/ep=1, n/st=100, rew=11241.00]                                                                                


Epoch #25: test_reward: 12262.300000 ± 3183.148694, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #26: 1001it [00:02, 383.01it/s, env_step=26000, gradient_step=2600, len=258, n/ep=0, n/st=100, rew=11775.00]                                                                                


Epoch #26: test_reward: 11347.800000 ± 4256.607306, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #27: 1001it [00:02, 392.41it/s, env_step=27000, gradient_step=2700, len=187, n/ep=0, n/st=100, rew=8848.50]                                                                                 


Epoch #27: test_reward: 10550.100000 ± 2770.189721, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #28: 1001it [00:02, 388.75it/s, env_step=28000, gradient_step=2800, len=228, n/ep=2, n/st=100, rew=11586.25]                                                                                


Epoch #28: test_reward: 12187.400000 ± 4196.593385, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #29: 1001it [00:02, 370.39it/s, env_step=29000, gradient_step=2900, len=78, n/ep=2, n/st=100, rew=3190.00]                                                                                  


Epoch #29: test_reward: 12681.600000 ± 3407.003029, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #30: 1001it [00:02, 382.39it/s, env_step=30000, gradient_step=3000, len=226, n/ep=2, n/st=100, rew=11039.25]                                                                                


Epoch #30: test_reward: 10127.700000 ± 2524.524551, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #31: 1001it [00:02, 428.79it/s, env_step=31000, gradient_step=3100, len=310, n/ep=1, n/st=100, rew=14542.00]                                                                                


Epoch #31: test_reward: 5863.200000 ± 2986.768983, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #32: 1001it [00:02, 337.27it/s, env_step=32000, gradient_step=3200, len=115, n/ep=0, n/st=100, rew=5170.00]                                                                                 


Epoch #32: test_reward: 8717.800000 ± 1545.527405, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #33: 1001it [00:02, 411.53it/s, env_step=33000, gradient_step=3300, len=188, n/ep=3, n/st=100, rew=8898.33]                                                                                 


Epoch #33: test_reward: 11339.000000 ± 2933.673874, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #34: 1001it [00:02, 425.11it/s, env_step=34000, gradient_step=3400, len=157, n/ep=0, n/st=100, rew=7868.75]                                                                                 


Epoch #34: test_reward: 9101.700000 ± 1568.298317, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #35: 1001it [00:02, 366.50it/s, env_step=35000, gradient_step=3500, len=161, n/ep=0, n/st=100, rew=8517.50]                                                                                 


Epoch #35: test_reward: 10720.300000 ± 3180.473614, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #36: 1001it [00:02, 380.89it/s, env_step=36000, gradient_step=3600, len=156, n/ep=0, n/st=100, rew=8592.00]                                                                                 


Epoch #36: test_reward: 10465.100000 ± 3505.964359, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #37: 1001it [00:02, 352.56it/s, env_step=37000, gradient_step=3700, len=195, n/ep=0, n/st=100, rew=10506.00]                                                                                


Epoch #37: test_reward: 9976.000000 ± 6688.949514, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #38: 1001it [00:03, 309.87it/s, env_step=38000, gradient_step=3800, len=232, n/ep=5, n/st=100, rew=12406.60]                                                                                


Epoch #38: test_reward: 10873.800000 ± 4641.051816, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #39: 1001it [00:02, 336.21it/s, env_step=39000, gradient_step=3900, len=142, n/ep=0, n/st=100, rew=7039.00]                                                                                 


Epoch #39: test_reward: 12512.500000 ± 3876.644200, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #40: 1001it [00:02, 380.83it/s, env_step=40000, gradient_step=4000, len=218, n/ep=0, n/st=100, rew=12204.00]                                                                                


Epoch #40: test_reward: 8677.800000 ± 2650.588682, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #41: 1001it [00:02, 336.50it/s, env_step=41000, gradient_step=4100, len=191, n/ep=2, n/st=100, rew=10794.75]                                                                                


Epoch #41: test_reward: 10847.700000 ± 2102.023123, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #42: 1001it [00:02, 350.05it/s, env_step=42000, gradient_step=4200, len=198, n/ep=0, n/st=100, rew=11208.00]                                                                                


Epoch #42: test_reward: 10570.100000 ± 4374.523024, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #43: 1001it [00:02, 369.22it/s, env_step=43000, gradient_step=4300, len=50, n/ep=1, n/st=100, rew=1832.00]                                                                                  


Epoch #43: test_reward: 7035.000000 ± 3460.349144, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #44: 1001it [00:02, 384.26it/s, env_step=44000, gradient_step=4400, len=94, n/ep=1, n/st=100, rew=5204.00]                                                                                  


Epoch #44: test_reward: 9135.200000 ± 1517.645993, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #45: 1001it [00:03, 322.88it/s, env_step=45000, gradient_step=4500, len=279, n/ep=1, n/st=100, rew=15798.00]                                                                                


Epoch #45: test_reward: 14600.500000 ± 7181.151457, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #46: 1001it [00:02, 345.74it/s, env_step=46000, gradient_step=4600, len=48, n/ep=0, n/st=100, rew=1502.00]                                                                                  


Epoch #46: test_reward: 13252.000000 ± 8046.722463, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #47: 1001it [00:02, 338.28it/s, env_step=47000, gradient_step=4700, len=192, n/ep=0, n/st=100, rew=10695.00]                                                                                


Epoch #47: test_reward: 12308.600000 ± 3226.731200, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #48: 1001it [00:02, 393.02it/s, env_step=48000, gradient_step=4800, len=167, n/ep=0, n/st=100, rew=10125.25]                                                                                


Epoch #48: test_reward: 9351.400000 ± 3242.483406, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #49: 1001it [00:02, 395.31it/s, env_step=49000, gradient_step=4900, len=145, n/ep=0, n/st=100, rew=8367.75]                                                                                 


Epoch #49: test_reward: 13245.000000 ± 5046.257485, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #50: 1001it [00:02, 359.34it/s, env_step=50000, gradient_step=5000, len=114, n/ep=2, n/st=100, rew=5955.50]                                                                                 


Epoch #50: test_reward: 11460.400000 ± 2864.742334, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #51: 1001it [00:02, 344.75it/s, env_step=51000, gradient_step=5100, len=158, n/ep=1, n/st=100, rew=8978.00]                                                                                 


Epoch #51: test_reward: 12876.500000 ± 2848.689602, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #52: 1001it [00:02, 353.60it/s, env_step=52000, gradient_step=5200, len=156, n/ep=0, n/st=100, rew=8793.00]                                                                                 


Epoch #52: test_reward: 10957.100000 ± 5634.441719, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #53: 1001it [00:02, 339.43it/s, env_step=53000, gradient_step=5300, len=225, n/ep=0, n/st=100, rew=13377.00]                                                                                


Epoch #53: test_reward: 9783.900000 ± 4088.835787, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #54: 1001it [00:03, 325.66it/s, env_step=54000, gradient_step=5400, len=164, n/ep=1, n/st=100, rew=9815.50]                                                                                 


Epoch #54: test_reward: 16812.800000 ± 4330.370164, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #55: 1001it [00:03, 276.43it/s, env_step=55000, gradient_step=5500, len=88, n/ep=1, n/st=100, rew=4604.00]                                                                                  


Epoch #55: test_reward: 9672.600000 ± 3779.069097, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #56: 1001it [00:03, 303.68it/s, env_step=56000, gradient_step=5600, len=156, n/ep=0, n/st=100, rew=9171.00]                                                                                 


Epoch #56: test_reward: 13150.700000 ± 6128.785590, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #57: 1001it [00:03, 285.19it/s, env_step=57000, gradient_step=5700, len=146, n/ep=2, n/st=100, rew=7893.75]                                                                                 


Epoch #57: test_reward: 12489.600000 ± 4884.117755, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #58: 1001it [00:03, 298.63it/s, env_step=58000, gradient_step=5800, len=156, n/ep=0, n/st=100, rew=9872.25]                                                                                 


Epoch #58: test_reward: 12044.700000 ± 1889.549737, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #59: 1001it [00:03, 302.11it/s, env_step=59000, gradient_step=5900, len=149, n/ep=1, n/st=100, rew=8571.00]                                                                                 


Epoch #59: test_reward: 14227.100000 ± 3715.670934, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #60: 1001it [00:03, 317.23it/s, env_step=60000, gradient_step=6000, len=209, n/ep=2, n/st=100, rew=12330.25]                                                                                


Epoch #60: test_reward: 15866.000000 ± 3608.685522, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #61: 1001it [00:03, 324.88it/s, env_step=61000, gradient_step=6100, len=94, n/ep=1, n/st=100, rew=5212.00]                                                                                  


Epoch #61: test_reward: 13802.800000 ± 6470.734237, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #62: 1001it [00:02, 346.72it/s, env_step=62000, gradient_step=6200, len=188, n/ep=1, n/st=100, rew=11169.00]                                                                                


Epoch #62: test_reward: 10560.000000 ± 4480.917585, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #63: 1001it [00:03, 321.64it/s, env_step=63000, gradient_step=6300, len=83, n/ep=0, n/st=100, rew=3719.00]                                                                                  


Epoch #63: test_reward: 11096.300000 ± 3442.605381, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #64: 1001it [00:03, 332.37it/s, env_step=64000, gradient_step=6400, len=148, n/ep=1, n/st=100, rew=7382.50]                                                                                 


Epoch #64: test_reward: 13797.200000 ± 3417.313500, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #65: 1001it [00:02, 361.20it/s, env_step=65000, gradient_step=6500, len=186, n/ep=1, n/st=100, rew=11337.00]                                                                                


Epoch #65: test_reward: 14200.800000 ± 4125.362258, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #66: 1001it [00:02, 339.47it/s, env_step=66000, gradient_step=6600, len=136, n/ep=0, n/st=100, rew=7470.00]                                                                                 


Epoch #66: test_reward: 11369.800000 ± 2600.336894, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #67: 1001it [00:02, 387.04it/s, env_step=67000, gradient_step=6700, len=124, n/ep=0, n/st=100, rew=6742.00]                                                                                 


Epoch #67: test_reward: 9018.600000 ± 4151.300596, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #68: 1001it [00:02, 424.21it/s, env_step=68000, gradient_step=6800, len=189, n/ep=3, n/st=100, rew=11042.67]                                                                                


Epoch #68: test_reward: 11291.200000 ± 4132.690088, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #69: 1001it [00:02, 353.06it/s, env_step=69000, gradient_step=6900, len=309, n/ep=0, n/st=100, rew=19563.00]                                                                                


Epoch #69: test_reward: 8415.300000 ± 2607.249127, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #70: 1001it [00:02, 364.01it/s, env_step=70000, gradient_step=7000, len=191, n/ep=0, n/st=100, rew=10646.00]                                                                                


Epoch #70: test_reward: 10543.800000 ± 3978.798532, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #71: 1001it [00:02, 407.84it/s, env_step=71000, gradient_step=7100, len=126, n/ep=0, n/st=100, rew=7304.00]                                                                                 


Epoch #71: test_reward: 13807.300000 ± 2455.051977, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #72: 1001it [00:02, 376.79it/s, env_step=72000, gradient_step=7200, len=176, n/ep=0, n/st=100, rew=11124.00]                                                                                


Epoch #72: test_reward: 9469.500000 ± 3205.569318, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #73: 1001it [00:02, 355.85it/s, env_step=73000, gradient_step=7300, len=181, n/ep=1, n/st=100, rew=10964.00]                                                                                


Epoch #73: test_reward: 12693.000000 ± 1766.124797, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #74: 1001it [00:02, 335.12it/s, env_step=74000, gradient_step=7400, len=188, n/ep=0, n/st=100, rew=11316.50]                                                                                


Epoch #74: test_reward: 10319.900000 ± 1174.395031, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #75: 1001it [00:02, 334.88it/s, env_step=75000, gradient_step=7500, len=137, n/ep=0, n/st=100, rew=7463.17]                                                                                 


Epoch #75: test_reward: 9345.600000 ± 3217.526851, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #76: 1001it [00:02, 413.00it/s, env_step=76000, gradient_step=7600, len=133, n/ep=2, n/st=100, rew=7662.00]                                                                                 


Epoch #76: test_reward: 8261.500000 ± 4644.796018, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #77: 1001it [00:03, 333.03it/s, env_step=77000, gradient_step=7700, len=114, n/ep=0, n/st=100, rew=6321.00]                                                                                 


Epoch #77: test_reward: 14001.300000 ± 7261.276555, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #78: 1001it [00:02, 385.65it/s, env_step=78000, gradient_step=7800, len=293, n/ep=0, n/st=100, rew=18381.75]                                                                                


Epoch #78: test_reward: 14062.100000 ± 7523.094077, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #79: 1001it [00:03, 302.69it/s, env_step=79000, gradient_step=7900, len=168, n/ep=0, n/st=100, rew=9312.50]                                                                                 


Epoch #79: test_reward: 13488.800000 ± 4796.002623, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #80: 1001it [00:02, 366.33it/s, env_step=80000, gradient_step=8000, len=118, n/ep=1, n/st=100, rew=6979.50]                                                                                 


Epoch #80: test_reward: 9773.200000 ± 5105.523848, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #81: 1001it [00:02, 366.26it/s, env_step=81000, gradient_step=8100, len=207, n/ep=0, n/st=100, rew=12520.50]                                                                                


Epoch #81: test_reward: 9970.800000 ± 2890.558451, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #82: 1001it [00:02, 377.88it/s, env_step=82000, gradient_step=8200, len=196, n/ep=0, n/st=100, rew=11519.00]                                                                                


Epoch #82: test_reward: 12106.500000 ± 2973.836150, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #83: 1001it [00:02, 407.16it/s, env_step=83000, gradient_step=8300, len=152, n/ep=1, n/st=100, rew=9285.00]                                                                                 


Epoch #83: test_reward: 7750.800000 ± 2029.921417, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #84: 1001it [00:02, 380.67it/s, env_step=84000, gradient_step=8400, len=185, n/ep=1, n/st=100, rew=10621.00]                                                                                


Epoch #84: test_reward: 11676.500000 ± 2757.599835, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #85: 1001it [00:02, 406.35it/s, env_step=85000, gradient_step=8500, len=84, n/ep=1, n/st=100, rew=4180.00]                                                                                  


Epoch #85: test_reward: 12467.800000 ± 6785.620588, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #86: 1001it [00:02, 346.76it/s, env_step=86000, gradient_step=8600, len=302, n/ep=0, n/st=100, rew=19824.00]                                                                                


Epoch #86: test_reward: 14518.400000 ± 4265.329324, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #87: 1001it [00:02, 357.60it/s, env_step=87000, gradient_step=8700, len=151, n/ep=0, n/st=100, rew=8853.50]                                                                                 


Epoch #87: test_reward: 12039.600000 ± 7417.621131, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #88: 1001it [00:02, 354.68it/s, env_step=88000, gradient_step=8800, len=195, n/ep=3, n/st=100, rew=11646.33]                                                                                


Epoch #88: test_reward: 10749.300000 ± 3593.512322, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #89: 1001it [00:02, 357.40it/s, env_step=89000, gradient_step=8900, len=178, n/ep=2, n/st=100, rew=10713.25]                                                                                


Epoch #89: test_reward: 11234.700000 ± 3222.924326, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #90: 1001it [00:02, 406.83it/s, env_step=90000, gradient_step=9000, len=120, n/ep=0, n/st=100, rew=7048.50]                                                                                 


Epoch #90: test_reward: 7561.300000 ± 2900.041829, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #91: 1001it [00:02, 387.38it/s, env_step=91000, gradient_step=9100, len=139, n/ep=0, n/st=100, rew=7724.75]                                                                                 


Epoch #91: test_reward: 11342.000000 ± 2679.895371, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #92: 1001it [00:02, 378.93it/s, env_step=92000, gradient_step=9200, len=125, n/ep=0, n/st=100, rew=6387.00]                                                                                 


Epoch #92: test_reward: 10823.400000 ± 6698.255119, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #93: 1001it [00:02, 357.12it/s, env_step=93000, gradient_step=9300, len=205, n/ep=0, n/st=100, rew=12617.00]                                                                                


Epoch #93: test_reward: 12194.300000 ± 3374.250318, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #94: 1001it [00:02, 395.04it/s, env_step=94000, gradient_step=9400, len=194, n/ep=2, n/st=100, rew=11994.00]                                                                                


Epoch #94: test_reward: 11870.300000 ± 6770.250572, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #95: 1001it [00:02, 347.61it/s, env_step=95000, gradient_step=9500, len=122, n/ep=1, n/st=100, rew=7229.00]                                                                                 


Epoch #95: test_reward: 9115.400000 ± 2809.828294, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #96: 1001it [00:02, 374.26it/s, env_step=96000, gradient_step=9600, len=195, n/ep=2, n/st=100, rew=11784.50]                                                                                


Epoch #96: test_reward: 10319.800000 ± 1564.155862, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #97: 1001it [00:02, 380.30it/s, env_step=97000, gradient_step=9700, len=96, n/ep=1, n/st=100, rew=5148.00]                                                                                  


Epoch #97: test_reward: 15609.300000 ± 5305.299257, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #98: 1001it [00:02, 358.83it/s, env_step=98000, gradient_step=9800, len=177, n/ep=0, n/st=100, rew=10500.67]                                                                                


Epoch #98: test_reward: 15142.600000 ± 5053.731774, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #99: 1001it [00:03, 291.53it/s, env_step=99000, gradient_step=9900, len=297, n/ep=0, n/st=100, rew=19803.00]                                                                                


Epoch #99: test_reward: 9770.100000 ± 2040.736213, best_reward: 18374.400000 ± 3198.548802 in #14


Epoch #100: 1001it [00:03, 281.15it/s, env_step=100000, gradient_step=10000, len=108, n/ep=1, n/st=100, rew=6323.00]                                                                              


Epoch #100: test_reward: 11454.700000 ± 4848.716388, best_reward: 18374.400000 ± 3198.548802 in #14

InfoStats(gradient_step=10000, best_reward=18374.4, best_reward_std=3198.548802191394, train_step=100000, train_episode=536, test_step=198554, test_episode=1010, timing=TimingStats(total_time=439.97184109687805, train_time=288.1956584453583, train_time_collect=41.029494285583496, train_time_update=242.026531457901, test_time=151.77618265151978, update_speed=346.98649014853197))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #16


Epoch #1: 1001it [00:03, 326.50it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 18319.700000 ± 7053.964928, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #2: 1001it [00:03, 320.22it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 15956.200000 ± 4083.354547, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #3: 1001it [00:03, 283.33it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 13806.700000 ± 5792.111637, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #4: 1001it [00:03, 329.86it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 14940.600000 ± 4634.440294, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #5: 1001it [00:02, 414.83it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 11259.100000 ± 3728.328753, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #6: 1001it [00:02, 422.29it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 12363.900000 ± 4501.408523, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #7: 1001it [00:02, 394.81it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 11799.400000 ± 4676.881422, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #8: 1001it [00:02, 428.09it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=2610.00]                                                                                     


Epoch #8: test_reward: 11634.600000 ± 4064.491880, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #9: 1001it [00:02, 386.70it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=3727.00]                                                                                     


Epoch #9: test_reward: 14750.500000 ± 5779.318528, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #10: 1001it [00:02, 376.90it/s, env_step=10000, gradient_step=1000, len=100, n/ep=1, n/st=100, rew=3351.00]                                                                                 


Epoch #10: test_reward: 12516.700000 ± 4357.339970, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #11: 1001it [00:02, 396.75it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=4160.50]                                                                                 


Epoch #11: test_reward: 9469.300000 ± 5017.258017, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #12: 1001it [00:02, 380.24it/s, env_step=12000, gradient_step=1200, len=119, n/ep=0, n/st=100, rew=5352.00]                                                                                 


Epoch #12: test_reward: 11420.900000 ± 6900.995732, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #13: 1001it [00:02, 409.80it/s, env_step=13000, gradient_step=1300, len=128, n/ep=0, n/st=100, rew=6134.00]                                                                                 


Epoch #13: test_reward: 13760.400000 ± 6170.315506, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #14: 1001it [00:02, 436.57it/s, env_step=14000, gradient_step=1400, len=137, n/ep=0, n/st=100, rew=5751.75]                                                                                 


Epoch #14: test_reward: 9866.300000 ± 5931.209658, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #15: 1001it [00:02, 358.74it/s, env_step=15000, gradient_step=1500, len=144, n/ep=0, n/st=100, rew=6109.00]                                                                                 


Epoch #15: test_reward: 9094.400000 ± 5267.467403, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #16: 1001it [00:02, 408.68it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=7439.00]                                                                                 


Epoch #16: test_reward: 10857.100000 ± 5580.313100, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #17: 1001it [00:02, 389.94it/s, env_step=17000, gradient_step=1700, len=166, n/ep=0, n/st=100, rew=7512.00]                                                                                 


Epoch #17: test_reward: 9716.600000 ± 3704.756516, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #18: 1001it [00:02, 371.11it/s, env_step=18000, gradient_step=1800, len=178, n/ep=0, n/st=100, rew=8143.50]                                                                                 


Epoch #18: test_reward: 10844.100000 ± 4813.514900, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #19: 1001it [00:02, 439.50it/s, env_step=19000, gradient_step=1900, len=178, n/ep=0, n/st=100, rew=8143.50]                                                                                 


Epoch #19: test_reward: 13393.900000 ± 2468.095316, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #20: 1001it [00:02, 391.04it/s, env_step=20000, gradient_step=2000, len=54, n/ep=0, n/st=100, rew=1804.50]                                                                                  


Epoch #20: test_reward: 14158.600000 ± 5164.822169, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #21: 1001it [00:02, 386.19it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=9951.00]                                                                                 


Epoch #21: test_reward: 13457.000000 ± 3788.738497, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #22: 1001it [00:02, 353.84it/s, env_step=22000, gradient_step=2200, len=218, n/ep=0, n/st=100, rew=10693.00]                                                                                


Epoch #22: test_reward: 7752.700000 ± 2394.447245, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #23: 1001it [00:02, 404.25it/s, env_step=23000, gradient_step=2300, len=114, n/ep=0, n/st=100, rew=5345.50]                                                                                 


Epoch #23: test_reward: 6732.000000 ± 2310.314091, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #24: 1001it [00:02, 388.93it/s, env_step=24000, gradient_step=2400, len=240, n/ep=2, n/st=100, rew=12040.50]                                                                                


Epoch #24: test_reward: 9068.400000 ± 1949.812873, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #25: 1001it [00:02, 398.24it/s, env_step=25000, gradient_step=2500, len=248, n/ep=0, n/st=100, rew=12572.67]                                                                                


Epoch #25: test_reward: 17883.000000 ± 5683.037023, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #26: 1001it [00:02, 409.79it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=13837.00]                                                                                


Epoch #26: test_reward: 13394.900000 ± 3807.365216, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #27: 1001it [00:03, 332.18it/s, env_step=27000, gradient_step=2700, len=146, n/ep=1, n/st=100, rew=7949.00]                                                                                 


Epoch #27: test_reward: 11046.600000 ± 4920.111710, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #28: 1001it [00:02, 379.20it/s, env_step=28000, gradient_step=2800, len=62, n/ep=2, n/st=100, rew=2698.00]                                                                                  


Epoch #28: test_reward: 8986.800000 ± 6214.447760, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #29: 1001it [00:02, 362.62it/s, env_step=29000, gradient_step=2900, len=98, n/ep=0, n/st=100, rew=4730.00]                                                                                  


Epoch #29: test_reward: 12518.400000 ± 5249.976651, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #30: 1001it [00:02, 400.90it/s, env_step=30000, gradient_step=3000, len=136, n/ep=1, n/st=100, rew=7737.50]                                                                                 


Epoch #30: test_reward: 11453.600000 ± 5660.297505, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #31: 1001it [00:02, 341.36it/s, env_step=31000, gradient_step=3100, len=183, n/ep=0, n/st=100, rew=8965.00]                                                                                 


Epoch #31: test_reward: 15922.900000 ± 5937.246474, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #32: 1001it [00:02, 403.64it/s, env_step=32000, gradient_step=3200, len=320, n/ep=1, n/st=100, rew=18125.00]                                                                                


Epoch #32: test_reward: 16687.000000 ± 7601.447375, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #33: 1001it [00:02, 383.04it/s, env_step=33000, gradient_step=3300, len=258, n/ep=0, n/st=100, rew=13333.25]                                                                                


Epoch #33: test_reward: 10569.900000 ± 3859.056425, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #34: 1001it [00:02, 376.60it/s, env_step=34000, gradient_step=3400, len=153, n/ep=0, n/st=100, rew=5752.00]                                                                                 


Epoch #34: test_reward: 10297.500000 ± 2784.043435, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #35: 1001it [00:02, 413.49it/s, env_step=35000, gradient_step=3500, len=128, n/ep=3, n/st=100, rew=7359.00]                                                                                 


Epoch #35: test_reward: 12548.900000 ± 5152.142359, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #36: 1001it [00:02, 413.53it/s, env_step=36000, gradient_step=3600, len=137, n/ep=3, n/st=100, rew=7490.67]                                                                                 


Epoch #36: test_reward: 10399.600000 ± 3985.197792, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #37: 1001it [00:02, 363.47it/s, env_step=37000, gradient_step=3700, len=258, n/ep=0, n/st=100, rew=15934.00]                                                                                


Epoch #37: test_reward: 7251.400000 ± 2321.883554, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #38: 1001it [00:02, 349.30it/s, env_step=38000, gradient_step=3800, len=380, n/ep=1, n/st=100, rew=22471.00]                                                                                


Epoch #38: test_reward: 12355.300000 ± 4828.300157, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #39: 1001it [00:03, 326.25it/s, env_step=39000, gradient_step=3900, len=212, n/ep=0, n/st=100, rew=13107.00]                                                                                


Epoch #39: test_reward: 12440.200000 ± 2008.492659, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #40: 1001it [00:02, 381.72it/s, env_step=40000, gradient_step=4000, len=329, n/ep=7, n/st=100, rew=19495.57]                                                                                


Epoch #40: test_reward: 18232.000000 ± 8516.346353, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #41: 1001it [00:02, 400.48it/s, env_step=41000, gradient_step=4100, len=124, n/ep=2, n/st=100, rew=5976.50]                                                                                 


Epoch #41: test_reward: 9341.800000 ± 3134.975783, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #42: 1001it [00:02, 414.03it/s, env_step=42000, gradient_step=4200, len=144, n/ep=2, n/st=100, rew=7150.75]                                                                                 


Epoch #42: test_reward: 13965.600000 ± 3639.361900, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #43: 1001it [00:02, 397.38it/s, env_step=43000, gradient_step=4300, len=113, n/ep=0, n/st=100, rew=5090.00]                                                                                 


Epoch #43: test_reward: 9646.500000 ± 2473.401757, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #44: 1001it [00:02, 347.50it/s, env_step=44000, gradient_step=4400, len=94, n/ep=0, n/st=100, rew=4717.00]                                                                                  


Epoch #44: test_reward: 5907.000000 ± 1548.902386, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #45: 1001it [00:02, 388.01it/s, env_step=45000, gradient_step=4500, len=184, n/ep=0, n/st=100, rew=11113.17]                                                                                


Epoch #45: test_reward: 11344.800000 ± 2979.235365, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #46: 1001it [00:03, 332.92it/s, env_step=46000, gradient_step=4600, len=116, n/ep=0, n/st=100, rew=6287.00]                                                                                 


Epoch #46: test_reward: 12268.200000 ± 5185.640284, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #47: 1001it [00:02, 351.40it/s, env_step=47000, gradient_step=4700, len=166, n/ep=2, n/st=100, rew=9898.75]                                                                                 


Epoch #47: test_reward: 8828.400000 ± 3365.988449, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #48: 1001it [00:02, 394.79it/s, env_step=48000, gradient_step=4800, len=178, n/ep=1, n/st=100, rew=10551.50]                                                                                


Epoch #48: test_reward: 9562.000000 ± 2521.260201, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #49: 1001it [00:02, 346.27it/s, env_step=49000, gradient_step=4900, len=178, n/ep=0, n/st=100, rew=10551.50]                                                                                


Epoch #49: test_reward: 8409.600000 ± 3658.416029, best_reward: 18319.700000 ± 7053.964928 in #1


Epoch #50: 1001it [00:02, 403.53it/s, env_step=50000, gradient_step=5000, len=153, n/ep=0, n/st=100, rew=8589.50]                                                                                 


Epoch #50: test_reward: 21831.500000 ± 6394.000614, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #51: 1001it [00:02, 424.17it/s, env_step=51000, gradient_step=5100, len=94, n/ep=0, n/st=100, rew=4412.00]                                                                                  


Epoch #51: test_reward: 12800.400000 ± 3425.180030, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #52: 1001it [00:02, 412.00it/s, env_step=52000, gradient_step=5200, len=137, n/ep=0, n/st=100, rew=7880.75]                                                                                 


Epoch #52: test_reward: 20967.900000 ± 4861.504241, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #53: 1001it [00:02, 348.77it/s, env_step=53000, gradient_step=5300, len=201, n/ep=1, n/st=100, rew=11770.00]                                                                                


Epoch #53: test_reward: 10856.000000 ± 2485.537326, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #54: 1001it [00:02, 355.13it/s, env_step=54000, gradient_step=5400, len=104, n/ep=0, n/st=100, rew=5822.00]                                                                                 


Epoch #54: test_reward: 7366.400000 ± 2794.001546, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #55: 1001it [00:02, 401.71it/s, env_step=55000, gradient_step=5500, len=158, n/ep=0, n/st=100, rew=9092.38]                                                                                 


Epoch #55: test_reward: 13537.000000 ± 8970.486564, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #56: 1001it [00:02, 410.42it/s, env_step=56000, gradient_step=5600, len=153, n/ep=2, n/st=100, rew=7845.25]                                                                                 


Epoch #56: test_reward: 14360.800000 ± 3064.522795, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #57: 1001it [00:02, 431.57it/s, env_step=57000, gradient_step=5700, len=166, n/ep=0, n/st=100, rew=10329.00]                                                                                


Epoch #57: test_reward: 15409.600000 ± 3697.712433, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #58: 1001it [00:02, 390.29it/s, env_step=58000, gradient_step=5800, len=180, n/ep=0, n/st=100, rew=11542.00]                                                                                


Epoch #58: test_reward: 13832.600000 ± 4493.654553, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #59: 1001it [00:02, 339.59it/s, env_step=59000, gradient_step=5900, len=274, n/ep=2, n/st=100, rew=15320.25]                                                                                


Epoch #59: test_reward: 19512.200000 ± 6313.631583, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #60: 1001it [00:03, 309.90it/s, env_step=60000, gradient_step=6000, len=219, n/ep=0, n/st=100, rew=13278.00]                                                                                


Epoch #60: test_reward: 11406.400000 ± 1909.732714, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #61: 1001it [00:02, 352.66it/s, env_step=61000, gradient_step=6100, len=209, n/ep=2, n/st=100, rew=12622.50]                                                                                


Epoch #61: test_reward: 13342.500000 ± 5680.635444, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #62: 1001it [00:02, 394.45it/s, env_step=62000, gradient_step=6200, len=225, n/ep=0, n/st=100, rew=14352.50]                                                                                


Epoch #62: test_reward: 14637.100000 ± 5594.737249, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #63: 1001it [00:02, 412.14it/s, env_step=63000, gradient_step=6300, len=246, n/ep=1, n/st=100, rew=15249.00]                                                                                


Epoch #63: test_reward: 9992.300000 ± 2226.447531, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #64: 1001it [00:02, 436.48it/s, env_step=64000, gradient_step=6400, len=102, n/ep=2, n/st=100, rew=5218.00]                                                                                 


Epoch #64: test_reward: 12763.800000 ± 3007.354778, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #65: 1001it [00:02, 430.87it/s, env_step=65000, gradient_step=6500, len=36, n/ep=1, n/st=100, rew=1262.00]                                                                                  


Epoch #65: test_reward: 9951.400000 ± 5332.339547, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #66: 1001it [00:02, 474.15it/s, env_step=66000, gradient_step=6600, len=181, n/ep=1, n/st=100, rew=10501.50]                                                                                


Epoch #66: test_reward: 12268.200000 ± 7600.978869, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #67: 1001it [00:02, 476.87it/s, env_step=67000, gradient_step=6700, len=277, n/ep=1, n/st=100, rew=18484.00]                                                                                


Epoch #67: test_reward: 12621.200000 ± 3998.045142, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #68: 1001it [00:01, 536.39it/s, env_step=68000, gradient_step=6800, len=131, n/ep=0, n/st=100, rew=7371.50]                                                                                 


Epoch #68: test_reward: 12835.200000 ± 3989.953904, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #69: 1001it [00:01, 505.33it/s, env_step=69000, gradient_step=6900, len=80, n/ep=1, n/st=100, rew=3760.50]                                                                                  


Epoch #69: test_reward: 9218.800000 ± 3762.592744, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #70: 1001it [00:02, 450.33it/s, env_step=70000, gradient_step=7000, len=200, n/ep=0, n/st=100, rew=12782.50]                                                                                


Epoch #70: test_reward: 9780.800000 ± 2032.758854, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #71: 1001it [00:02, 467.20it/s, env_step=71000, gradient_step=7100, len=183, n/ep=0, n/st=100, rew=11083.67]                                                                                


Epoch #71: test_reward: 11783.800000 ± 1886.317778, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #72: 1001it [00:02, 488.10it/s, env_step=72000, gradient_step=7200, len=109, n/ep=0, n/st=100, rew=6093.75]                                                                                 


Epoch #72: test_reward: 13505.400000 ± 3539.418065, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #73: 1001it [00:02, 425.71it/s, env_step=73000, gradient_step=7300, len=202, n/ep=0, n/st=100, rew=10981.00]                                                                                


Epoch #73: test_reward: 15197.500000 ± 4247.093812, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #74: 1001it [00:02, 387.11it/s, env_step=74000, gradient_step=7400, len=287, n/ep=2, n/st=100, rew=17031.75]                                                                                


Epoch #74: test_reward: 13007.400000 ± 5678.702038, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #75: 1001it [00:02, 439.13it/s, env_step=75000, gradient_step=7500, len=224, n/ep=1, n/st=100, rew=13439.00]                                                                                


Epoch #75: test_reward: 7134.200000 ± 3905.152335, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #76: 1001it [00:02, 456.18it/s, env_step=76000, gradient_step=7600, len=90, n/ep=1, n/st=100, rew=4923.00]                                                                                  


Epoch #76: test_reward: 12811.200000 ± 6360.214867, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #77: 1001it [00:01, 522.47it/s, env_step=77000, gradient_step=7700, len=310, n/ep=0, n/st=100, rew=18788.50]                                                                                


Epoch #77: test_reward: 14242.700000 ± 7620.370123, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #78: 1001it [00:02, 494.95it/s, env_step=78000, gradient_step=7800, len=210, n/ep=0, n/st=100, rew=13186.33]                                                                                


Epoch #78: test_reward: 9351.200000 ± 4012.107895, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #79: 1001it [00:01, 539.58it/s, env_step=79000, gradient_step=7900, len=198, n/ep=0, n/st=100, rew=12003.00]                                                                                


Epoch #79: test_reward: 10611.600000 ± 4745.698351, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #80: 1001it [00:02, 430.72it/s, env_step=80000, gradient_step=8000, len=132, n/ep=1, n/st=100, rew=7503.00]                                                                                 


Epoch #80: test_reward: 10423.500000 ± 2959.228219, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #81: 1001it [00:02, 430.70it/s, env_step=81000, gradient_step=8100, len=127, n/ep=1, n/st=100, rew=6614.00]                                                                                 


Epoch #81: test_reward: 7627.200000 ± 1604.557808, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #82: 1001it [00:02, 439.13it/s, env_step=82000, gradient_step=8200, len=184, n/ep=1, n/st=100, rew=10077.00]                                                                                


Epoch #82: test_reward: 10574.800000 ± 2768.443454, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #83: 1001it [00:02, 452.72it/s, env_step=83000, gradient_step=8300, len=126, n/ep=1, n/st=100, rew=7524.50]                                                                                 


Epoch #83: test_reward: 12919.700000 ± 3656.184816, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #84: 1001it [00:02, 470.94it/s, env_step=84000, gradient_step=8400, len=166, n/ep=1, n/st=100, rew=9717.00]                                                                                 


Epoch #84: test_reward: 7583.500000 ± 2480.765094, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #85: 1001it [00:02, 448.40it/s, env_step=85000, gradient_step=8500, len=169, n/ep=1, n/st=100, rew=10665.00]                                                                                


Epoch #85: test_reward: 12611.200000 ± 3308.013446, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #86: 1001it [00:02, 381.05it/s, env_step=86000, gradient_step=8600, len=106, n/ep=0, n/st=100, rew=6063.00]                                                                                 


Epoch #86: test_reward: 14678.700000 ± 4241.334178, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #87: 1001it [00:02, 459.77it/s, env_step=87000, gradient_step=8700, len=157, n/ep=0, n/st=100, rew=9629.00]                                                                                 


Epoch #87: test_reward: 14729.500000 ± 6278.093536, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #88: 1001it [00:02, 411.93it/s, env_step=88000, gradient_step=8800, len=106, n/ep=0, n/st=100, rew=5817.00]                                                                                 


Epoch #88: test_reward: 12234.800000 ± 2750.441594, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #89: 1001it [00:02, 438.53it/s, env_step=89000, gradient_step=8900, len=142, n/ep=1, n/st=100, rew=7531.00]                                                                                 


Epoch #89: test_reward: 10662.800000 ± 6458.856506, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #90: 1001it [00:02, 498.53it/s, env_step=90000, gradient_step=9000, len=259, n/ep=0, n/st=100, rew=14880.00]                                                                                


Epoch #90: test_reward: 9857.600000 ± 3687.225060, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #91: 1001it [00:02, 458.76it/s, env_step=91000, gradient_step=9100, len=64, n/ep=0, n/st=100, rew=2516.50]                                                                                  


Epoch #91: test_reward: 9920.000000 ± 1284.775934, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #92: 1001it [00:02, 433.40it/s, env_step=92000, gradient_step=9200, len=132, n/ep=0, n/st=100, rew=6719.00]                                                                                 


Epoch #92: test_reward: 7105.000000 ± 3885.630168, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #93: 1001it [00:01, 513.67it/s, env_step=93000, gradient_step=9300, len=90, n/ep=1, n/st=100, rew=4572.00]                                                                                  


Epoch #93: test_reward: 8458.800000 ± 2826.892173, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #94: 1001it [00:02, 493.18it/s, env_step=94000, gradient_step=9400, len=162, n/ep=1, n/st=100, rew=9267.00]                                                                                 


Epoch #94: test_reward: 7270.300000 ± 3098.369186, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #95: 1001it [00:02, 438.47it/s, env_step=95000, gradient_step=9500, len=102, n/ep=1, n/st=100, rew=4815.00]                                                                                 


Epoch #95: test_reward: 14134.000000 ± 4813.383571, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #96: 1001it [00:01, 519.50it/s, env_step=96000, gradient_step=9600, len=160, n/ep=0, n/st=100, rew=8929.00]                                                                                 


Epoch #96: test_reward: 8378.100000 ± 1683.499240, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #97: 1001it [00:01, 519.37it/s, env_step=97000, gradient_step=9700, len=207, n/ep=0, n/st=100, rew=12056.00]                                                                                


Epoch #97: test_reward: 10331.300000 ± 4870.967831, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #98: 1001it [00:01, 531.15it/s, env_step=98000, gradient_step=9800, len=238, n/ep=1, n/st=100, rew=15285.00]                                                                                


Epoch #98: test_reward: 6070.700000 ± 2424.035975, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #99: 1001it [00:02, 415.28it/s, env_step=99000, gradient_step=9900, len=250, n/ep=1, n/st=100, rew=16183.00]                                                                                


Epoch #99: test_reward: 13236.700000 ± 6955.103077, best_reward: 21831.500000 ± 6394.000614 in #50


Epoch #100: 1001it [00:02, 371.11it/s, env_step=100000, gradient_step=10000, len=141, n/ep=0, n/st=100, rew=7867.83]                                                                              


Epoch #100: test_reward: 12488.000000 ± 3490.139481, best_reward: 21831.500000 ± 6394.000614 in #50

InfoStats(gradient_step=10000, best_reward=21831.5, best_reward_std=6394.000613856711, train_step=100000, train_episode=533, test_step=200484, test_episode=1010, timing=TimingStats(total_time=383.92750334739685, train_time=248.98151779174805, train_time_collect=37.354031801223755, train_time_update=207.18620324134827, test_time=134.9459855556488, update_speed=401.63623744812065))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #6


Epoch #1: 1001it [00:02, 390.36it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 10494.400000 ± 5691.036025, best_reward: 13131.000000 ± 3950.329505 in #0


Epoch #2: 1001it [00:02, 461.48it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 15997.000000 ± 4891.431549, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #3: 1001it [00:01, 537.04it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 13602.700000 ± 3582.899526, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #4: 1001it [00:02, 479.45it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 10022.200000 ± 4736.281026, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #5: 1001it [00:02, 488.48it/s, env_step=5000, gradient_step=500, len=50, n/ep=1, n/st=100, rew=1525.00]                                                                                     


Epoch #5: test_reward: 10531.500000 ± 4347.538712, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #6: 1001it [00:02, 449.30it/s, env_step=6000, gradient_step=600, len=58, n/ep=0, n/st=100, rew=1784.00]                                                                                     


Epoch #6: test_reward: 8198.300000 ± 4069.504639, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #7: 1001it [00:01, 522.09it/s, env_step=7000, gradient_step=700, len=66, n/ep=0, n/st=100, rew=2313.00]                                                                                     


Epoch #7: test_reward: 14801.300000 ± 3423.416219, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #8: 1001it [00:02, 469.56it/s, env_step=8000, gradient_step=800, len=74, n/ep=0, n/st=100, rew=2395.00]                                                                                     


Epoch #8: test_reward: 12599.700000 ± 4756.612388, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #9: 1001it [00:02, 432.93it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=3439.00]                                                                                     


Epoch #9: test_reward: 7350.100000 ± 2987.875849, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #10: 1001it [00:02, 496.01it/s, env_step=10000, gradient_step=1000, len=100, n/ep=1, n/st=100, rew=3521.00]                                                                                 


Epoch #10: test_reward: 9324.900000 ± 2144.485041, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #11: 1001it [00:02, 449.44it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=3969.00]                                                                                 


Epoch #11: test_reward: 11348.800000 ± 2659.042677, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #12: 1001it [00:02, 437.58it/s, env_step=12000, gradient_step=1200, len=120, n/ep=1, n/st=100, rew=4488.50]                                                                                 


Epoch #12: test_reward: 14995.700000 ± 4025.091801, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #13: 1001it [00:01, 519.02it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=5175.00]                                                                                 


Epoch #13: test_reward: 13702.800000 ± 4776.875586, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #14: 1001it [00:02, 399.82it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=5534.25]                                                                                 


Epoch #14: test_reward: 10511.100000 ± 6499.055200, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #15: 1001it [00:02, 494.23it/s, env_step=15000, gradient_step=1500, len=92, n/ep=1, n/st=100, rew=2991.00]                                                                                  


Epoch #15: test_reward: 14857.800000 ± 6198.906642, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #16: 1001it [00:02, 497.27it/s, env_step=16000, gradient_step=1600, len=103, n/ep=2, n/st=100, rew=4159.50]                                                                                 


Epoch #16: test_reward: 11163.100000 ± 3185.939342, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #17: 1001it [00:01, 526.01it/s, env_step=17000, gradient_step=1700, len=105, n/ep=0, n/st=100, rew=3352.00]                                                                                 


Epoch #17: test_reward: 11935.400000 ± 5086.291856, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #18: 1001it [00:02, 452.66it/s, env_step=18000, gradient_step=1800, len=176, n/ep=0, n/st=100, rew=7397.00]                                                                                 


Epoch #18: test_reward: 10841.200000 ± 5637.933182, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #19: 1001it [00:02, 459.02it/s, env_step=19000, gradient_step=1900, len=165, n/ep=3, n/st=100, rew=6847.00]                                                                                 


Epoch #19: test_reward: 11337.900000 ± 4221.001953, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #20: 1001it [00:02, 375.73it/s, env_step=20000, gradient_step=2000, len=161, n/ep=0, n/st=100, rew=6778.50]                                                                                 


Epoch #20: test_reward: 11775.000000 ± 5430.957871, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #21: 1001it [00:02, 408.46it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=9425.50]                                                                                 


Epoch #21: test_reward: 9709.200000 ± 3018.032763, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #22: 1001it [00:01, 536.85it/s, env_step=22000, gradient_step=2200, len=219, n/ep=0, n/st=100, rew=11530.50]                                                                                


Epoch #22: test_reward: 9850.300000 ± 4306.303335, best_reward: 15997.000000 ± 4891.431549 in #2


Epoch #23: 1001it [00:01, 532.55it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=9150.00]                                                                                 


Epoch #23: test_reward: 16058.000000 ± 7910.700904, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #24: 1001it [00:02, 372.13it/s, env_step=24000, gradient_step=2400, len=102, n/ep=2, n/st=100, rew=4257.75]                                                                                 


Epoch #24: test_reward: 14425.900000 ± 5736.381864, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #25: 1001it [00:02, 447.16it/s, env_step=25000, gradient_step=2500, len=98, n/ep=0, n/st=100, rew=3898.00]                                                                                  


Epoch #25: test_reward: 13352.400000 ± 4632.865014, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #26: 1001it [00:02, 428.31it/s, env_step=26000, gradient_step=2600, len=222, n/ep=3, n/st=100, rew=11042.00]                                                                                


Epoch #26: test_reward: 10734.500000 ± 5588.740811, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #27: 1001it [00:02, 431.21it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=13941.00]                                                                                


Epoch #27: test_reward: 11335.300000 ± 3842.589701, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #28: 1001it [00:02, 415.24it/s, env_step=28000, gradient_step=2800, len=136, n/ep=1, n/st=100, rew=5337.50]                                                                                 


Epoch #28: test_reward: 13078.000000 ± 7481.109383, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #29: 1001it [00:02, 423.01it/s, env_step=29000, gradient_step=2900, len=48, n/ep=0, n/st=100, rew=1929.50]                                                                                  


Epoch #29: test_reward: 13693.900000 ± 3795.490441, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #30: 1001it [00:02, 455.71it/s, env_step=30000, gradient_step=3000, len=152, n/ep=1, n/st=100, rew=5381.00]                                                                                 


Epoch #30: test_reward: 12425.000000 ± 3871.722304, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #31: 1001it [00:02, 493.73it/s, env_step=31000, gradient_step=3100, len=189, n/ep=2, n/st=100, rew=9751.50]                                                                                 


Epoch #31: test_reward: 11962.200000 ± 5080.354767, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #32: 1001it [00:02, 408.22it/s, env_step=32000, gradient_step=3200, len=278, n/ep=2, n/st=100, rew=13877.00]                                                                                


Epoch #32: test_reward: 13311.200000 ± 4072.401842, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #33: 1001it [00:02, 381.76it/s, env_step=33000, gradient_step=3300, len=234, n/ep=1, n/st=100, rew=12426.00]                                                                                


Epoch #33: test_reward: 9283.800000 ± 5671.352322, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #34: 1001it [00:02, 373.71it/s, env_step=34000, gradient_step=3400, len=106, n/ep=2, n/st=100, rew=5209.50]                                                                                 


Epoch #34: test_reward: 13338.300000 ± 3916.637411, best_reward: 16058.000000 ± 7910.700904 in #23


Epoch #35: 1001it [00:02, 408.82it/s, env_step=35000, gradient_step=3500, len=110, n/ep=0, n/st=100, rew=6187.50]                                                                                 


Epoch #35: test_reward: 16327.400000 ± 7533.985946, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #36: 1001it [00:02, 416.06it/s, env_step=36000, gradient_step=3600, len=163, n/ep=0, n/st=100, rew=9573.00]                                                                                 


Epoch #36: test_reward: 6433.000000 ± 4813.535914, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #37: 1001it [00:02, 363.42it/s, env_step=37000, gradient_step=3700, len=187, n/ep=2, n/st=100, rew=11339.25]                                                                                


Epoch #37: test_reward: 6554.100000 ± 4685.522606, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #38: 1001it [00:02, 423.87it/s, env_step=38000, gradient_step=3800, len=282, n/ep=1, n/st=100, rew=16250.00]                                                                                


Epoch #38: test_reward: 14692.900000 ± 4467.180351, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #39: 1001it [00:02, 486.96it/s, env_step=39000, gradient_step=3900, len=208, n/ep=1, n/st=100, rew=11145.00]                                                                                


Epoch #39: test_reward: 10591.600000 ± 2999.522202, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #40: 1001it [00:01, 509.33it/s, env_step=40000, gradient_step=4000, len=80, n/ep=1, n/st=100, rew=3493.00]                                                                                  


Epoch #40: test_reward: 10964.700000 ± 3550.873866, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #41: 1001it [00:02, 424.48it/s, env_step=41000, gradient_step=4100, len=258, n/ep=2, n/st=100, rew=13785.00]                                                                                


Epoch #41: test_reward: 11520.200000 ± 4562.491113, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #42: 1001it [00:02, 406.14it/s, env_step=42000, gradient_step=4200, len=176, n/ep=0, n/st=100, rew=10453.00]                                                                                


Epoch #42: test_reward: 13285.600000 ± 4580.283948, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #43: 1001it [00:02, 453.25it/s, env_step=43000, gradient_step=4300, len=159, n/ep=0, n/st=100, rew=9509.00]                                                                                 


Epoch #43: test_reward: 12126.200000 ± 4140.768982, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #44: 1001it [00:02, 487.21it/s, env_step=44000, gradient_step=4400, len=142, n/ep=2, n/st=100, rew=7570.00]                                                                                 


Epoch #44: test_reward: 7579.600000 ± 3638.081203, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #45: 1001it [00:01, 509.49it/s, env_step=45000, gradient_step=4500, len=90, n/ep=0, n/st=100, rew=5154.00]                                                                                  


Epoch #45: test_reward: 11742.900000 ± 3680.292935, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #46: 1001it [00:02, 428.55it/s, env_step=46000, gradient_step=4600, len=196, n/ep=1, n/st=100, rew=10748.00]                                                                                


Epoch #46: test_reward: 11137.400000 ± 1762.351679, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #47: 1001it [00:02, 467.19it/s, env_step=47000, gradient_step=4700, len=169, n/ep=0, n/st=100, rew=9639.75]                                                                                 


Epoch #47: test_reward: 11815.600000 ± 6711.888366, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #48: 1001it [00:02, 440.30it/s, env_step=48000, gradient_step=4800, len=296, n/ep=1, n/st=100, rew=17235.00]                                                                                


Epoch #48: test_reward: 10852.700000 ± 2111.229644, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #49: 1001it [00:02, 442.76it/s, env_step=49000, gradient_step=4900, len=132, n/ep=0, n/st=100, rew=7479.00]                                                                                 


Epoch #49: test_reward: 14973.200000 ± 8012.402296, best_reward: 16327.400000 ± 7533.985946 in #35


Epoch #50: 1001it [00:02, 459.87it/s, env_step=50000, gradient_step=5000, len=104, n/ep=0, n/st=100, rew=3763.50]                                                                                 


Epoch #50: test_reward: 16528.000000 ± 6879.319443, best_reward: 16528.000000 ± 6879.319443 in #50


Epoch #51: 1001it [00:01, 510.79it/s, env_step=51000, gradient_step=5100, len=136, n/ep=1, n/st=100, rew=7333.00]                                                                                 


Epoch #51: test_reward: 14383.000000 ± 5151.982162, best_reward: 16528.000000 ± 6879.319443 in #50


Epoch #52: 1001it [00:02, 377.55it/s, env_step=52000, gradient_step=5200, len=130, n/ep=0, n/st=100, rew=6529.50]                                                                                 


Epoch #52: test_reward: 18280.600000 ± 8918.468256, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #53: 1001it [00:02, 471.44it/s, env_step=53000, gradient_step=5300, len=139, n/ep=0, n/st=100, rew=7112.00]                                                                                 


Epoch #53: test_reward: 15112.400000 ± 6387.778694, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #54: 1001it [00:02, 415.62it/s, env_step=54000, gradient_step=5400, len=104, n/ep=0, n/st=100, rew=5717.17]                                                                                 


Epoch #54: test_reward: 11524.100000 ± 4271.392406, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #55: 1001it [00:02, 384.74it/s, env_step=55000, gradient_step=5500, len=139, n/ep=0, n/st=100, rew=8052.25]                                                                                 


Epoch #55: test_reward: 11903.300000 ± 2109.292490, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #56: 1001it [00:02, 366.10it/s, env_step=56000, gradient_step=5600, len=179, n/ep=0, n/st=100, rew=11347.00]                                                                                


Epoch #56: test_reward: 15990.700000 ± 2011.718472, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #57: 1001it [00:02, 446.07it/s, env_step=57000, gradient_step=5700, len=320, n/ep=0, n/st=100, rew=20359.00]                                                                                


Epoch #57: test_reward: 17256.400000 ± 7415.105437, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #58: 1001it [00:02, 498.74it/s, env_step=58000, gradient_step=5800, len=114, n/ep=0, n/st=100, rew=5689.00]                                                                                 


Epoch #58: test_reward: 17128.400000 ± 6638.106299, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #59: 1001it [00:02, 430.89it/s, env_step=59000, gradient_step=5900, len=73, n/ep=0, n/st=100, rew=3521.50]                                                                                  


Epoch #59: test_reward: 15832.800000 ± 6871.333914, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #60: 1001it [00:02, 363.54it/s, env_step=60000, gradient_step=6000, len=149, n/ep=1, n/st=100, rew=9282.00]                                                                                 


Epoch #60: test_reward: 11111.400000 ± 6789.416060, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #61: 1001it [00:02, 420.03it/s, env_step=61000, gradient_step=6100, len=163, n/ep=0, n/st=100, rew=9806.00]                                                                                 


Epoch #61: test_reward: 12486.500000 ± 6071.548900, best_reward: 18280.600000 ± 8918.468256 in #52


Epoch #62: 1001it [00:02, 414.93it/s, env_step=62000, gradient_step=6200, len=136, n/ep=0, n/st=100, rew=8184.00]                                                                                 


Epoch #62: test_reward: 19021.000000 ± 7385.682487, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #63: 1001it [00:02, 437.78it/s, env_step=63000, gradient_step=6300, len=141, n/ep=1, n/st=100, rew=8170.00]                                                                                 


Epoch #63: test_reward: 10046.000000 ± 3326.115332, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #64: 1001it [00:02, 398.49it/s, env_step=64000, gradient_step=6400, len=136, n/ep=0, n/st=100, rew=8085.50]                                                                                 


Epoch #64: test_reward: 11363.100000 ± 3664.540447, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #65: 1001it [00:02, 376.61it/s, env_step=65000, gradient_step=6500, len=123, n/ep=0, n/st=100, rew=6999.50]                                                                                 


Epoch #65: test_reward: 11021.600000 ± 5558.144568, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #66: 1001it [00:02, 390.97it/s, env_step=66000, gradient_step=6600, len=106, n/ep=0, n/st=100, rew=5969.00]                                                                                 


Epoch #66: test_reward: 14332.600000 ± 3222.056741, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #67: 1001it [00:02, 458.39it/s, env_step=67000, gradient_step=6700, len=142, n/ep=1, n/st=100, rew=7254.00]                                                                                 


Epoch #67: test_reward: 17342.000000 ± 5477.239889, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #68: 1001it [00:02, 466.33it/s, env_step=68000, gradient_step=6800, len=241, n/ep=0, n/st=100, rew=14758.00]                                                                                


Epoch #68: test_reward: 11658.300000 ± 6029.265595, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #69: 1001it [00:02, 408.34it/s, env_step=69000, gradient_step=6900, len=110, n/ep=2, n/st=100, rew=6496.50]                                                                                 


Epoch #69: test_reward: 18658.600000 ± 7492.894290, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #70: 1001it [00:02, 414.45it/s, env_step=70000, gradient_step=7000, len=215, n/ep=3, n/st=100, rew=13510.67]                                                                                


Epoch #70: test_reward: 12707.900000 ± 4512.347802, best_reward: 19021.000000 ± 7385.682487 in #62


Epoch #71: 1001it [00:02, 465.45it/s, env_step=71000, gradient_step=7100, len=48, n/ep=0, n/st=100, rew=2105.00]                                                                                  


Epoch #71: test_reward: 19131.300000 ± 6020.538332, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #72: 1001it [00:02, 488.41it/s, env_step=72000, gradient_step=7200, len=115, n/ep=1, n/st=100, rew=6764.00]                                                                                 


Epoch #72: test_reward: 18019.000000 ± 5204.990394, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #73: 1001it [00:02, 442.11it/s, env_step=73000, gradient_step=7300, len=104, n/ep=0, n/st=100, rew=5758.75]                                                                                 


Epoch #73: test_reward: 11764.300000 ± 4720.163897, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #74: 1001it [00:02, 484.48it/s, env_step=74000, gradient_step=7400, len=72, n/ep=1, n/st=100, rew=3723.00]                                                                                  


Epoch #74: test_reward: 13023.200000 ± 9065.342540, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #75: 1001it [00:02, 465.95it/s, env_step=75000, gradient_step=7500, len=72, n/ep=0, n/st=100, rew=3800.00]                                                                                  


Epoch #75: test_reward: 13741.000000 ± 8869.273352, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #76: 1001it [00:02, 448.87it/s, env_step=76000, gradient_step=7600, len=108, n/ep=2, n/st=100, rew=5743.00]                                                                                 


Epoch #76: test_reward: 5959.200000 ± 5811.932360, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #77: 1001it [00:02, 413.62it/s, env_step=77000, gradient_step=7700, len=207, n/ep=0, n/st=100, rew=13188.00]                                                                                


Epoch #77: test_reward: 11598.400000 ± 4765.834265, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #78: 1001it [00:02, 437.64it/s, env_step=78000, gradient_step=7800, len=204, n/ep=2, n/st=100, rew=12605.25]                                                                                


Epoch #78: test_reward: 12743.800000 ± 3957.306149, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #79: 1001it [00:02, 382.62it/s, env_step=79000, gradient_step=7900, len=138, n/ep=1, n/st=100, rew=7427.00]                                                                                 


Epoch #79: test_reward: 12686.100000 ± 9434.371590, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #80: 1001it [00:02, 492.99it/s, env_step=80000, gradient_step=8000, len=222, n/ep=2, n/st=100, rew=13718.75]                                                                                


Epoch #80: test_reward: 11749.400000 ± 2270.506736, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #81: 1001it [00:02, 474.39it/s, env_step=81000, gradient_step=8100, len=210, n/ep=0, n/st=100, rew=12538.00]                                                                                


Epoch #81: test_reward: 7547.500000 ± 3707.617058, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #82: 1001it [00:02, 398.49it/s, env_step=82000, gradient_step=8200, len=166, n/ep=2, n/st=100, rew=10378.25]                                                                                


Epoch #82: test_reward: 15241.900000 ± 5555.595710, best_reward: 19131.300000 ± 6020.538332 in #71


Epoch #83: 1001it [00:02, 473.53it/s, env_step=83000, gradient_step=8300, len=200, n/ep=1, n/st=100, rew=12568.00]                                                                                


Epoch #83: test_reward: 21389.200000 ± 7781.391403, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #84: 1001it [00:02, 408.95it/s, env_step=84000, gradient_step=8400, len=244, n/ep=1, n/st=100, rew=15313.50]                                                                                


Epoch #84: test_reward: 7866.400000 ± 4564.736667, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #85: 1001it [00:02, 438.57it/s, env_step=85000, gradient_step=8500, len=238, n/ep=1, n/st=100, rew=15153.50]                                                                                


Epoch #85: test_reward: 14676.500000 ± 4190.944410, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #86: 1001it [00:02, 495.45it/s, env_step=86000, gradient_step=8600, len=75, n/ep=0, n/st=100, rew=3497.00]                                                                                  


Epoch #86: test_reward: 13005.500000 ± 7124.585128, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #87: 1001it [00:02, 407.90it/s, env_step=87000, gradient_step=8700, len=103, n/ep=0, n/st=100, rew=5792.00]                                                                                 


Epoch #87: test_reward: 12176.600000 ± 4276.599705, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #88: 1001it [00:02, 444.79it/s, env_step=88000, gradient_step=8800, len=143, n/ep=0, n/st=100, rew=9107.50]                                                                                 


Epoch #88: test_reward: 16471.700000 ± 4557.206580, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #89: 1001it [00:02, 394.95it/s, env_step=89000, gradient_step=8900, len=208, n/ep=0, n/st=100, rew=12963.50]                                                                                


Epoch #89: test_reward: 14466.200000 ± 5182.903545, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #90: 1001it [00:02, 480.44it/s, env_step=90000, gradient_step=9000, len=224, n/ep=1, n/st=100, rew=14197.50]                                                                                


Epoch #90: test_reward: 15347.600000 ± 6423.716731, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #91: 1001it [00:02, 400.32it/s, env_step=91000, gradient_step=9100, len=160, n/ep=0, n/st=100, rew=9428.25]                                                                                 


Epoch #91: test_reward: 12805.200000 ± 4861.501122, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #92: 1001it [00:02, 364.34it/s, env_step=92000, gradient_step=9200, len=208, n/ep=0, n/st=100, rew=13188.00]                                                                                


Epoch #92: test_reward: 14445.000000 ± 5075.553526, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #93: 1001it [00:02, 489.71it/s, env_step=93000, gradient_step=9300, len=100, n/ep=0, n/st=100, rew=5496.00]                                                                                 


Epoch #93: test_reward: 20465.100000 ± 7485.211853, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #94: 1001it [00:02, 450.17it/s, env_step=94000, gradient_step=9400, len=167, n/ep=1, n/st=100, rew=10229.00]                                                                                


Epoch #94: test_reward: 10439.600000 ± 3527.665494, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #95: 1001it [00:02, 492.48it/s, env_step=95000, gradient_step=9500, len=128, n/ep=0, n/st=100, rew=7008.00]                                                                                 


Epoch #95: test_reward: 13527.600000 ± 5141.764662, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #96: 1001it [00:02, 409.43it/s, env_step=96000, gradient_step=9600, len=132, n/ep=1, n/st=100, rew=7031.00]                                                                                 


Epoch #96: test_reward: 15389.400000 ± 8200.294924, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #97: 1001it [00:02, 415.28it/s, env_step=97000, gradient_step=9700, len=235, n/ep=0, n/st=100, rew=14814.00]                                                                                


Epoch #97: test_reward: 13286.200000 ± 5065.186784, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #98: 1001it [00:02, 442.89it/s, env_step=98000, gradient_step=9800, len=90, n/ep=0, n/st=100, rew=5042.00]                                                                                  


Epoch #98: test_reward: 6637.100000 ± 2244.741520, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #99: 1001it [00:02, 402.28it/s, env_step=99000, gradient_step=9900, len=110, n/ep=0, n/st=100, rew=6073.00]                                                                                 


Epoch #99: test_reward: 7981.900000 ± 7056.339086, best_reward: 21389.200000 ± 7781.391403 in #83


Epoch #100: 1001it [00:02, 429.16it/s, env_step=100000, gradient_step=10000, len=108, n/ep=0, n/st=100, rew=6032.50]                                                                              


Epoch #100: test_reward: 10491.500000 ± 2629.722999, best_reward: 21389.200000 ± 7781.391403 in #83

InfoStats(gradient_step=10000, best_reward=21389.2, best_reward_std=7781.391402570623, train_step=100000, train_episode=565, test_step=205886, test_episode=1010, timing=TimingStats(total_time=347.653687953949, train_time=229.4331533908844, train_time_collect=34.47949171066284, train_time_update=190.90431451797485, test_time=118.22053456306458, update_speed=435.85679977832314))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #6


Epoch #1: 1001it [00:02, 430.99it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 11195.000000 ± 2547.691426, best_reward: 12934.100000 ± 4593.753943 in #0


Epoch #2: 1001it [00:02, 419.35it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 19080.900000 ± 4307.076211, best_reward: 19080.900000 ± 4307.076211 in #2


Epoch #3: 1001it [00:02, 490.60it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 13570.100000 ± 6288.858155, best_reward: 19080.900000 ± 4307.076211 in #2


Epoch #4: 1001it [00:02, 404.52it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 18785.400000 ± 9415.282175, best_reward: 19080.900000 ± 4307.076211 in #2


Epoch #5: 1001it [00:02, 410.81it/s, env_step=5000, gradient_step=500, len=50, n/ep=1, n/st=100, rew=1407.00]                                                                                     


Epoch #5: test_reward: 10745.400000 ± 4849.872621, best_reward: 19080.900000 ± 4307.076211 in #2


Epoch #6: 1001it [00:02, 406.13it/s, env_step=6000, gradient_step=600, len=58, n/ep=0, n/st=100, rew=2239.00]                                                                                     


Epoch #6: test_reward: 16930.000000 ± 4493.870225, best_reward: 19080.900000 ± 4307.076211 in #2


Epoch #7: 1001it [00:02, 399.27it/s, env_step=7000, gradient_step=700, len=70, n/ep=1, n/st=100, rew=2891.00]                                                                                     


Epoch #7: test_reward: 19011.400000 ± 5283.903334, best_reward: 19080.900000 ± 4307.076211 in #2


Epoch #8: 1001it [00:02, 396.63it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=3416.00]                                                                                     


Epoch #8: test_reward: 20124.800000 ± 7783.454205, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #9: 1001it [00:02, 425.13it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=4296.00]                                                                                     


Epoch #9: test_reward: 18837.800000 ± 5642.613469, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #10: 1001it [00:02, 399.00it/s, env_step=10000, gradient_step=1000, len=96, n/ep=0, n/st=100, rew=4213.25]                                                                                  


Epoch #10: test_reward: 18742.800000 ± 7120.970282, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #11: 1001it [00:02, 468.94it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=5535.50]                                                                                 


Epoch #11: test_reward: 13314.700000 ± 5732.144818, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #12: 1001it [00:02, 364.02it/s, env_step=12000, gradient_step=1200, len=116, n/ep=0, n/st=100, rew=6142.00]                                                                                 


Epoch #12: test_reward: 17456.800000 ± 7009.293956, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #13: 1001it [00:02, 343.89it/s, env_step=13000, gradient_step=1300, len=129, n/ep=0, n/st=100, rew=6634.00]                                                                                 


Epoch #13: test_reward: 8140.400000 ± 4899.929840, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #14: 1001it [00:02, 341.72it/s, env_step=14000, gradient_step=1400, len=140, n/ep=2, n/st=100, rew=6879.00]                                                                                 


Epoch #14: test_reward: 13251.400000 ± 5155.429220, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #15: 1001it [00:02, 417.23it/s, env_step=15000, gradient_step=1500, len=145, n/ep=0, n/st=100, rew=8102.50]                                                                                 


Epoch #15: test_reward: 9580.400000 ± 3410.870716, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #16: 1001it [00:02, 450.99it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=8750.00]                                                                                 


Epoch #16: test_reward: 13731.800000 ± 6047.266718, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #17: 1001it [00:02, 390.35it/s, env_step=17000, gradient_step=1700, len=168, n/ep=0, n/st=100, rew=9293.00]                                                                                 


Epoch #17: test_reward: 13993.100000 ± 7823.361713, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #18: 1001it [00:02, 424.83it/s, env_step=18000, gradient_step=1800, len=110, n/ep=0, n/st=100, rew=5666.00]                                                                                 


Epoch #18: test_reward: 16394.200000 ± 9623.909121, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #19: 1001it [00:02, 413.89it/s, env_step=19000, gradient_step=1900, len=190, n/ep=2, n/st=100, rew=9715.00]                                                                                 


Epoch #19: test_reward: 9949.000000 ± 5192.714454, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #20: 1001it [00:02, 412.56it/s, env_step=20000, gradient_step=2000, len=132, n/ep=1, n/st=100, rew=7282.00]                                                                                 


Epoch #20: test_reward: 11866.700000 ± 4688.880592, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #21: 1001it [00:02, 371.76it/s, env_step=21000, gradient_step=2100, len=50, n/ep=0, n/st=100, rew=2360.00]                                                                                  


Epoch #21: test_reward: 14391.900000 ± 6277.406860, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #22: 1001it [00:02, 347.42it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=12722.00]                                                                                


Epoch #22: test_reward: 15605.300000 ± 4950.034304, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #23: 1001it [00:02, 421.32it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=12686.50]                                                                                


Epoch #23: test_reward: 15983.300000 ± 4852.529280, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #24: 1001it [00:02, 469.63it/s, env_step=24000, gradient_step=2400, len=237, n/ep=0, n/st=100, rew=11955.50]                                                                                


Epoch #24: test_reward: 16534.800000 ± 7596.174100, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #25: 1001it [00:02, 396.61it/s, env_step=25000, gradient_step=2500, len=250, n/ep=2, n/st=100, rew=15377.00]                                                                                


Epoch #25: test_reward: 10457.400000 ± 4148.693173, best_reward: 20124.800000 ± 7783.454205 in #8


Epoch #26: 1001it [00:02, 426.34it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=15526.00]                                                                                


Epoch #26: test_reward: 20949.900000 ± 7455.871008, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #27: 1001it [00:02, 398.46it/s, env_step=27000, gradient_step=2700, len=212, n/ep=1, n/st=100, rew=10936.50]                                                                                


Epoch #27: test_reward: 12397.500000 ± 6621.990868, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #28: 1001it [00:02, 395.73it/s, env_step=28000, gradient_step=2800, len=280, n/ep=2, n/st=100, rew=15279.50]                                                                                


Epoch #28: test_reward: 19495.900000 ± 8541.355682, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #29: 1001it [00:02, 407.14it/s, env_step=29000, gradient_step=2900, len=172, n/ep=0, n/st=100, rew=10557.00]                                                                                


Epoch #29: test_reward: 16908.300000 ± 6510.969913, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #30: 1001it [00:02, 362.69it/s, env_step=30000, gradient_step=3000, len=298, n/ep=0, n/st=100, rew=17804.50]                                                                                


Epoch #30: test_reward: 20716.700000 ± 5677.749308, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #31: 1001it [00:02, 385.38it/s, env_step=31000, gradient_step=3100, len=104, n/ep=1, n/st=100, rew=5817.00]                                                                                 


Epoch #31: test_reward: 20655.800000 ± 5676.052797, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #32: 1001it [00:02, 340.42it/s, env_step=32000, gradient_step=3200, len=156, n/ep=0, n/st=100, rew=8692.00]                                                                                 


Epoch #32: test_reward: 14772.800000 ± 6368.271050, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #33: 1001it [00:02, 357.11it/s, env_step=33000, gradient_step=3300, len=124, n/ep=1, n/st=100, rew=7304.00]                                                                                 


Epoch #33: test_reward: 10070.400000 ± 3842.224829, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #34: 1001it [00:02, 347.99it/s, env_step=34000, gradient_step=3400, len=56, n/ep=1, n/st=100, rew=1814.00]                                                                                  


Epoch #34: test_reward: 12333.200000 ± 4480.088789, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #35: 1001it [00:02, 341.81it/s, env_step=35000, gradient_step=3500, len=258, n/ep=3, n/st=100, rew=15740.83]                                                                                


Epoch #35: test_reward: 12807.300000 ± 6264.349480, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #36: 1001it [00:02, 406.43it/s, env_step=36000, gradient_step=3600, len=104, n/ep=3, n/st=100, rew=5958.00]                                                                                 


Epoch #36: test_reward: 10421.100000 ± 2859.082841, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #37: 1001it [00:02, 471.36it/s, env_step=37000, gradient_step=3700, len=136, n/ep=0, n/st=100, rew=8242.00]                                                                                 


Epoch #37: test_reward: 12701.000000 ± 7040.983497, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #38: 1001it [00:02, 382.84it/s, env_step=38000, gradient_step=3800, len=176, n/ep=0, n/st=100, rew=10714.00]                                                                                


Epoch #38: test_reward: 4228.000000 ± 2281.745165, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #39: 1001it [00:02, 345.41it/s, env_step=39000, gradient_step=3900, len=243, n/ep=2, n/st=100, rew=14920.75]                                                                                


Epoch #39: test_reward: 13820.300000 ± 2708.735648, best_reward: 20949.900000 ± 7455.871008 in #26


Epoch #40: 1001it [00:02, 463.51it/s, env_step=40000, gradient_step=4000, len=186, n/ep=0, n/st=100, rew=11624.50]                                                                                


Epoch #40: test_reward: 21815.200000 ± 7048.356359, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #41: 1001it [00:02, 486.59it/s, env_step=41000, gradient_step=4100, len=228, n/ep=1, n/st=100, rew=13299.00]                                                                                


Epoch #41: test_reward: 17760.200000 ± 6314.654588, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #42: 1001it [00:01, 501.88it/s, env_step=42000, gradient_step=4200, len=206, n/ep=1, n/st=100, rew=12400.00]                                                                                


Epoch #42: test_reward: 9979.700000 ± 4577.116910, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #43: 1001it [00:02, 365.13it/s, env_step=43000, gradient_step=4300, len=69, n/ep=0, n/st=100, rew=3420.00]                                                                                  


Epoch #43: test_reward: 11524.700000 ± 3282.091317, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #44: 1001it [00:02, 398.23it/s, env_step=44000, gradient_step=4400, len=112, n/ep=1, n/st=100, rew=6391.00]                                                                                 


Epoch #44: test_reward: 11200.200000 ± 6095.251296, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #45: 1001it [00:02, 397.62it/s, env_step=45000, gradient_step=4500, len=76, n/ep=0, n/st=100, rew=4029.00]                                                                                  


Epoch #45: test_reward: 9418.700000 ± 2666.842929, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #46: 1001it [00:02, 410.25it/s, env_step=46000, gradient_step=4600, len=156, n/ep=0, n/st=100, rew=9005.50]                                                                                 


Epoch #46: test_reward: 11681.200000 ± 4098.947006, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #47: 1001it [00:02, 411.95it/s, env_step=47000, gradient_step=4700, len=181, n/ep=2, n/st=100, rew=10954.00]                                                                                


Epoch #47: test_reward: 14443.900000 ± 4823.677445, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #48: 1001it [00:02, 345.47it/s, env_step=48000, gradient_step=4800, len=228, n/ep=1, n/st=100, rew=13056.00]                                                                                


Epoch #48: test_reward: 12710.000000 ± 5367.179371, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #49: 1001it [00:02, 401.99it/s, env_step=49000, gradient_step=4900, len=151, n/ep=2, n/st=100, rew=9700.50]                                                                                 


Epoch #49: test_reward: 10364.600000 ± 6546.132541, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #50: 1001it [00:02, 340.85it/s, env_step=50000, gradient_step=5000, len=146, n/ep=2, n/st=100, rew=9029.00]                                                                                 


Epoch #50: test_reward: 16660.100000 ± 9836.952724, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #51: 1001it [00:02, 444.12it/s, env_step=51000, gradient_step=5100, len=136, n/ep=1, n/st=100, rew=7565.00]                                                                                 


Epoch #51: test_reward: 17319.000000 ± 7748.164273, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #52: 1001it [00:02, 436.02it/s, env_step=52000, gradient_step=5200, len=156, n/ep=0, n/st=100, rew=9879.25]                                                                                 


Epoch #52: test_reward: 9326.500000 ± 4900.351156, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #53: 1001it [00:02, 441.66it/s, env_step=53000, gradient_step=5300, len=220, n/ep=1, n/st=100, rew=13876.00]                                                                                


Epoch #53: test_reward: 9580.400000 ± 3349.092868, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #54: 1001it [00:02, 443.32it/s, env_step=54000, gradient_step=5400, len=304, n/ep=3, n/st=100, rew=19993.67]                                                                                


Epoch #54: test_reward: 17511.600000 ± 5538.983502, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #55: 1001it [00:02, 377.21it/s, env_step=55000, gradient_step=5500, len=271, n/ep=0, n/st=100, rew=17063.00]                                                                                


Epoch #55: test_reward: 17012.800000 ± 5352.287844, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #56: 1001it [00:02, 451.03it/s, env_step=56000, gradient_step=5600, len=177, n/ep=0, n/st=100, rew=10082.00]                                                                                


Epoch #56: test_reward: 10314.100000 ± 2086.378750, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #57: 1001it [00:02, 468.28it/s, env_step=57000, gradient_step=5700, len=144, n/ep=3, n/st=100, rew=8644.33]                                                                                 


Epoch #57: test_reward: 12182.200000 ± 1638.346532, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #58: 1001it [00:02, 465.22it/s, env_step=58000, gradient_step=5800, len=215, n/ep=0, n/st=100, rew=14429.50]                                                                                


Epoch #58: test_reward: 13568.300000 ± 5035.305513, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #59: 1001it [00:02, 435.62it/s, env_step=59000, gradient_step=5900, len=136, n/ep=0, n/st=100, rew=8246.00]                                                                                 


Epoch #59: test_reward: 11571.000000 ± 7047.085710, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #60: 1001it [00:02, 436.52it/s, env_step=60000, gradient_step=6000, len=194, n/ep=1, n/st=100, rew=11541.00]                                                                                


Epoch #60: test_reward: 11803.200000 ± 6521.582458, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #61: 1001it [00:02, 379.36it/s, env_step=61000, gradient_step=6100, len=180, n/ep=0, n/st=100, rew=9642.50]                                                                                 


Epoch #61: test_reward: 10309.200000 ± 3706.200232, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #62: 1001it [00:02, 460.65it/s, env_step=62000, gradient_step=6200, len=94, n/ep=0, n/st=100, rew=5154.00]                                                                                  


Epoch #62: test_reward: 11788.800000 ± 3299.326653, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #63: 1001it [00:02, 406.42it/s, env_step=63000, gradient_step=6300, len=168, n/ep=0, n/st=100, rew=10401.00]                                                                                


Epoch #63: test_reward: 10987.400000 ± 4512.660218, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #64: 1001it [00:02, 431.06it/s, env_step=64000, gradient_step=6400, len=144, n/ep=1, n/st=100, rew=7391.00]                                                                                 


Epoch #64: test_reward: 10666.900000 ± 3829.529983, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #65: 1001it [00:02, 358.00it/s, env_step=65000, gradient_step=6500, len=193, n/ep=0, n/st=100, rew=11896.00]                                                                                


Epoch #65: test_reward: 10968.600000 ± 6468.290225, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #66: 1001it [00:02, 432.99it/s, env_step=66000, gradient_step=6600, len=118, n/ep=0, n/st=100, rew=7192.00]                                                                                 


Epoch #66: test_reward: 11319.600000 ± 1245.156392, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #67: 1001it [00:02, 405.03it/s, env_step=67000, gradient_step=6700, len=150, n/ep=0, n/st=100, rew=9163.00]                                                                                 


Epoch #67: test_reward: 13846.500000 ± 3183.039090, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #68: 1001it [00:02, 468.15it/s, env_step=68000, gradient_step=6800, len=182, n/ep=0, n/st=100, rew=11345.00]                                                                                


Epoch #68: test_reward: 12417.200000 ± 2994.714971, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #69: 1001it [00:02, 417.02it/s, env_step=69000, gradient_step=6900, len=164, n/ep=2, n/st=100, rew=10601.75]                                                                                


Epoch #69: test_reward: 10764.900000 ± 6555.628993, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #70: 1001it [00:02, 339.45it/s, env_step=70000, gradient_step=7000, len=91, n/ep=0, n/st=100, rew=5278.00]                                                                                  


Epoch #70: test_reward: 14552.500000 ± 6429.529893, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #71: 1001it [00:02, 359.05it/s, env_step=71000, gradient_step=7100, len=317, n/ep=1, n/st=100, rew=20611.50]                                                                                


Epoch #71: test_reward: 18057.600000 ± 5841.858732, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #72: 1001it [00:02, 381.57it/s, env_step=72000, gradient_step=7200, len=136, n/ep=1, n/st=100, rew=7835.00]                                                                                 


Epoch #72: test_reward: 13105.500000 ± 4115.761467, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #73: 1001it [00:02, 384.79it/s, env_step=73000, gradient_step=7300, len=142, n/ep=0, n/st=100, rew=8574.00]                                                                                 


Epoch #73: test_reward: 19460.200000 ± 7027.883462, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #74: 1001it [00:02, 366.00it/s, env_step=74000, gradient_step=7400, len=178, n/ep=0, n/st=100, rew=10560.25]                                                                                


Epoch #74: test_reward: 19851.000000 ± 9666.940674, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #75: 1001it [00:02, 421.72it/s, env_step=75000, gradient_step=7500, len=118, n/ep=1, n/st=100, rew=7010.00]                                                                                 


Epoch #75: test_reward: 10498.000000 ± 6707.485311, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #76: 1001it [00:02, 428.77it/s, env_step=76000, gradient_step=7600, len=97, n/ep=2, n/st=100, rew=5647.50]                                                                                  


Epoch #76: test_reward: 14350.500000 ± 7974.037299, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #77: 1001it [00:02, 422.33it/s, env_step=77000, gradient_step=7700, len=160, n/ep=3, n/st=100, rew=9622.67]                                                                                 


Epoch #77: test_reward: 11316.000000 ± 2744.101893, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #78: 1001it [00:02, 405.91it/s, env_step=78000, gradient_step=7800, len=110, n/ep=0, n/st=100, rew=7007.50]                                                                                 


Epoch #78: test_reward: 16518.400000 ± 7180.301710, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #79: 1001it [00:02, 363.33it/s, env_step=79000, gradient_step=7900, len=170, n/ep=0, n/st=100, rew=10575.00]                                                                                


Epoch #79: test_reward: 15085.100000 ± 7011.740475, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #80: 1001it [00:02, 392.84it/s, env_step=80000, gradient_step=8000, len=118, n/ep=1, n/st=100, rew=7364.50]                                                                                 


Epoch #80: test_reward: 14416.100000 ± 8887.246013, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #81: 1001it [00:02, 435.89it/s, env_step=81000, gradient_step=8100, len=236, n/ep=0, n/st=100, rew=14529.00]                                                                                


Epoch #81: test_reward: 19550.200000 ± 6941.540820, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #82: 1001it [00:02, 399.83it/s, env_step=82000, gradient_step=8200, len=192, n/ep=1, n/st=100, rew=11658.00]                                                                                


Epoch #82: test_reward: 10724.500000 ± 4528.414121, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #83: 1001it [00:02, 408.00it/s, env_step=83000, gradient_step=8300, len=222, n/ep=2, n/st=100, rew=14525.25]                                                                                


Epoch #83: test_reward: 15353.300000 ± 6658.227903, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #84: 1001it [00:02, 463.34it/s, env_step=84000, gradient_step=8400, len=112, n/ep=2, n/st=100, rew=5288.00]                                                                                 


Epoch #84: test_reward: 12562.800000 ± 6692.833373, best_reward: 21815.200000 ± 7048.356359 in #40


Epoch #85: 1001it [00:02, 408.72it/s, env_step=85000, gradient_step=8500, len=172, n/ep=1, n/st=100, rew=10996.50]                                                                                


Epoch #85: test_reward: 22343.400000 ± 6503.353969, best_reward: 22343.400000 ± 6503.353969 in #85

InfoStats(gradient_step=8500, best_reward=22343.4, best_reward_std=6503.353968530393, train_step=85000, train_episode=422, test_step=184273, test_episode=860, timing=TimingStats(total_time=314.8401732444763, train_time=211.54757022857666, train_time_collect=29.50301742553711, train_time_update=178.60889291763306, test_time=103.29260301589966, update_speed=401.800880568648))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #19


Epoch #1: 1001it [00:02, 399.32it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 15618.300000 ± 6822.368358, best_reward: 15618.300000 ± 6822.368358 in #1


Epoch #2: 1001it [00:02, 388.52it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 16129.100000 ± 8213.266518, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #3: 1001it [00:02, 364.30it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 11908.600000 ± 7195.340050, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #4: 1001it [00:02, 460.57it/s, env_step=4000, gradient_step=400, len=38, n/ep=0, n/st=100, rew=1233.00]                                                                                     


Epoch #4: test_reward: 10591.000000 ± 4837.811055, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #5: 1001it [00:02, 482.16it/s, env_step=5000, gradient_step=500, len=38, n/ep=0, n/st=100, rew=1233.00]                                                                                     


Epoch #5: test_reward: 7596.300000 ± 6872.408312, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #6: 1001it [00:02, 408.41it/s, env_step=6000, gradient_step=600, len=60, n/ep=1, n/st=100, rew=1682.00]                                                                                     


Epoch #6: test_reward: 11993.000000 ± 5044.249597, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #7: 1001it [00:02, 479.21it/s, env_step=7000, gradient_step=700, len=70, n/ep=1, n/st=100, rew=2275.00]                                                                                     


Epoch #7: test_reward: 11015.400000 ± 3219.262562, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #8: 1001it [00:02, 374.69it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=2698.00]                                                                                     


Epoch #8: test_reward: 13400.600000 ± 6520.954534, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #9: 1001it [00:02, 421.48it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=3242.00]                                                                                     


Epoch #9: test_reward: 10007.200000 ± 2909.746752, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #10: 1001it [00:02, 380.72it/s, env_step=10000, gradient_step=1000, len=96, n/ep=0, n/st=100, rew=3410.00]                                                                                  


Epoch #10: test_reward: 11617.700000 ± 3652.384866, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #11: 1001it [00:02, 367.04it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=4340.25]                                                                                 


Epoch #11: test_reward: 14227.300000 ± 4558.578069, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #12: 1001it [00:02, 393.69it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=4273.25]                                                                                 


Epoch #12: test_reward: 15305.000000 ± 5415.999594, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #13: 1001it [00:02, 409.23it/s, env_step=13000, gradient_step=1300, len=130, n/ep=2, n/st=100, rew=5983.00]                                                                                 


Epoch #13: test_reward: 11439.000000 ± 3671.758516, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #14: 1001it [00:02, 447.20it/s, env_step=14000, gradient_step=1400, len=140, n/ep=2, n/st=100, rew=5617.00]                                                                                 


Epoch #14: test_reward: 13654.500000 ± 2895.884295, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #15: 1001it [00:02, 461.94it/s, env_step=15000, gradient_step=1500, len=150, n/ep=3, n/st=100, rew=7197.33]                                                                                 


Epoch #15: test_reward: 15118.900000 ± 7655.330123, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #16: 1001it [00:02, 440.04it/s, env_step=16000, gradient_step=1600, len=160, n/ep=2, n/st=100, rew=7065.50]                                                                                 


Epoch #16: test_reward: 11293.200000 ± 5800.625549, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #17: 1001it [00:02, 365.98it/s, env_step=17000, gradient_step=1700, len=170, n/ep=1, n/st=100, rew=7837.00]                                                                                 


Epoch #17: test_reward: 12483.300000 ± 3822.974969, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #18: 1001it [00:02, 376.62it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=9277.00]                                                                                 


Epoch #18: test_reward: 10850.700000 ± 4344.109185, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #19: 1001it [00:02, 420.44it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=8592.00]                                                                                 


Epoch #19: test_reward: 12546.600000 ± 2497.419316, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #20: 1001it [00:02, 451.82it/s, env_step=20000, gradient_step=2000, len=128, n/ep=0, n/st=100, rew=7024.00]                                                                                 


Epoch #20: test_reward: 13435.400000 ± 7370.336427, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #21: 1001it [00:02, 428.18it/s, env_step=21000, gradient_step=2100, len=70, n/ep=1, n/st=100, rew=3723.00]                                                                                  


Epoch #21: test_reward: 7306.900000 ± 5305.146736, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #22: 1001it [00:02, 421.88it/s, env_step=22000, gradient_step=2200, len=218, n/ep=0, n/st=100, rew=10137.00]                                                                                


Epoch #22: test_reward: 13157.600000 ± 2508.264388, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #23: 1001it [00:02, 460.23it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=11673.00]                                                                                


Epoch #23: test_reward: 15086.000000 ± 3210.692916, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #24: 1001it [00:02, 458.63it/s, env_step=24000, gradient_step=2400, len=236, n/ep=0, n/st=100, rew=13470.00]                                                                                


Epoch #24: test_reward: 9442.000000 ± 4237.980840, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #25: 1001it [00:02, 382.59it/s, env_step=25000, gradient_step=2500, len=142, n/ep=1, n/st=100, rew=7766.50]                                                                                 


Epoch #25: test_reward: 10231.100000 ± 4105.355854, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #26: 1001it [00:02, 369.59it/s, env_step=26000, gradient_step=2600, len=258, n/ep=0, n/st=100, rew=13203.00]                                                                                


Epoch #26: test_reward: 12970.700000 ± 3330.389318, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #27: 1001it [00:02, 387.73it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=13696.00]                                                                                


Epoch #27: test_reward: 13479.000000 ± 3755.677489, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #28: 1001it [00:02, 457.78it/s, env_step=28000, gradient_step=2800, len=276, n/ep=0, n/st=100, rew=14743.50]                                                                                


Epoch #28: test_reward: 11495.700000 ± 3994.342350, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #29: 1001it [00:02, 399.31it/s, env_step=29000, gradient_step=2900, len=68, n/ep=1, n/st=100, rew=3297.00]                                                                                  


Epoch #29: test_reward: 12107.400000 ± 2714.177010, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #30: 1001it [00:02, 425.56it/s, env_step=30000, gradient_step=3000, len=97, n/ep=0, n/st=100, rew=4655.00]                                                                                  


Epoch #30: test_reward: 13613.000000 ± 4350.786182, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #31: 1001it [00:02, 406.47it/s, env_step=31000, gradient_step=3100, len=86, n/ep=1, n/st=100, rew=4567.00]                                                                                  


Epoch #31: test_reward: 12389.100000 ± 4925.393171, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #32: 1001it [00:02, 450.62it/s, env_step=32000, gradient_step=3200, len=144, n/ep=1, n/st=100, rew=5845.00]                                                                                 


Epoch #32: test_reward: 9961.900000 ± 3150.212547, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #33: 1001it [00:02, 376.91it/s, env_step=33000, gradient_step=3300, len=198, n/ep=1, n/st=100, rew=11456.00]                                                                                


Epoch #33: test_reward: 14600.900000 ± 4461.856305, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #34: 1001it [00:02, 391.00it/s, env_step=34000, gradient_step=3400, len=129, n/ep=0, n/st=100, rew=7604.75]                                                                                 


Epoch #34: test_reward: 14722.600000 ± 3105.668115, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #35: 1001it [00:02, 434.91it/s, env_step=35000, gradient_step=3500, len=144, n/ep=3, n/st=100, rew=8241.17]                                                                                 


Epoch #35: test_reward: 10576.100000 ± 5620.362950, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #36: 1001it [00:02, 432.43it/s, env_step=36000, gradient_step=3600, len=176, n/ep=2, n/st=100, rew=10141.25]                                                                                


Epoch #36: test_reward: 11056.400000 ± 1885.097515, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #37: 1001it [00:02, 466.46it/s, env_step=37000, gradient_step=3700, len=196, n/ep=3, n/st=100, rew=12109.17]                                                                                


Epoch #37: test_reward: 8136.800000 ± 1429.036934, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #38: 1001it [00:02, 403.46it/s, env_step=38000, gradient_step=3800, len=120, n/ep=2, n/st=100, rew=6474.75]                                                                                 


Epoch #38: test_reward: 9901.000000 ± 5728.928015, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #39: 1001it [00:02, 461.20it/s, env_step=39000, gradient_step=3900, len=167, n/ep=3, n/st=100, rew=9411.00]                                                                                 


Epoch #39: test_reward: 7192.800000 ± 3789.644200, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #40: 1001it [00:02, 386.81it/s, env_step=40000, gradient_step=4000, len=221, n/ep=2, n/st=100, rew=13347.50]                                                                                


Epoch #40: test_reward: 13335.200000 ± 3908.006981, best_reward: 16129.100000 ± 8213.266518 in #2


Epoch #41: 1001it [00:02, 455.69it/s, env_step=41000, gradient_step=4100, len=178, n/ep=0, n/st=100, rew=9862.00]                                                                                 


Epoch #41: test_reward: 16505.400000 ± 6206.201853, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #42: 1001it [00:02, 379.13it/s, env_step=42000, gradient_step=4200, len=178, n/ep=1, n/st=100, rew=11620.50]                                                                                


Epoch #42: test_reward: 8920.700000 ± 2954.815563, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #43: 1001it [00:02, 440.56it/s, env_step=43000, gradient_step=4300, len=239, n/ep=2, n/st=100, rew=15024.00]                                                                                


Epoch #43: test_reward: 13841.100000 ± 5829.474118, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #44: 1001it [00:02, 355.04it/s, env_step=44000, gradient_step=4400, len=153, n/ep=0, n/st=100, rew=8747.00]                                                                                 


Epoch #44: test_reward: 13156.400000 ± 5660.090392, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #45: 1001it [00:02, 428.95it/s, env_step=45000, gradient_step=4500, len=114, n/ep=0, n/st=100, rew=6589.00]                                                                                 


Epoch #45: test_reward: 14017.700000 ± 3774.222040, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #46: 1001it [00:02, 374.28it/s, env_step=46000, gradient_step=4600, len=194, n/ep=2, n/st=100, rew=10645.25]                                                                                


Epoch #46: test_reward: 12368.800000 ± 4161.569146, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #47: 1001it [00:02, 376.79it/s, env_step=47000, gradient_step=4700, len=246, n/ep=0, n/st=100, rew=15516.50]                                                                                


Epoch #47: test_reward: 14844.200000 ± 3577.337971, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #48: 1001it [00:02, 379.48it/s, env_step=48000, gradient_step=4800, len=139, n/ep=3, n/st=100, rew=7750.67]                                                                                 


Epoch #48: test_reward: 11968.400000 ± 3534.084753, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #49: 1001it [00:02, 442.33it/s, env_step=49000, gradient_step=4900, len=176, n/ep=2, n/st=100, rew=10326.50]                                                                                


Epoch #49: test_reward: 9132.000000 ± 3818.829035, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #50: 1001it [00:02, 375.59it/s, env_step=50000, gradient_step=5000, len=140, n/ep=0, n/st=100, rew=8501.00]                                                                                 


Epoch #50: test_reward: 16015.800000 ± 4638.645056, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #51: 1001it [00:02, 441.81it/s, env_step=51000, gradient_step=5100, len=153, n/ep=2, n/st=100, rew=8938.00]                                                                                 


Epoch #51: test_reward: 14094.500000 ± 5216.160183, best_reward: 16505.400000 ± 6206.201853 in #41


Epoch #52: 1001it [00:02, 449.13it/s, env_step=52000, gradient_step=5200, len=101, n/ep=0, n/st=100, rew=5748.00]                                                                                 


Epoch #52: test_reward: 17786.500000 ± 4431.427360, best_reward: 17786.500000 ± 4431.427360 in #52


Epoch #53: 1001it [00:02, 439.78it/s, env_step=53000, gradient_step=5300, len=96, n/ep=0, n/st=100, rew=5281.00]                                                                                  


Epoch #53: test_reward: 17279.700000 ± 4085.262172, best_reward: 17786.500000 ± 4431.427360 in #52


Epoch #54: 1001it [00:02, 426.19it/s, env_step=54000, gradient_step=5400, len=210, n/ep=1, n/st=100, rew=13420.00]                                                                                


Epoch #54: test_reward: 17886.100000 ± 4655.509541, best_reward: 17886.100000 ± 4655.509541 in #54


Epoch #55: 1001it [00:02, 387.29it/s, env_step=55000, gradient_step=5500, len=190, n/ep=1, n/st=100, rew=10962.00]                                                                                


Epoch #55: test_reward: 14019.400000 ± 5022.727092, best_reward: 17886.100000 ± 4655.509541 in #54


Epoch #56: 1001it [00:02, 341.46it/s, env_step=56000, gradient_step=5600, len=192, n/ep=0, n/st=100, rew=11463.88]                                                                                


Epoch #56: test_reward: 19601.100000 ± 8910.276724, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #57: 1001it [00:02, 358.99it/s, env_step=57000, gradient_step=5700, len=257, n/ep=0, n/st=100, rew=16963.50]                                                                                


Epoch #57: test_reward: 16795.600000 ± 4328.749177, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #58: 1001it [00:02, 440.02it/s, env_step=58000, gradient_step=5800, len=228, n/ep=1, n/st=100, rew=14215.00]                                                                                


Epoch #58: test_reward: 15719.700000 ± 7593.953569, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #59: 1001it [00:02, 432.67it/s, env_step=59000, gradient_step=5900, len=150, n/ep=0, n/st=100, rew=8639.00]                                                                                 


Epoch #59: test_reward: 16257.300000 ± 6042.040186, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #60: 1001it [00:02, 408.50it/s, env_step=60000, gradient_step=6000, len=86, n/ep=1, n/st=100, rew=4582.50]                                                                                  


Epoch #60: test_reward: 14213.300000 ± 5114.518473, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #61: 1001it [00:02, 373.81it/s, env_step=61000, gradient_step=6100, len=222, n/ep=0, n/st=100, rew=13987.50]                                                                                


Epoch #61: test_reward: 17032.000000 ± 8500.058117, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #62: 1001it [00:02, 420.35it/s, env_step=62000, gradient_step=6200, len=219, n/ep=0, n/st=100, rew=14271.50]                                                                                


Epoch #62: test_reward: 10206.200000 ± 3437.107237, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #63: 1001it [00:02, 440.49it/s, env_step=63000, gradient_step=6300, len=178, n/ep=2, n/st=100, rew=10867.75]                                                                                


Epoch #63: test_reward: 14226.500000 ± 6715.652839, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #64: 1001it [00:02, 364.52it/s, env_step=64000, gradient_step=6400, len=121, n/ep=0, n/st=100, rew=6542.50]                                                                                 


Epoch #64: test_reward: 13295.800000 ± 7716.395697, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #65: 1001it [00:02, 437.99it/s, env_step=65000, gradient_step=6500, len=110, n/ep=0, n/st=100, rew=4328.00]                                                                                 


Epoch #65: test_reward: 17801.600000 ± 8457.413354, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #66: 1001it [00:02, 413.57it/s, env_step=66000, gradient_step=6600, len=282, n/ep=1, n/st=100, rew=19444.00]                                                                                


Epoch #66: test_reward: 16537.800000 ± 6386.528976, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #67: 1001it [00:02, 387.37it/s, env_step=67000, gradient_step=6700, len=124, n/ep=0, n/st=100, rew=7123.00]                                                                                 


Epoch #67: test_reward: 10503.000000 ± 3968.537363, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #68: 1001it [00:02, 427.85it/s, env_step=68000, gradient_step=6800, len=212, n/ep=0, n/st=100, rew=12598.00]                                                                                


Epoch #68: test_reward: 12781.700000 ± 8650.284505, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #69: 1001it [00:02, 426.75it/s, env_step=69000, gradient_step=6900, len=204, n/ep=1, n/st=100, rew=11233.00]                                                                                


Epoch #69: test_reward: 17187.600000 ± 5292.343284, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #70: 1001it [00:02, 353.23it/s, env_step=70000, gradient_step=7000, len=214, n/ep=1, n/st=100, rew=12918.00]                                                                                


Epoch #70: test_reward: 12065.800000 ± 3342.102626, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #71: 1001it [00:02, 437.42it/s, env_step=71000, gradient_step=7100, len=160, n/ep=1, n/st=100, rew=9527.50]                                                                                 


Epoch #71: test_reward: 13825.400000 ± 6406.063428, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #72: 1001it [00:02, 426.63it/s, env_step=72000, gradient_step=7200, len=162, n/ep=1, n/st=100, rew=10144.50]                                                                                


Epoch #72: test_reward: 12964.400000 ± 4463.934390, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #73: 1001it [00:02, 460.90it/s, env_step=73000, gradient_step=7300, len=164, n/ep=2, n/st=100, rew=9729.00]                                                                                 


Epoch #73: test_reward: 15363.500000 ± 4597.774424, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #74: 1001it [00:02, 402.17it/s, env_step=74000, gradient_step=7400, len=144, n/ep=0, n/st=100, rew=8659.00]                                                                                 


Epoch #74: test_reward: 14997.100000 ± 6033.301757, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #75: 1001it [00:02, 398.05it/s, env_step=75000, gradient_step=7500, len=264, n/ep=2, n/st=100, rew=15460.50]                                                                                


Epoch #75: test_reward: 6249.700000 ± 4025.522824, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #76: 1001it [00:02, 377.88it/s, env_step=76000, gradient_step=7600, len=163, n/ep=0, n/st=100, rew=10084.00]                                                                                


Epoch #76: test_reward: 15113.100000 ± 5854.321625, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #77: 1001it [00:02, 354.73it/s, env_step=77000, gradient_step=7700, len=166, n/ep=1, n/st=100, rew=9583.00]                                                                                 


Epoch #77: test_reward: 16310.800000 ± 2716.902567, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #78: 1001it [00:02, 435.40it/s, env_step=78000, gradient_step=7800, len=270, n/ep=1, n/st=100, rew=15801.50]                                                                                


Epoch #78: test_reward: 16475.200000 ± 8133.537789, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #79: 1001it [00:02, 449.72it/s, env_step=79000, gradient_step=7900, len=218, n/ep=1, n/st=100, rew=13647.00]                                                                                


Epoch #79: test_reward: 14516.000000 ± 4989.305102, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #80: 1001it [00:02, 453.23it/s, env_step=80000, gradient_step=8000, len=232, n/ep=1, n/st=100, rew=14915.00]                                                                                


Epoch #80: test_reward: 13789.800000 ± 7093.525355, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #81: 1001it [00:02, 395.51it/s, env_step=81000, gradient_step=8100, len=250, n/ep=0, n/st=100, rew=16857.00]                                                                                


Epoch #81: test_reward: 19183.000000 ± 3490.606623, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #82: 1001it [00:02, 373.78it/s, env_step=82000, gradient_step=8200, len=168, n/ep=1, n/st=100, rew=9653.50]                                                                                 


Epoch #82: test_reward: 11385.000000 ± 6143.344382, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #83: 1001it [00:02, 448.41it/s, env_step=83000, gradient_step=8300, len=116, n/ep=1, n/st=100, rew=6421.00]                                                                                 


Epoch #83: test_reward: 12774.000000 ± 4979.463947, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #84: 1001it [00:02, 408.04it/s, env_step=84000, gradient_step=8400, len=285, n/ep=0, n/st=100, rew=18631.50]                                                                                


Epoch #84: test_reward: 9560.200000 ± 1528.165554, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #85: 1001it [00:02, 414.45it/s, env_step=85000, gradient_step=8500, len=120, n/ep=1, n/st=100, rew=6795.00]                                                                                 


Epoch #85: test_reward: 17753.600000 ± 6845.097621, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #86: 1001it [00:02, 394.43it/s, env_step=86000, gradient_step=8600, len=97, n/ep=1, n/st=100, rew=5405.00]                                                                                  


Epoch #86: test_reward: 11116.400000 ± 2185.053052, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #87: 1001it [00:02, 449.45it/s, env_step=87000, gradient_step=8700, len=120, n/ep=0, n/st=100, rew=7508.00]                                                                                 


Epoch #87: test_reward: 16443.400000 ± 6249.962643, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #88: 1001it [00:02, 385.40it/s, env_step=88000, gradient_step=8800, len=104, n/ep=0, n/st=100, rew=5828.00]                                                                                 


Epoch #88: test_reward: 14072.800000 ± 3141.244365, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #89: 1001it [00:02, 401.89it/s, env_step=89000, gradient_step=8900, len=128, n/ep=1, n/st=100, rew=5945.00]                                                                                 


Epoch #89: test_reward: 12058.000000 ± 4546.638605, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #90: 1001it [00:02, 419.52it/s, env_step=90000, gradient_step=9000, len=388, n/ep=1, n/st=100, rew=25504.00]                                                                                


Epoch #90: test_reward: 17314.800000 ± 6336.065558, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #91: 1001it [00:02, 460.25it/s, env_step=91000, gradient_step=9100, len=195, n/ep=0, n/st=100, rew=12653.00]                                                                                


Epoch #91: test_reward: 18510.600000 ± 8854.753844, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #92: 1001it [00:02, 422.51it/s, env_step=92000, gradient_step=9200, len=186, n/ep=4, n/st=100, rew=11574.75]                                                                                


Epoch #92: test_reward: 10602.000000 ± 8442.496171, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #93: 1001it [00:02, 349.94it/s, env_step=93000, gradient_step=9300, len=133, n/ep=0, n/st=100, rew=7779.50]                                                                                 


Epoch #93: test_reward: 11685.200000 ± 8275.390744, best_reward: 19601.100000 ± 8910.276724 in #56


Epoch #94: 1001it [00:02, 368.50it/s, env_step=94000, gradient_step=9400, len=400, n/ep=0, n/st=100, rew=26092.00]                                                                                


Epoch #94: test_reward: 21408.800000 ± 4683.485898, best_reward: 21408.800000 ± 4683.485898 in #94


Epoch #95: 1001it [00:02, 387.75it/s, env_step=95000, gradient_step=9500, len=158, n/ep=0, n/st=100, rew=10203.00]                                                                                


Epoch #95: test_reward: 17728.800000 ± 4670.375912, best_reward: 21408.800000 ± 4683.485898 in #94


Epoch #96: 1001it [00:02, 345.42it/s, env_step=96000, gradient_step=9600, len=270, n/ep=0, n/st=100, rew=16848.00]                                                                                


Epoch #96: test_reward: 19242.300000 ± 9266.313118, best_reward: 21408.800000 ± 4683.485898 in #94


Epoch #97: 1001it [00:02, 421.85it/s, env_step=97000, gradient_step=9700, len=135, n/ep=0, n/st=100, rew=8068.75]                                                                                 


Epoch #97: test_reward: 15044.400000 ± 8893.355263, best_reward: 21408.800000 ± 4683.485898 in #94


Epoch #98: 1001it [00:02, 442.67it/s, env_step=98000, gradient_step=9800, len=104, n/ep=1, n/st=100, rew=6004.00]                                                                                 


Epoch #98: test_reward: 12558.700000 ± 8801.501600, best_reward: 21408.800000 ± 4683.485898 in #94


Epoch #99: 1001it [00:02, 368.28it/s, env_step=99000, gradient_step=9900, len=140, n/ep=0, n/st=100, rew=8220.00]                                                                                 


Epoch #99: test_reward: 12510.400000 ± 5929.027040, best_reward: 21408.800000 ± 4683.485898 in #94


Epoch #100: 1001it [00:02, 368.77it/s, env_step=100000, gradient_step=10000, len=183, n/ep=0, n/st=100, rew=11505.17]                                                                             


Epoch #100: test_reward: 19271.000000 ± 5568.188503, best_reward: 21408.800000 ± 4683.485898 in #94

InfoStats(gradient_step=10000, best_reward=21408.8, best_reward_std=4683.485898345377, train_step=100000, train_episode=533, test_step=213127, test_episode=1010, timing=TimingStats(total_time=362.1328372955322, train_time=246.3137607574463, train_time_collect=33.5727322101593, train_time_update=208.69930720329285, test_time=115.81907653808594, update_speed=405.98624978355747))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #14


Epoch #1: 1001it [00:02, 466.04it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 17092.500000 ± 5415.438953, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #2: 1001it [00:02, 489.34it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 14514.400000 ± 5166.715460, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #3: 1001it [00:01, 507.62it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 15764.500000 ± 7762.685247, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #4: 1001it [00:02, 408.83it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 11166.600000 ± 5831.892578, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #5: 1001it [00:02, 391.80it/s, env_step=5000, gradient_step=500, len=50, n/ep=1, n/st=100, rew=998.00]                                                                                      


Epoch #5: test_reward: 12496.100000 ± 6877.033175, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #6: 1001it [00:02, 397.27it/s, env_step=6000, gradient_step=600, len=58, n/ep=0, n/st=100, rew=1386.00]                                                                                     


Epoch #6: test_reward: 16035.200000 ± 5984.168026, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #7: 1001it [00:02, 486.90it/s, env_step=7000, gradient_step=700, len=64, n/ep=0, n/st=100, rew=1412.17]                                                                                     


Epoch #7: test_reward: 16906.800000 ± 5078.761755, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #8: 1001it [00:01, 511.50it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=2241.00]                                                                                     


Epoch #8: test_reward: 15813.300000 ± 5029.560061, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #9: 1001it [00:02, 486.47it/s, env_step=9000, gradient_step=900, len=90, n/ep=3, n/st=100, rew=2912.33]                                                                                     


Epoch #9: test_reward: 14942.300000 ± 6293.263463, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #10: 1001it [00:02, 396.02it/s, env_step=10000, gradient_step=1000, len=68, n/ep=2, n/st=100, rew=2047.50]                                                                                  


Epoch #10: test_reward: 15480.900000 ± 4354.750635, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #11: 1001it [00:02, 443.93it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=4510.00]                                                                                 


Epoch #11: test_reward: 6196.100000 ± 3544.607636, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #12: 1001it [00:02, 490.78it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=3831.00]                                                                                 


Epoch #12: test_reward: 11488.400000 ± 5018.967527, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #13: 1001it [00:02, 407.95it/s, env_step=13000, gradient_step=1300, len=126, n/ep=0, n/st=100, rew=3869.00]                                                                                 


Epoch #13: test_reward: 12882.400000 ± 6506.978334, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #14: 1001it [00:02, 403.68it/s, env_step=14000, gradient_step=1400, len=134, n/ep=0, n/st=100, rew=4530.00]                                                                                 


Epoch #14: test_reward: 8195.800000 ± 3370.861308, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #15: 1001it [00:02, 482.11it/s, env_step=15000, gradient_step=1500, len=150, n/ep=1, n/st=100, rew=5040.00]                                                                                 


Epoch #15: test_reward: 6202.000000 ± 2168.070848, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #16: 1001it [00:02, 437.23it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=6141.00]                                                                                 


Epoch #16: test_reward: 10752.400000 ± 4743.000637, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #17: 1001it [00:02, 429.89it/s, env_step=17000, gradient_step=1700, len=170, n/ep=1, n/st=100, rew=6887.50]                                                                                 


Epoch #17: test_reward: 10308.300000 ± 4293.218561, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #18: 1001it [00:02, 442.71it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=7235.25]                                                                                 


Epoch #18: test_reward: 11691.600000 ± 2209.469221, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #19: 1001it [00:02, 386.28it/s, env_step=19000, gradient_step=1900, len=184, n/ep=0, n/st=100, rew=7563.50]                                                                                 


Epoch #19: test_reward: 6953.200000 ± 1811.140900, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #20: 1001it [00:02, 417.20it/s, env_step=20000, gradient_step=2000, len=150, n/ep=0, n/st=100, rew=7259.00]                                                                                 


Epoch #20: test_reward: 12096.300000 ± 3572.312586, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #21: 1001it [00:02, 433.33it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=8283.00]                                                                                 


Epoch #21: test_reward: 14056.900000 ± 5070.267142, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #22: 1001it [00:02, 427.66it/s, env_step=22000, gradient_step=2200, len=216, n/ep=0, n/st=100, rew=8990.00]                                                                                 


Epoch #22: test_reward: 11672.300000 ± 1865.659779, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #23: 1001it [00:02, 444.54it/s, env_step=23000, gradient_step=2300, len=102, n/ep=2, n/st=100, rew=4518.00]                                                                                 


Epoch #23: test_reward: 12590.000000 ± 3411.912777, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #24: 1001it [00:02, 361.90it/s, env_step=24000, gradient_step=2400, len=150, n/ep=0, n/st=100, rew=6686.00]                                                                                 


Epoch #24: test_reward: 10472.600000 ± 2623.410612, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #25: 1001it [00:02, 484.63it/s, env_step=25000, gradient_step=2500, len=138, n/ep=1, n/st=100, rew=6710.00]                                                                                 


Epoch #25: test_reward: 7679.600000 ± 5238.264564, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #26: 1001it [00:02, 458.18it/s, env_step=26000, gradient_step=2600, len=118, n/ep=2, n/st=100, rew=5504.00]                                                                                 


Epoch #26: test_reward: 7803.600000 ± 2404.375645, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #27: 1001it [00:02, 408.18it/s, env_step=27000, gradient_step=2700, len=188, n/ep=2, n/st=100, rew=8321.00]                                                                                 


Epoch #27: test_reward: 11820.800000 ± 2640.410150, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #28: 1001it [00:02, 425.89it/s, env_step=28000, gradient_step=2800, len=188, n/ep=1, n/st=100, rew=8722.00]                                                                                 


Epoch #28: test_reward: 10737.600000 ± 3723.328624, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #29: 1001it [00:02, 387.04it/s, env_step=29000, gradient_step=2900, len=143, n/ep=0, n/st=100, rew=5895.00]                                                                                 


Epoch #29: test_reward: 7913.700000 ± 3381.367121, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #30: 1001it [00:02, 432.63it/s, env_step=30000, gradient_step=3000, len=135, n/ep=0, n/st=100, rew=6203.50]                                                                                 


Epoch #30: test_reward: 9286.200000 ± 3087.466852, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #31: 1001it [00:02, 442.34it/s, env_step=31000, gradient_step=3100, len=184, n/ep=1, n/st=100, rew=8837.00]                                                                                 


Epoch #31: test_reward: 12402.500000 ± 3604.412858, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #32: 1001it [00:02, 468.91it/s, env_step=32000, gradient_step=3200, len=188, n/ep=1, n/st=100, rew=8561.00]                                                                                 


Epoch #32: test_reward: 14032.100000 ± 3019.913259, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #33: 1001it [00:02, 477.72it/s, env_step=33000, gradient_step=3300, len=159, n/ep=2, n/st=100, rew=7679.25]                                                                                 


Epoch #33: test_reward: 10008.100000 ± 4240.733037, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #34: 1001it [00:02, 375.04it/s, env_step=34000, gradient_step=3400, len=181, n/ep=2, n/st=100, rew=8659.25]                                                                                 


Epoch #34: test_reward: 9089.800000 ± 1409.840686, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #35: 1001it [00:02, 458.95it/s, env_step=35000, gradient_step=3500, len=223, n/ep=0, n/st=100, rew=10457.00]                                                                                


Epoch #35: test_reward: 9407.100000 ± 2355.721395, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #36: 1001it [00:02, 394.69it/s, env_step=36000, gradient_step=3600, len=134, n/ep=1, n/st=100, rew=7375.00]                                                                                 


Epoch #36: test_reward: 12988.300000 ± 3483.259337, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #37: 1001it [00:02, 385.71it/s, env_step=37000, gradient_step=3700, len=168, n/ep=1, n/st=100, rew=9799.00]                                                                                 


Epoch #37: test_reward: 9905.000000 ± 3225.166197, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #38: 1001it [00:02, 440.70it/s, env_step=38000, gradient_step=3800, len=168, n/ep=1, n/st=100, rew=8054.50]                                                                                 


Epoch #38: test_reward: 11329.600000 ± 4670.943185, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #39: 1001it [00:02, 409.70it/s, env_step=39000, gradient_step=3900, len=148, n/ep=0, n/st=100, rew=7639.50]                                                                                 


Epoch #39: test_reward: 3867.800000 ± 1898.459892, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #40: 1001it [00:02, 453.17it/s, env_step=40000, gradient_step=4000, len=400, n/ep=1, n/st=100, rew=22444.00]                                                                                


Epoch #40: test_reward: 9341.800000 ± 3115.433222, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #41: 1001it [00:02, 409.87it/s, env_step=41000, gradient_step=4100, len=134, n/ep=2, n/st=100, rew=7423.50]                                                                                 


Epoch #41: test_reward: 10460.600000 ± 3264.312491, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #42: 1001it [00:02, 454.15it/s, env_step=42000, gradient_step=4200, len=325, n/ep=0, n/st=100, rew=13777.00]                                                                                


Epoch #42: test_reward: 12200.400000 ± 3825.191112, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #43: 1001it [00:02, 344.63it/s, env_step=43000, gradient_step=4300, len=179, n/ep=0, n/st=100, rew=9443.83]                                                                                 


Epoch #43: test_reward: 10166.600000 ± 2798.502035, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #44: 1001it [00:02, 391.78it/s, env_step=44000, gradient_step=4400, len=108, n/ep=2, n/st=100, rew=5817.50]                                                                                 


Epoch #44: test_reward: 10776.200000 ± 1923.345928, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #45: 1001it [00:02, 445.61it/s, env_step=45000, gradient_step=4500, len=76, n/ep=1, n/st=100, rew=3883.00]                                                                                  


Epoch #45: test_reward: 9178.300000 ± 3075.371686, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #46: 1001it [00:02, 442.50it/s, env_step=46000, gradient_step=4600, len=197, n/ep=3, n/st=100, rew=11148.00]                                                                                


Epoch #46: test_reward: 7592.200000 ± 5877.858144, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #47: 1001it [00:02, 372.43it/s, env_step=47000, gradient_step=4700, len=128, n/ep=1, n/st=100, rew=6841.00]                                                                                 


Epoch #47: test_reward: 11582.600000 ± 4796.119686, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #48: 1001it [00:02, 415.20it/s, env_step=48000, gradient_step=4800, len=120, n/ep=1, n/st=100, rew=6757.00]                                                                                 


Epoch #48: test_reward: 13885.800000 ± 8978.018020, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #49: 1001it [00:02, 430.01it/s, env_step=49000, gradient_step=4900, len=232, n/ep=0, n/st=100, rew=13558.00]                                                                                


Epoch #49: test_reward: 11275.600000 ± 4724.990184, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #50: 1001it [00:02, 494.78it/s, env_step=50000, gradient_step=5000, len=165, n/ep=0, n/st=100, rew=9854.17]                                                                                 


Epoch #50: test_reward: 11718.200000 ± 4274.670953, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #51: 1001it [00:02, 473.73it/s, env_step=51000, gradient_step=5100, len=197, n/ep=0, n/st=100, rew=12130.25]                                                                                


Epoch #51: test_reward: 10137.400000 ± 3221.388527, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #52: 1001it [00:02, 432.23it/s, env_step=52000, gradient_step=5200, len=208, n/ep=0, n/st=100, rew=12927.00]                                                                                


Epoch #52: test_reward: 9997.400000 ± 4019.973164, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #53: 1001it [00:02, 465.29it/s, env_step=53000, gradient_step=5300, len=242, n/ep=2, n/st=100, rew=15169.25]                                                                                


Epoch #53: test_reward: 9083.300000 ± 2481.285637, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #54: 1001it [00:02, 415.74it/s, env_step=54000, gradient_step=5400, len=151, n/ep=2, n/st=100, rew=9004.00]                                                                                 


Epoch #54: test_reward: 14073.500000 ± 3395.286858, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #55: 1001it [00:02, 465.02it/s, env_step=55000, gradient_step=5500, len=176, n/ep=0, n/st=100, rew=10092.75]                                                                                


Epoch #55: test_reward: 9893.600000 ± 3783.981586, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #56: 1001it [00:02, 425.76it/s, env_step=56000, gradient_step=5600, len=99, n/ep=1, n/st=100, rew=5486.50]                                                                                  


Epoch #56: test_reward: 11640.900000 ± 3037.378951, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #57: 1001it [00:02, 366.86it/s, env_step=57000, gradient_step=5700, len=115, n/ep=2, n/st=100, rew=6075.75]                                                                                 


Epoch #57: test_reward: 9182.300000 ± 5503.200742, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #58: 1001it [00:02, 456.84it/s, env_step=58000, gradient_step=5800, len=158, n/ep=1, n/st=100, rew=8967.00]                                                                                 


Epoch #58: test_reward: 7327.700000 ± 10092.267972, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #59: 1001it [00:02, 371.28it/s, env_step=59000, gradient_step=5900, len=96, n/ep=0, n/st=100, rew=3968.00]                                                                                  


Epoch #59: test_reward: 15397.200000 ± 7923.795565, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #60: 1001it [00:02, 394.02it/s, env_step=60000, gradient_step=6000, len=157, n/ep=0, n/st=100, rew=9713.75]                                                                                 


Epoch #60: test_reward: 9909.000000 ± 2820.447092, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #61: 1001it [00:02, 374.73it/s, env_step=61000, gradient_step=6100, len=149, n/ep=0, n/st=100, rew=8321.75]                                                                                 


Epoch #61: test_reward: 10809.500000 ± 5861.407975, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #62: 1001it [00:02, 417.73it/s, env_step=62000, gradient_step=6200, len=34, n/ep=1, n/st=100, rew=1431.00]                                                                                  


Epoch #62: test_reward: 12590.700000 ± 4006.875243, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #63: 1001it [00:02, 425.61it/s, env_step=63000, gradient_step=6300, len=78, n/ep=1, n/st=100, rew=4163.50]                                                                                  


Epoch #63: test_reward: 11919.500000 ± 4295.273548, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #64: 1001it [00:02, 385.70it/s, env_step=64000, gradient_step=6400, len=162, n/ep=1, n/st=100, rew=9751.00]                                                                                 


Epoch #64: test_reward: 12640.900000 ± 4601.531081, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #65: 1001it [00:02, 390.49it/s, env_step=65000, gradient_step=6500, len=128, n/ep=1, n/st=100, rew=7199.00]                                                                                 


Epoch #65: test_reward: 11635.000000 ± 4532.836485, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #66: 1001it [00:02, 459.83it/s, env_step=66000, gradient_step=6600, len=148, n/ep=1, n/st=100, rew=8142.50]                                                                                 


Epoch #66: test_reward: 13804.300000 ± 3260.151133, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #67: 1001it [00:02, 371.83it/s, env_step=67000, gradient_step=6700, len=130, n/ep=1, n/st=100, rew=7943.00]                                                                                 


Epoch #67: test_reward: 12485.300000 ± 5994.597969, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #68: 1001it [00:02, 390.63it/s, env_step=68000, gradient_step=6800, len=122, n/ep=2, n/st=100, rew=6734.00]                                                                                 


Epoch #68: test_reward: 13835.300000 ± 5062.603067, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #69: 1001it [00:02, 390.85it/s, env_step=69000, gradient_step=6900, len=111, n/ep=2, n/st=100, rew=6459.50]                                                                                 


Epoch #69: test_reward: 10832.200000 ± 5074.730373, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #70: 1001it [00:02, 423.67it/s, env_step=70000, gradient_step=7000, len=145, n/ep=1, n/st=100, rew=8945.00]                                                                                 


Epoch #70: test_reward: 14442.600000 ± 7653.801097, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #71: 1001it [00:02, 454.36it/s, env_step=71000, gradient_step=7100, len=154, n/ep=0, n/st=100, rew=9427.67]                                                                                 


Epoch #71: test_reward: 11509.400000 ± 3803.347662, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #72: 1001it [00:02, 355.74it/s, env_step=72000, gradient_step=7200, len=98, n/ep=1, n/st=100, rew=5402.00]                                                                                  


Epoch #72: test_reward: 15169.700000 ± 3949.324273, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #73: 1001it [00:02, 359.73it/s, env_step=73000, gradient_step=7300, len=135, n/ep=1, n/st=100, rew=7116.00]                                                                                 


Epoch #73: test_reward: 11530.900000 ± 5353.579559, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #74: 1001it [00:02, 371.98it/s, env_step=74000, gradient_step=7400, len=144, n/ep=0, n/st=100, rew=8204.00]                                                                                 


Epoch #74: test_reward: 16698.000000 ± 7464.575219, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #75: 1001it [00:02, 365.30it/s, env_step=75000, gradient_step=7500, len=154, n/ep=0, n/st=100, rew=10105.00]                                                                                


Epoch #75: test_reward: 9285.600000 ± 3060.433473, best_reward: 17092.500000 ± 5415.438953 in #1


Epoch #76: 1001it [00:02, 404.87it/s, env_step=76000, gradient_step=7600, len=172, n/ep=1, n/st=100, rew=9890.00]                                                                                 


Epoch #76: test_reward: 17191.600000 ± 5321.026502, best_reward: 17191.600000 ± 5321.026502 in #76


Epoch #77: 1001it [00:02, 406.58it/s, env_step=77000, gradient_step=7700, len=90, n/ep=0, n/st=100, rew=5074.50]                                                                                  


Epoch #77: test_reward: 17448.200000 ± 7971.919829, best_reward: 17448.200000 ± 7971.919829 in #77


Epoch #78: 1001it [00:02, 422.71it/s, env_step=78000, gradient_step=7800, len=129, n/ep=0, n/st=100, rew=6499.00]                                                                                 


Epoch #78: test_reward: 8837.800000 ± 2118.589049, best_reward: 17448.200000 ± 7971.919829 in #77


Epoch #79: 1001it [00:02, 376.57it/s, env_step=79000, gradient_step=7900, len=169, n/ep=1, n/st=100, rew=9405.50]                                                                                 


Epoch #79: test_reward: 8816.800000 ± 3605.230861, best_reward: 17448.200000 ± 7971.919829 in #77


Epoch #80: 1001it [00:02, 408.60it/s, env_step=80000, gradient_step=8000, len=170, n/ep=3, n/st=100, rew=9850.67]                                                                                 


Epoch #80: test_reward: 10585.800000 ± 3745.917479, best_reward: 17448.200000 ± 7971.919829 in #77


Epoch #81: 1001it [00:02, 338.25it/s, env_step=81000, gradient_step=8100, len=198, n/ep=0, n/st=100, rew=12052.00]                                                                                


Epoch #81: test_reward: 12782.400000 ± 8085.801224, best_reward: 17448.200000 ± 7971.919829 in #77


Epoch #82: 1001it [00:02, 394.46it/s, env_step=82000, gradient_step=8200, len=84, n/ep=1, n/st=100, rew=4730.00]                                                                                  


Epoch #82: test_reward: 14482.500000 ± 4996.627027, best_reward: 17448.200000 ± 7971.919829 in #77


Epoch #83: 1001it [00:02, 377.75it/s, env_step=83000, gradient_step=8300, len=111, n/ep=1, n/st=100, rew=5568.00]                                                                                 


Epoch #83: test_reward: 17470.800000 ± 8730.600115, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #84: 1001it [00:02, 430.65it/s, env_step=84000, gradient_step=8400, len=137, n/ep=1, n/st=100, rew=7830.00]                                                                                 


Epoch #84: test_reward: 14147.300000 ± 4493.891433, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #85: 1001it [00:02, 363.66it/s, env_step=85000, gradient_step=8500, len=221, n/ep=1, n/st=100, rew=14063.00]                                                                                


Epoch #85: test_reward: 15133.300000 ± 7073.472670, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #86: 1001it [00:02, 375.84it/s, env_step=86000, gradient_step=8600, len=400, n/ep=1, n/st=100, rew=26777.00]                                                                                


Epoch #86: test_reward: 10504.400000 ± 6954.263774, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #87: 1001it [00:02, 390.31it/s, env_step=87000, gradient_step=8700, len=172, n/ep=0, n/st=100, rew=10090.00]                                                                                


Epoch #87: test_reward: 8382.400000 ± 3645.440089, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #88: 1001it [00:02, 366.55it/s, env_step=88000, gradient_step=8800, len=195, n/ep=1, n/st=100, rew=11770.50]                                                                                


Epoch #88: test_reward: 13928.400000 ± 7332.014719, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #89: 1001it [00:02, 370.71it/s, env_step=89000, gradient_step=8900, len=215, n/ep=0, n/st=100, rew=13071.00]                                                                                


Epoch #89: test_reward: 10663.800000 ± 3044.897824, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #90: 1001it [00:02, 410.62it/s, env_step=90000, gradient_step=9000, len=263, n/ep=3, n/st=100, rew=16959.00]                                                                                


Epoch #90: test_reward: 12609.200000 ± 5342.937129, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #91: 1001it [00:02, 380.26it/s, env_step=91000, gradient_step=9100, len=90, n/ep=0, n/st=100, rew=4196.00]                                                                                  


Epoch #91: test_reward: 7667.600000 ± 3040.356630, best_reward: 17470.800000 ± 8730.600115 in #83


Epoch #92: 1001it [00:02, 407.83it/s, env_step=92000, gradient_step=9200, len=166, n/ep=0, n/st=100, rew=9278.25]                                                                                 


Epoch #92: test_reward: 18948.600000 ± 7976.178336, best_reward: 18948.600000 ± 7976.178336 in #92


Epoch #93: 1001it [00:02, 454.92it/s, env_step=93000, gradient_step=9300, len=230, n/ep=1, n/st=100, rew=14732.00]                                                                                


Epoch #93: test_reward: 14011.800000 ± 8380.068040, best_reward: 18948.600000 ± 7976.178336 in #92


Epoch #94: 1001it [00:02, 352.89it/s, env_step=94000, gradient_step=9400, len=177, n/ep=0, n/st=100, rew=10614.00]                                                                                


Epoch #94: test_reward: 12473.100000 ± 6760.411296, best_reward: 18948.600000 ± 7976.178336 in #92


Epoch #95: 1001it [00:02, 449.12it/s, env_step=95000, gradient_step=9500, len=152, n/ep=0, n/st=100, rew=9587.00]                                                                                 


Epoch #95: test_reward: 18624.000000 ± 7882.955842, best_reward: 18948.600000 ± 7976.178336 in #92


Epoch #96: 1001it [00:02, 355.53it/s, env_step=96000, gradient_step=9600, len=72, n/ep=0, n/st=100, rew=3193.00]                                                                                  


Epoch #96: test_reward: 11534.400000 ± 5559.918330, best_reward: 18948.600000 ± 7976.178336 in #92


Epoch #97: 1001it [00:02, 423.86it/s, env_step=97000, gradient_step=9700, len=174, n/ep=0, n/st=100, rew=10139.00]                                                                                


Epoch #97: test_reward: 9579.400000 ± 4887.868906, best_reward: 18948.600000 ± 7976.178336 in #92


Epoch #98: 1001it [00:02, 420.92it/s, env_step=98000, gradient_step=9800, len=206, n/ep=4, n/st=100, rew=12989.75]                                                                                


Epoch #98: test_reward: 12287.900000 ± 4455.504897, best_reward: 18948.600000 ± 7976.178336 in #92


Epoch #99: 1001it [00:02, 398.39it/s, env_step=99000, gradient_step=9900, len=167, n/ep=0, n/st=100, rew=10188.00]                                                                                


Epoch #99: test_reward: 10466.500000 ± 2689.225177, best_reward: 18948.600000 ± 7976.178336 in #92


Epoch #100: 1001it [00:02, 449.08it/s, env_step=100000, gradient_step=10000, len=114, n/ep=0, n/st=100, rew=6457.00]                                                                              


Epoch #100: test_reward: 16312.000000 ± 7762.452770, best_reward: 18948.600000 ± 7976.178336 in #92

InfoStats(gradient_step=10000, best_reward=18948.6, best_reward_std=7976.1783355188345, train_step=100000, train_episode=576, test_step=196282, test_episode=1010, timing=TimingStats(total_time=351.607462644577, train_time=242.71322679519653, train_time_collect=34.64234447479248, train_time_update=204.00702238082886, test_time=108.8942358493805, update_speed=412.008860499312))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #27


Epoch #1: 1001it [00:02, 436.30it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 11332.800000 ± 4748.145592, best_reward: 12933.100000 ± 5395.903380 in #0


Epoch #2: 1001it [00:02, 402.86it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 11067.600000 ± 6896.398367, best_reward: 12933.100000 ± 5395.903380 in #0


Epoch #3: 1001it [00:02, 374.13it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 13165.200000 ± 3203.876177, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #4: 1001it [00:02, 389.82it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 11171.700000 ± 3509.011230, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #5: 1001it [00:02, 377.09it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 8317.600000 ± 2737.279350, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #6: 1001it [00:02, 334.92it/s, env_step=6000, gradient_step=600, len=58, n/ep=0, n/st=100, rew=2575.00]                                                                                     


Epoch #6: test_reward: 9423.300000 ± 2916.100549, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #7: 1001it [00:02, 426.07it/s, env_step=7000, gradient_step=700, len=68, n/ep=0, n/st=100, rew=3054.50]                                                                                     


Epoch #7: test_reward: 12073.400000 ± 3273.062059, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #8: 1001it [00:02, 359.92it/s, env_step=8000, gradient_step=800, len=78, n/ep=0, n/st=100, rew=3697.00]                                                                                     


Epoch #8: test_reward: 10011.000000 ± 3125.104574, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #9: 1001it [00:02, 365.86it/s, env_step=9000, gradient_step=900, len=85, n/ep=0, n/st=100, rew=4429.00]                                                                                     


Epoch #9: test_reward: 10899.000000 ± 3332.514546, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #10: 1001it [00:02, 354.24it/s, env_step=10000, gradient_step=1000, len=100, n/ep=2, n/st=100, rew=4652.50]                                                                                 


Epoch #10: test_reward: 10792.900000 ± 2071.123292, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #11: 1001it [00:02, 424.08it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=5534.00]                                                                                 


Epoch #11: test_reward: 10021.300000 ± 4111.681069, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #12: 1001it [00:02, 387.83it/s, env_step=12000, gradient_step=1200, len=120, n/ep=1, n/st=100, rew=6225.00]                                                                                 


Epoch #12: test_reward: 10036.400000 ± 3397.723214, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #13: 1001it [00:02, 394.10it/s, env_step=13000, gradient_step=1300, len=128, n/ep=0, n/st=100, rew=6560.00]                                                                                 


Epoch #13: test_reward: 8232.000000 ± 3446.727259, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #14: 1001it [00:03, 332.81it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=6981.00]                                                                                 


Epoch #14: test_reward: 10317.000000 ± 3958.806992, best_reward: 13165.200000 ± 3203.876177 in #3


Epoch #15: 1001it [00:03, 319.23it/s, env_step=15000, gradient_step=1500, len=150, n/ep=1, n/st=100, rew=8234.50]                                                                                 


Epoch #15: test_reward: 13625.600000 ± 5343.630287, best_reward: 13625.600000 ± 5343.630287 in #15


Epoch #16: 1001it [00:02, 352.60it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=9149.00]                                                                                 


Epoch #16: test_reward: 9117.700000 ± 4323.777031, best_reward: 13625.600000 ± 5343.630287 in #15


Epoch #17: 1001it [00:02, 370.81it/s, env_step=17000, gradient_step=1700, len=166, n/ep=0, n/st=100, rew=8572.00]                                                                                 


Epoch #17: test_reward: 17277.500000 ± 6156.714859, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #18: 1001it [00:02, 412.33it/s, env_step=18000, gradient_step=1800, len=110, n/ep=3, n/st=100, rew=5493.17]                                                                                 


Epoch #18: test_reward: 14695.800000 ± 4774.425092, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #19: 1001it [00:02, 418.75it/s, env_step=19000, gradient_step=1900, len=188, n/ep=0, n/st=100, rew=10401.00]                                                                                


Epoch #19: test_reward: 13266.400000 ± 6715.947695, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #20: 1001it [00:02, 335.32it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=11388.00]                                                                                


Epoch #20: test_reward: 10442.600000 ± 3340.018808, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #21: 1001it [00:02, 361.36it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=12466.50]                                                                                


Epoch #21: test_reward: 14629.500000 ± 5797.815265, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #22: 1001it [00:02, 394.56it/s, env_step=22000, gradient_step=2200, len=218, n/ep=0, n/st=100, rew=12709.00]                                                                                


Epoch #22: test_reward: 16079.400000 ± 5182.409868, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #23: 1001it [00:03, 325.57it/s, env_step=23000, gradient_step=2300, len=229, n/ep=0, n/st=100, rew=13894.50]                                                                                


Epoch #23: test_reward: 12443.200000 ± 4552.925429, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #24: 1001it [00:02, 372.42it/s, env_step=24000, gradient_step=2400, len=183, n/ep=2, n/st=100, rew=10690.50]                                                                                


Epoch #24: test_reward: 12568.300000 ± 4310.331265, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #25: 1001it [00:02, 413.04it/s, env_step=25000, gradient_step=2500, len=182, n/ep=2, n/st=100, rew=10238.25]                                                                                


Epoch #25: test_reward: 12366.500000 ± 4017.206872, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #26: 1001it [00:02, 355.51it/s, env_step=26000, gradient_step=2600, len=198, n/ep=3, n/st=100, rew=11358.00]                                                                                


Epoch #26: test_reward: 15103.600000 ± 3153.806342, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #27: 1001it [00:02, 419.00it/s, env_step=27000, gradient_step=2700, len=110, n/ep=0, n/st=100, rew=3895.00]                                                                                 


Epoch #27: test_reward: 11657.600000 ± 6945.531444, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #28: 1001it [00:02, 402.10it/s, env_step=28000, gradient_step=2800, len=166, n/ep=1, n/st=100, rew=10673.50]                                                                                


Epoch #28: test_reward: 11725.200000 ± 4466.121624, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #29: 1001it [00:02, 406.89it/s, env_step=29000, gradient_step=2900, len=290, n/ep=2, n/st=100, rew=17966.00]                                                                                


Epoch #29: test_reward: 13310.600000 ± 6532.180711, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #30: 1001it [00:02, 389.00it/s, env_step=30000, gradient_step=3000, len=120, n/ep=1, n/st=100, rew=5223.00]                                                                                 


Epoch #30: test_reward: 11154.800000 ± 4212.102487, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #31: 1001it [00:02, 388.08it/s, env_step=31000, gradient_step=3100, len=310, n/ep=1, n/st=100, rew=19696.00]                                                                                


Epoch #31: test_reward: 14157.900000 ± 6645.892753, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #32: 1001it [00:02, 388.69it/s, env_step=32000, gradient_step=3200, len=318, n/ep=0, n/st=100, rew=19626.50]                                                                                


Epoch #32: test_reward: 15995.700000 ± 5220.320144, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #33: 1001it [00:03, 330.29it/s, env_step=33000, gradient_step=3300, len=160, n/ep=0, n/st=100, rew=7324.25]                                                                                 


Epoch #33: test_reward: 12435.800000 ± 3678.628924, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #34: 1001it [00:02, 413.17it/s, env_step=34000, gradient_step=3400, len=200, n/ep=1, n/st=100, rew=11426.00]                                                                                


Epoch #34: test_reward: 15228.800000 ± 4727.946802, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #35: 1001it [00:03, 325.75it/s, env_step=35000, gradient_step=3500, len=170, n/ep=1, n/st=100, rew=9998.50]                                                                                 


Epoch #35: test_reward: 14068.500000 ± 5611.944676, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #36: 1001it [00:03, 330.35it/s, env_step=36000, gradient_step=3600, len=358, n/ep=0, n/st=100, rew=22911.00]                                                                                


Epoch #36: test_reward: 14291.600000 ± 3074.657223, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #37: 1001it [00:02, 405.20it/s, env_step=37000, gradient_step=3700, len=174, n/ep=0, n/st=100, rew=9878.50]                                                                                 


Epoch #37: test_reward: 14308.100000 ± 4512.899322, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #38: 1001it [00:03, 329.80it/s, env_step=38000, gradient_step=3800, len=171, n/ep=2, n/st=100, rew=8599.50]                                                                                 


Epoch #38: test_reward: 13633.100000 ± 4199.246205, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #39: 1001it [00:02, 346.03it/s, env_step=39000, gradient_step=3900, len=232, n/ep=0, n/st=100, rew=11743.00]                                                                                


Epoch #39: test_reward: 11713.200000 ± 5073.421425, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #40: 1001it [00:02, 408.60it/s, env_step=40000, gradient_step=4000, len=337, n/ep=4, n/st=100, rew=19797.50]                                                                                


Epoch #40: test_reward: 14503.000000 ± 7893.324597, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #41: 1001it [00:02, 410.38it/s, env_step=41000, gradient_step=4100, len=234, n/ep=0, n/st=100, rew=13199.00]                                                                                


Epoch #41: test_reward: 14759.900000 ± 2328.043361, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #42: 1001it [00:02, 421.27it/s, env_step=42000, gradient_step=4200, len=96, n/ep=0, n/st=100, rew=3935.00]                                                                                  


Epoch #42: test_reward: 12509.800000 ± 3866.554275, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #43: 1001it [00:02, 346.80it/s, env_step=43000, gradient_step=4300, len=217, n/ep=2, n/st=100, rew=13101.00]                                                                                


Epoch #43: test_reward: 12267.700000 ± 3869.861859, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #44: 1001it [00:03, 317.53it/s, env_step=44000, gradient_step=4400, len=236, n/ep=1, n/st=100, rew=13979.50]                                                                                


Epoch #44: test_reward: 11026.700000 ± 4265.259946, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #45: 1001it [00:02, 403.36it/s, env_step=45000, gradient_step=4500, len=189, n/ep=0, n/st=100, rew=12006.50]                                                                                


Epoch #45: test_reward: 14651.200000 ± 6267.745397, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #46: 1001it [00:02, 344.09it/s, env_step=46000, gradient_step=4600, len=128, n/ep=1, n/st=100, rew=6240.00]                                                                                 


Epoch #46: test_reward: 11411.100000 ± 4389.173737, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #47: 1001it [00:02, 338.52it/s, env_step=47000, gradient_step=4700, len=172, n/ep=0, n/st=100, rew=9942.00]                                                                                 


Epoch #47: test_reward: 12791.700000 ± 5808.579655, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #48: 1001it [00:02, 371.72it/s, env_step=48000, gradient_step=4800, len=229, n/ep=0, n/st=100, rew=14261.00]                                                                                


Epoch #48: test_reward: 13010.200000 ± 2952.868260, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #49: 1001it [00:02, 401.24it/s, env_step=49000, gradient_step=4900, len=167, n/ep=2, n/st=100, rew=8209.00]                                                                                 


Epoch #49: test_reward: 12193.600000 ± 6125.222677, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #50: 1001it [00:02, 353.68it/s, env_step=50000, gradient_step=5000, len=224, n/ep=0, n/st=100, rew=13373.00]                                                                                


Epoch #50: test_reward: 12345.100000 ± 3595.288151, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #51: 1001it [00:03, 317.28it/s, env_step=51000, gradient_step=5100, len=131, n/ep=2, n/st=100, rew=7244.00]                                                                                 


Epoch #51: test_reward: 11666.800000 ± 3655.412338, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #52: 1001it [00:02, 345.69it/s, env_step=52000, gradient_step=5200, len=208, n/ep=3, n/st=100, rew=12869.67]                                                                                


Epoch #52: test_reward: 11938.600000 ± 3917.386787, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #53: 1001it [00:02, 404.27it/s, env_step=53000, gradient_step=5300, len=98, n/ep=0, n/st=100, rew=4472.00]                                                                                  


Epoch #53: test_reward: 11246.000000 ± 5780.418947, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #54: 1001it [00:02, 366.64it/s, env_step=54000, gradient_step=5400, len=194, n/ep=0, n/st=100, rew=11894.00]                                                                                


Epoch #54: test_reward: 13806.000000 ± 5733.601730, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #55: 1001it [00:02, 356.57it/s, env_step=55000, gradient_step=5500, len=138, n/ep=0, n/st=100, rew=7904.62]                                                                                 


Epoch #55: test_reward: 11162.700000 ± 5818.479235, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #56: 1001it [00:02, 336.54it/s, env_step=56000, gradient_step=5600, len=294, n/ep=0, n/st=100, rew=16722.00]                                                                                


Epoch #56: test_reward: 12057.400000 ± 5739.847859, best_reward: 17277.500000 ± 6156.714859 in #17


Epoch #57: 1001it [00:02, 403.14it/s, env_step=57000, gradient_step=5700, len=132, n/ep=0, n/st=100, rew=7125.00]                                                                                 


Epoch #57: test_reward: 18248.900000 ± 8014.155407, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #58: 1001it [00:02, 355.69it/s, env_step=58000, gradient_step=5800, len=161, n/ep=0, n/st=100, rew=9415.50]                                                                                 


Epoch #58: test_reward: 15803.100000 ± 5346.244522, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #59: 1001it [00:02, 343.97it/s, env_step=59000, gradient_step=5900, len=190, n/ep=1, n/st=100, rew=11942.00]                                                                                


Epoch #59: test_reward: 14582.400000 ± 5818.800137, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #60: 1001it [00:02, 355.33it/s, env_step=60000, gradient_step=6000, len=214, n/ep=0, n/st=100, rew=12360.00]                                                                                


Epoch #60: test_reward: 12246.200000 ± 5378.946083, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #61: 1001it [00:02, 414.67it/s, env_step=61000, gradient_step=6100, len=400, n/ep=0, n/st=100, rew=25792.00]                                                                                


Epoch #61: test_reward: 17310.700000 ± 6883.950872, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #62: 1001it [00:02, 411.57it/s, env_step=62000, gradient_step=6200, len=100, n/ep=1, n/st=100, rew=5140.00]                                                                                 


Epoch #62: test_reward: 17359.600000 ± 7866.449507, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #63: 1001it [00:02, 379.29it/s, env_step=63000, gradient_step=6300, len=267, n/ep=0, n/st=100, rew=17597.50]                                                                                


Epoch #63: test_reward: 9377.700000 ± 3160.310113, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #64: 1001it [00:02, 374.26it/s, env_step=64000, gradient_step=6400, len=179, n/ep=0, n/st=100, rew=10970.00]                                                                                


Epoch #64: test_reward: 13394.700000 ± 5567.774691, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #65: 1001it [00:02, 410.15it/s, env_step=65000, gradient_step=6500, len=188, n/ep=0, n/st=100, rew=10955.00]                                                                                


Epoch #65: test_reward: 14941.600000 ± 8016.058485, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #66: 1001it [00:02, 385.85it/s, env_step=66000, gradient_step=6600, len=136, n/ep=1, n/st=100, rew=7868.00]                                                                                 


Epoch #66: test_reward: 10271.000000 ± 4924.896040, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #67: 1001it [00:02, 381.28it/s, env_step=67000, gradient_step=6700, len=151, n/ep=0, n/st=100, rew=8715.00]                                                                                 


Epoch #67: test_reward: 8539.800000 ± 4447.417089, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #68: 1001it [00:02, 403.43it/s, env_step=68000, gradient_step=6800, len=306, n/ep=1, n/st=100, rew=18945.00]                                                                                


Epoch #68: test_reward: 13527.100000 ± 3982.997049, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #69: 1001it [00:02, 367.40it/s, env_step=69000, gradient_step=6900, len=156, n/ep=1, n/st=100, rew=9396.00]                                                                                 


Epoch #69: test_reward: 12644.100000 ± 5734.863930, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #70: 1001it [00:02, 368.72it/s, env_step=70000, gradient_step=7000, len=118, n/ep=0, n/st=100, rew=6907.00]                                                                                 


Epoch #70: test_reward: 11767.000000 ± 1767.478939, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #71: 1001it [00:02, 414.23it/s, env_step=71000, gradient_step=7100, len=253, n/ep=0, n/st=100, rew=16378.50]                                                                                


Epoch #71: test_reward: 15084.700000 ± 4136.488174, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #72: 1001it [00:02, 333.85it/s, env_step=72000, gradient_step=7200, len=200, n/ep=1, n/st=100, rew=11814.00]                                                                                


Epoch #72: test_reward: 10580.600000 ± 4175.694390, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #73: 1001it [00:03, 323.88it/s, env_step=73000, gradient_step=7300, len=305, n/ep=1, n/st=100, rew=20764.00]                                                                                


Epoch #73: test_reward: 13737.600000 ± 4855.458355, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #74: 1001it [00:02, 372.86it/s, env_step=74000, gradient_step=7400, len=112, n/ep=0, n/st=100, rew=6925.00]                                                                                 


Epoch #74: test_reward: 11855.000000 ± 5219.360995, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #75: 1001it [00:02, 377.74it/s, env_step=75000, gradient_step=7500, len=115, n/ep=0, n/st=100, rew=7020.00]                                                                                 


Epoch #75: test_reward: 12685.400000 ± 5910.501623, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #76: 1001it [00:03, 316.07it/s, env_step=76000, gradient_step=7600, len=251, n/ep=0, n/st=100, rew=15683.75]                                                                                


Epoch #76: test_reward: 11027.000000 ± 3723.662847, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #77: 1001it [00:02, 389.64it/s, env_step=77000, gradient_step=7700, len=126, n/ep=1, n/st=100, rew=8000.00]                                                                                 


Epoch #77: test_reward: 16802.600000 ± 5011.906069, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #78: 1001it [00:02, 362.67it/s, env_step=78000, gradient_step=7800, len=270, n/ep=1, n/st=100, rew=17761.00]                                                                                


Epoch #78: test_reward: 11962.600000 ± 4831.292233, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #79: 1001it [00:02, 352.36it/s, env_step=79000, gradient_step=7900, len=168, n/ep=0, n/st=100, rew=10616.75]                                                                                


Epoch #79: test_reward: 12503.400000 ± 6814.803536, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #80: 1001it [00:02, 382.54it/s, env_step=80000, gradient_step=8000, len=178, n/ep=0, n/st=100, rew=9986.50]                                                                                 


Epoch #80: test_reward: 12165.900000 ± 3764.118501, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #81: 1001it [00:02, 410.71it/s, env_step=81000, gradient_step=8100, len=166, n/ep=1, n/st=100, rew=9652.00]                                                                                 


Epoch #81: test_reward: 8626.400000 ± 5685.342473, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #82: 1001it [00:02, 409.82it/s, env_step=82000, gradient_step=8200, len=114, n/ep=0, n/st=100, rew=6849.00]                                                                                 


Epoch #82: test_reward: 13405.800000 ± 6905.782748, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #83: 1001it [00:02, 345.86it/s, env_step=83000, gradient_step=8300, len=400, n/ep=1, n/st=100, rew=24446.00]                                                                                


Epoch #83: test_reward: 13628.000000 ± 7657.557626, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #84: 1001it [00:02, 410.12it/s, env_step=84000, gradient_step=8400, len=124, n/ep=0, n/st=100, rew=6842.00]                                                                                 


Epoch #84: test_reward: 12619.100000 ± 3100.686132, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #85: 1001it [00:02, 376.31it/s, env_step=85000, gradient_step=8500, len=142, n/ep=0, n/st=100, rew=8672.00]                                                                                 


Epoch #85: test_reward: 15750.800000 ± 5115.727158, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #86: 1001it [00:03, 308.80it/s, env_step=86000, gradient_step=8600, len=238, n/ep=2, n/st=100, rew=15344.75]                                                                                


Epoch #86: test_reward: 13691.800000 ± 5613.057185, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #87: 1001it [00:02, 370.26it/s, env_step=87000, gradient_step=8700, len=181, n/ep=2, n/st=100, rew=10912.75]                                                                                


Epoch #87: test_reward: 11470.500000 ± 3839.976907, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #88: 1001it [00:02, 384.64it/s, env_step=88000, gradient_step=8800, len=125, n/ep=0, n/st=100, rew=6973.00]                                                                                 


Epoch #88: test_reward: 15291.800000 ± 8398.237646, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #89: 1001it [00:03, 324.77it/s, env_step=89000, gradient_step=8900, len=198, n/ep=0, n/st=100, rew=12533.00]                                                                                


Epoch #89: test_reward: 13056.300000 ± 7164.096790, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #90: 1001it [00:03, 333.38it/s, env_step=90000, gradient_step=9000, len=122, n/ep=0, n/st=100, rew=7346.00]                                                                                 


Epoch #90: test_reward: 14184.000000 ± 7138.440208, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #91: 1001it [00:02, 390.65it/s, env_step=91000, gradient_step=9100, len=263, n/ep=0, n/st=100, rew=16785.00]                                                                                


Epoch #91: test_reward: 10217.600000 ± 5176.987159, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #92: 1001it [00:02, 382.34it/s, env_step=92000, gradient_step=9200, len=182, n/ep=0, n/st=100, rew=11571.00]                                                                                


Epoch #92: test_reward: 13854.100000 ± 4537.626019, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #93: 1001it [00:02, 363.72it/s, env_step=93000, gradient_step=9300, len=126, n/ep=1, n/st=100, rew=7817.00]                                                                                 


Epoch #93: test_reward: 17935.800000 ± 9079.070226, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #94: 1001it [00:02, 370.97it/s, env_step=94000, gradient_step=9400, len=121, n/ep=2, n/st=100, rew=7067.25]                                                                                 


Epoch #94: test_reward: 13180.400000 ± 5922.814318, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #95: 1001it [00:02, 406.57it/s, env_step=95000, gradient_step=9500, len=293, n/ep=0, n/st=100, rew=20174.00]                                                                                


Epoch #95: test_reward: 13102.800000 ± 2982.881587, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #96: 1001it [00:02, 340.99it/s, env_step=96000, gradient_step=9600, len=108, n/ep=0, n/st=100, rew=6054.00]                                                                                 


Epoch #96: test_reward: 12045.400000 ± 6614.325169, best_reward: 18248.900000 ± 8014.155407 in #57


Epoch #97: 1001it [00:02, 396.50it/s, env_step=97000, gradient_step=9700, len=126, n/ep=0, n/st=100, rew=6968.00]                                                                                 


Epoch #97: test_reward: 19574.600000 ± 6677.279269, best_reward: 19574.600000 ± 6677.279269 in #97


Epoch #98: 1001it [00:03, 311.53it/s, env_step=98000, gradient_step=9800, len=204, n/ep=3, n/st=100, rew=12567.17]                                                                                


Epoch #98: test_reward: 10800.200000 ± 6774.678218, best_reward: 19574.600000 ± 6677.279269 in #97


Epoch #99: 1001it [00:03, 330.25it/s, env_step=99000, gradient_step=9900, len=400, n/ep=1, n/st=100, rew=27368.00]                                                                                


Epoch #99: test_reward: 12385.900000 ± 5269.123845, best_reward: 19574.600000 ± 6677.279269 in #97


Epoch #100: 1001it [00:02, 406.49it/s, env_step=100000, gradient_step=10000, len=154, n/ep=1, n/st=100, rew=9394.00]                                                                              


Epoch #100: test_reward: 11970.900000 ± 4810.603236, best_reward: 19574.600000 ± 6677.279269 in #97

InfoStats(gradient_step=10000, best_reward=19574.6, best_reward_std=6677.279269283261, train_step=100000, train_episode=505, test_step=200798, test_episode=1010, timing=TimingStats(total_time=383.4476127624512, train_time=271.68435525894165, train_time_collect=34.09118342399597, train_time_update=233.50307774543762, test_time=111.76325750350952, update_speed=368.07419368954925))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #19


Epoch #1: 1001it [00:02, 383.89it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 10138.000000 ± 2532.171400, best_reward: 13589.500000 ± 5610.163086 in #0


Epoch #2: 1001it [00:02, 393.85it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 17380.400000 ± 4020.115401, best_reward: 17380.400000 ± 4020.115401 in #2


Epoch #3: 1001it [00:02, 424.49it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 15975.000000 ± 4796.362664, best_reward: 17380.400000 ± 4020.115401 in #2


Epoch #4: 1001it [00:02, 434.58it/s, env_step=4000, gradient_step=400, len=34, n/ep=0, n/st=100, rew=1004.00]                                                                                     


Epoch #4: test_reward: 13701.000000 ± 7511.032898, best_reward: 17380.400000 ± 4020.115401 in #2


Epoch #5: 1001it [00:02, 402.12it/s, env_step=5000, gradient_step=500, len=44, n/ep=0, n/st=100, rew=1833.50]                                                                                     


Epoch #5: test_reward: 14640.500000 ± 4045.562390, best_reward: 17380.400000 ± 4020.115401 in #2


Epoch #6: 1001it [00:02, 430.33it/s, env_step=6000, gradient_step=600, len=44, n/ep=0, n/st=100, rew=1833.50]                                                                                     


Epoch #6: test_reward: 11835.600000 ± 5803.831772, best_reward: 17380.400000 ± 4020.115401 in #2


Epoch #7: 1001it [00:02, 401.96it/s, env_step=7000, gradient_step=700, len=44, n/ep=0, n/st=100, rew=1833.50]                                                                                     


Epoch #7: test_reward: 14670.300000 ± 4786.758236, best_reward: 17380.400000 ± 4020.115401 in #2


Epoch #8: 1001it [00:03, 314.75it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=4096.00]                                                                                     


Epoch #8: test_reward: 12188.300000 ± 8945.464214, best_reward: 17380.400000 ± 4020.115401 in #2


Epoch #9: 1001it [00:02, 350.26it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=4081.00]                                                                                     


Epoch #9: test_reward: 10476.800000 ± 5050.262742, best_reward: 17380.400000 ± 4020.115401 in #2


Epoch #10: 1001it [00:02, 385.84it/s, env_step=10000, gradient_step=1000, len=100, n/ep=1, n/st=100, rew=4200.00]                                                                                 


Epoch #10: test_reward: 18472.000000 ± 8247.186951, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #11: 1001it [00:02, 394.74it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=5672.00]                                                                                 


Epoch #11: test_reward: 11405.200000 ± 6371.977602, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #12: 1001it [00:03, 317.60it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=6692.75]                                                                                 


Epoch #12: test_reward: 10722.400000 ± 5622.256419, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #13: 1001it [00:02, 383.50it/s, env_step=13000, gradient_step=1300, len=130, n/ep=2, n/st=100, rew=7181.50]                                                                                 


Epoch #13: test_reward: 13837.600000 ± 7101.592697, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #14: 1001it [00:03, 326.94it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=7784.50]                                                                                 


Epoch #14: test_reward: 15623.800000 ± 6226.367076, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #15: 1001it [00:02, 404.80it/s, env_step=15000, gradient_step=1500, len=146, n/ep=0, n/st=100, rew=6803.00]                                                                                 


Epoch #15: test_reward: 13884.500000 ± 3573.865309, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #16: 1001it [00:02, 376.22it/s, env_step=16000, gradient_step=1600, len=158, n/ep=0, n/st=100, rew=8912.00]                                                                                 


Epoch #16: test_reward: 14386.300000 ± 6127.532816, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #17: 1001it [00:02, 382.32it/s, env_step=17000, gradient_step=1700, len=170, n/ep=1, n/st=100, rew=10332.50]                                                                                


Epoch #17: test_reward: 12171.500000 ± 3452.762293, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #18: 1001it [00:02, 398.29it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=10506.75]                                                                                


Epoch #18: test_reward: 16082.200000 ± 5575.428626, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #19: 1001it [00:02, 359.32it/s, env_step=19000, gradient_step=1900, len=146, n/ep=4, n/st=100, rew=8574.00]                                                                                 


Epoch #19: test_reward: 13751.100000 ± 7188.482572, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #20: 1001it [00:02, 352.81it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=11622.00]                                                                                


Epoch #20: test_reward: 17237.700000 ± 6753.314742, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #21: 1001it [00:02, 409.19it/s, env_step=21000, gradient_step=2100, len=210, n/ep=2, n/st=100, rew=12850.50]                                                                                


Epoch #21: test_reward: 12679.700000 ± 5393.126738, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #22: 1001it [00:02, 422.40it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=14077.50]                                                                                


Epoch #22: test_reward: 10384.000000 ± 4715.947667, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #23: 1001it [00:03, 329.11it/s, env_step=23000, gradient_step=2300, len=102, n/ep=0, n/st=100, rew=5727.75]                                                                                 


Epoch #23: test_reward: 10576.200000 ± 3742.979289, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #24: 1001it [00:02, 371.86it/s, env_step=24000, gradient_step=2400, len=189, n/ep=2, n/st=100, rew=10850.50]                                                                                


Epoch #24: test_reward: 13587.000000 ± 4616.407023, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #25: 1001it [00:02, 362.21it/s, env_step=25000, gradient_step=2500, len=250, n/ep=1, n/st=100, rew=14851.00]                                                                                


Epoch #25: test_reward: 17300.300000 ± 7463.991024, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #26: 1001it [00:02, 338.92it/s, env_step=26000, gradient_step=2600, len=209, n/ep=2, n/st=100, rew=13111.50]                                                                                


Epoch #26: test_reward: 9845.600000 ± 3640.940269, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #27: 1001it [00:02, 376.34it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=17300.00]                                                                                


Epoch #27: test_reward: 16803.400000 ± 5709.814291, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #28: 1001it [00:02, 375.34it/s, env_step=28000, gradient_step=2800, len=146, n/ep=0, n/st=100, rew=8790.00]                                                                                 


Epoch #28: test_reward: 15358.500000 ± 8335.481681, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #29: 1001it [00:02, 396.75it/s, env_step=29000, gradient_step=2900, len=118, n/ep=1, n/st=100, rew=6642.00]                                                                                 


Epoch #29: test_reward: 14194.200000 ± 8281.001036, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #30: 1001it [00:03, 318.74it/s, env_step=30000, gradient_step=3000, len=300, n/ep=1, n/st=100, rew=19129.00]                                                                                


Epoch #30: test_reward: 9933.000000 ± 5987.449090, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #31: 1001it [00:02, 409.89it/s, env_step=31000, gradient_step=3100, len=150, n/ep=0, n/st=100, rew=9372.00]                                                                                 


Epoch #31: test_reward: 9908.600000 ± 5099.347903, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #32: 1001it [00:02, 423.81it/s, env_step=32000, gradient_step=3200, len=184, n/ep=1, n/st=100, rew=11427.00]                                                                                


Epoch #32: test_reward: 9970.800000 ± 3166.951872, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #33: 1001it [00:02, 362.78it/s, env_step=33000, gradient_step=3300, len=138, n/ep=0, n/st=100, rew=8296.00]                                                                                 


Epoch #33: test_reward: 11367.300000 ± 6218.036733, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #34: 1001it [00:02, 349.25it/s, env_step=34000, gradient_step=3400, len=229, n/ep=2, n/st=100, rew=14522.00]                                                                                


Epoch #34: test_reward: 12437.200000 ± 5864.547805, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #35: 1001it [00:02, 339.04it/s, env_step=35000, gradient_step=3500, len=148, n/ep=2, n/st=100, rew=8732.00]                                                                                 


Epoch #35: test_reward: 9553.500000 ± 5104.713885, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #36: 1001it [00:02, 376.37it/s, env_step=36000, gradient_step=3600, len=167, n/ep=0, n/st=100, rew=9802.00]                                                                                 


Epoch #36: test_reward: 12597.800000 ± 6120.888527, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #37: 1001it [00:02, 424.50it/s, env_step=37000, gradient_step=3700, len=142, n/ep=0, n/st=100, rew=8945.00]                                                                                 


Epoch #37: test_reward: 18120.400000 ± 6828.613742, best_reward: 18472.000000 ± 8247.186951 in #10


Epoch #38: 1001it [00:02, 366.64it/s, env_step=38000, gradient_step=3800, len=139, n/ep=0, n/st=100, rew=8573.50]                                                                                 


Epoch #38: test_reward: 18542.100000 ± 8951.992967, best_reward: 18542.100000 ± 8951.992967 in #38


Epoch #39: 1001it [00:02, 335.50it/s, env_step=39000, gradient_step=3900, len=129, n/ep=0, n/st=100, rew=7590.00]                                                                                 


Epoch #39: test_reward: 14526.800000 ± 2884.177727, best_reward: 18542.100000 ± 8951.992967 in #38


Epoch #40: 1001it [00:02, 356.85it/s, env_step=40000, gradient_step=4000, len=400, n/ep=6, n/st=100, rew=26836.33]                                                                                


Epoch #40: test_reward: 11440.100000 ± 2538.389428, best_reward: 18542.100000 ± 8951.992967 in #38


Epoch #41: 1001it [00:02, 409.20it/s, env_step=41000, gradient_step=4100, len=223, n/ep=0, n/st=100, rew=14924.00]                                                                                


Epoch #41: test_reward: 18013.000000 ± 7963.515066, best_reward: 18542.100000 ± 8951.992967 in #38


Epoch #42: 1001it [00:02, 369.87it/s, env_step=42000, gradient_step=4200, len=160, n/ep=1, n/st=100, rew=9900.00]                                                                                 


Epoch #42: test_reward: 20086.600000 ± 6692.875857, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #43: 1001it [00:02, 384.49it/s, env_step=43000, gradient_step=4300, len=114, n/ep=0, n/st=100, rew=6691.00]                                                                                 


Epoch #43: test_reward: 10172.400000 ± 7210.775537, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #44: 1001it [00:02, 400.59it/s, env_step=44000, gradient_step=4400, len=86, n/ep=0, n/st=100, rew=4874.75]                                                                                  


Epoch #44: test_reward: 9307.100000 ± 3331.308165, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #45: 1001it [00:02, 388.14it/s, env_step=45000, gradient_step=4500, len=126, n/ep=1, n/st=100, rew=7614.00]                                                                                 


Epoch #45: test_reward: 19834.400000 ± 6919.191328, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #46: 1001it [00:02, 351.39it/s, env_step=46000, gradient_step=4600, len=253, n/ep=0, n/st=100, rew=16827.00]                                                                                


Epoch #46: test_reward: 11138.200000 ± 4976.448509, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #47: 1001it [00:02, 401.81it/s, env_step=47000, gradient_step=4700, len=138, n/ep=1, n/st=100, rew=7976.00]                                                                                 


Epoch #47: test_reward: 9219.500000 ± 5951.953549, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #48: 1001it [00:02, 435.44it/s, env_step=48000, gradient_step=4800, len=123, n/ep=1, n/st=100, rew=7200.50]                                                                                 


Epoch #48: test_reward: 14584.400000 ± 5329.098520, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #49: 1001it [00:02, 355.38it/s, env_step=49000, gradient_step=4900, len=213, n/ep=0, n/st=100, rew=13491.00]                                                                                


Epoch #49: test_reward: 9640.500000 ± 7417.037835, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #50: 1001it [00:02, 385.01it/s, env_step=50000, gradient_step=5000, len=175, n/ep=0, n/st=100, rew=10971.00]                                                                                


Epoch #50: test_reward: 11876.800000 ± 4645.428695, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #51: 1001it [00:03, 321.07it/s, env_step=51000, gradient_step=5100, len=128, n/ep=1, n/st=100, rew=7470.00]                                                                                 


Epoch #51: test_reward: 14129.900000 ± 6307.661476, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #52: 1001it [00:02, 424.88it/s, env_step=52000, gradient_step=5200, len=295, n/ep=0, n/st=100, rew=20031.25]                                                                                


Epoch #52: test_reward: 14180.500000 ± 8854.184810, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #53: 1001it [00:02, 427.52it/s, env_step=53000, gradient_step=5300, len=202, n/ep=2, n/st=100, rew=13106.50]                                                                                


Epoch #53: test_reward: 9451.200000 ± 2786.976634, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #54: 1001it [00:02, 379.69it/s, env_step=54000, gradient_step=5400, len=136, n/ep=2, n/st=100, rew=8426.75]                                                                                 


Epoch #54: test_reward: 8680.000000 ± 6272.630325, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #55: 1001it [00:02, 396.58it/s, env_step=55000, gradient_step=5500, len=82, n/ep=0, n/st=100, rew=4634.00]                                                                                  


Epoch #55: test_reward: 15118.800000 ± 5763.899111, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #56: 1001it [00:02, 369.55it/s, env_step=56000, gradient_step=5600, len=138, n/ep=0, n/st=100, rew=8195.17]                                                                                 


Epoch #56: test_reward: 8391.200000 ± 2221.997876, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #57: 1001it [00:02, 348.26it/s, env_step=57000, gradient_step=5700, len=180, n/ep=0, n/st=100, rew=9938.00]                                                                                 


Epoch #57: test_reward: 8370.800000 ± 3608.225209, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #58: 1001it [00:02, 342.52it/s, env_step=58000, gradient_step=5800, len=143, n/ep=1, n/st=100, rew=9267.00]                                                                                 


Epoch #58: test_reward: 12910.100000 ± 6355.064224, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #59: 1001it [00:02, 430.55it/s, env_step=59000, gradient_step=5900, len=190, n/ep=1, n/st=100, rew=11907.00]                                                                                


Epoch #59: test_reward: 16498.800000 ± 6559.385654, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #60: 1001it [00:02, 383.31it/s, env_step=60000, gradient_step=6000, len=173, n/ep=2, n/st=100, rew=10528.25]                                                                                


Epoch #60: test_reward: 11204.800000 ± 5731.257956, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #61: 1001it [00:02, 342.77it/s, env_step=61000, gradient_step=6100, len=143, n/ep=0, n/st=100, rew=8702.88]                                                                                 


Epoch #61: test_reward: 14257.400000 ± 8788.732061, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #62: 1001it [00:02, 400.06it/s, env_step=62000, gradient_step=6200, len=219, n/ep=0, n/st=100, rew=14300.00]                                                                                


Epoch #62: test_reward: 10900.200000 ± 3048.239059, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #63: 1001it [00:02, 343.24it/s, env_step=63000, gradient_step=6300, len=109, n/ep=0, n/st=100, rew=6745.00]                                                                                 


Epoch #63: test_reward: 18115.100000 ± 6833.175520, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #64: 1001it [00:02, 391.93it/s, env_step=64000, gradient_step=6400, len=191, n/ep=2, n/st=100, rew=12393.50]                                                                                


Epoch #64: test_reward: 12706.500000 ± 3997.240404, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #65: 1001it [00:02, 413.50it/s, env_step=65000, gradient_step=6500, len=258, n/ep=1, n/st=100, rew=15759.00]                                                                                


Epoch #65: test_reward: 14613.600000 ± 4634.076892, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #66: 1001it [00:02, 405.78it/s, env_step=66000, gradient_step=6600, len=115, n/ep=0, n/st=100, rew=6860.00]                                                                                 


Epoch #66: test_reward: 12666.600000 ± 3110.894926, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #67: 1001it [00:02, 400.09it/s, env_step=67000, gradient_step=6700, len=149, n/ep=0, n/st=100, rew=8696.00]                                                                                 


Epoch #67: test_reward: 9474.200000 ± 3318.690880, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #68: 1001it [00:03, 328.92it/s, env_step=68000, gradient_step=6800, len=126, n/ep=0, n/st=100, rew=7725.50]                                                                                 


Epoch #68: test_reward: 10150.600000 ± 6518.791195, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #69: 1001it [00:03, 332.83it/s, env_step=69000, gradient_step=6900, len=60, n/ep=0, n/st=100, rew=2095.00]                                                                                  


Epoch #69: test_reward: 18087.300000 ± 7884.161719, best_reward: 20086.600000 ± 6692.875857 in #42


Epoch #70: 1001it [00:02, 371.39it/s, env_step=70000, gradient_step=7000, len=203, n/ep=0, n/st=100, rew=13217.50]                                                                                


Epoch #70: test_reward: 25390.000000 ± 7256.625056, best_reward: 25390.000000 ± 7256.625056 in #70

InfoStats(gradient_step=7000, best_reward=25390.0, best_reward_std=7256.625055768005, train_step=70000, train_episode=350, test_step=143508, test_episode=710, timing=TimingStats(total_time=267.0329568386078, train_time=187.20722103118896, train_time_collect=23.56826114654541, train_time_update=160.80367708206177, test_time=79.82573580741882, update_speed=373.91720049269844))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #12


Epoch #1: 1001it [00:02, 491.97it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 11381.400000 ± 2530.742626, best_reward: 11381.400000 ± 2530.742626 in #1


Epoch #2: 1001it [00:02, 387.41it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 13969.400000 ± 7840.025921, best_reward: 13969.400000 ± 7840.025921 in #2


Epoch #3: 1001it [00:02, 363.01it/s, env_step=3000, gradient_step=300, len=24, n/ep=0, n/st=100, rew=519.00]                                                                                      


Epoch #3: test_reward: 10245.700000 ± 5332.431473, best_reward: 13969.400000 ± 7840.025921 in #2


Epoch #4: 1001it [00:02, 447.42it/s, env_step=4000, gradient_step=400, len=24, n/ep=0, n/st=100, rew=519.00]                                                                                      


Epoch #4: test_reward: 9404.000000 ± 4625.743270, best_reward: 13969.400000 ± 7840.025921 in #2


Epoch #5: 1001it [00:02, 481.58it/s, env_step=5000, gradient_step=500, len=24, n/ep=0, n/st=100, rew=519.00]                                                                                      


Epoch #5: test_reward: 10999.200000 ± 4961.676729, best_reward: 13969.400000 ± 7840.025921 in #2


Epoch #6: 1001it [00:02, 484.28it/s, env_step=6000, gradient_step=600, len=56, n/ep=0, n/st=100, rew=1505.00]                                                                                     


Epoch #6: test_reward: 16829.400000 ± 5459.539197, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #7: 1001it [00:02, 455.72it/s, env_step=7000, gradient_step=700, len=66, n/ep=0, n/st=100, rew=2118.00]                                                                                     


Epoch #7: test_reward: 10278.600000 ± 4314.470355, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #8: 1001it [00:02, 388.43it/s, env_step=8000, gradient_step=800, len=80, n/ep=2, n/st=100, rew=2526.50]                                                                                     


Epoch #8: test_reward: 7193.300000 ± 6042.770756, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #9: 1001it [00:02, 445.33it/s, env_step=9000, gradient_step=900, len=82, n/ep=3, n/st=100, rew=2761.50]                                                                                     


Epoch #9: test_reward: 7512.100000 ± 2725.603693, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #10: 1001it [00:02, 457.11it/s, env_step=10000, gradient_step=1000, len=100, n/ep=2, n/st=100, rew=3641.50]                                                                                 


Epoch #10: test_reward: 10779.000000 ± 4702.691591, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #11: 1001it [00:02, 463.50it/s, env_step=11000, gradient_step=1100, len=110, n/ep=3, n/st=100, rew=4055.00]                                                                                 


Epoch #11: test_reward: 6885.100000 ± 2156.611808, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #12: 1001it [00:02, 444.98it/s, env_step=12000, gradient_step=1200, len=20, n/ep=2, n/st=100, rew=547.00]                                                                                   


Epoch #12: test_reward: 10236.000000 ± 4085.432878, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #13: 1001it [00:02, 347.67it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=5218.00]                                                                                 


Epoch #13: test_reward: 9406.300000 ± 4700.276440, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #14: 1001it [00:02, 393.79it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=5716.50]                                                                                 


Epoch #14: test_reward: 9693.000000 ± 3659.883878, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #15: 1001it [00:02, 400.10it/s, env_step=15000, gradient_step=1500, len=88, n/ep=1, n/st=100, rew=4056.00]                                                                                  


Epoch #15: test_reward: 12446.800000 ± 3020.544348, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #16: 1001it [00:02, 408.35it/s, env_step=16000, gradient_step=1600, len=62, n/ep=2, n/st=100, rew=2331.50]                                                                                  


Epoch #16: test_reward: 10982.400000 ± 3436.491676, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #17: 1001it [00:02, 386.51it/s, env_step=17000, gradient_step=1700, len=135, n/ep=0, n/st=100, rew=5894.33]                                                                                 


Epoch #17: test_reward: 10753.500000 ± 4582.807726, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #18: 1001it [00:02, 466.80it/s, env_step=18000, gradient_step=1800, len=36, n/ep=1, n/st=100, rew=1114.00]                                                                                  


Epoch #18: test_reward: 11065.300000 ± 5796.665887, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #19: 1001it [00:02, 425.77it/s, env_step=19000, gradient_step=1900, len=142, n/ep=2, n/st=100, rew=5869.00]                                                                                 


Epoch #19: test_reward: 13674.300000 ± 3066.334165, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #20: 1001it [00:02, 404.12it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=9046.00]                                                                                 


Epoch #20: test_reward: 14611.700000 ± 6481.625198, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #21: 1001it [00:02, 408.88it/s, env_step=21000, gradient_step=2100, len=144, n/ep=1, n/st=100, rew=6925.00]                                                                                 


Epoch #21: test_reward: 11829.600000 ± 5034.790307, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #22: 1001it [00:02, 431.50it/s, env_step=22000, gradient_step=2200, len=72, n/ep=0, n/st=100, rew=2679.00]                                                                                  


Epoch #22: test_reward: 14079.000000 ± 4773.598433, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #23: 1001it [00:02, 428.08it/s, env_step=23000, gradient_step=2300, len=118, n/ep=0, n/st=100, rew=5208.00]                                                                                 


Epoch #23: test_reward: 11780.000000 ± 7158.568237, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #24: 1001it [00:02, 344.16it/s, env_step=24000, gradient_step=2400, len=152, n/ep=1, n/st=100, rew=7507.00]                                                                                 


Epoch #24: test_reward: 8432.000000 ± 3815.816977, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #25: 1001it [00:02, 390.82it/s, env_step=25000, gradient_step=2500, len=246, n/ep=0, n/st=100, rew=11431.00]                                                                                


Epoch #25: test_reward: 5870.900000 ± 4464.787933, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #26: 1001it [00:02, 463.42it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=12890.00]                                                                                


Epoch #26: test_reward: 14064.400000 ± 3541.952207, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #27: 1001it [00:02, 404.00it/s, env_step=27000, gradient_step=2700, len=68, n/ep=0, n/st=100, rew=3167.00]                                                                                  


Epoch #27: test_reward: 11160.800000 ± 5311.086156, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #28: 1001it [00:02, 365.93it/s, env_step=28000, gradient_step=2800, len=122, n/ep=0, n/st=100, rew=5910.00]                                                                                 


Epoch #28: test_reward: 10285.100000 ± 3361.885377, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #29: 1001it [00:02, 401.63it/s, env_step=29000, gradient_step=2900, len=180, n/ep=0, n/st=100, rew=9087.00]                                                                                 


Epoch #29: test_reward: 12638.000000 ± 6419.193890, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #30: 1001it [00:02, 440.91it/s, env_step=30000, gradient_step=3000, len=298, n/ep=0, n/st=100, rew=16465.00]                                                                                


Epoch #30: test_reward: 10809.900000 ± 2155.205904, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #31: 1001it [00:02, 385.83it/s, env_step=31000, gradient_step=3100, len=310, n/ep=1, n/st=100, rew=16625.00]                                                                                


Epoch #31: test_reward: 8774.400000 ± 3424.638469, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #32: 1001it [00:02, 381.86it/s, env_step=32000, gradient_step=3200, len=320, n/ep=2, n/st=100, rew=16889.50]                                                                                


Epoch #32: test_reward: 10394.300000 ± 3880.284269, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #33: 1001it [00:02, 435.30it/s, env_step=33000, gradient_step=3300, len=325, n/ep=0, n/st=100, rew=16510.00]                                                                                


Epoch #33: test_reward: 14430.200000 ± 5064.381636, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #34: 1001it [00:02, 345.41it/s, env_step=34000, gradient_step=3400, len=107, n/ep=2, n/st=100, rew=5438.00]                                                                                 


Epoch #34: test_reward: 9775.000000 ± 6161.372931, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #35: 1001it [00:02, 352.62it/s, env_step=35000, gradient_step=3500, len=131, n/ep=2, n/st=100, rew=7171.00]                                                                                 


Epoch #35: test_reward: 7364.200000 ± 2021.857453, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #36: 1001it [00:02, 391.80it/s, env_step=36000, gradient_step=3600, len=36, n/ep=0, n/st=100, rew=1304.00]                                                                                  


Epoch #36: test_reward: 6585.000000 ± 2387.126515, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #37: 1001it [00:02, 423.35it/s, env_step=37000, gradient_step=3700, len=112, n/ep=2, n/st=100, rew=5456.00]                                                                                 


Epoch #37: test_reward: 11832.100000 ± 5234.144370, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #38: 1001it [00:02, 411.98it/s, env_step=38000, gradient_step=3800, len=263, n/ep=0, n/st=100, rew=14663.50]                                                                                


Epoch #38: test_reward: 6359.500000 ± 5720.806294, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #39: 1001it [00:02, 417.78it/s, env_step=39000, gradient_step=3900, len=240, n/ep=1, n/st=100, rew=12853.50]                                                                                


Epoch #39: test_reward: 9441.400000 ± 5173.365640, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #40: 1001it [00:02, 344.60it/s, env_step=40000, gradient_step=4000, len=197, n/ep=4, n/st=100, rew=9730.25]                                                                                 


Epoch #40: test_reward: 13762.500000 ± 6510.009267, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #41: 1001it [00:02, 450.40it/s, env_step=41000, gradient_step=4100, len=177, n/ep=0, n/st=100, rew=8905.50]                                                                                 


Epoch #41: test_reward: 6440.800000 ± 2850.887188, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #42: 1001it [00:02, 352.20it/s, env_step=42000, gradient_step=4200, len=330, n/ep=1, n/st=100, rew=19895.00]                                                                                


Epoch #42: test_reward: 10495.900000 ± 6982.029482, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #43: 1001it [00:02, 412.75it/s, env_step=43000, gradient_step=4300, len=132, n/ep=0, n/st=100, rew=6188.50]                                                                                 


Epoch #43: test_reward: 11776.300000 ± 6730.552876, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #44: 1001it [00:02, 408.95it/s, env_step=44000, gradient_step=4400, len=172, n/ep=1, n/st=100, rew=10460.50]                                                                                


Epoch #44: test_reward: 10688.200000 ± 5027.811090, best_reward: 16829.400000 ± 5459.539197 in #6


Epoch #45: 1001it [00:02, 379.85it/s, env_step=45000, gradient_step=4500, len=226, n/ep=1, n/st=100, rew=12654.00]                                                                                


Epoch #45: test_reward: 18349.000000 ± 5792.736020, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #46: 1001it [00:02, 404.31it/s, env_step=46000, gradient_step=4600, len=280, n/ep=0, n/st=100, rew=16297.00]                                                                                


Epoch #46: test_reward: 9830.100000 ± 2762.118117, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #47: 1001it [00:02, 439.40it/s, env_step=47000, gradient_step=4700, len=242, n/ep=1, n/st=100, rew=13073.00]                                                                                


Epoch #47: test_reward: 15047.000000 ± 5493.049881, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #48: 1001it [00:02, 422.44it/s, env_step=48000, gradient_step=4800, len=166, n/ep=1, n/st=100, rew=9114.00]                                                                                 


Epoch #48: test_reward: 11423.200000 ± 4167.239105, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #49: 1001it [00:02, 369.27it/s, env_step=49000, gradient_step=4900, len=238, n/ep=1, n/st=100, rew=12871.00]                                                                                


Epoch #49: test_reward: 7308.000000 ± 1694.593048, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #50: 1001it [00:02, 466.17it/s, env_step=50000, gradient_step=5000, len=237, n/ep=2, n/st=100, rew=13357.00]                                                                                


Epoch #50: test_reward: 14573.400000 ± 4014.496736, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #51: 1001it [00:02, 410.38it/s, env_step=51000, gradient_step=5100, len=116, n/ep=0, n/st=100, rew=6063.00]                                                                                 


Epoch #51: test_reward: 8982.400000 ± 4685.695662, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #52: 1001it [00:02, 446.29it/s, env_step=52000, gradient_step=5200, len=120, n/ep=1, n/st=100, rew=6456.00]                                                                                 


Epoch #52: test_reward: 13966.200000 ± 5230.435829, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #53: 1001it [00:02, 388.48it/s, env_step=53000, gradient_step=5300, len=164, n/ep=1, n/st=100, rew=9024.00]                                                                                 


Epoch #53: test_reward: 7128.800000 ± 2958.777883, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #54: 1001it [00:02, 444.78it/s, env_step=54000, gradient_step=5400, len=86, n/ep=1, n/st=100, rew=2770.00]                                                                                  


Epoch #54: test_reward: 11156.100000 ± 6045.075061, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #55: 1001it [00:02, 344.19it/s, env_step=55000, gradient_step=5500, len=162, n/ep=0, n/st=100, rew=9001.50]                                                                                 


Epoch #55: test_reward: 9522.400000 ± 5607.679791, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #56: 1001it [00:02, 431.80it/s, env_step=56000, gradient_step=5600, len=125, n/ep=0, n/st=100, rew=6810.00]                                                                                 


Epoch #56: test_reward: 8109.800000 ± 5246.529478, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #57: 1001it [00:02, 432.95it/s, env_step=57000, gradient_step=5700, len=142, n/ep=0, n/st=100, rew=8131.00]                                                                                 


Epoch #57: test_reward: 10070.400000 ± 4037.507924, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #58: 1001it [00:02, 375.43it/s, env_step=58000, gradient_step=5800, len=215, n/ep=3, n/st=100, rew=12552.33]                                                                                


Epoch #58: test_reward: 9026.800000 ± 3291.325715, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #59: 1001it [00:02, 451.74it/s, env_step=59000, gradient_step=5900, len=234, n/ep=0, n/st=100, rew=13777.00]                                                                                


Epoch #59: test_reward: 14969.200000 ± 6958.297964, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #60: 1001it [00:02, 384.54it/s, env_step=60000, gradient_step=6000, len=150, n/ep=1, n/st=100, rew=8355.00]                                                                                 


Epoch #60: test_reward: 13052.500000 ± 4377.852105, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #61: 1001it [00:02, 414.31it/s, env_step=61000, gradient_step=6100, len=99, n/ep=2, n/st=100, rew=4879.25]                                                                                  


Epoch #61: test_reward: 18254.200000 ± 5482.426849, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #62: 1001it [00:02, 397.68it/s, env_step=62000, gradient_step=6200, len=148, n/ep=3, n/st=100, rew=8070.00]                                                                                 


Epoch #62: test_reward: 7971.000000 ± 4015.214291, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #63: 1001it [00:02, 390.13it/s, env_step=63000, gradient_step=6300, len=190, n/ep=0, n/st=100, rew=10805.00]                                                                                


Epoch #63: test_reward: 14345.200000 ± 5756.423782, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #64: 1001it [00:02, 400.88it/s, env_step=64000, gradient_step=6400, len=144, n/ep=2, n/st=100, rew=8075.25]                                                                                 


Epoch #64: test_reward: 6901.800000 ± 2176.180222, best_reward: 18349.000000 ± 5792.736020 in #45


Epoch #65: 1001it [00:02, 454.15it/s, env_step=65000, gradient_step=6500, len=143, n/ep=2, n/st=100, rew=7578.00]                                                                                 


Epoch #65: test_reward: 19121.200000 ± 7333.501767, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #66: 1001it [00:02, 449.64it/s, env_step=66000, gradient_step=6600, len=78, n/ep=0, n/st=100, rew=3843.00]                                                                                  


Epoch #66: test_reward: 14379.800000 ± 5855.237669, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #67: 1001it [00:02, 360.26it/s, env_step=67000, gradient_step=6700, len=124, n/ep=1, n/st=100, rew=6776.00]                                                                                 


Epoch #67: test_reward: 8071.400000 ± 4182.305972, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #68: 1001it [00:02, 418.99it/s, env_step=68000, gradient_step=6800, len=180, n/ep=1, n/st=100, rew=9943.50]                                                                                 


Epoch #68: test_reward: 8144.800000 ± 3183.960829, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #69: 1001it [00:02, 438.93it/s, env_step=69000, gradient_step=6900, len=154, n/ep=0, n/st=100, rew=9083.00]                                                                                 


Epoch #69: test_reward: 9723.000000 ± 4313.824127, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #70: 1001it [00:02, 400.78it/s, env_step=70000, gradient_step=7000, len=186, n/ep=0, n/st=100, rew=9197.00]                                                                                 


Epoch #70: test_reward: 8119.200000 ± 3454.207544, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #71: 1001it [00:02, 433.30it/s, env_step=71000, gradient_step=7100, len=148, n/ep=2, n/st=100, rew=8660.50]                                                                                 


Epoch #71: test_reward: 15520.900000 ± 4673.726660, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #72: 1001it [00:02, 416.55it/s, env_step=72000, gradient_step=7200, len=132, n/ep=1, n/st=100, rew=6923.00]                                                                                 


Epoch #72: test_reward: 7827.800000 ± 2598.334151, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #73: 1001it [00:02, 462.00it/s, env_step=73000, gradient_step=7300, len=186, n/ep=0, n/st=100, rew=10946.00]                                                                                


Epoch #73: test_reward: 11923.300000 ± 4459.861075, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #74: 1001it [00:02, 391.39it/s, env_step=74000, gradient_step=7400, len=60, n/ep=0, n/st=100, rew=2408.00]                                                                                  


Epoch #74: test_reward: 16765.400000 ± 6149.919482, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #75: 1001it [00:02, 427.48it/s, env_step=75000, gradient_step=7500, len=128, n/ep=1, n/st=100, rew=6387.00]                                                                                 


Epoch #75: test_reward: 14999.000000 ± 7177.519084, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #76: 1001it [00:02, 439.81it/s, env_step=76000, gradient_step=7600, len=107, n/ep=1, n/st=100, rew=6257.00]                                                                                 


Epoch #76: test_reward: 12829.000000 ± 3691.298660, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #77: 1001it [00:02, 409.31it/s, env_step=77000, gradient_step=7700, len=158, n/ep=0, n/st=100, rew=9364.00]                                                                                 


Epoch #77: test_reward: 11035.200000 ± 2746.481269, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #78: 1001it [00:02, 382.87it/s, env_step=78000, gradient_step=7800, len=181, n/ep=2, n/st=100, rew=10084.00]                                                                                


Epoch #78: test_reward: 10853.300000 ± 2522.645280, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #79: 1001it [00:02, 429.14it/s, env_step=79000, gradient_step=7900, len=309, n/ep=2, n/st=100, rew=19204.75]                                                                                


Epoch #79: test_reward: 10309.000000 ± 4856.456836, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #80: 1001it [00:02, 385.96it/s, env_step=80000, gradient_step=8000, len=154, n/ep=0, n/st=100, rew=9027.00]                                                                                 


Epoch #80: test_reward: 9599.100000 ± 4672.202488, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #81: 1001it [00:02, 456.86it/s, env_step=81000, gradient_step=8100, len=174, n/ep=0, n/st=100, rew=10296.00]                                                                                


Epoch #81: test_reward: 14038.000000 ± 3158.742376, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #82: 1001it [00:02, 457.44it/s, env_step=82000, gradient_step=8200, len=148, n/ep=0, n/st=100, rew=7975.50]                                                                                 


Epoch #82: test_reward: 13916.200000 ± 6253.839106, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #83: 1001it [00:02, 354.74it/s, env_step=83000, gradient_step=8300, len=105, n/ep=0, n/st=100, rew=5876.00]                                                                                 


Epoch #83: test_reward: 12599.900000 ± 3299.232288, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #84: 1001it [00:02, 401.96it/s, env_step=84000, gradient_step=8400, len=246, n/ep=1, n/st=100, rew=14515.50]                                                                                


Epoch #84: test_reward: 16512.000000 ± 4882.088242, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #85: 1001it [00:02, 398.34it/s, env_step=85000, gradient_step=8500, len=156, n/ep=0, n/st=100, rew=8663.00]                                                                                 


Epoch #85: test_reward: 13705.100000 ± 5864.960604, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #86: 1001it [00:02, 333.98it/s, env_step=86000, gradient_step=8600, len=156, n/ep=0, n/st=100, rew=8663.00]                                                                                 


Epoch #86: test_reward: 13255.800000 ± 5190.705362, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #87: 1001it [00:02, 423.20it/s, env_step=87000, gradient_step=8700, len=132, n/ep=0, n/st=100, rew=6796.00]                                                                                 


Epoch #87: test_reward: 10762.000000 ± 2992.146621, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #88: 1001it [00:02, 416.63it/s, env_step=88000, gradient_step=8800, len=117, n/ep=2, n/st=100, rew=5761.75]                                                                                 


Epoch #88: test_reward: 11440.400000 ± 5422.378633, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #89: 1001it [00:02, 400.44it/s, env_step=89000, gradient_step=8900, len=188, n/ep=0, n/st=100, rew=11230.00]                                                                                


Epoch #89: test_reward: 11855.900000 ± 2770.106296, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #90: 1001it [00:02, 412.46it/s, env_step=90000, gradient_step=9000, len=131, n/ep=0, n/st=100, rew=7444.00]                                                                                 


Epoch #90: test_reward: 9804.300000 ± 4456.431399, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #91: 1001it [00:03, 328.30it/s, env_step=91000, gradient_step=9100, len=288, n/ep=0, n/st=100, rew=17623.00]                                                                                


Epoch #91: test_reward: 15823.700000 ± 5268.564094, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #92: 1001it [00:02, 386.81it/s, env_step=92000, gradient_step=9200, len=176, n/ep=0, n/st=100, rew=10040.00]                                                                                


Epoch #92: test_reward: 12771.200000 ± 4953.447543, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #93: 1001it [00:02, 382.52it/s, env_step=93000, gradient_step=9300, len=194, n/ep=0, n/st=100, rew=11147.00]                                                                                


Epoch #93: test_reward: 8903.100000 ± 2642.592532, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #94: 1001it [00:02, 355.08it/s, env_step=94000, gradient_step=9400, len=153, n/ep=0, n/st=100, rew=9236.00]                                                                                 


Epoch #94: test_reward: 9581.100000 ± 2672.992011, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #95: 1001it [00:02, 390.68it/s, env_step=95000, gradient_step=9500, len=164, n/ep=1, n/st=100, rew=9390.00]                                                                                 


Epoch #95: test_reward: 14993.600000 ± 5230.432625, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #96: 1001it [00:02, 341.91it/s, env_step=96000, gradient_step=9600, len=260, n/ep=2, n/st=100, rew=15351.00]                                                                                


Epoch #96: test_reward: 15549.400000 ± 4128.100585, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #97: 1001it [00:02, 387.72it/s, env_step=97000, gradient_step=9700, len=131, n/ep=0, n/st=100, rew=8017.00]                                                                                 


Epoch #97: test_reward: 12445.600000 ± 3918.003961, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #98: 1001it [00:02, 403.77it/s, env_step=98000, gradient_step=9800, len=110, n/ep=0, n/st=100, rew=5680.00]                                                                                 


Epoch #98: test_reward: 12156.800000 ± 4047.865605, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #99: 1001it [00:02, 446.76it/s, env_step=99000, gradient_step=9900, len=171, n/ep=3, n/st=100, rew=10191.83]                                                                                


Epoch #99: test_reward: 12796.100000 ± 5680.744677, best_reward: 19121.200000 ± 7333.501767 in #65


Epoch #100: 1001it [00:02, 398.42it/s, env_step=100000, gradient_step=10000, len=182, n/ep=1, n/st=100, rew=10303.00]                                                                             


Epoch #100: test_reward: 9323.100000 ± 4204.830994, best_reward: 19121.200000 ± 7333.501767 in #65

InfoStats(gradient_step=10000, best_reward=19121.2, best_reward_std=7333.5017665505475, train_step=100000, train_episode=553, test_step=184646, test_episode=1010, timing=TimingStats(total_time=349.9337532520294, train_time=247.60770726203918, train_time_collect=33.59860134124756, train_time_update=210.02408123016357, test_time=102.32604598999023, update_speed=403.8646498760704))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #13


Epoch #1: 1001it [00:02, 361.30it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 10703.100000 ± 6467.586110, best_reward: 11393.200000 ± 6876.357943 in #0


Epoch #2: 1001it [00:02, 391.56it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 11824.600000 ± 6976.544076, best_reward: 11824.600000 ± 6976.544076 in #2


Epoch #3: 1001it [00:02, 334.38it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 12815.800000 ± 7397.816729, best_reward: 12815.800000 ± 7397.816729 in #3


Epoch #4: 1001it [00:02, 426.13it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 18627.400000 ± 4962.370829, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #5: 1001it [00:02, 437.45it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 18159.600000 ± 5674.783224, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #6: 1001it [00:02, 368.42it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 8055.700000 ± 7513.379999, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #7: 1001it [00:02, 441.29it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 14025.500000 ± 5404.421990, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #8: 1001it [00:03, 324.12it/s, env_step=8000, gradient_step=800, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #8: test_reward: 13525.700000 ± 7930.278054, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #9: 1001it [00:03, 287.14it/s, env_step=9000, gradient_step=900, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #9: test_reward: 16347.400000 ± 6780.040062, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #10: 1001it [00:03, 325.08it/s, env_step=10000, gradient_step=1000, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                      


Epoch #10: test_reward: 7600.300000 ± 4141.573422, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #11: 1001it [00:03, 323.36it/s, env_step=11000, gradient_step=1100, len=102, n/ep=0, n/st=100, rew=3186.00]                                                                                 


Epoch #11: test_reward: 10551.300000 ± 7970.304035, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #12: 1001it [00:03, 325.34it/s, env_step=12000, gradient_step=1200, len=111, n/ep=0, n/st=100, rew=4998.00]                                                                                 


Epoch #12: test_reward: 7808.900000 ± 2213.834251, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #13: 1001it [00:02, 411.49it/s, env_step=13000, gradient_step=1300, len=122, n/ep=0, n/st=100, rew=5903.00]                                                                                 


Epoch #13: test_reward: 9791.700000 ± 2107.993266, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #14: 1001it [00:03, 331.95it/s, env_step=14000, gradient_step=1400, len=136, n/ep=0, n/st=100, rew=5889.00]                                                                                 


Epoch #14: test_reward: 9175.700000 ± 4015.938895, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #15: 1001it [00:02, 409.83it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=7970.50]                                                                                 


Epoch #15: test_reward: 13711.500000 ± 7353.102002, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #16: 1001it [00:02, 377.23it/s, env_step=16000, gradient_step=1600, len=157, n/ep=0, n/st=100, rew=7014.50]                                                                                 


Epoch #16: test_reward: 8710.800000 ± 6098.455096, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #17: 1001it [00:02, 412.24it/s, env_step=17000, gradient_step=1700, len=165, n/ep=0, n/st=100, rew=8461.00]                                                                                 


Epoch #17: test_reward: 12627.500000 ± 8297.060748, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #18: 1001it [00:02, 355.42it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=9598.75]                                                                                 


Epoch #18: test_reward: 9544.400000 ± 4926.945082, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #19: 1001it [00:02, 359.24it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=10633.00]                                                                                


Epoch #19: test_reward: 9444.800000 ± 6836.513699, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #20: 1001it [00:03, 333.22it/s, env_step=20000, gradient_step=2000, len=196, n/ep=0, n/st=100, rew=9603.83]                                                                                 


Epoch #20: test_reward: 12574.500000 ± 7881.304108, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #21: 1001it [00:03, 319.28it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=10504.00]                                                                                


Epoch #21: test_reward: 13162.000000 ± 7624.588592, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #22: 1001it [00:02, 356.00it/s, env_step=22000, gradient_step=2200, len=219, n/ep=0, n/st=100, rew=12819.50]                                                                                


Epoch #22: test_reward: 11511.900000 ± 8146.268231, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #23: 1001it [00:02, 384.90it/s, env_step=23000, gradient_step=2300, len=46, n/ep=1, n/st=100, rew=2005.00]                                                                                  


Epoch #23: test_reward: 8315.900000 ± 2471.839900, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #24: 1001it [00:03, 327.14it/s, env_step=24000, gradient_step=2400, len=38, n/ep=1, n/st=100, rew=1404.00]                                                                                  


Epoch #24: test_reward: 11999.800000 ± 6702.196443, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #25: 1001it [00:02, 333.99it/s, env_step=25000, gradient_step=2500, len=250, n/ep=1, n/st=100, rew=14513.00]                                                                                


Epoch #25: test_reward: 6363.800000 ± 1951.529288, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #26: 1001it [00:02, 430.83it/s, env_step=26000, gradient_step=2600, len=260, n/ep=2, n/st=100, rew=15078.00]                                                                                


Epoch #26: test_reward: 8035.800000 ± 6341.444533, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #27: 1001it [00:02, 420.19it/s, env_step=27000, gradient_step=2700, len=269, n/ep=0, n/st=100, rew=15647.50]                                                                                


Epoch #27: test_reward: 9425.700000 ± 5946.430712, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #28: 1001it [00:02, 401.68it/s, env_step=28000, gradient_step=2800, len=72, n/ep=1, n/st=100, rew=3243.00]                                                                                  


Epoch #28: test_reward: 9246.100000 ± 3815.409138, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #29: 1001it [00:02, 337.92it/s, env_step=29000, gradient_step=2900, len=290, n/ep=1, n/st=100, rew=16180.00]                                                                                


Epoch #29: test_reward: 15010.400000 ± 9091.248882, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #30: 1001it [00:02, 427.55it/s, env_step=30000, gradient_step=3000, len=299, n/ep=0, n/st=100, rew=16230.00]                                                                                


Epoch #30: test_reward: 8982.800000 ± 7578.492407, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #31: 1001it [00:02, 360.09it/s, env_step=31000, gradient_step=3100, len=217, n/ep=2, n/st=100, rew=12264.50]                                                                                


Epoch #31: test_reward: 6860.300000 ± 2165.628502, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #32: 1001it [00:02, 409.43it/s, env_step=32000, gradient_step=3200, len=84, n/ep=0, n/st=100, rew=3987.00]                                                                                  


Epoch #32: test_reward: 6985.400000 ± 7280.358401, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #33: 1001it [00:02, 412.17it/s, env_step=33000, gradient_step=3300, len=56, n/ep=1, n/st=100, rew=1843.00]                                                                                  


Epoch #33: test_reward: 9037.600000 ± 6852.174213, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #34: 1001it [00:02, 370.35it/s, env_step=34000, gradient_step=3400, len=102, n/ep=0, n/st=100, rew=4494.00]                                                                                 


Epoch #34: test_reward: 6756.200000 ± 2945.465865, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #35: 1001it [00:02, 347.16it/s, env_step=35000, gradient_step=3500, len=176, n/ep=2, n/st=100, rew=9956.50]                                                                                 


Epoch #35: test_reward: 9348.600000 ± 6772.217675, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #36: 1001it [00:02, 389.69it/s, env_step=36000, gradient_step=3600, len=183, n/ep=0, n/st=100, rew=11395.00]                                                                                


Epoch #36: test_reward: 5152.200000 ± 1099.027006, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #37: 1001it [00:02, 427.22it/s, env_step=37000, gradient_step=3700, len=52, n/ep=0, n/st=100, rew=2421.00]                                                                                  


Epoch #37: test_reward: 8229.800000 ± 4220.612984, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #38: 1001it [00:02, 416.06it/s, env_step=38000, gradient_step=3800, len=84, n/ep=0, n/st=100, rew=3411.00]                                                                                  


Epoch #38: test_reward: 13581.200000 ± 4422.062342, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #39: 1001it [00:02, 373.94it/s, env_step=39000, gradient_step=3900, len=130, n/ep=1, n/st=100, rew=6568.00]                                                                                 


Epoch #39: test_reward: 12463.700000 ± 7955.585410, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #40: 1001it [00:02, 359.63it/s, env_step=40000, gradient_step=4000, len=364, n/ep=5, n/st=100, rew=22281.70]                                                                                


Epoch #40: test_reward: 16460.900000 ± 8658.109291, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #41: 1001it [00:02, 399.37it/s, env_step=41000, gradient_step=4100, len=160, n/ep=1, n/st=100, rew=10453.00]                                                                                


Epoch #41: test_reward: 15605.400000 ± 8023.673077, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #42: 1001it [00:02, 365.71it/s, env_step=42000, gradient_step=4200, len=72, n/ep=1, n/st=100, rew=3262.00]                                                                                  


Epoch #42: test_reward: 7738.800000 ± 5964.928378, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #43: 1001it [00:02, 391.04it/s, env_step=43000, gradient_step=4300, len=188, n/ep=1, n/st=100, rew=11027.00]                                                                                


Epoch #43: test_reward: 10897.200000 ± 2883.519683, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #44: 1001it [00:02, 423.39it/s, env_step=44000, gradient_step=4400, len=187, n/ep=2, n/st=100, rew=10422.75]                                                                                


Epoch #44: test_reward: 9778.800000 ± 7097.595266, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #45: 1001it [00:03, 322.24it/s, env_step=45000, gradient_step=4500, len=114, n/ep=0, n/st=100, rew=5842.17]                                                                                 


Epoch #45: test_reward: 14456.300000 ± 3611.973755, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #46: 1001it [00:02, 383.79it/s, env_step=46000, gradient_step=4600, len=240, n/ep=0, n/st=100, rew=14737.50]                                                                                


Epoch #46: test_reward: 8416.900000 ± 1272.956987, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #47: 1001it [00:02, 408.29it/s, env_step=47000, gradient_step=4700, len=130, n/ep=3, n/st=100, rew=7180.00]                                                                                 


Epoch #47: test_reward: 16642.000000 ± 7092.681820, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #48: 1001it [00:02, 396.70it/s, env_step=48000, gradient_step=4800, len=215, n/ep=1, n/st=100, rew=13971.50]                                                                                


Epoch #48: test_reward: 9020.000000 ± 1579.177254, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #49: 1001it [00:02, 362.92it/s, env_step=49000, gradient_step=4900, len=111, n/ep=0, n/st=100, rew=6082.25]                                                                                 


Epoch #49: test_reward: 9566.600000 ± 2505.112540, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #50: 1001it [00:02, 417.78it/s, env_step=50000, gradient_step=5000, len=166, n/ep=0, n/st=100, rew=9551.50]                                                                                 


Epoch #50: test_reward: 16658.900000 ± 7127.698723, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #51: 1001it [00:02, 383.00it/s, env_step=51000, gradient_step=5100, len=86, n/ep=2, n/st=100, rew=4467.50]                                                                                  


Epoch #51: test_reward: 10697.800000 ± 5066.796578, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #52: 1001it [00:03, 315.50it/s, env_step=52000, gradient_step=5200, len=238, n/ep=0, n/st=100, rew=14923.00]                                                                                


Epoch #52: test_reward: 11811.700000 ± 3326.673265, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #53: 1001it [00:02, 379.85it/s, env_step=53000, gradient_step=5300, len=143, n/ep=1, n/st=100, rew=8665.00]                                                                                 


Epoch #53: test_reward: 8220.600000 ± 4442.284282, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #54: 1001it [00:02, 389.31it/s, env_step=54000, gradient_step=5400, len=163, n/ep=2, n/st=100, rew=9960.00]                                                                                 


Epoch #54: test_reward: 6105.400000 ± 7272.853693, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #55: 1001it [00:02, 365.89it/s, env_step=55000, gradient_step=5500, len=314, n/ep=0, n/st=100, rew=19661.00]                                                                                


Epoch #55: test_reward: 16379.600000 ± 3563.925370, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #56: 1001it [00:02, 354.47it/s, env_step=56000, gradient_step=5600, len=50, n/ep=1, n/st=100, rew=2256.00]                                                                                  


Epoch #56: test_reward: 11299.900000 ± 4873.636844, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #57: 1001it [00:02, 397.85it/s, env_step=57000, gradient_step=5700, len=92, n/ep=0, n/st=100, rew=4195.00]                                                                                  


Epoch #57: test_reward: 16354.000000 ± 6490.906193, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #58: 1001it [00:02, 366.99it/s, env_step=58000, gradient_step=5800, len=110, n/ep=1, n/st=100, rew=6388.00]                                                                                 


Epoch #58: test_reward: 9935.700000 ± 3333.348468, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #59: 1001it [00:02, 402.67it/s, env_step=59000, gradient_step=5900, len=64, n/ep=1, n/st=100, rew=3023.00]                                                                                  


Epoch #59: test_reward: 11064.000000 ± 5581.059899, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #60: 1001it [00:02, 390.12it/s, env_step=60000, gradient_step=6000, len=153, n/ep=2, n/st=100, rew=8836.75]                                                                                 


Epoch #60: test_reward: 14057.900000 ± 3521.988173, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #61: 1001it [00:02, 346.17it/s, env_step=61000, gradient_step=6100, len=164, n/ep=0, n/st=100, rew=10362.25]                                                                                


Epoch #61: test_reward: 13254.900000 ± 6021.520779, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #62: 1001it [00:02, 354.20it/s, env_step=62000, gradient_step=6200, len=240, n/ep=0, n/st=100, rew=15306.50]                                                                                


Epoch #62: test_reward: 11350.000000 ± 3833.792639, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #63: 1001it [00:03, 331.90it/s, env_step=63000, gradient_step=6300, len=153, n/ep=2, n/st=100, rew=9101.00]                                                                                 


Epoch #63: test_reward: 9008.700000 ± 2520.088651, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #64: 1001it [00:03, 327.85it/s, env_step=64000, gradient_step=6400, len=128, n/ep=0, n/st=100, rew=7254.00]                                                                                 


Epoch #64: test_reward: 10067.700000 ± 4949.016348, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #65: 1001it [00:02, 372.55it/s, env_step=65000, gradient_step=6500, len=247, n/ep=1, n/st=100, rew=16264.50]                                                                                


Epoch #65: test_reward: 10402.600000 ± 6711.146135, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #66: 1001it [00:02, 350.01it/s, env_step=66000, gradient_step=6600, len=102, n/ep=1, n/st=100, rew=5568.00]                                                                                 


Epoch #66: test_reward: 17250.300000 ± 6305.119286, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #67: 1001it [00:02, 365.32it/s, env_step=67000, gradient_step=6700, len=262, n/ep=1, n/st=100, rew=17641.00]                                                                                


Epoch #67: test_reward: 10288.900000 ± 4546.275299, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #68: 1001it [00:02, 367.21it/s, env_step=68000, gradient_step=6800, len=228, n/ep=0, n/st=100, rew=14314.00]                                                                                


Epoch #68: test_reward: 10502.600000 ± 2934.659237, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #69: 1001it [00:02, 355.76it/s, env_step=69000, gradient_step=6900, len=94, n/ep=0, n/st=100, rew=4615.00]                                                                                  


Epoch #69: test_reward: 7115.600000 ± 4474.243874, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #70: 1001it [00:02, 394.12it/s, env_step=70000, gradient_step=7000, len=120, n/ep=1, n/st=100, rew=6920.50]                                                                                 


Epoch #70: test_reward: 15262.800000 ± 5433.945837, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #71: 1001it [00:02, 348.93it/s, env_step=71000, gradient_step=7100, len=138, n/ep=1, n/st=100, rew=7896.00]                                                                                 


Epoch #71: test_reward: 10734.000000 ± 3119.026002, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #72: 1001it [00:02, 380.27it/s, env_step=72000, gradient_step=7200, len=176, n/ep=2, n/st=100, rew=10987.75]                                                                                


Epoch #72: test_reward: 9020.800000 ± 4165.535759, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #73: 1001it [00:02, 335.45it/s, env_step=73000, gradient_step=7300, len=134, n/ep=0, n/st=100, rew=7958.75]                                                                                 


Epoch #73: test_reward: 13659.400000 ± 6576.690843, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #74: 1001it [00:02, 416.58it/s, env_step=74000, gradient_step=7400, len=95, n/ep=0, n/st=100, rew=4726.00]                                                                                  


Epoch #74: test_reward: 14471.600000 ± 5979.274123, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #75: 1001it [00:02, 411.10it/s, env_step=75000, gradient_step=7500, len=109, n/ep=1, n/st=100, rew=5189.50]                                                                                 


Epoch #75: test_reward: 12350.000000 ± 8691.499479, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #76: 1001it [00:02, 380.33it/s, env_step=76000, gradient_step=7600, len=101, n/ep=0, n/st=100, rew=5161.00]                                                                                 


Epoch #76: test_reward: 13455.100000 ± 5813.645921, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #77: 1001it [00:02, 392.61it/s, env_step=77000, gradient_step=7700, len=166, n/ep=0, n/st=100, rew=9953.75]                                                                                 


Epoch #77: test_reward: 5598.100000 ± 6356.865713, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #78: 1001it [00:03, 333.62it/s, env_step=78000, gradient_step=7800, len=125, n/ep=0, n/st=100, rew=7195.50]                                                                                 


Epoch #78: test_reward: 8962.400000 ± 3876.933200, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #79: 1001it [00:02, 366.92it/s, env_step=79000, gradient_step=7900, len=159, n/ep=1, n/st=100, rew=8675.50]                                                                                 


Epoch #79: test_reward: 8891.000000 ± 3766.817675, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #80: 1001it [00:02, 338.36it/s, env_step=80000, gradient_step=8000, len=210, n/ep=1, n/st=100, rew=13805.00]                                                                                


Epoch #80: test_reward: 9822.600000 ± 2718.948333, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #81: 1001it [00:02, 366.30it/s, env_step=81000, gradient_step=8100, len=267, n/ep=0, n/st=100, rew=17076.00]                                                                                


Epoch #81: test_reward: 10801.200000 ± 4155.179438, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #82: 1001it [00:02, 422.32it/s, env_step=82000, gradient_step=8200, len=170, n/ep=1, n/st=100, rew=11046.00]                                                                                


Epoch #82: test_reward: 8411.600000 ± 1855.639362, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #83: 1001it [00:02, 402.81it/s, env_step=83000, gradient_step=8300, len=400, n/ep=0, n/st=100, rew=26450.00]                                                                                


Epoch #83: test_reward: 9926.000000 ± 6592.675633, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #84: 1001it [00:02, 369.40it/s, env_step=84000, gradient_step=8400, len=400, n/ep=1, n/st=100, rew=28039.00]                                                                                


Epoch #84: test_reward: 5931.300000 ± 3351.015847, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #85: 1001it [00:02, 410.43it/s, env_step=85000, gradient_step=8500, len=176, n/ep=2, n/st=100, rew=11293.50]                                                                                


Epoch #85: test_reward: 11441.200000 ± 5461.459893, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #86: 1001it [00:03, 330.28it/s, env_step=86000, gradient_step=8600, len=128, n/ep=2, n/st=100, rew=7182.75]                                                                                 


Epoch #86: test_reward: 11567.600000 ± 6921.712436, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #87: 1001it [00:02, 365.05it/s, env_step=87000, gradient_step=8700, len=86, n/ep=0, n/st=100, rew=4147.00]                                                                                  


Epoch #87: test_reward: 11494.000000 ± 5022.147509, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #88: 1001it [00:02, 381.73it/s, env_step=88000, gradient_step=8800, len=92, n/ep=1, n/st=100, rew=4845.00]                                                                                  


Epoch #88: test_reward: 11657.600000 ± 3376.412866, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #89: 1001it [00:02, 428.62it/s, env_step=89000, gradient_step=8900, len=120, n/ep=0, n/st=100, rew=6452.33]                                                                                 


Epoch #89: test_reward: 15374.000000 ± 6898.562227, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #90: 1001it [00:02, 391.89it/s, env_step=90000, gradient_step=9000, len=78, n/ep=0, n/st=100, rew=4326.17]                                                                                  


Epoch #90: test_reward: 9690.300000 ± 3489.924069, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #91: 1001it [00:02, 335.45it/s, env_step=91000, gradient_step=9100, len=131, n/ep=0, n/st=100, rew=8081.00]                                                                                 


Epoch #91: test_reward: 11457.800000 ± 3757.119476, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #92: 1001it [00:02, 365.34it/s, env_step=92000, gradient_step=9200, len=289, n/ep=0, n/st=100, rew=19864.50]                                                                                


Epoch #92: test_reward: 13016.600000 ± 5887.134943, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #93: 1001it [00:02, 377.14it/s, env_step=93000, gradient_step=9300, len=170, n/ep=0, n/st=100, rew=9703.50]                                                                                 


Epoch #93: test_reward: 7370.400000 ± 2043.838702, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #94: 1001it [00:03, 322.16it/s, env_step=94000, gradient_step=9400, len=118, n/ep=0, n/st=100, rew=6630.00]                                                                                 


Epoch #94: test_reward: 7818.000000 ± 1649.285421, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #95: 1001it [00:02, 389.55it/s, env_step=95000, gradient_step=9500, len=125, n/ep=1, n/st=100, rew=7639.00]                                                                                 


Epoch #95: test_reward: 10108.200000 ± 2498.813390, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #96: 1001it [00:02, 367.19it/s, env_step=96000, gradient_step=9600, len=98, n/ep=0, n/st=100, rew=4832.00]                                                                                  


Epoch #96: test_reward: 12938.600000 ± 4476.189723, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #97: 1001it [00:02, 385.01it/s, env_step=97000, gradient_step=9700, len=183, n/ep=1, n/st=100, rew=11812.50]                                                                                


Epoch #97: test_reward: 12968.000000 ± 5446.194414, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #98: 1001it [00:02, 403.21it/s, env_step=98000, gradient_step=9800, len=99, n/ep=0, n/st=100, rew=5822.50]                                                                                  


Epoch #98: test_reward: 11884.400000 ± 8218.014106, best_reward: 18627.400000 ± 4962.370829 in #4


Epoch #99: 1001it [00:02, 367.81it/s, env_step=99000, gradient_step=9900, len=144, n/ep=0, n/st=100, rew=8096.00]                                                                                 


Epoch #99: test_reward: 18770.000000 ± 9849.355756, best_reward: 18770.000000 ± 9849.355756 in #99


Epoch #100: 1001it [00:02, 351.29it/s, env_step=100000, gradient_step=10000, len=144, n/ep=1, n/st=100, rew=8535.00]                                                                              


Epoch #100: test_reward: 9119.200000 ± 3534.179418, best_reward: 18770.000000 ± 9849.355756 in #99

InfoStats(gradient_step=10000, best_reward=18770.0, best_reward_std=9849.355755581173, train_step=100000, train_episode=553, test_step=178795, test_episode=1010, timing=TimingStats(total_time=377.69640040397644, train_time=271.14004015922546, train_time_collect=34.626155853271484, train_time_update=232.34609365463257, test_time=106.55636024475098, update_speed=368.8131046276882))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #11


Epoch #1: 1001it [00:02, 437.76it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 17325.300000 ± 5136.084229, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #2: 1001it [00:02, 400.78it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 14573.200000 ± 6100.067357, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #3: 1001it [00:02, 419.69it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10494.300000 ± 3777.756214, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #4: 1001it [00:02, 418.82it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 12757.800000 ± 3439.797372, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #5: 1001it [00:02, 407.11it/s, env_step=5000, gradient_step=500, len=50, n/ep=1, n/st=100, rew=1717.00]                                                                                     


Epoch #5: test_reward: 12611.100000 ± 6005.303480, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #6: 1001it [00:02, 390.25it/s, env_step=6000, gradient_step=600, len=58, n/ep=0, n/st=100, rew=2149.00]                                                                                     


Epoch #6: test_reward: 13172.700000 ± 4895.788906, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #7: 1001it [00:02, 436.85it/s, env_step=7000, gradient_step=700, len=58, n/ep=0, n/st=100, rew=2149.00]                                                                                     


Epoch #7: test_reward: 9516.300000 ± 5266.686549, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #8: 1001it [00:02, 416.26it/s, env_step=8000, gradient_step=800, len=72, n/ep=0, n/st=100, rew=2980.00]                                                                                     


Epoch #8: test_reward: 13446.300000 ± 4547.928343, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #9: 1001it [00:02, 444.63it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=3474.00]                                                                                     


Epoch #9: test_reward: 12749.300000 ± 2337.143729, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #10: 1001it [00:02, 395.36it/s, env_step=10000, gradient_step=1000, len=100, n/ep=2, n/st=100, rew=4618.00]                                                                                 


Epoch #10: test_reward: 14419.200000 ± 6642.853721, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #11: 1001it [00:02, 363.52it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=5772.50]                                                                                 


Epoch #11: test_reward: 13058.100000 ± 6583.704769, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #12: 1001it [00:03, 333.62it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=4830.00]                                                                                 


Epoch #12: test_reward: 8229.800000 ± 3481.698373, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #13: 1001it [00:02, 376.17it/s, env_step=13000, gradient_step=1300, len=130, n/ep=3, n/st=100, rew=6418.33]                                                                                 


Epoch #13: test_reward: 9983.800000 ± 3943.193777, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #14: 1001it [00:02, 396.39it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=7591.00]                                                                                 


Epoch #14: test_reward: 10941.800000 ± 4472.877324, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #15: 1001it [00:02, 374.55it/s, env_step=15000, gradient_step=1500, len=147, n/ep=0, n/st=100, rew=8371.00]                                                                                 


Epoch #15: test_reward: 5851.000000 ± 3808.435716, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #16: 1001it [00:02, 432.22it/s, env_step=16000, gradient_step=1600, len=122, n/ep=0, n/st=100, rew=5927.75]                                                                                 


Epoch #16: test_reward: 14089.000000 ± 6672.580325, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #17: 1001it [00:02, 426.43it/s, env_step=17000, gradient_step=1700, len=84, n/ep=1, n/st=100, rew=3282.00]                                                                                  


Epoch #17: test_reward: 12483.100000 ± 4098.810521, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #18: 1001it [00:02, 355.64it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=8863.00]                                                                                 


Epoch #18: test_reward: 10150.800000 ± 5545.057273, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #19: 1001it [00:02, 399.73it/s, env_step=19000, gradient_step=1900, len=188, n/ep=0, n/st=100, rew=10527.62]                                                                                


Epoch #19: test_reward: 7525.000000 ± 2995.059432, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #20: 1001it [00:02, 412.14it/s, env_step=20000, gradient_step=2000, len=199, n/ep=0, n/st=100, rew=11616.00]                                                                                


Epoch #20: test_reward: 8911.800000 ± 2729.192621, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #21: 1001it [00:02, 456.10it/s, env_step=21000, gradient_step=2100, len=208, n/ep=0, n/st=100, rew=12613.00]                                                                                


Epoch #21: test_reward: 12945.600000 ± 5574.861491, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #22: 1001it [00:02, 371.27it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=13713.50]                                                                                


Epoch #22: test_reward: 13915.300000 ± 7308.787424, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #23: 1001it [00:02, 374.81it/s, env_step=23000, gradient_step=2300, len=229, n/ep=0, n/st=100, rew=13753.00]                                                                                


Epoch #23: test_reward: 15275.900000 ± 9109.101585, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #24: 1001it [00:02, 361.69it/s, env_step=24000, gradient_step=2400, len=240, n/ep=1, n/st=100, rew=14576.00]                                                                                


Epoch #24: test_reward: 10562.900000 ± 1907.884716, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #25: 1001it [00:02, 339.37it/s, env_step=25000, gradient_step=2500, len=106, n/ep=0, n/st=100, rew=4707.00]                                                                                 


Epoch #25: test_reward: 9599.800000 ± 3352.309974, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #26: 1001it [00:02, 415.33it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=15869.00]                                                                                


Epoch #26: test_reward: 12158.600000 ± 4406.425767, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #27: 1001it [00:02, 394.21it/s, env_step=27000, gradient_step=2700, len=263, n/ep=0, n/st=100, rew=16306.75]                                                                                


Epoch #27: test_reward: 13366.300000 ± 5267.353549, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #28: 1001it [00:02, 428.75it/s, env_step=28000, gradient_step=2800, len=158, n/ep=0, n/st=100, rew=8295.00]                                                                                 


Epoch #28: test_reward: 8534.000000 ± 7408.579675, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #29: 1001it [00:02, 437.93it/s, env_step=29000, gradient_step=2900, len=199, n/ep=2, n/st=100, rew=9885.50]                                                                                 


Epoch #29: test_reward: 10876.700000 ± 4549.460101, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #30: 1001it [00:02, 399.68it/s, env_step=30000, gradient_step=3000, len=142, n/ep=0, n/st=100, rew=6083.00]                                                                                 


Epoch #30: test_reward: 10324.200000 ± 2986.156252, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #31: 1001it [00:02, 382.39it/s, env_step=31000, gradient_step=3100, len=100, n/ep=0, n/st=100, rew=3694.00]                                                                                 


Epoch #31: test_reward: 12837.900000 ± 3459.196105, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #32: 1001it [00:02, 336.61it/s, env_step=32000, gradient_step=3200, len=153, n/ep=2, n/st=100, rew=6845.00]                                                                                 


Epoch #32: test_reward: 8631.100000 ± 3899.435612, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #33: 1001it [00:02, 431.61it/s, env_step=33000, gradient_step=3300, len=110, n/ep=2, n/st=100, rew=5721.50]                                                                                 


Epoch #33: test_reward: 14170.000000 ± 5670.035714, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #34: 1001it [00:02, 375.88it/s, env_step=34000, gradient_step=3400, len=118, n/ep=1, n/st=100, rew=6317.00]                                                                                 


Epoch #34: test_reward: 11474.500000 ± 1938.575934, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #35: 1001it [00:02, 378.65it/s, env_step=35000, gradient_step=3500, len=177, n/ep=2, n/st=100, rew=9215.00]                                                                                 


Epoch #35: test_reward: 7336.800000 ± 3840.443745, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #36: 1001it [00:02, 408.76it/s, env_step=36000, gradient_step=3600, len=212, n/ep=0, n/st=100, rew=11696.00]                                                                                


Epoch #36: test_reward: 10801.800000 ± 2373.540680, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #37: 1001it [00:02, 353.23it/s, env_step=37000, gradient_step=3700, len=48, n/ep=0, n/st=100, rew=1909.00]                                                                                  


Epoch #37: test_reward: 13394.000000 ± 5284.024319, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #38: 1001it [00:02, 388.39it/s, env_step=38000, gradient_step=3800, len=52, n/ep=1, n/st=100, rew=2389.00]                                                                                  


Epoch #38: test_reward: 8769.100000 ± 4171.326131, best_reward: 17325.300000 ± 5136.084229 in #1


Epoch #39: 1001it [00:02, 378.03it/s, env_step=39000, gradient_step=3900, len=212, n/ep=0, n/st=100, rew=11587.17]                                                                                


Epoch #39: test_reward: 17816.800000 ± 7825.579094, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #40: 1001it [00:02, 365.48it/s, env_step=40000, gradient_step=4000, len=236, n/ep=3, n/st=100, rew=13719.83]                                                                                


Epoch #40: test_reward: 8301.700000 ± 4162.334779, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #41: 1001it [00:02, 348.98it/s, env_step=41000, gradient_step=4100, len=113, n/ep=2, n/st=100, rew=5889.75]                                                                                 


Epoch #41: test_reward: 10173.600000 ± 3513.343570, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #42: 1001it [00:02, 412.48it/s, env_step=42000, gradient_step=4200, len=231, n/ep=0, n/st=100, rew=14273.75]                                                                                


Epoch #42: test_reward: 7456.200000 ± 4365.847038, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #43: 1001it [00:02, 379.87it/s, env_step=43000, gradient_step=4300, len=287, n/ep=1, n/st=100, rew=16174.50]                                                                                


Epoch #43: test_reward: 13019.400000 ± 2412.100338, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #44: 1001it [00:02, 371.91it/s, env_step=44000, gradient_step=4400, len=110, n/ep=1, n/st=100, rew=6057.50]                                                                                 


Epoch #44: test_reward: 9098.900000 ± 5468.522697, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #45: 1001it [00:02, 388.54it/s, env_step=45000, gradient_step=4500, len=98, n/ep=0, n/st=100, rew=5430.00]                                                                                  


Epoch #45: test_reward: 12310.900000 ± 4671.294434, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #46: 1001it [00:02, 395.54it/s, env_step=46000, gradient_step=4600, len=274, n/ep=1, n/st=100, rew=14498.00]                                                                                


Epoch #46: test_reward: 10364.400000 ± 3478.055986, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #47: 1001it [00:02, 385.39it/s, env_step=47000, gradient_step=4700, len=194, n/ep=0, n/st=100, rew=11564.00]                                                                                


Epoch #47: test_reward: 10556.800000 ± 2313.800199, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #48: 1001it [00:02, 397.67it/s, env_step=48000, gradient_step=4800, len=140, n/ep=0, n/st=100, rew=8246.50]                                                                                 


Epoch #48: test_reward: 11878.500000 ± 7955.825919, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #49: 1001it [00:03, 327.33it/s, env_step=49000, gradient_step=4900, len=68, n/ep=0, n/st=100, rew=3308.00]                                                                                  


Epoch #49: test_reward: 11594.800000 ± 2631.675390, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #50: 1001it [00:02, 358.51it/s, env_step=50000, gradient_step=5000, len=86, n/ep=1, n/st=100, rew=4321.00]                                                                                  


Epoch #50: test_reward: 10290.500000 ± 3803.674309, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #51: 1001it [00:02, 415.33it/s, env_step=51000, gradient_step=5100, len=229, n/ep=2, n/st=100, rew=14114.50]                                                                                


Epoch #51: test_reward: 10764.100000 ± 3823.868760, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #52: 1001it [00:02, 380.00it/s, env_step=52000, gradient_step=5200, len=146, n/ep=0, n/st=100, rew=6761.00]                                                                                 


Epoch #52: test_reward: 15028.600000 ± 3194.539097, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #53: 1001it [00:02, 383.80it/s, env_step=53000, gradient_step=5300, len=46, n/ep=0, n/st=100, rew=1431.00]                                                                                  


Epoch #53: test_reward: 10404.000000 ± 2485.404796, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #54: 1001it [00:02, 417.70it/s, env_step=54000, gradient_step=5400, len=124, n/ep=0, n/st=100, rew=7013.00]                                                                                 


Epoch #54: test_reward: 11217.300000 ± 5490.027032, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #55: 1001it [00:02, 347.48it/s, env_step=55000, gradient_step=5500, len=100, n/ep=0, n/st=100, rew=5061.00]                                                                                 


Epoch #55: test_reward: 14002.100000 ± 7504.788411, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #56: 1001it [00:02, 374.62it/s, env_step=56000, gradient_step=5600, len=220, n/ep=2, n/st=100, rew=12680.00]                                                                                


Epoch #56: test_reward: 12547.600000 ± 6626.876537, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #57: 1001it [00:02, 362.21it/s, env_step=57000, gradient_step=5700, len=136, n/ep=3, n/st=100, rew=7478.00]                                                                                 


Epoch #57: test_reward: 11948.000000 ± 5914.595033, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #58: 1001it [00:02, 334.24it/s, env_step=58000, gradient_step=5800, len=152, n/ep=0, n/st=100, rew=9268.00]                                                                                 


Epoch #58: test_reward: 16838.000000 ± 5844.391089, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #59: 1001it [00:02, 379.78it/s, env_step=59000, gradient_step=5900, len=112, n/ep=1, n/st=100, rew=5417.00]                                                                                 


Epoch #59: test_reward: 14780.400000 ± 6542.335412, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #60: 1001it [00:02, 341.61it/s, env_step=60000, gradient_step=6000, len=152, n/ep=0, n/st=100, rew=9409.50]                                                                                 


Epoch #60: test_reward: 14600.800000 ± 4206.066233, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #61: 1001it [00:02, 367.15it/s, env_step=61000, gradient_step=6100, len=164, n/ep=2, n/st=100, rew=9740.25]                                                                                 


Epoch #61: test_reward: 14317.100000 ± 8530.905338, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #62: 1001it [00:02, 415.98it/s, env_step=62000, gradient_step=6200, len=162, n/ep=2, n/st=100, rew=9584.25]                                                                                 


Epoch #62: test_reward: 12871.400000 ± 3251.347696, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #63: 1001it [00:02, 421.13it/s, env_step=63000, gradient_step=6300, len=228, n/ep=0, n/st=100, rew=13952.00]                                                                                


Epoch #63: test_reward: 11073.100000 ± 3470.636871, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #64: 1001it [00:02, 429.92it/s, env_step=64000, gradient_step=6400, len=116, n/ep=0, n/st=100, rew=5723.00]                                                                                 


Epoch #64: test_reward: 7868.900000 ± 5507.959376, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #65: 1001it [00:02, 399.38it/s, env_step=65000, gradient_step=6500, len=184, n/ep=0, n/st=100, rew=10916.75]                                                                                


Epoch #65: test_reward: 14660.600000 ± 7216.687221, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #66: 1001it [00:02, 362.13it/s, env_step=66000, gradient_step=6600, len=64, n/ep=0, n/st=100, rew=3060.00]                                                                                  


Epoch #66: test_reward: 13050.600000 ± 4567.874258, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #67: 1001it [00:02, 419.75it/s, env_step=67000, gradient_step=6700, len=118, n/ep=0, n/st=100, rew=6003.00]                                                                                 


Epoch #67: test_reward: 7629.200000 ± 2103.513242, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #68: 1001it [00:02, 413.28it/s, env_step=68000, gradient_step=6800, len=68, n/ep=1, n/st=100, rew=3301.00]                                                                                  


Epoch #68: test_reward: 12243.100000 ± 4046.581630, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #69: 1001it [00:02, 384.73it/s, env_step=69000, gradient_step=6900, len=102, n/ep=1, n/st=100, rew=4600.00]                                                                                 


Epoch #69: test_reward: 14183.400000 ± 7371.766738, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #70: 1001it [00:02, 387.64it/s, env_step=70000, gradient_step=7000, len=93, n/ep=2, n/st=100, rew=4678.25]                                                                                  


Epoch #70: test_reward: 14442.000000 ± 4895.051072, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #71: 1001it [00:02, 355.78it/s, env_step=71000, gradient_step=7100, len=46, n/ep=0, n/st=100, rew=2103.00]                                                                                  


Epoch #71: test_reward: 9636.400000 ± 5757.590715, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #72: 1001it [00:02, 399.50it/s, env_step=72000, gradient_step=7200, len=152, n/ep=0, n/st=100, rew=8463.50]                                                                                 


Epoch #72: test_reward: 12228.200000 ± 3813.734805, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #73: 1001it [00:02, 348.23it/s, env_step=73000, gradient_step=7300, len=206, n/ep=1, n/st=100, rew=12764.50]                                                                                


Epoch #73: test_reward: 14115.500000 ± 7215.239112, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #74: 1001it [00:02, 391.33it/s, env_step=74000, gradient_step=7400, len=197, n/ep=0, n/st=100, rew=12663.00]                                                                                


Epoch #74: test_reward: 10797.100000 ± 3749.541531, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #75: 1001it [00:02, 423.56it/s, env_step=75000, gradient_step=7500, len=210, n/ep=0, n/st=100, rew=13203.00]                                                                                


Epoch #75: test_reward: 12939.700000 ± 5888.181995, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #76: 1001it [00:02, 361.35it/s, env_step=76000, gradient_step=7600, len=238, n/ep=0, n/st=100, rew=15559.00]                                                                                


Epoch #76: test_reward: 10733.900000 ± 4304.084907, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #77: 1001it [00:02, 385.46it/s, env_step=77000, gradient_step=7700, len=220, n/ep=0, n/st=100, rew=13631.00]                                                                                


Epoch #77: test_reward: 14333.000000 ± 6613.623712, best_reward: 17816.800000 ± 7825.579094 in #39


Epoch #78: 1001it [00:02, 427.64it/s, env_step=78000, gradient_step=7800, len=193, n/ep=0, n/st=100, rew=12482.00]                                                                                


Epoch #78: test_reward: 18040.100000 ± 8155.885659, best_reward: 18040.100000 ± 8155.885659 in #78


Epoch #79: 1001it [00:02, 431.62it/s, env_step=79000, gradient_step=7900, len=81, n/ep=2, n/st=100, rew=3640.50]                                                                                  


Epoch #79: test_reward: 13530.100000 ± 6117.737171, best_reward: 18040.100000 ± 8155.885659 in #78


Epoch #80: 1001it [00:02, 414.16it/s, env_step=80000, gradient_step=8000, len=170, n/ep=0, n/st=100, rew=10554.00]                                                                                


Epoch #80: test_reward: 21303.900000 ± 4769.959548, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #81: 1001it [00:02, 415.05it/s, env_step=81000, gradient_step=8100, len=241, n/ep=0, n/st=100, rew=15324.00]                                                                                


Epoch #81: test_reward: 20948.800000 ± 7903.298056, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #82: 1001it [00:02, 384.61it/s, env_step=82000, gradient_step=8200, len=182, n/ep=2, n/st=100, rew=11042.25]                                                                                


Epoch #82: test_reward: 10157.400000 ± 3621.551938, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #83: 1001it [00:02, 374.63it/s, env_step=83000, gradient_step=8300, len=200, n/ep=0, n/st=100, rew=12468.00]                                                                                


Epoch #83: test_reward: 14606.400000 ± 7326.478707, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #84: 1001it [00:02, 374.38it/s, env_step=84000, gradient_step=8400, len=212, n/ep=2, n/st=100, rew=12914.50]                                                                                


Epoch #84: test_reward: 10116.100000 ± 10083.465202, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #85: 1001it [00:02, 407.32it/s, env_step=85000, gradient_step=8500, len=207, n/ep=0, n/st=100, rew=12335.00]                                                                                


Epoch #85: test_reward: 13216.000000 ± 4168.452663, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #86: 1001it [00:02, 412.82it/s, env_step=86000, gradient_step=8600, len=186, n/ep=0, n/st=100, rew=11336.00]                                                                                


Epoch #86: test_reward: 11158.400000 ± 1565.331352, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #87: 1001it [00:02, 417.80it/s, env_step=87000, gradient_step=8700, len=288, n/ep=0, n/st=100, rew=18664.00]                                                                                


Epoch #87: test_reward: 12587.900000 ± 2587.360487, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #88: 1001it [00:02, 411.46it/s, env_step=88000, gradient_step=8800, len=213, n/ep=0, n/st=100, rew=13643.00]                                                                                


Epoch #88: test_reward: 9950.000000 ± 2806.413797, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #89: 1001it [00:02, 405.17it/s, env_step=89000, gradient_step=8900, len=216, n/ep=1, n/st=100, rew=13288.00]                                                                                


Epoch #89: test_reward: 16443.100000 ± 7733.844276, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #90: 1001it [00:02, 384.12it/s, env_step=90000, gradient_step=9000, len=182, n/ep=0, n/st=100, rew=11449.00]                                                                                


Epoch #90: test_reward: 11535.100000 ± 2888.247824, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #91: 1001it [00:02, 417.81it/s, env_step=91000, gradient_step=9100, len=236, n/ep=0, n/st=100, rew=14915.50]                                                                                


Epoch #91: test_reward: 14538.600000 ± 5303.124290, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #92: 1001it [00:02, 357.36it/s, env_step=92000, gradient_step=9200, len=164, n/ep=1, n/st=100, rew=8911.00]                                                                                 


Epoch #92: test_reward: 15258.700000 ± 6078.461221, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #93: 1001it [00:02, 357.67it/s, env_step=93000, gradient_step=9300, len=152, n/ep=0, n/st=100, rew=8495.00]                                                                                 


Epoch #93: test_reward: 19869.200000 ± 4181.266693, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #94: 1001it [00:02, 376.46it/s, env_step=94000, gradient_step=9400, len=336, n/ep=1, n/st=100, rew=22213.00]                                                                                


Epoch #94: test_reward: 8011.500000 ± 7756.680775, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #95: 1001it [00:03, 332.61it/s, env_step=95000, gradient_step=9500, len=168, n/ep=1, n/st=100, rew=9789.00]                                                                                 


Epoch #95: test_reward: 14165.900000 ± 3679.063480, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #96: 1001it [00:02, 359.45it/s, env_step=96000, gradient_step=9600, len=237, n/ep=1, n/st=100, rew=15678.00]                                                                                


Epoch #96: test_reward: 13076.000000 ± 2508.642820, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #97: 1001it [00:03, 332.85it/s, env_step=97000, gradient_step=9700, len=148, n/ep=0, n/st=100, rew=8390.25]                                                                                 


Epoch #97: test_reward: 4230.000000 ± 4709.751841, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #98: 1001it [00:02, 358.51it/s, env_step=98000, gradient_step=9800, len=76, n/ep=1, n/st=100, rew=3192.00]                                                                                  


Epoch #98: test_reward: 12533.600000 ± 8221.459520, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #99: 1001it [00:02, 343.84it/s, env_step=99000, gradient_step=9900, len=180, n/ep=3, n/st=100, rew=10398.33]                                                                                


Epoch #99: test_reward: 19065.400000 ± 4856.880629, best_reward: 21303.900000 ± 4769.959548 in #80


Epoch #100: 1001it [00:02, 344.55it/s, env_step=100000, gradient_step=10000, len=34, n/ep=1, n/st=100, rew=1317.00]                                                                               


Epoch #100: test_reward: 15726.000000 ± 6948.195104, best_reward: 21303.900000 ± 4769.959548 in #80

InfoStats(gradient_step=10000, best_reward=21303.9, best_reward_std=4769.959548046502, train_step=100000, train_episode=525, test_step=197844, test_episode=1010, timing=TimingStats(total_time=367.81300234794617, train_time=260.3338129520416, train_time_collect=34.02744722366333, train_time_update=222.23253631591797, test_time=107.47918939590454, update_speed=384.1222116560859))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #5


Epoch #1: 1001it [00:02, 413.30it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 13243.200000 ± 6376.803899, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #2: 1001it [00:02, 423.65it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 9168.100000 ± 5135.575887, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #3: 1001it [00:02, 432.35it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10847.300000 ± 6597.088874, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #4: 1001it [00:02, 413.05it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 10927.200000 ± 3070.442763, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #5: 1001it [00:02, 443.56it/s, env_step=5000, gradient_step=500, len=50, n/ep=1, n/st=100, rew=1066.00]                                                                                     


Epoch #5: test_reward: 10113.300000 ± 4186.986746, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #6: 1001it [00:02, 445.69it/s, env_step=6000, gradient_step=600, len=56, n/ep=0, n/st=100, rew=1325.00]                                                                                     


Epoch #6: test_reward: 10134.600000 ± 3221.762536, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #7: 1001it [00:02, 473.70it/s, env_step=7000, gradient_step=700, len=68, n/ep=0, n/st=100, rew=2484.00]                                                                                     


Epoch #7: test_reward: 11683.500000 ± 5333.763835, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #8: 1001it [00:02, 389.83it/s, env_step=8000, gradient_step=800, len=72, n/ep=0, n/st=100, rew=1399.00]                                                                                     


Epoch #8: test_reward: 8465.500000 ± 3455.464694, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #9: 1001it [00:02, 370.97it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=2851.00]                                                                                     


Epoch #9: test_reward: 11715.300000 ± 6036.824265, best_reward: 13243.200000 ± 6376.803899 in #1


Epoch #10: 1001it [00:02, 407.85it/s, env_step=10000, gradient_step=1000, len=98, n/ep=0, n/st=100, rew=3455.50]                                                                                  


Epoch #10: test_reward: 15448.000000 ± 5049.814749, best_reward: 15448.000000 ± 5049.814749 in #10


Epoch #11: 1001it [00:02, 432.75it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=3978.00]                                                                                 


Epoch #11: test_reward: 11380.900000 ± 3951.871796, best_reward: 15448.000000 ± 5049.814749 in #10


Epoch #12: 1001it [00:02, 396.62it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=3627.00]                                                                                 


Epoch #12: test_reward: 11891.100000 ± 5277.454755, best_reward: 15448.000000 ± 5049.814749 in #10


Epoch #13: 1001it [00:02, 424.30it/s, env_step=13000, gradient_step=1300, len=130, n/ep=2, n/st=100, rew=4599.50]                                                                                 


Epoch #13: test_reward: 9776.600000 ± 7032.389469, best_reward: 15448.000000 ± 5049.814749 in #10


Epoch #14: 1001it [00:02, 379.34it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=6040.00]                                                                                 


Epoch #14: test_reward: 18125.600000 ± 9899.259035, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #15: 1001it [00:02, 450.97it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=5588.62]                                                                                 


Epoch #15: test_reward: 17475.400000 ± 6981.483857, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #16: 1001it [00:02, 338.58it/s, env_step=16000, gradient_step=1600, len=119, n/ep=0, n/st=100, rew=4495.17]                                                                                 


Epoch #16: test_reward: 9791.200000 ± 6944.065103, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #17: 1001it [00:02, 398.95it/s, env_step=17000, gradient_step=1700, len=142, n/ep=2, n/st=100, rew=4911.50]                                                                                 


Epoch #17: test_reward: 10603.800000 ± 6765.665522, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #18: 1001it [00:02, 353.39it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=7547.00]                                                                                 


Epoch #18: test_reward: 16362.000000 ± 6496.281598, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #19: 1001it [00:02, 397.46it/s, env_step=19000, gradient_step=1900, len=132, n/ep=0, n/st=100, rew=4978.25]                                                                                 


Epoch #19: test_reward: 12189.600000 ± 5590.612528, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #20: 1001it [00:02, 350.17it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=9439.00]                                                                                 


Epoch #20: test_reward: 11398.900000 ± 4617.195479, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #21: 1001it [00:02, 348.14it/s, env_step=21000, gradient_step=2100, len=164, n/ep=2, n/st=100, rew=6985.25]                                                                                 


Epoch #21: test_reward: 11833.500000 ± 2674.071100, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #22: 1001it [00:02, 405.74it/s, env_step=22000, gradient_step=2200, len=54, n/ep=1, n/st=100, rew=1786.00]                                                                                  


Epoch #22: test_reward: 15229.100000 ± 5546.501176, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #23: 1001it [00:02, 392.30it/s, env_step=23000, gradient_step=2300, len=228, n/ep=0, n/st=100, rew=10021.00]                                                                                


Epoch #23: test_reward: 12359.200000 ± 3096.629290, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #24: 1001it [00:02, 428.05it/s, env_step=24000, gradient_step=2400, len=141, n/ep=2, n/st=100, rew=6901.25]                                                                                 


Epoch #24: test_reward: 7752.700000 ± 4577.447893, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #25: 1001it [00:02, 377.41it/s, env_step=25000, gradient_step=2500, len=104, n/ep=1, n/st=100, rew=4734.00]                                                                                 


Epoch #25: test_reward: 9306.500000 ± 3652.547419, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #26: 1001it [00:02, 368.01it/s, env_step=26000, gradient_step=2600, len=80, n/ep=1, n/st=100, rew=3043.00]                                                                                  


Epoch #26: test_reward: 7176.500000 ± 3651.550499, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #27: 1001it [00:02, 337.06it/s, env_step=27000, gradient_step=2700, len=146, n/ep=0, n/st=100, rew=8685.00]                                                                                 


Epoch #27: test_reward: 12275.800000 ± 1742.142233, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #28: 1001it [00:02, 354.52it/s, env_step=28000, gradient_step=2800, len=166, n/ep=1, n/st=100, rew=8421.00]                                                                                 


Epoch #28: test_reward: 6697.400000 ± 3416.516946, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #29: 1001it [00:02, 419.03it/s, env_step=29000, gradient_step=2900, len=288, n/ep=0, n/st=100, rew=13417.00]                                                                                


Epoch #29: test_reward: 11910.400000 ± 3838.027572, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #30: 1001it [00:02, 371.32it/s, env_step=30000, gradient_step=3000, len=138, n/ep=0, n/st=100, rew=6930.00]                                                                                 


Epoch #30: test_reward: 8308.600000 ± 2417.522707, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #31: 1001it [00:02, 410.73it/s, env_step=31000, gradient_step=3100, len=101, n/ep=2, n/st=100, rew=5226.00]                                                                                 


Epoch #31: test_reward: 12427.700000 ± 2134.735349, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #32: 1001it [00:02, 424.12it/s, env_step=32000, gradient_step=3200, len=112, n/ep=2, n/st=100, rew=5693.50]                                                                                 


Epoch #32: test_reward: 8039.800000 ± 3676.048144, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #33: 1001it [00:02, 381.83it/s, env_step=33000, gradient_step=3300, len=168, n/ep=1, n/st=100, rew=9391.00]                                                                                 


Epoch #33: test_reward: 14626.900000 ± 6576.332754, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #34: 1001it [00:02, 399.10it/s, env_step=34000, gradient_step=3400, len=136, n/ep=1, n/st=100, rew=6927.00]                                                                                 


Epoch #34: test_reward: 14645.100000 ± 7246.521034, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #35: 1001it [00:02, 423.73it/s, env_step=35000, gradient_step=3500, len=132, n/ep=2, n/st=100, rew=6684.75]                                                                                 


Epoch #35: test_reward: 11351.600000 ± 6461.691763, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #36: 1001it [00:02, 349.36it/s, env_step=36000, gradient_step=3600, len=192, n/ep=2, n/st=100, rew=10219.50]                                                                                


Epoch #36: test_reward: 13198.600000 ± 4016.722475, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #37: 1001it [00:02, 409.10it/s, env_step=37000, gradient_step=3700, len=161, n/ep=2, n/st=100, rew=8779.50]                                                                                 


Epoch #37: test_reward: 10459.700000 ± 3141.325391, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #38: 1001it [00:03, 325.41it/s, env_step=38000, gradient_step=3800, len=173, n/ep=2, n/st=100, rew=10482.00]                                                                                


Epoch #38: test_reward: 12512.500000 ± 6833.102462, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #39: 1001it [00:02, 336.10it/s, env_step=39000, gradient_step=3900, len=188, n/ep=1, n/st=100, rew=10970.00]                                                                                


Epoch #39: test_reward: 11242.400000 ± 5713.616424, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #40: 1001it [00:02, 353.81it/s, env_step=40000, gradient_step=4000, len=299, n/ep=2, n/st=100, rew=15991.75]                                                                                


Epoch #40: test_reward: 15761.200000 ± 4777.909664, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #41: 1001it [00:02, 358.75it/s, env_step=41000, gradient_step=4100, len=64, n/ep=1, n/st=100, rew=2852.00]                                                                                  


Epoch #41: test_reward: 9482.800000 ± 3904.816508, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #42: 1001it [00:02, 369.97it/s, env_step=42000, gradient_step=4200, len=310, n/ep=0, n/st=100, rew=17660.50]                                                                                


Epoch #42: test_reward: 8547.300000 ± 4511.674723, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #43: 1001it [00:02, 369.11it/s, env_step=43000, gradient_step=4300, len=201, n/ep=2, n/st=100, rew=12068.75]                                                                                


Epoch #43: test_reward: 11889.500000 ± 2656.824354, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #44: 1001it [00:02, 367.14it/s, env_step=44000, gradient_step=4400, len=90, n/ep=2, n/st=100, rew=4355.75]                                                                                  


Epoch #44: test_reward: 15153.200000 ± 4548.738282, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #45: 1001it [00:02, 384.81it/s, env_step=45000, gradient_step=4500, len=170, n/ep=1, n/st=100, rew=9239.00]                                                                                 


Epoch #45: test_reward: 11108.900000 ± 7125.935819, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #46: 1001it [00:02, 381.76it/s, env_step=46000, gradient_step=4600, len=128, n/ep=1, n/st=100, rew=6546.00]                                                                                 


Epoch #46: test_reward: 10024.600000 ± 4786.800104, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #47: 1001it [00:02, 417.95it/s, env_step=47000, gradient_step=4700, len=131, n/ep=0, n/st=100, rew=7068.50]                                                                                 


Epoch #47: test_reward: 13379.100000 ± 4922.722955, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #48: 1001it [00:02, 340.80it/s, env_step=48000, gradient_step=4800, len=147, n/ep=0, n/st=100, rew=8562.00]                                                                                 


Epoch #48: test_reward: 14262.000000 ± 6772.937088, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #49: 1001it [00:02, 346.18it/s, env_step=49000, gradient_step=4900, len=250, n/ep=1, n/st=100, rew=14758.00]                                                                                


Epoch #49: test_reward: 6180.000000 ± 2130.958094, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #50: 1001it [00:02, 354.62it/s, env_step=50000, gradient_step=5000, len=139, n/ep=0, n/st=100, rew=7571.50]                                                                                 


Epoch #50: test_reward: 7892.600000 ± 4096.851308, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #51: 1001it [00:03, 323.06it/s, env_step=51000, gradient_step=5100, len=146, n/ep=0, n/st=100, rew=8111.00]                                                                                 


Epoch #51: test_reward: 15898.900000 ± 4300.447755, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #52: 1001it [00:02, 337.35it/s, env_step=52000, gradient_step=5200, len=204, n/ep=1, n/st=100, rew=11168.00]                                                                                


Epoch #52: test_reward: 10052.400000 ± 2634.934656, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #53: 1001it [00:02, 347.61it/s, env_step=53000, gradient_step=5300, len=204, n/ep=0, n/st=100, rew=11168.00]                                                                                


Epoch #53: test_reward: 13259.100000 ± 2247.867276, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #54: 1001it [00:02, 399.31it/s, env_step=54000, gradient_step=5400, len=176, n/ep=1, n/st=100, rew=9683.00]                                                                                 


Epoch #54: test_reward: 13915.100000 ± 4114.166780, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #55: 1001it [00:02, 388.12it/s, env_step=55000, gradient_step=5500, len=244, n/ep=0, n/st=100, rew=15121.00]                                                                                


Epoch #55: test_reward: 11574.400000 ± 3095.837115, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #56: 1001it [00:02, 347.19it/s, env_step=56000, gradient_step=5600, len=206, n/ep=0, n/st=100, rew=12515.00]                                                                                


Epoch #56: test_reward: 10026.900000 ± 4115.362498, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #57: 1001it [00:02, 340.37it/s, env_step=57000, gradient_step=5700, len=194, n/ep=1, n/st=100, rew=11580.00]                                                                                


Epoch #57: test_reward: 11584.800000 ± 3335.119782, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #58: 1001it [00:02, 357.61it/s, env_step=58000, gradient_step=5800, len=176, n/ep=2, n/st=100, rew=9690.50]                                                                                 


Epoch #58: test_reward: 13291.900000 ± 3926.804387, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #59: 1001it [00:02, 431.16it/s, env_step=59000, gradient_step=5900, len=230, n/ep=1, n/st=100, rew=14006.00]                                                                                


Epoch #59: test_reward: 11656.200000 ± 3016.685393, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #60: 1001it [00:02, 432.15it/s, env_step=60000, gradient_step=6000, len=276, n/ep=1, n/st=100, rew=16039.00]                                                                                


Epoch #60: test_reward: 10909.600000 ± 5549.912255, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #61: 1001it [00:02, 337.30it/s, env_step=61000, gradient_step=6100, len=161, n/ep=0, n/st=100, rew=9190.00]                                                                                 


Epoch #61: test_reward: 13822.000000 ± 3739.758067, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #62: 1001it [00:02, 352.68it/s, env_step=62000, gradient_step=6200, len=154, n/ep=1, n/st=100, rew=8827.00]                                                                                 


Epoch #62: test_reward: 10808.700000 ± 1209.867435, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #63: 1001it [00:02, 340.06it/s, env_step=63000, gradient_step=6300, len=184, n/ep=1, n/st=100, rew=10586.00]                                                                                


Epoch #63: test_reward: 10527.900000 ± 2387.886281, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #64: 1001it [00:02, 403.40it/s, env_step=64000, gradient_step=6400, len=138, n/ep=1, n/st=100, rew=8321.50]                                                                                 


Epoch #64: test_reward: 12819.800000 ± 5798.042287, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #65: 1001it [00:02, 393.11it/s, env_step=65000, gradient_step=6500, len=91, n/ep=0, n/st=100, rew=5071.00]                                                                                  


Epoch #65: test_reward: 9707.500000 ± 3618.770157, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #66: 1001it [00:03, 327.88it/s, env_step=66000, gradient_step=6600, len=116, n/ep=1, n/st=100, rew=6562.50]                                                                                 


Epoch #66: test_reward: 8684.200000 ± 3601.739158, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #67: 1001it [00:02, 366.55it/s, env_step=67000, gradient_step=6700, len=180, n/ep=1, n/st=100, rew=10279.00]                                                                                


Epoch #67: test_reward: 11483.900000 ± 2866.666372, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #68: 1001it [00:02, 379.68it/s, env_step=68000, gradient_step=6800, len=164, n/ep=0, n/st=100, rew=10436.00]                                                                                


Epoch #68: test_reward: 7828.100000 ± 2834.562910, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #69: 1001it [00:02, 387.61it/s, env_step=69000, gradient_step=6900, len=252, n/ep=1, n/st=100, rew=16066.00]                                                                                


Epoch #69: test_reward: 10186.500000 ± 2577.796200, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #70: 1001it [00:02, 399.98it/s, env_step=70000, gradient_step=7000, len=164, n/ep=0, n/st=100, rew=10052.00]                                                                                


Epoch #70: test_reward: 8099.700000 ± 1340.419043, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #71: 1001it [00:02, 350.88it/s, env_step=71000, gradient_step=7100, len=156, n/ep=0, n/st=100, rew=8796.67]                                                                                 


Epoch #71: test_reward: 11883.100000 ± 1887.989695, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #72: 1001it [00:02, 407.44it/s, env_step=72000, gradient_step=7200, len=130, n/ep=2, n/st=100, rew=7884.00]                                                                                 


Epoch #72: test_reward: 9134.300000 ± 2772.154399, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #73: 1001it [00:02, 394.58it/s, env_step=73000, gradient_step=7300, len=142, n/ep=1, n/st=100, rew=8133.00]                                                                                 


Epoch #73: test_reward: 8974.400000 ± 2559.625098, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #74: 1001it [00:02, 360.60it/s, env_step=74000, gradient_step=7400, len=172, n/ep=1, n/st=100, rew=10478.00]                                                                                


Epoch #74: test_reward: 8252.700000 ± 3064.167686, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #75: 1001it [00:02, 368.35it/s, env_step=75000, gradient_step=7500, len=162, n/ep=0, n/st=100, rew=9518.00]                                                                                 


Epoch #75: test_reward: 10895.200000 ± 1707.237406, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #76: 1001it [00:02, 352.95it/s, env_step=76000, gradient_step=7600, len=194, n/ep=1, n/st=100, rew=11488.00]                                                                                


Epoch #76: test_reward: 15604.200000 ± 3794.457479, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #77: 1001it [00:02, 396.09it/s, env_step=77000, gradient_step=7700, len=204, n/ep=0, n/st=100, rew=12787.00]                                                                                


Epoch #77: test_reward: 8465.500000 ± 1183.204906, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #78: 1001it [00:02, 371.01it/s, env_step=78000, gradient_step=7800, len=128, n/ep=1, n/st=100, rew=7460.00]                                                                                 


Epoch #78: test_reward: 8130.900000 ± 2800.876772, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #79: 1001it [00:02, 356.76it/s, env_step=79000, gradient_step=7900, len=128, n/ep=3, n/st=100, rew=7154.83]                                                                                 


Epoch #79: test_reward: 12789.100000 ± 3134.671768, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #80: 1001it [00:02, 365.44it/s, env_step=80000, gradient_step=8000, len=134, n/ep=2, n/st=100, rew=7796.00]                                                                                 


Epoch #80: test_reward: 13518.300000 ± 4289.523564, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #81: 1001it [00:02, 377.88it/s, env_step=81000, gradient_step=8100, len=128, n/ep=0, n/st=100, rew=7483.50]                                                                                 


Epoch #81: test_reward: 9506.400000 ± 3697.291203, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #82: 1001it [00:02, 376.06it/s, env_step=82000, gradient_step=8200, len=150, n/ep=0, n/st=100, rew=9121.00]                                                                                 


Epoch #82: test_reward: 11571.500000 ± 1963.346900, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #83: 1001it [00:02, 357.38it/s, env_step=83000, gradient_step=8300, len=132, n/ep=0, n/st=100, rew=7852.00]                                                                                 


Epoch #83: test_reward: 14706.200000 ± 2853.864110, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #84: 1001it [00:03, 325.00it/s, env_step=84000, gradient_step=8400, len=208, n/ep=0, n/st=100, rew=11982.00]                                                                                


Epoch #84: test_reward: 12603.900000 ± 3969.038283, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #85: 1001it [00:02, 353.33it/s, env_step=85000, gradient_step=8500, len=82, n/ep=1, n/st=100, rew=4071.50]                                                                                  


Epoch #85: test_reward: 14240.900000 ± 4810.693847, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #86: 1001it [00:02, 382.74it/s, env_step=86000, gradient_step=8600, len=128, n/ep=0, n/st=100, rew=7978.00]                                                                                 


Epoch #86: test_reward: 12203.700000 ± 3765.977511, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #87: 1001it [00:02, 378.96it/s, env_step=87000, gradient_step=8700, len=130, n/ep=1, n/st=100, rew=7123.00]                                                                                 


Epoch #87: test_reward: 9392.500000 ± 4152.042130, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #88: 1001it [00:02, 393.01it/s, env_step=88000, gradient_step=8800, len=151, n/ep=0, n/st=100, rew=9459.00]                                                                                 


Epoch #88: test_reward: 12798.600000 ± 3284.626286, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #89: 1001it [00:02, 380.92it/s, env_step=89000, gradient_step=8900, len=94, n/ep=0, n/st=100, rew=5323.00]                                                                                  


Epoch #89: test_reward: 10727.800000 ± 1783.489490, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #90: 1001it [00:02, 423.36it/s, env_step=90000, gradient_step=9000, len=142, n/ep=0, n/st=100, rew=8814.00]                                                                                 


Epoch #90: test_reward: 12571.800000 ± 7003.439109, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #91: 1001it [00:02, 351.78it/s, env_step=91000, gradient_step=9100, len=250, n/ep=1, n/st=100, rew=15835.00]                                                                                


Epoch #91: test_reward: 7055.700000 ± 2491.105500, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #92: 1001it [00:02, 418.96it/s, env_step=92000, gradient_step=9200, len=172, n/ep=4, n/st=100, rew=10829.25]                                                                                


Epoch #92: test_reward: 10962.400000 ± 4310.320596, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #93: 1001it [00:02, 427.07it/s, env_step=93000, gradient_step=9300, len=131, n/ep=1, n/st=100, rew=7648.00]                                                                                 


Epoch #93: test_reward: 10415.500000 ± 2922.993166, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #94: 1001it [00:02, 431.84it/s, env_step=94000, gradient_step=9400, len=123, n/ep=0, n/st=100, rew=7280.25]                                                                                 


Epoch #94: test_reward: 10235.900000 ± 3456.889655, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #95: 1001it [00:02, 358.63it/s, env_step=95000, gradient_step=9500, len=158, n/ep=0, n/st=100, rew=9224.50]                                                                                 


Epoch #95: test_reward: 12538.200000 ± 7440.629984, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #96: 1001it [00:02, 391.46it/s, env_step=96000, gradient_step=9600, len=235, n/ep=2, n/st=100, rew=15024.00]                                                                                


Epoch #96: test_reward: 14495.000000 ± 4241.986374, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #97: 1001it [00:02, 350.03it/s, env_step=97000, gradient_step=9700, len=374, n/ep=1, n/st=100, rew=21623.00]                                                                                


Epoch #97: test_reward: 13233.900000 ± 4687.591737, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #98: 1001it [00:02, 424.38it/s, env_step=98000, gradient_step=9800, len=141, n/ep=0, n/st=100, rew=8164.50]                                                                                 


Epoch #98: test_reward: 11631.800000 ± 2444.909274, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #99: 1001it [00:02, 382.18it/s, env_step=99000, gradient_step=9900, len=137, n/ep=0, n/st=100, rew=8195.00]                                                                                 


Epoch #99: test_reward: 13019.500000 ± 2559.176166, best_reward: 18125.600000 ± 9899.259035 in #14


Epoch #100: 1001it [00:02, 392.76it/s, env_step=100000, gradient_step=10000, len=81, n/ep=1, n/st=100, rew=4045.00]                                                                               


Epoch #100: test_reward: 10029.200000 ± 3989.055046, best_reward: 18125.600000 ± 9899.259035 in #14

InfoStats(gradient_step=10000, best_reward=18125.6, best_reward_std=9899.259034897512, train_step=100000, train_episode=575, test_step=184999, test_episode=1010, timing=TimingStats(total_time=367.5815634727478, train_time=264.681946516037, train_time_collect=35.01699090003967, train_time_update=225.53213381767273, test_time=102.89961695671082, update_speed=377.811941147792))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #27


Epoch #1: 1001it [00:02, 365.78it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 15703.800000 ± 5347.821347, best_reward: 15703.800000 ± 5347.821347 in #1


Epoch #2: 1001it [00:02, 367.49it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 15586.700000 ± 4985.661462, best_reward: 15703.800000 ± 5347.821347 in #1


Epoch #3: 1001it [00:02, 345.48it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 18299.500000 ± 7542.112267, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #4: 1001it [00:02, 335.95it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 16174.700000 ± 8031.193760, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #5: 1001it [00:02, 355.69it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 14926.900000 ± 8117.542836, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #6: 1001it [00:02, 375.51it/s, env_step=6000, gradient_step=600, len=58, n/ep=0, n/st=100, rew=2555.00]                                                                                     


Epoch #6: test_reward: 10859.400000 ± 4440.236305, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #7: 1001it [00:02, 339.38it/s, env_step=7000, gradient_step=700, len=66, n/ep=0, n/st=100, rew=3385.00]                                                                                     


Epoch #7: test_reward: 13261.800000 ± 5011.848098, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #8: 1001it [00:02, 359.59it/s, env_step=8000, gradient_step=800, len=73, n/ep=0, n/st=100, rew=3764.00]                                                                                     


Epoch #8: test_reward: 13202.300000 ± 7166.605097, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #9: 1001it [00:02, 354.88it/s, env_step=9000, gradient_step=900, len=90, n/ep=1, n/st=100, rew=3634.00]                                                                                     


Epoch #9: test_reward: 9506.300000 ± 3267.900581, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #10: 1001it [00:02, 340.51it/s, env_step=10000, gradient_step=1000, len=97, n/ep=0, n/st=100, rew=5232.00]                                                                                  


Epoch #10: test_reward: 12344.900000 ± 3509.037659, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #11: 1001it [00:03, 287.69it/s, env_step=11000, gradient_step=1100, len=109, n/ep=0, n/st=100, rew=6379.00]                                                                                 


Epoch #11: test_reward: 11393.000000 ± 2333.394952, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #12: 1001it [00:03, 308.24it/s, env_step=12000, gradient_step=1200, len=119, n/ep=0, n/st=100, rew=6612.00]                                                                                 


Epoch #12: test_reward: 11704.700000 ± 5801.606123, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #13: 1001it [00:02, 342.72it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=7455.00]                                                                                 


Epoch #13: test_reward: 9252.300000 ± 2626.064624, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #14: 1001it [00:03, 320.85it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=8243.50]                                                                                 


Epoch #14: test_reward: 7964.500000 ± 2959.426608, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #15: 1001it [00:03, 300.16it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=9300.00]                                                                                 


Epoch #15: test_reward: 9902.000000 ± 4575.440875, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #16: 1001it [00:03, 329.17it/s, env_step=16000, gradient_step=1600, len=160, n/ep=3, n/st=100, rew=9621.83]                                                                                 


Epoch #16: test_reward: 8185.300000 ± 2227.828586, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #17: 1001it [00:03, 318.54it/s, env_step=17000, gradient_step=1700, len=170, n/ep=2, n/st=100, rew=9649.25]                                                                                 


Epoch #17: test_reward: 9378.000000 ± 2614.638598, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #18: 1001it [00:02, 335.49it/s, env_step=18000, gradient_step=1800, len=179, n/ep=0, n/st=100, rew=10623.00]                                                                                


Epoch #18: test_reward: 11673.400000 ± 2082.190923, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #19: 1001it [00:03, 303.60it/s, env_step=19000, gradient_step=1900, len=54, n/ep=0, n/st=100, rew=2518.00]                                                                                  


Epoch #19: test_reward: 11090.400000 ± 3569.581242, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #20: 1001it [00:02, 333.86it/s, env_step=20000, gradient_step=2000, len=194, n/ep=0, n/st=100, rew=11288.00]                                                                                


Epoch #20: test_reward: 9977.100000 ± 3680.687828, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #21: 1001it [00:02, 344.86it/s, env_step=21000, gradient_step=2100, len=155, n/ep=0, n/st=100, rew=10032.50]                                                                                


Epoch #21: test_reward: 13598.300000 ± 5653.903820, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #22: 1001it [00:02, 368.18it/s, env_step=22000, gradient_step=2200, len=217, n/ep=0, n/st=100, rew=13712.50]                                                                                


Epoch #22: test_reward: 13354.000000 ± 8021.353963, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #23: 1001it [00:03, 299.24it/s, env_step=23000, gradient_step=2300, len=229, n/ep=0, n/st=100, rew=14812.00]                                                                                


Epoch #23: test_reward: 9093.100000 ± 3042.269299, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #24: 1001it [00:03, 286.96it/s, env_step=24000, gradient_step=2400, len=121, n/ep=0, n/st=100, rew=7330.50]                                                                                 


Epoch #24: test_reward: 12532.200000 ± 4700.285902, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #25: 1001it [00:03, 318.15it/s, env_step=25000, gradient_step=2500, len=247, n/ep=0, n/st=100, rew=16040.00]                                                                                


Epoch #25: test_reward: 11462.100000 ± 1956.343602, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #26: 1001it [00:03, 320.88it/s, env_step=26000, gradient_step=2600, len=108, n/ep=0, n/st=100, rew=6144.00]                                                                                 


Epoch #26: test_reward: 9926.400000 ± 1101.911358, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #27: 1001it [00:02, 345.61it/s, env_step=27000, gradient_step=2700, len=160, n/ep=0, n/st=100, rew=10198.00]                                                                                


Epoch #27: test_reward: 9956.800000 ± 2078.757360, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #28: 1001it [00:02, 352.63it/s, env_step=28000, gradient_step=2800, len=118, n/ep=0, n/st=100, rew=7407.00]                                                                                 


Epoch #28: test_reward: 12988.000000 ± 2688.083034, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #29: 1001it [00:03, 310.82it/s, env_step=29000, gradient_step=2900, len=166, n/ep=1, n/st=100, rew=10402.50]                                                                                


Epoch #29: test_reward: 9487.300000 ± 2823.749955, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #30: 1001it [00:03, 317.55it/s, env_step=30000, gradient_step=3000, len=143, n/ep=0, n/st=100, rew=8861.00]                                                                                 


Epoch #30: test_reward: 9635.800000 ± 3104.638749, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #31: 1001it [00:02, 354.93it/s, env_step=31000, gradient_step=3100, len=216, n/ep=0, n/st=100, rew=13338.00]                                                                                


Epoch #31: test_reward: 8227.700000 ± 2815.169695, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #32: 1001it [00:02, 377.18it/s, env_step=32000, gradient_step=3200, len=106, n/ep=0, n/st=100, rew=6026.38]                                                                                 


Epoch #32: test_reward: 7244.800000 ± 1259.399206, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #33: 1001it [00:02, 375.53it/s, env_step=33000, gradient_step=3300, len=144, n/ep=0, n/st=100, rew=9484.00]                                                                                 


Epoch #33: test_reward: 9903.000000 ± 3400.070499, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #34: 1001it [00:02, 355.01it/s, env_step=34000, gradient_step=3400, len=176, n/ep=0, n/st=100, rew=11645.00]                                                                                


Epoch #34: test_reward: 9714.800000 ± 3354.638246, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #35: 1001it [00:02, 342.27it/s, env_step=35000, gradient_step=3500, len=200, n/ep=1, n/st=100, rew=12787.00]                                                                                


Epoch #35: test_reward: 13158.100000 ± 7651.940021, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #36: 1001it [00:03, 313.15it/s, env_step=36000, gradient_step=3600, len=163, n/ep=0, n/st=100, rew=10155.50]                                                                                


Epoch #36: test_reward: 8788.500000 ± 2922.426911, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #37: 1001it [00:03, 319.87it/s, env_step=37000, gradient_step=3700, len=132, n/ep=1, n/st=100, rew=8337.50]                                                                                 


Epoch #37: test_reward: 7530.400000 ± 2519.804881, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #38: 1001it [00:03, 310.21it/s, env_step=38000, gradient_step=3800, len=175, n/ep=1, n/st=100, rew=11379.00]                                                                                


Epoch #38: test_reward: 9665.400000 ± 3814.389314, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #39: 1001it [00:02, 374.94it/s, env_step=39000, gradient_step=3900, len=170, n/ep=2, n/st=100, rew=11195.50]                                                                                


Epoch #39: test_reward: 9951.400000 ± 5934.946541, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #40: 1001it [00:03, 332.48it/s, env_step=40000, gradient_step=4000, len=331, n/ep=8, n/st=100, rew=22592.94]                                                                                


Epoch #40: test_reward: 10938.200000 ± 8873.843866, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #41: 1001it [00:03, 309.14it/s, env_step=41000, gradient_step=4100, len=144, n/ep=1, n/st=100, rew=8957.50]                                                                                 


Epoch #41: test_reward: 9428.300000 ± 4176.474902, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #42: 1001it [00:03, 301.86it/s, env_step=42000, gradient_step=4200, len=186, n/ep=2, n/st=100, rew=11665.25]                                                                                


Epoch #42: test_reward: 10731.100000 ± 3811.801423, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #43: 1001it [00:02, 354.58it/s, env_step=43000, gradient_step=4300, len=119, n/ep=1, n/st=100, rew=7456.50]                                                                                 


Epoch #43: test_reward: 10031.200000 ± 3104.917158, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #44: 1001it [00:02, 336.99it/s, env_step=44000, gradient_step=4400, len=106, n/ep=0, n/st=100, rew=6162.00]                                                                                 


Epoch #44: test_reward: 11135.900000 ± 6057.818806, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #45: 1001it [00:03, 300.19it/s, env_step=45000, gradient_step=4500, len=113, n/ep=1, n/st=100, rew=7324.00]                                                                                 


Epoch #45: test_reward: 12192.300000 ± 4116.684760, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #46: 1001it [00:03, 311.90it/s, env_step=46000, gradient_step=4600, len=212, n/ep=2, n/st=100, rew=14074.00]                                                                                


Epoch #46: test_reward: 10954.900000 ± 3390.141190, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #47: 1001it [00:02, 355.06it/s, env_step=47000, gradient_step=4700, len=140, n/ep=0, n/st=100, rew=8635.50]                                                                                 


Epoch #47: test_reward: 11285.000000 ± 4331.212071, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #48: 1001it [00:02, 369.24it/s, env_step=48000, gradient_step=4800, len=140, n/ep=0, n/st=100, rew=9117.00]                                                                                 


Epoch #48: test_reward: 9896.800000 ± 4163.750732, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #49: 1001it [00:02, 354.48it/s, env_step=49000, gradient_step=4900, len=84, n/ep=1, n/st=100, rew=4963.00]                                                                                  


Epoch #49: test_reward: 13807.600000 ± 4263.535979, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #50: 1001it [00:02, 358.60it/s, env_step=50000, gradient_step=5000, len=171, n/ep=1, n/st=100, rew=11224.00]                                                                                


Epoch #50: test_reward: 7215.000000 ± 2139.532519, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #51: 1001it [00:03, 291.43it/s, env_step=51000, gradient_step=5100, len=239, n/ep=2, n/st=100, rew=16229.25]                                                                                


Epoch #51: test_reward: 9789.600000 ± 3152.999150, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #52: 1001it [00:03, 314.24it/s, env_step=52000, gradient_step=5200, len=236, n/ep=1, n/st=100, rew=15700.50]                                                                                


Epoch #52: test_reward: 10979.700000 ± 5825.020825, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #53: 1001it [00:02, 373.35it/s, env_step=53000, gradient_step=5300, len=130, n/ep=2, n/st=100, rew=8107.50]                                                                                 


Epoch #53: test_reward: 8035.000000 ± 3161.648462, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #54: 1001it [00:02, 343.81it/s, env_step=54000, gradient_step=5400, len=140, n/ep=0, n/st=100, rew=8715.50]                                                                                 


Epoch #54: test_reward: 14193.100000 ± 5998.754295, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #55: 1001it [00:02, 336.48it/s, env_step=55000, gradient_step=5500, len=127, n/ep=1, n/st=100, rew=7984.50]                                                                                 


Epoch #55: test_reward: 9763.500000 ± 4147.609559, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #56: 1001it [00:02, 355.64it/s, env_step=56000, gradient_step=5600, len=396, n/ep=1, n/st=100, rew=28335.00]                                                                                


Epoch #56: test_reward: 6412.700000 ± 3091.359638, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #57: 1001it [00:03, 325.26it/s, env_step=57000, gradient_step=5700, len=112, n/ep=0, n/st=100, rew=7197.00]                                                                                 


Epoch #57: test_reward: 8956.400000 ± 4278.400991, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #58: 1001it [00:02, 340.42it/s, env_step=58000, gradient_step=5800, len=129, n/ep=1, n/st=100, rew=8205.00]                                                                                 


Epoch #58: test_reward: 9814.600000 ± 5964.696609, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #59: 1001it [00:03, 305.99it/s, env_step=59000, gradient_step=5900, len=131, n/ep=0, n/st=100, rew=8017.00]                                                                                 


Epoch #59: test_reward: 9084.100000 ± 3554.162108, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #60: 1001it [00:02, 353.36it/s, env_step=60000, gradient_step=6000, len=124, n/ep=2, n/st=100, rew=7749.25]                                                                                 


Epoch #60: test_reward: 14160.500000 ± 7212.197241, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #61: 1001it [00:02, 354.79it/s, env_step=61000, gradient_step=6100, len=102, n/ep=0, n/st=100, rew=6460.00]                                                                                 


Epoch #61: test_reward: 11318.800000 ± 3236.811264, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #62: 1001it [00:03, 332.27it/s, env_step=62000, gradient_step=6200, len=105, n/ep=2, n/st=100, rew=6133.50]                                                                                 


Epoch #62: test_reward: 11008.800000 ± 2412.647500, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #63: 1001it [00:03, 319.34it/s, env_step=63000, gradient_step=6300, len=217, n/ep=0, n/st=100, rew=14562.00]                                                                                


Epoch #63: test_reward: 15202.300000 ± 7028.324381, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #64: 1001it [00:02, 358.63it/s, env_step=64000, gradient_step=6400, len=127, n/ep=0, n/st=100, rew=8000.00]                                                                                 


Epoch #64: test_reward: 14190.800000 ± 7761.098025, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #65: 1001it [00:02, 360.62it/s, env_step=65000, gradient_step=6500, len=117, n/ep=0, n/st=100, rew=7445.00]                                                                                 


Epoch #65: test_reward: 8974.000000 ± 2986.545027, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #66: 1001it [00:03, 314.16it/s, env_step=66000, gradient_step=6600, len=214, n/ep=1, n/st=100, rew=14911.50]                                                                                


Epoch #66: test_reward: 16728.600000 ± 4993.608559, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #67: 1001it [00:03, 326.20it/s, env_step=67000, gradient_step=6700, len=134, n/ep=0, n/st=100, rew=7623.50]                                                                                 


Epoch #67: test_reward: 9373.400000 ± 2792.760326, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #68: 1001it [00:03, 294.42it/s, env_step=68000, gradient_step=6800, len=166, n/ep=0, n/st=100, rew=10626.00]                                                                                


Epoch #68: test_reward: 11640.800000 ± 3208.315284, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #69: 1001it [00:03, 289.40it/s, env_step=69000, gradient_step=6900, len=109, n/ep=0, n/st=100, rew=6884.50]                                                                                 


Epoch #69: test_reward: 5798.900000 ± 856.253987, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #70: 1001it [00:03, 323.32it/s, env_step=70000, gradient_step=7000, len=144, n/ep=2, n/st=100, rew=9045.75]                                                                                 


Epoch #70: test_reward: 8069.800000 ± 2734.085288, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #71: 1001it [00:03, 328.55it/s, env_step=71000, gradient_step=7100, len=224, n/ep=0, n/st=100, rew=13498.00]                                                                                


Epoch #71: test_reward: 8956.000000 ± 4243.917930, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #72: 1001it [00:03, 331.82it/s, env_step=72000, gradient_step=7200, len=208, n/ep=1, n/st=100, rew=13381.00]                                                                                


Epoch #72: test_reward: 17012.500000 ± 6807.310030, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #73: 1001it [00:03, 315.45it/s, env_step=73000, gradient_step=7300, len=130, n/ep=1, n/st=100, rew=8592.00]                                                                                 


Epoch #73: test_reward: 11442.200000 ± 4976.743952, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #74: 1001it [00:03, 325.22it/s, env_step=74000, gradient_step=7400, len=102, n/ep=0, n/st=100, rew=6127.00]                                                                                 


Epoch #74: test_reward: 12732.300000 ± 6469.862689, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #75: 1001it [00:03, 304.44it/s, env_step=75000, gradient_step=7500, len=127, n/ep=1, n/st=100, rew=7921.00]                                                                                 


Epoch #75: test_reward: 8658.100000 ± 3343.200576, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #76: 1001it [00:03, 318.20it/s, env_step=76000, gradient_step=7600, len=124, n/ep=0, n/st=100, rew=7773.20]                                                                                 


Epoch #76: test_reward: 12413.900000 ± 3023.509863, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #77: 1001it [00:02, 375.54it/s, env_step=77000, gradient_step=7700, len=205, n/ep=0, n/st=100, rew=13769.50]                                                                                


Epoch #77: test_reward: 7340.600000 ± 1771.597708, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #78: 1001it [00:02, 370.27it/s, env_step=78000, gradient_step=7800, len=101, n/ep=0, n/st=100, rew=6077.00]                                                                                 


Epoch #78: test_reward: 11935.000000 ± 5253.149912, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #79: 1001it [00:03, 292.16it/s, env_step=79000, gradient_step=7900, len=147, n/ep=2, n/st=100, rew=9201.00]                                                                                 


Epoch #79: test_reward: 10323.100000 ± 4438.491713, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #80: 1001it [00:03, 322.97it/s, env_step=80000, gradient_step=8000, len=147, n/ep=1, n/st=100, rew=9495.00]                                                                                 


Epoch #80: test_reward: 9053.300000 ± 3197.388498, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #81: 1001it [00:03, 293.97it/s, env_step=81000, gradient_step=8100, len=93, n/ep=2, n/st=100, rew=5164.25]                                                                                  


Epoch #81: test_reward: 11655.500000 ± 3362.185933, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #82: 1001it [00:02, 344.53it/s, env_step=82000, gradient_step=8200, len=112, n/ep=0, n/st=100, rew=6242.00]                                                                                 


Epoch #82: test_reward: 11403.700000 ± 3752.650638, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #83: 1001it [00:02, 368.48it/s, env_step=83000, gradient_step=8300, len=103, n/ep=1, n/st=100, rew=6374.00]                                                                                 


Epoch #83: test_reward: 10732.800000 ± 2969.218779, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #84: 1001it [00:02, 373.60it/s, env_step=84000, gradient_step=8400, len=82, n/ep=1, n/st=100, rew=4032.50]                                                                                  


Epoch #84: test_reward: 12545.700000 ± 4270.078197, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #85: 1001it [00:03, 323.23it/s, env_step=85000, gradient_step=8500, len=199, n/ep=0, n/st=100, rew=13481.00]                                                                                


Epoch #85: test_reward: 11265.000000 ± 5085.805069, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #86: 1001it [00:02, 346.59it/s, env_step=86000, gradient_step=8600, len=130, n/ep=2, n/st=100, rew=7772.25]                                                                                 


Epoch #86: test_reward: 9441.800000 ± 3491.730969, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #87: 1001it [00:02, 350.09it/s, env_step=87000, gradient_step=8700, len=164, n/ep=0, n/st=100, rew=10219.50]                                                                                


Epoch #87: test_reward: 7468.300000 ± 2364.508112, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #88: 1001it [00:02, 375.57it/s, env_step=88000, gradient_step=8800, len=157, n/ep=1, n/st=100, rew=10211.00]                                                                                


Epoch #88: test_reward: 7829.600000 ± 3005.096078, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #89: 1001it [00:02, 364.69it/s, env_step=89000, gradient_step=8900, len=141, n/ep=1, n/st=100, rew=8758.50]                                                                                 


Epoch #89: test_reward: 16555.600000 ± 7370.944773, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #90: 1001it [00:02, 353.29it/s, env_step=90000, gradient_step=9000, len=139, n/ep=0, n/st=100, rew=8586.33]                                                                                 


Epoch #90: test_reward: 9160.700000 ± 6435.997872, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #91: 1001it [00:03, 291.68it/s, env_step=91000, gradient_step=9100, len=116, n/ep=2, n/st=100, rew=6714.00]                                                                                 


Epoch #91: test_reward: 9003.900000 ± 4259.283871, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #92: 1001it [00:03, 315.21it/s, env_step=92000, gradient_step=9200, len=107, n/ep=1, n/st=100, rew=6524.00]                                                                                 


Epoch #92: test_reward: 12911.600000 ± 6771.486502, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #93: 1001it [00:02, 335.50it/s, env_step=93000, gradient_step=9300, len=134, n/ep=1, n/st=100, rew=8322.50]                                                                                 


Epoch #93: test_reward: 8677.900000 ± 2067.759727, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #94: 1001it [00:03, 291.69it/s, env_step=94000, gradient_step=9400, len=104, n/ep=0, n/st=100, rew=5706.00]                                                                                 


Epoch #94: test_reward: 12765.500000 ± 4385.214755, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #95: 1001it [00:03, 288.71it/s, env_step=95000, gradient_step=9500, len=147, n/ep=2, n/st=100, rew=9185.50]                                                                                 


Epoch #95: test_reward: 9033.000000 ± 2332.108274, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #96: 1001it [00:03, 329.28it/s, env_step=96000, gradient_step=9600, len=78, n/ep=1, n/st=100, rew=3995.00]                                                                                  


Epoch #96: test_reward: 7367.800000 ± 1805.040099, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #97: 1001it [00:03, 321.11it/s, env_step=97000, gradient_step=9700, len=155, n/ep=1, n/st=100, rew=10018.00]                                                                                


Epoch #97: test_reward: 9704.700000 ± 2063.799896, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #98: 1001it [00:02, 342.42it/s, env_step=98000, gradient_step=9800, len=137, n/ep=1, n/st=100, rew=8753.50]                                                                                 


Epoch #98: test_reward: 6068.400000 ± 2405.875483, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #99: 1001it [00:03, 330.99it/s, env_step=99000, gradient_step=9900, len=130, n/ep=1, n/st=100, rew=7771.00]                                                                                 


Epoch #99: test_reward: 8188.100000 ± 1717.397767, best_reward: 18299.500000 ± 7542.112267 in #3


Epoch #100: 1001it [00:02, 339.17it/s, env_step=100000, gradient_step=10000, len=132, n/ep=0, n/st=100, rew=8517.75]                                                                              


Epoch #100: test_reward: 14947.600000 ± 6969.469107, best_reward: 18299.500000 ± 7542.112267 in #3

InfoStats(gradient_step=10000, best_reward=18299.5, best_reward_std=7542.112267130476, train_step=100000, train_episode=633, test_step=164757, test_episode=1010, timing=TimingStats(total_time=398.9308195114136, train_time=302.6786699295044, train_time_collect=34.38520574569702, train_time_update=264.1262276172638, test_time=96.25214958190918, update_speed=330.3833733090296))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #31


Epoch #1: 1001it [00:03, 327.34it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 13158.900000 ± 4420.308461, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #2: 1001it [00:02, 340.53it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 12148.600000 ± 5469.124157, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #3: 1001it [00:03, 318.51it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10277.800000 ± 4530.792995, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #4: 1001it [00:03, 295.99it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 14444.800000 ± 5294.702651, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #5: 1001it [00:03, 293.48it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 10966.800000 ± 4889.623151, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #6: 1001it [00:03, 282.89it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 14496.300000 ± 5844.055613, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #7: 1001it [00:03, 300.85it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 12094.400000 ± 7142.206637, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #8: 1001it [00:03, 288.05it/s, env_step=8000, gradient_step=800, len=78, n/ep=0, n/st=100, rew=3497.00]                                                                                     


Epoch #8: test_reward: 10372.200000 ± 2305.070489, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #9: 1001it [00:03, 304.93it/s, env_step=9000, gradient_step=900, len=86, n/ep=0, n/st=100, rew=3613.50]                                                                                     


Epoch #9: test_reward: 6255.000000 ± 2821.849642, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #10: 1001it [00:03, 298.20it/s, env_step=10000, gradient_step=1000, len=94, n/ep=0, n/st=100, rew=3651.50]                                                                                  


Epoch #10: test_reward: 10594.700000 ± 6292.144230, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #11: 1001it [00:03, 299.70it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=4889.00]                                                                                 


Epoch #11: test_reward: 8715.500000 ± 2271.883063, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #12: 1001it [00:03, 291.86it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=5220.50]                                                                                 


Epoch #12: test_reward: 6142.100000 ± 4113.120481, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #13: 1001it [00:03, 295.51it/s, env_step=13000, gradient_step=1300, len=128, n/ep=0, n/st=100, rew=6625.67]                                                                                 


Epoch #13: test_reward: 8774.000000 ± 4227.135768, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #14: 1001it [00:03, 330.63it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=7422.00]                                                                                 


Epoch #14: test_reward: 9742.700000 ± 5664.027190, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #15: 1001it [00:02, 343.01it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=7739.67]                                                                                 


Epoch #15: test_reward: 13964.900000 ± 1896.889372, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #16: 1001it [00:02, 362.34it/s, env_step=16000, gradient_step=1600, len=160, n/ep=3, n/st=100, rew=8431.00]                                                                                 


Epoch #16: test_reward: 8779.200000 ± 5478.240006, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #17: 1001it [00:03, 301.83it/s, env_step=17000, gradient_step=1700, len=168, n/ep=0, n/st=100, rew=8492.75]                                                                                 


Epoch #17: test_reward: 7814.100000 ± 6465.822460, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #18: 1001it [00:02, 365.02it/s, env_step=18000, gradient_step=1800, len=84, n/ep=0, n/st=100, rew=3991.00]                                                                                  


Epoch #18: test_reward: 9417.700000 ± 4005.250705, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #19: 1001it [00:02, 349.14it/s, env_step=19000, gradient_step=1900, len=190, n/ep=2, n/st=100, rew=10770.50]                                                                                


Epoch #19: test_reward: 10228.900000 ± 2781.574319, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #20: 1001it [00:02, 360.54it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=10774.00]                                                                                


Epoch #20: test_reward: 13068.600000 ± 3895.484494, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #21: 1001it [00:03, 324.08it/s, env_step=21000, gradient_step=2100, len=208, n/ep=0, n/st=100, rew=11811.00]                                                                                


Epoch #21: test_reward: 12540.100000 ± 6151.163003, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #22: 1001it [00:02, 357.53it/s, env_step=22000, gradient_step=2200, len=72, n/ep=1, n/st=100, rew=2599.50]                                                                                  


Epoch #22: test_reward: 8127.200000 ± 3389.732048, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #23: 1001it [00:02, 363.59it/s, env_step=23000, gradient_step=2300, len=230, n/ep=2, n/st=100, rew=13851.75]                                                                                


Epoch #23: test_reward: 14599.400000 ± 4210.450598, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #24: 1001it [00:02, 365.30it/s, env_step=24000, gradient_step=2400, len=112, n/ep=1, n/st=100, rew=4960.00]                                                                                 


Epoch #24: test_reward: 13492.400000 ± 7231.972071, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #25: 1001it [00:03, 310.15it/s, env_step=25000, gradient_step=2500, len=249, n/ep=0, n/st=100, rew=15623.00]                                                                                


Epoch #25: test_reward: 10088.400000 ± 3969.371719, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #26: 1001it [00:03, 322.65it/s, env_step=26000, gradient_step=2600, len=157, n/ep=2, n/st=100, rew=7353.00]                                                                                 


Epoch #26: test_reward: 11895.100000 ± 4992.184020, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #27: 1001it [00:02, 350.29it/s, env_step=27000, gradient_step=2700, len=270, n/ep=2, n/st=100, rew=15993.00]                                                                                


Epoch #27: test_reward: 12770.800000 ± 3244.468918, best_reward: 14674.100000 ± 5093.873427 in #0


Epoch #28: 1001it [00:03, 318.75it/s, env_step=28000, gradient_step=2800, len=204, n/ep=4, n/st=100, rew=11976.50]                                                                                


Epoch #28: test_reward: 16579.700000 ± 7594.914905, best_reward: 16579.700000 ± 7594.914905 in #28


Epoch #29: 1001it [00:03, 314.78it/s, env_step=29000, gradient_step=2900, len=84, n/ep=1, n/st=100, rew=4244.00]                                                                                  


Epoch #29: test_reward: 16798.600000 ± 7805.461462, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #30: 1001it [00:03, 299.76it/s, env_step=30000, gradient_step=3000, len=194, n/ep=0, n/st=100, rew=10911.00]                                                                                


Epoch #30: test_reward: 12397.800000 ± 5557.444679, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #31: 1001it [00:03, 330.29it/s, env_step=31000, gradient_step=3100, len=172, n/ep=1, n/st=100, rew=9656.00]                                                                                 


Epoch #31: test_reward: 11815.100000 ± 2412.378637, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #32: 1001it [00:03, 276.87it/s, env_step=32000, gradient_step=3200, len=163, n/ep=2, n/st=100, rew=8395.50]                                                                                 


Epoch #32: test_reward: 12905.700000 ± 3542.396620, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #33: 1001it [00:03, 325.44it/s, env_step=33000, gradient_step=3300, len=68, n/ep=1, n/st=100, rew=2466.00]                                                                                  


Epoch #33: test_reward: 8466.600000 ± 4251.534975, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #34: 1001it [00:03, 298.22it/s, env_step=34000, gradient_step=3400, len=176, n/ep=0, n/st=100, rew=10165.00]                                                                                


Epoch #34: test_reward: 15217.800000 ± 7155.402697, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #35: 1001it [00:03, 318.11it/s, env_step=35000, gradient_step=3500, len=182, n/ep=1, n/st=100, rew=10411.00]                                                                                


Epoch #35: test_reward: 13424.000000 ± 7474.986395, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #36: 1001it [00:02, 344.78it/s, env_step=36000, gradient_step=3600, len=232, n/ep=2, n/st=100, rew=13999.25]                                                                                


Epoch #36: test_reward: 13774.300000 ± 5798.482285, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #37: 1001it [00:02, 337.47it/s, env_step=37000, gradient_step=3700, len=81, n/ep=2, n/st=100, rew=4092.50]                                                                                  


Epoch #37: test_reward: 14399.400000 ± 4005.594892, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #38: 1001it [00:03, 302.37it/s, env_step=38000, gradient_step=3800, len=141, n/ep=0, n/st=100, rew=7799.50]                                                                                 


Epoch #38: test_reward: 9976.200000 ± 3083.254865, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #39: 1001it [00:02, 363.94it/s, env_step=39000, gradient_step=3900, len=117, n/ep=0, n/st=100, rew=7049.50]                                                                                 


Epoch #39: test_reward: 12595.400000 ± 3818.134471, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #40: 1001it [00:02, 345.98it/s, env_step=40000, gradient_step=4000, len=400, n/ep=4, n/st=100, rew=24899.25]                                                                                


Epoch #40: test_reward: 13255.600000 ± 5189.487262, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #41: 1001it [00:02, 348.46it/s, env_step=41000, gradient_step=4100, len=185, n/ep=0, n/st=100, rew=10214.50]                                                                                


Epoch #41: test_reward: 9226.300000 ± 2296.671463, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #42: 1001it [00:03, 297.30it/s, env_step=42000, gradient_step=4200, len=229, n/ep=0, n/st=100, rew=14248.00]                                                                                


Epoch #42: test_reward: 11826.500000 ± 4204.897365, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #43: 1001it [00:02, 334.76it/s, env_step=43000, gradient_step=4300, len=125, n/ep=0, n/st=100, rew=6712.75]                                                                                 


Epoch #43: test_reward: 14050.700000 ± 3916.005466, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #44: 1001it [00:02, 340.28it/s, env_step=44000, gradient_step=4400, len=110, n/ep=1, n/st=100, rew=5546.00]                                                                                 


Epoch #44: test_reward: 10712.000000 ± 2033.614713, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #45: 1001it [00:02, 339.14it/s, env_step=45000, gradient_step=4500, len=78, n/ep=0, n/st=100, rew=3099.00]                                                                                  


Epoch #45: test_reward: 15694.800000 ± 6408.274273, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #46: 1001it [00:03, 321.87it/s, env_step=46000, gradient_step=4600, len=317, n/ep=0, n/st=100, rew=17619.00]                                                                                


Epoch #46: test_reward: 13763.000000 ± 6575.828784, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #47: 1001it [00:03, 311.55it/s, env_step=47000, gradient_step=4700, len=268, n/ep=1, n/st=100, rew=16289.00]                                                                                


Epoch #47: test_reward: 14155.500000 ± 7690.138455, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #48: 1001it [00:02, 339.96it/s, env_step=48000, gradient_step=4800, len=40, n/ep=0, n/st=100, rew=1032.00]                                                                                  


Epoch #48: test_reward: 10718.400000 ± 3776.965216, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #49: 1001it [00:03, 330.66it/s, env_step=49000, gradient_step=4900, len=135, n/ep=0, n/st=100, rew=7472.25]                                                                                 


Epoch #49: test_reward: 8960.600000 ± 5156.484931, best_reward: 16798.600000 ± 7805.461462 in #29


Epoch #50: 1001it [00:03, 328.46it/s, env_step=50000, gradient_step=5000, len=171, n/ep=1, n/st=100, rew=9851.00]                                                                                 


Epoch #50: test_reward: 17325.200000 ± 9868.987839, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #51: 1001it [00:03, 328.94it/s, env_step=51000, gradient_step=5100, len=70, n/ep=0, n/st=100, rew=3256.00]                                                                                  


Epoch #51: test_reward: 9212.600000 ± 7007.702451, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #52: 1001it [00:03, 311.28it/s, env_step=52000, gradient_step=5200, len=158, n/ep=1, n/st=100, rew=10368.50]                                                                                


Epoch #52: test_reward: 7908.800000 ± 3358.780040, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #53: 1001it [00:02, 350.57it/s, env_step=53000, gradient_step=5300, len=174, n/ep=1, n/st=100, rew=9306.50]                                                                                 


Epoch #53: test_reward: 14698.300000 ± 6110.330368, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #54: 1001it [00:03, 286.77it/s, env_step=54000, gradient_step=5400, len=288, n/ep=1, n/st=100, rew=16609.00]                                                                                


Epoch #54: test_reward: 10343.400000 ± 7430.083771, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #55: 1001it [00:02, 351.05it/s, env_step=55000, gradient_step=5500, len=267, n/ep=1, n/st=100, rew=16777.00]                                                                                


Epoch #55: test_reward: 11278.600000 ± 6534.303990, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #56: 1001it [00:03, 298.62it/s, env_step=56000, gradient_step=5600, len=138, n/ep=1, n/st=100, rew=7870.50]                                                                                 


Epoch #56: test_reward: 11996.400000 ± 6252.696941, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #57: 1001it [00:03, 286.20it/s, env_step=57000, gradient_step=5700, len=198, n/ep=0, n/st=100, rew=11944.75]                                                                                


Epoch #57: test_reward: 16063.100000 ± 8153.286509, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #58: 1001it [00:03, 277.02it/s, env_step=58000, gradient_step=5800, len=180, n/ep=1, n/st=100, rew=9839.00]                                                                                 


Epoch #58: test_reward: 11914.400000 ± 3863.063893, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #59: 1001it [00:03, 288.71it/s, env_step=59000, gradient_step=5900, len=80, n/ep=3, n/st=100, rew=4185.83]                                                                                  


Epoch #59: test_reward: 9415.200000 ± 4368.465264, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #60: 1001it [00:03, 288.61it/s, env_step=60000, gradient_step=6000, len=170, n/ep=0, n/st=100, rew=10258.00]                                                                                


Epoch #60: test_reward: 13313.300000 ± 10326.853616, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #61: 1001it [00:03, 318.61it/s, env_step=61000, gradient_step=6100, len=197, n/ep=1, n/st=100, rew=12443.00]                                                                                


Epoch #61: test_reward: 10334.000000 ± 5007.507723, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #62: 1001it [00:03, 327.79it/s, env_step=62000, gradient_step=6200, len=143, n/ep=1, n/st=100, rew=7940.50]                                                                                 


Epoch #62: test_reward: 11271.300000 ± 5040.045358, best_reward: 17325.200000 ± 9868.987839 in #50


Epoch #63: 1001it [00:02, 352.18it/s, env_step=63000, gradient_step=6300, len=118, n/ep=0, n/st=100, rew=6565.00]                                                                                 


Epoch #63: test_reward: 18257.000000 ± 9104.905897, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #64: 1001it [00:03, 289.57it/s, env_step=64000, gradient_step=6400, len=171, n/ep=0, n/st=100, rew=11389.00]                                                                                


Epoch #64: test_reward: 12178.400000 ± 5143.726027, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #65: 1001it [00:03, 284.26it/s, env_step=65000, gradient_step=6500, len=107, n/ep=0, n/st=100, rew=6060.50]                                                                                 


Epoch #65: test_reward: 9707.800000 ± 4871.270200, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #66: 1001it [00:03, 301.19it/s, env_step=66000, gradient_step=6600, len=74, n/ep=0, n/st=100, rew=3726.00]                                                                                  


Epoch #66: test_reward: 9473.300000 ± 3944.460573, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #67: 1001it [00:03, 302.37it/s, env_step=67000, gradient_step=6700, len=182, n/ep=0, n/st=100, rew=10542.50]                                                                                


Epoch #67: test_reward: 12415.200000 ± 5451.263281, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #68: 1001it [00:03, 305.15it/s, env_step=68000, gradient_step=6800, len=186, n/ep=0, n/st=100, rew=11189.00]                                                                                


Epoch #68: test_reward: 10236.600000 ± 5226.801244, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #69: 1001it [00:03, 272.54it/s, env_step=69000, gradient_step=6900, len=137, n/ep=0, n/st=100, rew=8881.00]                                                                                 


Epoch #69: test_reward: 10480.800000 ± 3354.410732, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #70: 1001it [00:03, 326.69it/s, env_step=70000, gradient_step=7000, len=185, n/ep=2, n/st=100, rew=10624.75]                                                                                


Epoch #70: test_reward: 6326.400000 ± 3063.917336, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #71: 1001it [00:03, 326.69it/s, env_step=71000, gradient_step=7100, len=210, n/ep=2, n/st=100, rew=13416.25]                                                                                


Epoch #71: test_reward: 7463.800000 ± 3127.153012, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #72: 1001it [00:03, 308.34it/s, env_step=72000, gradient_step=7200, len=254, n/ep=0, n/st=100, rew=14819.00]                                                                                


Epoch #72: test_reward: 8122.300000 ± 7924.955395, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #73: 1001it [00:03, 308.84it/s, env_step=73000, gradient_step=7300, len=248, n/ep=0, n/st=100, rew=14321.50]                                                                                


Epoch #73: test_reward: 10751.600000 ± 8712.858064, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #74: 1001it [00:03, 277.02it/s, env_step=74000, gradient_step=7400, len=95, n/ep=0, n/st=100, rew=5617.50]                                                                                  


Epoch #74: test_reward: 8341.400000 ± 3820.050345, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #75: 1001it [00:03, 268.64it/s, env_step=75000, gradient_step=7500, len=118, n/ep=0, n/st=100, rew=6421.00]                                                                                 


Epoch #75: test_reward: 13103.700000 ± 5400.359193, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #76: 1001it [00:03, 331.93it/s, env_step=76000, gradient_step=7600, len=28, n/ep=0, n/st=100, rew=1079.00]                                                                                  


Epoch #76: test_reward: 12951.400000 ± 5700.157352, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #77: 1001it [00:02, 335.16it/s, env_step=77000, gradient_step=7700, len=197, n/ep=0, n/st=100, rew=11795.25]                                                                                


Epoch #77: test_reward: 16730.600000 ± 8915.230667, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #78: 1001it [00:02, 333.82it/s, env_step=78000, gradient_step=7800, len=141, n/ep=0, n/st=100, rew=8561.50]                                                                                 


Epoch #78: test_reward: 11027.700000 ± 3580.411319, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #79: 1001it [00:03, 307.98it/s, env_step=79000, gradient_step=7900, len=138, n/ep=2, n/st=100, rew=8118.00]                                                                                 


Epoch #79: test_reward: 9458.200000 ± 4723.532191, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #80: 1001it [00:03, 285.07it/s, env_step=80000, gradient_step=8000, len=245, n/ep=2, n/st=100, rew=15324.00]                                                                                


Epoch #80: test_reward: 11652.200000 ± 6471.075920, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #81: 1001it [00:03, 279.13it/s, env_step=81000, gradient_step=8100, len=400, n/ep=0, n/st=100, rew=27312.00]                                                                                


Epoch #81: test_reward: 8213.400000 ± 2944.535148, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #82: 1001it [00:03, 330.02it/s, env_step=82000, gradient_step=8200, len=170, n/ep=2, n/st=100, rew=11040.00]                                                                                


Epoch #82: test_reward: 11012.100000 ± 3562.333798, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #83: 1001it [00:02, 350.41it/s, env_step=83000, gradient_step=8300, len=24, n/ep=0, n/st=100, rew=868.00]                                                                                   


Epoch #83: test_reward: 10541.900000 ± 5208.880272, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #84: 1001it [00:03, 300.00it/s, env_step=84000, gradient_step=8400, len=237, n/ep=1, n/st=100, rew=15447.00]                                                                                


Epoch #84: test_reward: 7076.100000 ± 7671.082094, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #85: 1001it [00:03, 302.90it/s, env_step=85000, gradient_step=8500, len=206, n/ep=1, n/st=100, rew=13125.00]                                                                                


Epoch #85: test_reward: 9324.800000 ± 4325.958224, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #86: 1001it [00:03, 274.30it/s, env_step=86000, gradient_step=8600, len=145, n/ep=3, n/st=100, rew=8623.33]                                                                                 


Epoch #86: test_reward: 12077.200000 ± 3927.575558, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #87: 1001it [00:03, 291.28it/s, env_step=87000, gradient_step=8700, len=148, n/ep=1, n/st=100, rew=9365.00]                                                                                 


Epoch #87: test_reward: 12211.700000 ± 2770.048232, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #88: 1001it [00:03, 289.18it/s, env_step=88000, gradient_step=8800, len=158, n/ep=1, n/st=100, rew=9024.00]                                                                                 


Epoch #88: test_reward: 14015.000000 ± 8468.029771, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #89: 1001it [00:02, 335.68it/s, env_step=89000, gradient_step=8900, len=154, n/ep=1, n/st=100, rew=9467.00]                                                                                 


Epoch #89: test_reward: 10055.400000 ± 3948.197771, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #90: 1001it [00:02, 352.65it/s, env_step=90000, gradient_step=9000, len=187, n/ep=0, n/st=100, rew=12062.00]                                                                                


Epoch #90: test_reward: 15356.100000 ± 7625.527883, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #91: 1001it [00:03, 308.46it/s, env_step=91000, gradient_step=9100, len=136, n/ep=0, n/st=100, rew=7682.67]                                                                                 


Epoch #91: test_reward: 12806.600000 ± 4701.476177, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #92: 1001it [00:02, 338.79it/s, env_step=92000, gradient_step=9200, len=116, n/ep=0, n/st=100, rew=6517.00]                                                                                 


Epoch #92: test_reward: 9106.200000 ± 2481.577474, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #93: 1001it [00:03, 290.37it/s, env_step=93000, gradient_step=9300, len=191, n/ep=2, n/st=100, rew=11520.50]                                                                                


Epoch #93: test_reward: 11579.100000 ± 7363.315591, best_reward: 18257.000000 ± 9104.905897 in #63


Epoch #94: 1001it [00:03, 298.11it/s, env_step=94000, gradient_step=9400, len=139, n/ep=1, n/st=100, rew=8389.00]                                                                                 


Epoch #94: test_reward: 22419.100000 ± 7204.910485, best_reward: 22419.100000 ± 7204.910485 in #94

InfoStats(gradient_step=9400, best_reward=22419.1, best_reward_std=7204.9104845237325, train_step=94000, train_episode=541, test_step=168779, test_episode=950, timing=TimingStats(total_time=396.2129762172699, train_time=299.77411437034607, train_time_collect=33.07958102226257, train_time_update=262.7720859050751, test_time=96.43886184692383, update_speed=313.5694360983777))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #5


Epoch #1: 1001it [00:02, 370.92it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 10743.800000 ± 2864.548509, best_reward: 10743.800000 ± 2864.548509 in #1


Epoch #2: 1001it [00:02, 357.80it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 9153.200000 ± 2848.481588, best_reward: 10743.800000 ± 2864.548509 in #1


Epoch #3: 1001it [00:02, 386.86it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10242.800000 ± 3013.044334, best_reward: 10743.800000 ± 2864.548509 in #1


Epoch #4: 1001it [00:02, 397.31it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 6097.600000 ± 2935.808141, best_reward: 10743.800000 ± 2864.548509 in #1


Epoch #5: 1001it [00:02, 385.40it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 9356.000000 ± 4376.989925, best_reward: 10743.800000 ± 2864.548509 in #1


Epoch #6: 1001it [00:02, 402.03it/s, env_step=6000, gradient_step=600, len=56, n/ep=0, n/st=100, rew=1843.00]                                                                                     


Epoch #6: test_reward: 11593.200000 ± 3119.806206, best_reward: 11593.200000 ± 3119.806206 in #6


Epoch #7: 1001it [00:02, 353.53it/s, env_step=7000, gradient_step=700, len=70, n/ep=1, n/st=100, rew=3524.00]                                                                                     


Epoch #7: test_reward: 11714.600000 ± 2291.100749, best_reward: 11714.600000 ± 2291.100749 in #7


Epoch #8: 1001it [00:02, 359.40it/s, env_step=8000, gradient_step=800, len=78, n/ep=0, n/st=100, rew=4129.50]                                                                                     


Epoch #8: test_reward: 12921.300000 ± 2185.349219, best_reward: 12921.300000 ± 2185.349219 in #8


Epoch #9: 1001it [00:03, 291.85it/s, env_step=9000, gradient_step=900, len=90, n/ep=6, n/st=100, rew=4971.00]                                                                                     


Epoch #9: test_reward: 11485.300000 ± 5524.174944, best_reward: 12921.300000 ± 2185.349219 in #8


Epoch #10: 1001it [00:03, 294.54it/s, env_step=10000, gradient_step=1000, len=98, n/ep=0, n/st=100, rew=5214.50]                                                                                  


Epoch #10: test_reward: 11931.300000 ± 2494.552868, best_reward: 12921.300000 ± 2185.349219 in #8


Epoch #11: 1001it [00:03, 324.71it/s, env_step=11000, gradient_step=1100, len=110, n/ep=2, n/st=100, rew=5797.00]                                                                                 


Epoch #11: test_reward: 8547.800000 ± 3428.724363, best_reward: 12921.300000 ± 2185.349219 in #8


Epoch #12: 1001it [00:02, 376.18it/s, env_step=12000, gradient_step=1200, len=120, n/ep=1, n/st=100, rew=6500.00]                                                                                 


Epoch #12: test_reward: 14248.400000 ± 6483.884749, best_reward: 14248.400000 ± 6483.884749 in #12


Epoch #13: 1001it [00:02, 369.92it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=8009.00]                                                                                 


Epoch #13: test_reward: 10035.100000 ± 3012.899681, best_reward: 14248.400000 ± 6483.884749 in #12


Epoch #14: 1001it [00:03, 331.03it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=8230.00]                                                                                 


Epoch #14: test_reward: 11271.200000 ± 4532.294712, best_reward: 14248.400000 ± 6483.884749 in #12


Epoch #15: 1001it [00:02, 346.10it/s, env_step=15000, gradient_step=1500, len=150, n/ep=1, n/st=100, rew=8425.50]                                                                                 


Epoch #15: test_reward: 9557.400000 ± 3212.993097, best_reward: 14248.400000 ± 6483.884749 in #12


Epoch #16: 1001it [00:03, 320.01it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=9425.50]                                                                                 


Epoch #16: test_reward: 12931.500000 ± 6197.773427, best_reward: 14248.400000 ± 6483.884749 in #12


Epoch #17: 1001it [00:03, 305.51it/s, env_step=17000, gradient_step=1700, len=170, n/ep=2, n/st=100, rew=9277.75]                                                                                 


Epoch #17: test_reward: 9730.800000 ± 4215.885264, best_reward: 14248.400000 ± 6483.884749 in #12


Epoch #18: 1001it [00:02, 395.67it/s, env_step=18000, gradient_step=1800, len=90, n/ep=1, n/st=100, rew=4535.50]                                                                                  


Epoch #18: test_reward: 11314.700000 ± 2987.462202, best_reward: 14248.400000 ± 6483.884749 in #12


Epoch #19: 1001it [00:02, 339.42it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=12118.00]                                                                                


Epoch #19: test_reward: 9457.200000 ± 5681.847038, best_reward: 14248.400000 ± 6483.884749 in #12


Epoch #20: 1001it [00:02, 392.37it/s, env_step=20000, gradient_step=2000, len=198, n/ep=0, n/st=100, rew=12503.00]                                                                                


Epoch #20: test_reward: 16328.000000 ± 6225.249553, best_reward: 16328.000000 ± 6225.249553 in #20


Epoch #21: 1001it [00:02, 358.18it/s, env_step=21000, gradient_step=2100, len=58, n/ep=1, n/st=100, rew=2036.00]                                                                                  


Epoch #21: test_reward: 14112.600000 ± 6472.060077, best_reward: 16328.000000 ± 6225.249553 in #20


Epoch #22: 1001it [00:02, 355.60it/s, env_step=22000, gradient_step=2200, len=219, n/ep=0, n/st=100, rew=14089.50]                                                                                


Epoch #22: test_reward: 17213.400000 ± 7003.712047, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #23: 1001it [00:02, 338.93it/s, env_step=23000, gradient_step=2300, len=130, n/ep=0, n/st=100, rew=7722.75]                                                                                 


Epoch #23: test_reward: 8078.000000 ± 3992.176148, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #24: 1001it [00:02, 353.26it/s, env_step=24000, gradient_step=2400, len=239, n/ep=0, n/st=100, rew=14002.00]                                                                                


Epoch #24: test_reward: 9603.200000 ± 2817.263168, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #25: 1001it [00:02, 352.28it/s, env_step=25000, gradient_step=2500, len=247, n/ep=0, n/st=100, rew=16122.50]                                                                                


Epoch #25: test_reward: 11484.600000 ± 3892.403273, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #26: 1001it [00:02, 355.02it/s, env_step=26000, gradient_step=2600, len=257, n/ep=0, n/st=100, rew=17111.50]                                                                                


Epoch #26: test_reward: 12963.600000 ± 6369.254496, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #27: 1001it [00:03, 328.03it/s, env_step=27000, gradient_step=2700, len=267, n/ep=0, n/st=100, rew=18144.50]                                                                                


Epoch #27: test_reward: 11779.800000 ± 3396.488593, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #28: 1001it [00:02, 389.58it/s, env_step=28000, gradient_step=2800, len=134, n/ep=0, n/st=100, rew=7848.50]                                                                                 


Epoch #28: test_reward: 11606.100000 ± 4083.453451, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #29: 1001it [00:02, 363.18it/s, env_step=29000, gradient_step=2900, len=190, n/ep=0, n/st=100, rew=11130.00]                                                                                


Epoch #29: test_reward: 11418.200000 ± 5053.268503, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #30: 1001it [00:02, 401.56it/s, env_step=30000, gradient_step=3000, len=222, n/ep=0, n/st=100, rew=13659.50]                                                                                


Epoch #30: test_reward: 8794.300000 ± 3679.329777, best_reward: 17213.400000 ± 7003.712047 in #22


Epoch #31: 1001it [00:02, 353.49it/s, env_step=31000, gradient_step=3100, len=310, n/ep=1, n/st=100, rew=20185.00]                                                                                


Epoch #31: test_reward: 22527.300000 ± 3680.048914, best_reward: 22527.300000 ± 3680.048914 in #31

InfoStats(gradient_step=3100, best_reward=22527.3, best_reward_std=3680.048914077094, train_step=31000, train_episode=116, test_step=57950, test_episode=320, timing=TimingStats(total_time=124.03714418411255, train_time=88.08321332931519, train_time_collect=10.436949253082275, train_time_update=76.291921377182, test_time=35.95393085479736, update_speed=351.9399307573037))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #27


Epoch #1: 1001it [00:03, 325.96it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 13126.200000 ± 5278.237088, best_reward: 13126.200000 ± 5278.237088 in #1


Epoch #2: 1001it [00:02, 334.25it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 16218.800000 ± 2763.770352, best_reward: 16218.800000 ± 2763.770352 in #2


Epoch #3: 1001it [00:03, 308.95it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 19258.600000 ± 3774.855340, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #4: 1001it [00:03, 309.23it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 10107.300000 ± 5175.920402, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #5: 1001it [00:03, 315.79it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 10140.700000 ± 3728.414839, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #6: 1001it [00:03, 320.27it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 9181.200000 ± 3301.083362, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #7: 1001it [00:03, 297.34it/s, env_step=7000, gradient_step=700, len=64, n/ep=0, n/st=100, rew=2502.00]                                                                                     


Epoch #7: test_reward: 11076.900000 ± 4019.046266, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #8: 1001it [00:03, 284.25it/s, env_step=8000, gradient_step=800, len=64, n/ep=0, n/st=100, rew=2502.00]                                                                                     


Epoch #8: test_reward: 11367.700000 ± 2434.419152, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #9: 1001it [00:03, 330.87it/s, env_step=9000, gradient_step=900, len=82, n/ep=0, n/st=100, rew=3265.00]                                                                                     


Epoch #9: test_reward: 8524.100000 ± 2506.355220, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #10: 1001it [00:02, 340.55it/s, env_step=10000, gradient_step=1000, len=100, n/ep=1, n/st=100, rew=5108.50]                                                                                 


Epoch #10: test_reward: 7630.100000 ± 1937.090935, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #11: 1001it [00:03, 314.74it/s, env_step=11000, gradient_step=1100, len=110, n/ep=2, n/st=100, rew=5666.25]                                                                                 


Epoch #11: test_reward: 9583.400000 ± 3082.545059, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #12: 1001it [00:03, 317.79it/s, env_step=12000, gradient_step=1200, len=114, n/ep=0, n/st=100, rew=6072.00]                                                                                 


Epoch #12: test_reward: 9282.700000 ± 1999.115207, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #13: 1001it [00:03, 322.53it/s, env_step=13000, gradient_step=1300, len=130, n/ep=2, n/st=100, rew=6195.50]                                                                                 


Epoch #13: test_reward: 10469.400000 ± 1486.973853, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #14: 1001it [00:03, 318.32it/s, env_step=14000, gradient_step=1400, len=140, n/ep=2, n/st=100, rew=8323.25]                                                                                 


Epoch #14: test_reward: 9205.700000 ± 2179.254462, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #15: 1001it [00:03, 307.76it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=8659.50]                                                                                 


Epoch #15: test_reward: 10321.500000 ± 3846.974844, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #16: 1001it [00:03, 309.14it/s, env_step=16000, gradient_step=1600, len=160, n/ep=2, n/st=100, rew=8942.00]                                                                                 


Epoch #16: test_reward: 11071.200000 ± 3586.167782, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #17: 1001it [00:03, 293.25it/s, env_step=17000, gradient_step=1700, len=168, n/ep=0, n/st=100, rew=9518.83]                                                                                 


Epoch #17: test_reward: 8377.500000 ± 2380.800380, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #18: 1001it [00:03, 288.42it/s, env_step=18000, gradient_step=1800, len=180, n/ep=3, n/st=100, rew=10369.17]                                                                                


Epoch #18: test_reward: 14835.400000 ± 4655.909819, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #19: 1001it [00:03, 282.76it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=10624.00]                                                                                


Epoch #19: test_reward: 9628.200000 ± 4155.875523, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #20: 1001it [00:03, 316.40it/s, env_step=20000, gradient_step=2000, len=157, n/ep=0, n/st=100, rew=9752.25]                                                                                 


Epoch #20: test_reward: 10169.800000 ± 3562.562948, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #21: 1001it [00:03, 323.58it/s, env_step=21000, gradient_step=2100, len=209, n/ep=0, n/st=100, rew=13085.00]                                                                                


Epoch #21: test_reward: 17995.900000 ± 8671.800061, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #22: 1001it [00:03, 288.47it/s, env_step=22000, gradient_step=2200, len=214, n/ep=0, n/st=100, rew=12801.00]                                                                                


Epoch #22: test_reward: 11649.200000 ± 2824.111641, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #23: 1001it [00:03, 299.33it/s, env_step=23000, gradient_step=2300, len=230, n/ep=2, n/st=100, rew=13728.25]                                                                                


Epoch #23: test_reward: 17759.000000 ± 7246.693936, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #24: 1001it [00:02, 340.27it/s, env_step=24000, gradient_step=2400, len=72, n/ep=0, n/st=100, rew=3934.00]                                                                                  


Epoch #24: test_reward: 11638.100000 ± 6179.513127, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #25: 1001it [00:03, 329.04it/s, env_step=25000, gradient_step=2500, len=86, n/ep=1, n/st=100, rew=5091.00]                                                                                  


Epoch #25: test_reward: 8452.600000 ± 4173.540660, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #26: 1001it [00:02, 347.29it/s, env_step=26000, gradient_step=2600, len=86, n/ep=0, n/st=100, rew=5091.00]                                                                                  


Epoch #26: test_reward: 14817.400000 ± 9000.084213, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #27: 1001it [00:03, 307.30it/s, env_step=27000, gradient_step=2700, len=94, n/ep=1, n/st=100, rew=4903.00]                                                                                  


Epoch #27: test_reward: 14660.900000 ± 7753.491761, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #28: 1001it [00:03, 328.47it/s, env_step=28000, gradient_step=2800, len=196, n/ep=2, n/st=100, rew=12172.00]                                                                                


Epoch #28: test_reward: 9378.400000 ± 2227.427359, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #29: 1001it [00:03, 314.42it/s, env_step=29000, gradient_step=2900, len=157, n/ep=0, n/st=100, rew=9740.50]                                                                                 


Epoch #29: test_reward: 14687.000000 ± 3614.697138, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #30: 1001it [00:03, 329.85it/s, env_step=30000, gradient_step=3000, len=164, n/ep=1, n/st=100, rew=9969.00]                                                                                 


Epoch #30: test_reward: 10085.700000 ± 6220.265333, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #31: 1001it [00:03, 305.35it/s, env_step=31000, gradient_step=3100, len=117, n/ep=2, n/st=100, rew=6984.50]                                                                                 


Epoch #31: test_reward: 9614.800000 ± 3931.598169, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #32: 1001it [00:03, 330.03it/s, env_step=32000, gradient_step=3200, len=142, n/ep=2, n/st=100, rew=8475.50]                                                                                 


Epoch #32: test_reward: 10519.600000 ± 4458.309594, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #33: 1001it [00:03, 290.78it/s, env_step=33000, gradient_step=3300, len=90, n/ep=0, n/st=100, rew=5279.00]                                                                                  


Epoch #33: test_reward: 14001.200000 ± 6283.591327, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #34: 1001it [00:03, 311.47it/s, env_step=34000, gradient_step=3400, len=143, n/ep=2, n/st=100, rew=8894.00]                                                                                 


Epoch #34: test_reward: 14634.600000 ± 6335.038125, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #35: 1001it [00:02, 337.45it/s, env_step=35000, gradient_step=3500, len=350, n/ep=2, n/st=100, rew=21843.00]                                                                                


Epoch #35: test_reward: 10138.700000 ± 8277.092389, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #36: 1001it [00:02, 344.83it/s, env_step=36000, gradient_step=3600, len=166, n/ep=0, n/st=100, rew=10671.50]                                                                                


Epoch #36: test_reward: 9247.100000 ± 4883.396266, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #37: 1001it [00:03, 319.47it/s, env_step=37000, gradient_step=3700, len=135, n/ep=1, n/st=100, rew=8263.50]                                                                                 


Epoch #37: test_reward: 10128.800000 ± 3781.521408, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #38: 1001it [00:02, 348.15it/s, env_step=38000, gradient_step=3800, len=178, n/ep=1, n/st=100, rew=10316.00]                                                                                


Epoch #38: test_reward: 10336.300000 ± 3351.834962, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #39: 1001it [00:02, 367.22it/s, env_step=39000, gradient_step=3900, len=144, n/ep=1, n/st=100, rew=7781.00]                                                                                 


Epoch #39: test_reward: 10831.900000 ± 6476.781862, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #40: 1001it [00:02, 337.85it/s, env_step=40000, gradient_step=4000, len=228, n/ep=2, n/st=100, rew=14381.50]                                                                                


Epoch #40: test_reward: 9604.100000 ± 2594.285237, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #41: 1001it [00:02, 338.03it/s, env_step=41000, gradient_step=4100, len=177, n/ep=1, n/st=100, rew=11522.00]                                                                                


Epoch #41: test_reward: 12849.300000 ± 4362.136588, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #42: 1001it [00:03, 302.96it/s, env_step=42000, gradient_step=4200, len=111, n/ep=0, n/st=100, rew=6775.00]                                                                                 


Epoch #42: test_reward: 9481.600000 ± 3326.144681, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #43: 1001it [00:03, 311.40it/s, env_step=43000, gradient_step=4300, len=171, n/ep=0, n/st=100, rew=10987.00]                                                                                


Epoch #43: test_reward: 11048.900000 ± 4224.491081, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #44: 1001it [00:03, 325.67it/s, env_step=44000, gradient_step=4400, len=136, n/ep=1, n/st=100, rew=8631.00]                                                                                 


Epoch #44: test_reward: 9055.500000 ± 1696.621717, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #45: 1001it [00:03, 290.45it/s, env_step=45000, gradient_step=4500, len=170, n/ep=0, n/st=100, rew=10260.00]                                                                                


Epoch #45: test_reward: 9745.200000 ± 2322.533134, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #46: 1001it [00:03, 299.59it/s, env_step=46000, gradient_step=4600, len=136, n/ep=1, n/st=100, rew=9079.50]                                                                                 


Epoch #46: test_reward: 12397.900000 ± 4136.752046, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #47: 1001it [00:03, 321.74it/s, env_step=47000, gradient_step=4700, len=121, n/ep=0, n/st=100, rew=7470.25]                                                                                 


Epoch #47: test_reward: 12715.600000 ± 4036.591587, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #48: 1001it [00:03, 307.38it/s, env_step=48000, gradient_step=4800, len=114, n/ep=2, n/st=100, rew=7176.00]                                                                                 


Epoch #48: test_reward: 14485.600000 ± 4853.539723, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #49: 1001it [00:03, 295.24it/s, env_step=49000, gradient_step=4900, len=138, n/ep=0, n/st=100, rew=8759.00]                                                                                 


Epoch #49: test_reward: 12173.300000 ± 4052.519662, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #50: 1001it [00:03, 297.84it/s, env_step=50000, gradient_step=5000, len=122, n/ep=0, n/st=100, rew=7461.25]                                                                                 


Epoch #50: test_reward: 7981.100000 ± 1884.212963, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #51: 1001it [00:02, 340.82it/s, env_step=51000, gradient_step=5100, len=186, n/ep=0, n/st=100, rew=11651.00]                                                                                


Epoch #51: test_reward: 11111.300000 ± 5861.016568, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #52: 1001it [00:02, 337.46it/s, env_step=52000, gradient_step=5200, len=183, n/ep=4, n/st=100, rew=12197.12]                                                                                


Epoch #52: test_reward: 13876.900000 ± 7516.881926, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #53: 1001it [00:03, 305.20it/s, env_step=53000, gradient_step=5300, len=157, n/ep=3, n/st=100, rew=10235.33]                                                                                


Epoch #53: test_reward: 10383.600000 ± 3633.047734, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #54: 1001it [00:03, 325.13it/s, env_step=54000, gradient_step=5400, len=396, n/ep=1, n/st=100, rew=23207.00]                                                                                


Epoch #54: test_reward: 9995.700000 ± 4469.649339, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #55: 1001it [00:03, 285.19it/s, env_step=55000, gradient_step=5500, len=400, n/ep=1, n/st=100, rew=24388.00]                                                                                


Epoch #55: test_reward: 11279.600000 ± 4185.213978, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #56: 1001it [00:02, 366.70it/s, env_step=56000, gradient_step=5600, len=122, n/ep=1, n/st=100, rew=7391.00]                                                                                 


Epoch #56: test_reward: 17038.300000 ± 8792.402266, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #57: 1001it [00:02, 366.40it/s, env_step=57000, gradient_step=5700, len=129, n/ep=1, n/st=100, rew=8235.00]                                                                                 


Epoch #57: test_reward: 13441.800000 ± 5406.838444, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #58: 1001it [00:03, 311.30it/s, env_step=58000, gradient_step=5800, len=118, n/ep=0, n/st=100, rew=7361.00]                                                                                 


Epoch #58: test_reward: 14283.000000 ± 7719.262944, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #59: 1001it [00:03, 320.25it/s, env_step=59000, gradient_step=5900, len=290, n/ep=1, n/st=100, rew=20865.00]                                                                                


Epoch #59: test_reward: 9766.100000 ± 3320.228861, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #60: 1001it [00:02, 353.40it/s, env_step=60000, gradient_step=6000, len=237, n/ep=0, n/st=100, rew=15541.00]                                                                                


Epoch #60: test_reward: 12567.800000 ± 5002.907351, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #61: 1001it [00:02, 338.57it/s, env_step=61000, gradient_step=6100, len=129, n/ep=0, n/st=100, rew=7793.00]                                                                                 


Epoch #61: test_reward: 10856.300000 ± 3274.457086, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #62: 1001it [00:03, 295.56it/s, env_step=62000, gradient_step=6200, len=106, n/ep=0, n/st=100, rew=6143.00]                                                                                 


Epoch #62: test_reward: 9962.300000 ± 4370.140525, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #63: 1001it [00:03, 332.10it/s, env_step=63000, gradient_step=6300, len=342, n/ep=0, n/st=100, rew=22715.00]                                                                                


Epoch #63: test_reward: 12355.900000 ± 4946.724744, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #64: 1001it [00:02, 353.70it/s, env_step=64000, gradient_step=6400, len=136, n/ep=0, n/st=100, rew=8456.75]                                                                                 


Epoch #64: test_reward: 14888.800000 ± 8414.662118, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #65: 1001it [00:03, 325.00it/s, env_step=65000, gradient_step=6500, len=145, n/ep=2, n/st=100, rew=8219.00]                                                                                 


Epoch #65: test_reward: 16072.300000 ± 5447.809194, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #66: 1001it [00:03, 327.64it/s, env_step=66000, gradient_step=6600, len=148, n/ep=1, n/st=100, rew=9648.00]                                                                                 


Epoch #66: test_reward: 14426.600000 ± 8347.894396, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #67: 1001it [00:03, 322.89it/s, env_step=67000, gradient_step=6700, len=199, n/ep=1, n/st=100, rew=12703.00]                                                                                


Epoch #67: test_reward: 13094.600000 ± 4864.184273, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #68: 1001it [00:03, 280.96it/s, env_step=68000, gradient_step=6800, len=152, n/ep=0, n/st=100, rew=9875.00]                                                                                 


Epoch #68: test_reward: 16435.400000 ± 7589.799802, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #69: 1001it [00:02, 353.94it/s, env_step=69000, gradient_step=6900, len=64, n/ep=1, n/st=100, rew=3190.00]                                                                                  


Epoch #69: test_reward: 8589.800000 ± 2237.089350, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #70: 1001it [00:02, 356.04it/s, env_step=70000, gradient_step=7000, len=141, n/ep=2, n/st=100, rew=9196.75]                                                                                 


Epoch #70: test_reward: 8565.400000 ± 2499.860644, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #71: 1001it [00:02, 350.64it/s, env_step=71000, gradient_step=7100, len=123, n/ep=1, n/st=100, rew=7771.00]                                                                                 


Epoch #71: test_reward: 11058.100000 ± 5836.209977, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #72: 1001it [00:03, 309.02it/s, env_step=72000, gradient_step=7200, len=57, n/ep=1, n/st=100, rew=2986.00]                                                                                  


Epoch #72: test_reward: 12400.600000 ± 7165.690660, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #73: 1001it [00:02, 345.72it/s, env_step=73000, gradient_step=7300, len=180, n/ep=0, n/st=100, rew=12000.50]                                                                                


Epoch #73: test_reward: 10159.800000 ± 1970.627403, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #74: 1001it [00:03, 291.61it/s, env_step=74000, gradient_step=7400, len=136, n/ep=0, n/st=100, rew=8376.67]                                                                                 


Epoch #74: test_reward: 14755.500000 ± 4796.743463, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #75: 1001it [00:02, 341.12it/s, env_step=75000, gradient_step=7500, len=108, n/ep=1, n/st=100, rew=6424.00]                                                                                 


Epoch #75: test_reward: 10405.500000 ± 2149.012948, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #76: 1001it [00:03, 322.44it/s, env_step=76000, gradient_step=7600, len=145, n/ep=0, n/st=100, rew=9317.00]                                                                                 


Epoch #76: test_reward: 12467.800000 ± 5213.424245, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #77: 1001it [00:03, 296.73it/s, env_step=77000, gradient_step=7700, len=102, n/ep=0, n/st=100, rew=6302.50]                                                                                 


Epoch #77: test_reward: 11739.600000 ± 8868.874373, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #78: 1001it [00:03, 299.29it/s, env_step=78000, gradient_step=7800, len=85, n/ep=1, n/st=100, rew=4826.00]                                                                                  


Epoch #78: test_reward: 9442.600000 ± 2556.154463, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #79: 1001it [00:03, 303.90it/s, env_step=79000, gradient_step=7900, len=302, n/ep=0, n/st=100, rew=20315.00]                                                                                


Epoch #79: test_reward: 12251.000000 ± 6708.936861, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #80: 1001it [00:02, 337.59it/s, env_step=80000, gradient_step=8000, len=129, n/ep=0, n/st=100, rew=7907.00]                                                                                 


Epoch #80: test_reward: 14837.600000 ± 9055.225621, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #81: 1001it [00:02, 353.09it/s, env_step=81000, gradient_step=8100, len=138, n/ep=0, n/st=100, rew=8809.50]                                                                                 


Epoch #81: test_reward: 8698.500000 ± 3484.463266, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #82: 1001it [00:02, 359.74it/s, env_step=82000, gradient_step=8200, len=157, n/ep=0, n/st=100, rew=9447.00]                                                                                 


Epoch #82: test_reward: 14906.300000 ± 7705.866350, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #83: 1001it [00:03, 318.45it/s, env_step=83000, gradient_step=8300, len=122, n/ep=0, n/st=100, rew=7564.83]                                                                                 


Epoch #83: test_reward: 15389.200000 ± 6245.930592, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #84: 1001it [00:03, 317.53it/s, env_step=84000, gradient_step=8400, len=145, n/ep=1, n/st=100, rew=9058.50]                                                                                 


Epoch #84: test_reward: 12352.500000 ± 4364.319082, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #85: 1001it [00:03, 304.22it/s, env_step=85000, gradient_step=8500, len=137, n/ep=2, n/st=100, rew=8622.50]                                                                                 


Epoch #85: test_reward: 11996.500000 ± 5994.832762, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #86: 1001it [00:02, 343.19it/s, env_step=86000, gradient_step=8600, len=123, n/ep=3, n/st=100, rew=7803.33]                                                                                 


Epoch #86: test_reward: 11800.300000 ± 5335.333805, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #87: 1001it [00:02, 339.40it/s, env_step=87000, gradient_step=8700, len=257, n/ep=0, n/st=100, rew=17802.00]                                                                                


Epoch #87: test_reward: 8609.500000 ± 4519.367240, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #88: 1001it [00:03, 330.43it/s, env_step=88000, gradient_step=8800, len=340, n/ep=1, n/st=100, rew=23763.00]                                                                                


Epoch #88: test_reward: 13132.900000 ± 6695.565763, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #89: 1001it [00:03, 323.36it/s, env_step=89000, gradient_step=8900, len=215, n/ep=0, n/st=100, rew=14660.00]                                                                                


Epoch #89: test_reward: 10012.100000 ± 4954.605382, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #90: 1001it [00:02, 344.87it/s, env_step=90000, gradient_step=9000, len=134, n/ep=1, n/st=100, rew=8078.00]                                                                                 


Epoch #90: test_reward: 11666.100000 ± 6530.316630, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #91: 1001it [00:03, 325.10it/s, env_step=91000, gradient_step=9100, len=114, n/ep=0, n/st=100, rew=7073.00]                                                                                 


Epoch #91: test_reward: 8549.400000 ± 4321.561019, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #92: 1001it [00:03, 322.43it/s, env_step=92000, gradient_step=9200, len=114, n/ep=1, n/st=100, rew=7322.50]                                                                                 


Epoch #92: test_reward: 8369.100000 ± 3745.327715, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #93: 1001it [00:02, 342.17it/s, env_step=93000, gradient_step=9300, len=155, n/ep=0, n/st=100, rew=10234.00]                                                                                


Epoch #93: test_reward: 10754.500000 ± 6944.782837, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #94: 1001it [00:03, 324.30it/s, env_step=94000, gradient_step=9400, len=181, n/ep=0, n/st=100, rew=12056.00]                                                                                


Epoch #94: test_reward: 7722.600000 ± 1944.594364, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #95: 1001it [00:03, 315.90it/s, env_step=95000, gradient_step=9500, len=109, n/ep=2, n/st=100, rew=6717.00]                                                                                 


Epoch #95: test_reward: 8074.400000 ± 5171.050013, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #96: 1001it [00:03, 287.39it/s, env_step=96000, gradient_step=9600, len=191, n/ep=0, n/st=100, rew=12708.50]                                                                                


Epoch #96: test_reward: 8403.300000 ± 3000.045668, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #97: 1001it [00:02, 352.25it/s, env_step=97000, gradient_step=9700, len=70, n/ep=1, n/st=100, rew=3883.00]                                                                                  


Epoch #97: test_reward: 11091.200000 ± 4160.233378, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #98: 1001it [00:03, 304.11it/s, env_step=98000, gradient_step=9800, len=132, n/ep=0, n/st=100, rew=8006.00]                                                                                 


Epoch #98: test_reward: 11598.200000 ± 5733.575460, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #99: 1001it [00:02, 339.73it/s, env_step=99000, gradient_step=9900, len=107, n/ep=1, n/st=100, rew=6608.00]                                                                                 


Epoch #99: test_reward: 10199.000000 ± 4039.393519, best_reward: 19258.600000 ± 3774.855340 in #3


Epoch #100: 1001it [00:02, 341.11it/s, env_step=100000, gradient_step=10000, len=153, n/ep=0, n/st=100, rew=9847.00]                                                                              


Epoch #100: test_reward: 6468.200000 ± 1036.574532, best_reward: 19258.600000 ± 3774.855340 in #3

InfoStats(gradient_step=10000, best_reward=19258.6, best_reward_std=3774.8553402746443, train_step=100000, train_episode=592, test_step=177707, test_episode=1010, timing=TimingStats(total_time=422.7239112854004, train_time=313.0587546825409, train_time_collect=34.67023587226868, train_time_update=274.1503140926361, test_time=109.6651566028595, update_speed=319.42885641836017))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #31


Epoch #1: 1001it [00:02, 346.68it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 12392.800000 ± 3940.577059, best_reward: 12392.800000 ± 3940.577059 in #1


Epoch #2: 1001it [00:02, 351.77it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 13127.900000 ± 6586.477700, best_reward: 13127.900000 ± 6586.477700 in #2


Epoch #3: 1001it [00:03, 296.26it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 15956.700000 ± 4581.981952, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #4: 1001it [00:02, 335.53it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 11135.400000 ± 3946.348393, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #5: 1001it [00:03, 329.97it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 11770.700000 ± 3589.839886, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #6: 1001it [00:03, 301.00it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 10310.600000 ± 3582.586222, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #7: 1001it [00:03, 298.16it/s, env_step=7000, gradient_step=700, len=66, n/ep=0, n/st=100, rew=3047.00]                                                                                     


Epoch #7: test_reward: 9605.000000 ± 3213.708294, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #8: 1001it [00:03, 266.47it/s, env_step=8000, gradient_step=800, len=66, n/ep=0, n/st=100, rew=3047.00]                                                                                     


Epoch #8: test_reward: 8970.600000 ± 2233.321123, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #9: 1001it [00:03, 260.12it/s, env_step=9000, gradient_step=900, len=86, n/ep=0, n/st=100, rew=4568.00]                                                                                     


Epoch #9: test_reward: 8951.300000 ± 5156.085764, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #10: 1001it [00:03, 266.07it/s, env_step=10000, gradient_step=1000, len=86, n/ep=0, n/st=100, rew=4568.00]                                                                                  


Epoch #10: test_reward: 15105.200000 ± 8270.934661, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #11: 1001it [00:03, 319.21it/s, env_step=11000, gradient_step=1100, len=45, n/ep=0, n/st=100, rew=2095.00]                                                                                  


Epoch #11: test_reward: 14345.200000 ± 8326.537381, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #12: 1001it [00:03, 263.74it/s, env_step=12000, gradient_step=1200, len=115, n/ep=0, n/st=100, rew=5740.00]                                                                                 


Epoch #12: test_reward: 8418.900000 ± 3136.217609, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #13: 1001it [00:03, 312.40it/s, env_step=13000, gradient_step=1300, len=129, n/ep=0, n/st=100, rew=7316.00]                                                                                 


Epoch #13: test_reward: 10199.900000 ± 3473.661482, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #14: 1001it [00:03, 318.92it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=8776.50]                                                                                 


Epoch #14: test_reward: 8431.500000 ± 2594.798152, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #15: 1001it [00:03, 299.96it/s, env_step=15000, gradient_step=1500, len=150, n/ep=1, n/st=100, rew=8904.00]                                                                                 


Epoch #15: test_reward: 12246.500000 ± 3277.404377, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #16: 1001it [00:03, 322.93it/s, env_step=16000, gradient_step=1600, len=159, n/ep=0, n/st=100, rew=9578.00]                                                                                 


Epoch #16: test_reward: 10798.200000 ± 4734.196063, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #17: 1001it [00:03, 301.82it/s, env_step=17000, gradient_step=1700, len=170, n/ep=1, n/st=100, rew=10377.00]                                                                                


Epoch #17: test_reward: 9273.500000 ± 3289.884230, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #18: 1001it [00:03, 321.97it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=11003.00]                                                                                


Epoch #18: test_reward: 10950.600000 ± 3072.785876, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #19: 1001it [00:03, 322.64it/s, env_step=19000, gradient_step=1900, len=190, n/ep=2, n/st=100, rew=12059.25]                                                                                


Epoch #19: test_reward: 10687.200000 ± 3425.235694, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #20: 1001it [00:02, 335.40it/s, env_step=20000, gradient_step=2000, len=199, n/ep=0, n/st=100, rew=11811.00]                                                                                


Epoch #20: test_reward: 8322.000000 ± 3534.975672, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #21: 1001it [00:03, 291.52it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=13382.00]                                                                                


Epoch #21: test_reward: 12575.900000 ± 3322.892308, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #22: 1001it [00:03, 319.07it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=14368.00]                                                                                


Epoch #22: test_reward: 10964.300000 ± 4390.764012, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #23: 1001it [00:03, 327.03it/s, env_step=23000, gradient_step=2300, len=223, n/ep=0, n/st=100, rew=12965.50]                                                                                


Epoch #23: test_reward: 7973.000000 ± 2278.575915, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #24: 1001it [00:03, 307.25it/s, env_step=24000, gradient_step=2400, len=235, n/ep=0, n/st=100, rew=14808.00]                                                                                


Epoch #24: test_reward: 10198.800000 ± 2849.082407, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #25: 1001it [00:03, 308.19it/s, env_step=25000, gradient_step=2500, len=250, n/ep=1, n/st=100, rew=15967.00]                                                                                


Epoch #25: test_reward: 11577.600000 ± 2767.142288, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #26: 1001it [00:03, 294.34it/s, env_step=26000, gradient_step=2600, len=138, n/ep=1, n/st=100, rew=8508.00]                                                                                 


Epoch #26: test_reward: 13655.000000 ± 2998.344376, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #27: 1001it [00:03, 302.77it/s, env_step=27000, gradient_step=2700, len=128, n/ep=0, n/st=100, rew=8050.00]                                                                                 


Epoch #27: test_reward: 13231.900000 ± 5929.692209, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #28: 1001it [00:02, 344.27it/s, env_step=28000, gradient_step=2800, len=81, n/ep=1, n/st=100, rew=4618.00]                                                                                  


Epoch #28: test_reward: 9290.000000 ± 3203.822467, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #29: 1001it [00:03, 313.34it/s, env_step=29000, gradient_step=2900, len=93, n/ep=0, n/st=100, rew=4883.00]                                                                                  


Epoch #29: test_reward: 13326.100000 ± 5043.824510, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #30: 1001it [00:02, 342.07it/s, env_step=30000, gradient_step=3000, len=142, n/ep=1, n/st=100, rew=9242.50]                                                                                 


Epoch #30: test_reward: 9746.400000 ± 3971.299188, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #31: 1001it [00:02, 337.91it/s, env_step=31000, gradient_step=3100, len=155, n/ep=2, n/st=100, rew=8987.25]                                                                                 


Epoch #31: test_reward: 15199.400000 ± 4787.527884, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #32: 1001it [00:03, 331.62it/s, env_step=32000, gradient_step=3200, len=133, n/ep=0, n/st=100, rew=8245.50]                                                                                 


Epoch #32: test_reward: 10327.900000 ± 3433.659460, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #33: 1001it [00:03, 280.81it/s, env_step=33000, gradient_step=3300, len=140, n/ep=2, n/st=100, rew=8573.75]                                                                                 


Epoch #33: test_reward: 11189.900000 ± 4118.917126, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #34: 1001it [00:03, 274.15it/s, env_step=34000, gradient_step=3400, len=128, n/ep=1, n/st=100, rew=8033.00]                                                                                 


Epoch #34: test_reward: 9515.100000 ± 3134.183225, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #35: 1001it [00:03, 300.42it/s, env_step=35000, gradient_step=3500, len=170, n/ep=0, n/st=100, rew=11104.50]                                                                                


Epoch #35: test_reward: 9159.400000 ± 1783.821078, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #36: 1001it [00:03, 307.45it/s, env_step=36000, gradient_step=3600, len=110, n/ep=1, n/st=100, rew=5277.00]                                                                                 


Epoch #36: test_reward: 8653.700000 ± 2134.003283, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #37: 1001it [00:03, 322.88it/s, env_step=37000, gradient_step=3700, len=203, n/ep=1, n/st=100, rew=13305.50]                                                                                


Epoch #37: test_reward: 10658.000000 ± 3961.633199, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #38: 1001it [00:03, 272.46it/s, env_step=38000, gradient_step=3800, len=202, n/ep=0, n/st=100, rew=13676.00]                                                                                


Epoch #38: test_reward: 12294.000000 ± 6310.293812, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #39: 1001it [00:02, 333.99it/s, env_step=39000, gradient_step=3900, len=129, n/ep=1, n/st=100, rew=7838.50]                                                                                 


Epoch #39: test_reward: 8994.200000 ± 3422.923131, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #40: 1001it [00:03, 324.42it/s, env_step=40000, gradient_step=4000, len=400, n/ep=5, n/st=100, rew=26744.40]                                                                                


Epoch #40: test_reward: 10141.900000 ± 4201.118814, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #41: 1001it [00:02, 343.82it/s, env_step=41000, gradient_step=4100, len=136, n/ep=0, n/st=100, rew=8677.00]                                                                                 


Epoch #41: test_reward: 9387.500000 ± 3932.761453, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #42: 1001it [00:03, 303.69it/s, env_step=42000, gradient_step=4200, len=242, n/ep=1, n/st=100, rew=16536.00]                                                                                


Epoch #42: test_reward: 10666.600000 ± 3683.241730, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #43: 1001it [00:03, 312.07it/s, env_step=43000, gradient_step=4300, len=150, n/ep=0, n/st=100, rew=9306.00]                                                                                 


Epoch #43: test_reward: 11753.500000 ± 4029.433912, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #44: 1001it [00:03, 268.83it/s, env_step=44000, gradient_step=4400, len=181, n/ep=1, n/st=100, rew=10602.00]                                                                                


Epoch #44: test_reward: 11540.500000 ± 4872.791525, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #45: 1001it [00:03, 308.54it/s, env_step=45000, gradient_step=4500, len=193, n/ep=0, n/st=100, rew=12327.33]                                                                                


Epoch #45: test_reward: 11927.800000 ± 5702.887388, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #46: 1001it [00:03, 281.36it/s, env_step=46000, gradient_step=4600, len=212, n/ep=1, n/st=100, rew=14621.00]                                                                                


Epoch #46: test_reward: 9894.900000 ± 2358.726879, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #47: 1001it [00:02, 335.41it/s, env_step=47000, gradient_step=4700, len=225, n/ep=0, n/st=100, rew=13129.00]                                                                                


Epoch #47: test_reward: 9532.000000 ± 1831.649311, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #48: 1001it [00:02, 334.81it/s, env_step=48000, gradient_step=4800, len=262, n/ep=1, n/st=100, rew=17320.50]                                                                                


Epoch #48: test_reward: 11597.400000 ± 6677.470228, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #49: 1001it [00:03, 284.81it/s, env_step=49000, gradient_step=4900, len=201, n/ep=0, n/st=100, rew=10921.00]                                                                                


Epoch #49: test_reward: 10789.300000 ± 4454.470318, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #50: 1001it [00:03, 275.71it/s, env_step=50000, gradient_step=5000, len=213, n/ep=1, n/st=100, rew=14278.50]                                                                                


Epoch #50: test_reward: 9883.700000 ± 2236.791991, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #51: 1001it [00:03, 330.44it/s, env_step=51000, gradient_step=5100, len=239, n/ep=2, n/st=100, rew=15567.25]                                                                                


Epoch #51: test_reward: 13649.200000 ± 8350.885747, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #52: 1001it [00:03, 288.74it/s, env_step=52000, gradient_step=5200, len=317, n/ep=1, n/st=100, rew=21851.00]                                                                                


Epoch #52: test_reward: 11747.500000 ± 3156.587437, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #53: 1001it [00:03, 273.38it/s, env_step=53000, gradient_step=5300, len=171, n/ep=1, n/st=100, rew=10722.00]                                                                                


Epoch #53: test_reward: 15242.600000 ± 7643.720707, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #54: 1001it [00:03, 323.36it/s, env_step=54000, gradient_step=5400, len=165, n/ep=3, n/st=100, rew=10405.50]                                                                                


Epoch #54: test_reward: 14214.000000 ± 5704.028191, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #55: 1001it [00:03, 328.38it/s, env_step=55000, gradient_step=5500, len=201, n/ep=2, n/st=100, rew=11953.00]                                                                                


Epoch #55: test_reward: 13871.900000 ± 8073.044314, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #56: 1001it [00:03, 296.67it/s, env_step=56000, gradient_step=5600, len=142, n/ep=0, n/st=100, rew=9032.83]                                                                                 


Epoch #56: test_reward: 11458.700000 ± 4658.194372, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #57: 1001it [00:02, 339.38it/s, env_step=57000, gradient_step=5700, len=124, n/ep=1, n/st=100, rew=7685.50]                                                                                 


Epoch #57: test_reward: 11269.000000 ± 3004.507614, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #58: 1001it [00:03, 306.77it/s, env_step=58000, gradient_step=5800, len=174, n/ep=1, n/st=100, rew=10752.50]                                                                                


Epoch #58: test_reward: 14683.000000 ± 6491.344545, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #59: 1001it [00:03, 312.18it/s, env_step=59000, gradient_step=5900, len=98, n/ep=1, n/st=100, rew=5194.00]                                                                                  


Epoch #59: test_reward: 15371.100000 ± 7555.915688, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #60: 1001it [00:02, 337.72it/s, env_step=60000, gradient_step=6000, len=211, n/ep=2, n/st=100, rew=14066.00]                                                                                


Epoch #60: test_reward: 11478.400000 ± 5297.837261, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #61: 1001it [00:03, 296.86it/s, env_step=61000, gradient_step=6100, len=148, n/ep=1, n/st=100, rew=9114.00]                                                                                 


Epoch #61: test_reward: 15434.500000 ± 5783.931280, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #62: 1001it [00:03, 331.70it/s, env_step=62000, gradient_step=6200, len=199, n/ep=0, n/st=100, rew=13354.00]                                                                                


Epoch #62: test_reward: 15299.500000 ± 5677.122603, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #63: 1001it [00:03, 296.06it/s, env_step=63000, gradient_step=6300, len=184, n/ep=1, n/st=100, rew=12511.00]                                                                                


Epoch #63: test_reward: 15924.700000 ± 7580.199391, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #64: 1001it [00:03, 304.30it/s, env_step=64000, gradient_step=6400, len=70, n/ep=0, n/st=100, rew=3811.50]                                                                                  


Epoch #64: test_reward: 14404.700000 ± 6379.032498, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #65: 1001it [00:03, 298.50it/s, env_step=65000, gradient_step=6500, len=205, n/ep=1, n/st=100, rew=13681.50]                                                                                


Epoch #65: test_reward: 14033.400000 ± 6696.874154, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #66: 1001it [00:03, 285.75it/s, env_step=66000, gradient_step=6600, len=190, n/ep=0, n/st=100, rew=13061.50]                                                                                


Epoch #66: test_reward: 10990.500000 ± 4566.974781, best_reward: 15956.700000 ± 4581.981952 in #3


Epoch #67: 1001it [00:03, 264.84it/s, env_step=67000, gradient_step=6700, len=338, n/ep=0, n/st=100, rew=23083.00]                                                                                


Epoch #67: test_reward: 17836.400000 ± 6259.819873, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #68: 1001it [00:03, 281.81it/s, env_step=68000, gradient_step=6800, len=149, n/ep=1, n/st=100, rew=9631.00]                                                                                 


Epoch #68: test_reward: 8855.700000 ± 2183.620024, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #69: 1001it [00:03, 292.37it/s, env_step=69000, gradient_step=6900, len=148, n/ep=2, n/st=100, rew=9533.50]                                                                                 


Epoch #69: test_reward: 10783.900000 ± 6094.279062, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #70: 1001it [00:02, 338.24it/s, env_step=70000, gradient_step=7000, len=200, n/ep=1, n/st=100, rew=13917.00]                                                                                


Epoch #70: test_reward: 12740.300000 ± 6553.567609, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #71: 1001it [00:03, 320.29it/s, env_step=71000, gradient_step=7100, len=345, n/ep=0, n/st=100, rew=25238.00]                                                                                


Epoch #71: test_reward: 9376.400000 ± 2985.142281, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #72: 1001it [00:03, 274.60it/s, env_step=72000, gradient_step=7200, len=169, n/ep=0, n/st=100, rew=10900.50]                                                                                


Epoch #72: test_reward: 14825.400000 ± 5148.799495, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #73: 1001it [00:03, 289.10it/s, env_step=73000, gradient_step=7300, len=174, n/ep=0, n/st=100, rew=10909.00]                                                                                


Epoch #73: test_reward: 17231.800000 ± 9042.865318, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #74: 1001it [00:02, 337.26it/s, env_step=74000, gradient_step=7400, len=239, n/ep=0, n/st=100, rew=16153.00]                                                                                


Epoch #74: test_reward: 10503.800000 ± 3029.699417, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #75: 1001it [00:03, 323.31it/s, env_step=75000, gradient_step=7500, len=375, n/ep=1, n/st=100, rew=26620.00]                                                                                


Epoch #75: test_reward: 7230.800000 ± 2349.066742, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #76: 1001it [00:03, 306.91it/s, env_step=76000, gradient_step=7600, len=175, n/ep=0, n/st=100, rew=11604.00]                                                                                


Epoch #76: test_reward: 11130.100000 ± 3504.514673, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #77: 1001it [00:03, 333.30it/s, env_step=77000, gradient_step=7700, len=86, n/ep=1, n/st=100, rew=4754.00]                                                                                  


Epoch #77: test_reward: 8505.600000 ± 1704.934087, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #78: 1001it [00:03, 304.80it/s, env_step=78000, gradient_step=7800, len=369, n/ep=1, n/st=100, rew=22702.00]                                                                                


Epoch #78: test_reward: 9827.600000 ± 3050.048760, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #79: 1001it [00:03, 311.52it/s, env_step=79000, gradient_step=7900, len=219, n/ep=0, n/st=100, rew=15173.00]                                                                                


Epoch #79: test_reward: 10448.900000 ± 7449.864300, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #80: 1001it [00:03, 302.80it/s, env_step=80000, gradient_step=8000, len=400, n/ep=1, n/st=100, rew=27936.00]                                                                                


Epoch #80: test_reward: 10108.700000 ± 4352.448140, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #81: 1001it [00:03, 326.23it/s, env_step=81000, gradient_step=8100, len=279, n/ep=0, n/st=100, rew=19752.67]                                                                                


Epoch #81: test_reward: 8473.300000 ± 3899.858742, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #82: 1001it [00:03, 322.72it/s, env_step=82000, gradient_step=8200, len=117, n/ep=0, n/st=100, rew=7775.50]                                                                                 


Epoch #82: test_reward: 11822.300000 ± 4142.067094, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #83: 1001it [00:02, 338.80it/s, env_step=83000, gradient_step=8300, len=124, n/ep=0, n/st=100, rew=7971.00]                                                                                 


Epoch #83: test_reward: 11403.200000 ± 5713.034514, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #84: 1001it [00:03, 327.87it/s, env_step=84000, gradient_step=8400, len=135, n/ep=1, n/st=100, rew=8882.00]                                                                                 


Epoch #84: test_reward: 9014.400000 ± 2305.643433, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #85: 1001it [00:03, 293.20it/s, env_step=85000, gradient_step=8500, len=193, n/ep=2, n/st=100, rew=12605.25]                                                                                


Epoch #85: test_reward: 11747.700000 ± 3277.358420, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #86: 1001it [00:03, 327.85it/s, env_step=86000, gradient_step=8600, len=312, n/ep=1, n/st=100, rew=19855.00]                                                                                


Epoch #86: test_reward: 8968.000000 ± 4142.757343, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #87: 1001it [00:03, 330.71it/s, env_step=87000, gradient_step=8700, len=189, n/ep=0, n/st=100, rew=12969.50]                                                                                


Epoch #87: test_reward: 9223.800000 ± 2572.140968, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #88: 1001it [00:02, 336.87it/s, env_step=88000, gradient_step=8800, len=256, n/ep=2, n/st=100, rew=17577.50]                                                                                


Epoch #88: test_reward: 12055.500000 ± 4160.405443, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #89: 1001it [00:03, 333.13it/s, env_step=89000, gradient_step=8900, len=123, n/ep=0, n/st=100, rew=7781.50]                                                                                 


Epoch #89: test_reward: 10586.600000 ± 7102.914334, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #90: 1001it [00:03, 305.88it/s, env_step=90000, gradient_step=9000, len=208, n/ep=1, n/st=100, rew=14343.00]                                                                                


Epoch #90: test_reward: 14886.100000 ± 6663.659092, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #91: 1001it [00:02, 338.07it/s, env_step=91000, gradient_step=9100, len=146, n/ep=0, n/st=100, rew=9387.00]                                                                                 


Epoch #91: test_reward: 14142.900000 ± 3834.724748, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #92: 1001it [00:03, 325.45it/s, env_step=92000, gradient_step=9200, len=139, n/ep=0, n/st=100, rew=8394.00]                                                                                 


Epoch #92: test_reward: 14096.600000 ± 5935.854483, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #93: 1001it [00:03, 314.54it/s, env_step=93000, gradient_step=9300, len=383, n/ep=1, n/st=100, rew=25182.00]                                                                                


Epoch #93: test_reward: 11712.600000 ± 8144.266415, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #94: 1001it [00:03, 327.92it/s, env_step=94000, gradient_step=9400, len=120, n/ep=0, n/st=100, rew=6928.75]                                                                                 


Epoch #94: test_reward: 15455.300000 ± 6520.330698, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #95: 1001it [00:03, 303.62it/s, env_step=95000, gradient_step=9500, len=105, n/ep=0, n/st=100, rew=5268.00]                                                                                 


Epoch #95: test_reward: 11567.400000 ± 6745.470439, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #96: 1001it [00:03, 283.30it/s, env_step=96000, gradient_step=9600, len=110, n/ep=1, n/st=100, rew=7232.00]                                                                                 


Epoch #96: test_reward: 10601.400000 ± 6162.935343, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #97: 1001it [00:03, 328.67it/s, env_step=97000, gradient_step=9700, len=121, n/ep=0, n/st=100, rew=7254.50]                                                                                 


Epoch #97: test_reward: 10242.600000 ± 5919.274148, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #98: 1001it [00:03, 292.22it/s, env_step=98000, gradient_step=9800, len=90, n/ep=2, n/st=100, rew=4545.50]                                                                                  


Epoch #98: test_reward: 10368.700000 ± 3369.468980, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #99: 1001it [00:03, 331.62it/s, env_step=99000, gradient_step=9900, len=251, n/ep=2, n/st=100, rew=15633.50]                                                                                


Epoch #99: test_reward: 13710.100000 ± 7445.479279, best_reward: 17836.400000 ± 6259.819873 in #67


Epoch #100: 1001it [00:03, 323.63it/s, env_step=100000, gradient_step=10000, len=216, n/ep=1, n/st=100, rew=13263.00]                                                                             


Epoch #100: test_reward: 9878.600000 ± 2448.883876, best_reward: 17836.400000 ± 6259.819873 in #67

InfoStats(gradient_step=10000, best_reward=17836.4, best_reward_std=6259.819872807842, train_step=100000, train_episode=523, test_step=179965, test_episode=1010, timing=TimingStats(total_time=429.04335832595825, train_time=324.6278221607208, train_time_collect=33.34372043609619, train_time_update=287.15552520751953, test_time=104.41553616523743, update_speed=308.0450693794531))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #34


Epoch #1: 1001it [00:03, 306.49it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 14804.500000 ± 6946.489088, best_reward: 14804.500000 ± 6946.489088 in #1


Epoch #2: 1001it [00:03, 279.25it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 13586.900000 ± 5979.512429, best_reward: 14804.500000 ± 6946.489088 in #1


Epoch #3: 1001it [00:03, 292.75it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 16040.100000 ± 4672.789840, best_reward: 16040.100000 ± 4672.789840 in #3


Epoch #4: 1001it [00:03, 306.73it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 18257.300000 ± 4532.129854, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #5: 1001it [00:03, 304.05it/s, env_step=5000, gradient_step=500, len=48, n/ep=0, n/st=100, rew=1075.00]                                                                                     


Epoch #5: test_reward: 13488.100000 ± 2856.353145, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #6: 1001it [00:03, 310.20it/s, env_step=6000, gradient_step=600, len=48, n/ep=0, n/st=100, rew=1075.00]                                                                                     


Epoch #6: test_reward: 13701.200000 ± 5644.625493, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #7: 1001it [00:03, 292.09it/s, env_step=7000, gradient_step=700, len=48, n/ep=0, n/st=100, rew=1075.00]                                                                                     


Epoch #7: test_reward: 11067.500000 ± 5199.723882, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #8: 1001it [00:03, 276.97it/s, env_step=8000, gradient_step=800, len=80, n/ep=2, n/st=100, rew=2825.00]                                                                                     


Epoch #8: test_reward: 9639.500000 ± 3730.503190, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #9: 1001it [00:03, 274.67it/s, env_step=9000, gradient_step=900, len=84, n/ep=0, n/st=100, rew=2119.75]                                                                                     


Epoch #9: test_reward: 11768.200000 ± 3409.814241, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #10: 1001it [00:03, 277.16it/s, env_step=10000, gradient_step=1000, len=96, n/ep=0, n/st=100, rew=3364.00]                                                                                  


Epoch #10: test_reward: 11052.000000 ± 3607.247150, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #11: 1001it [00:03, 284.27it/s, env_step=11000, gradient_step=1100, len=110, n/ep=2, n/st=100, rew=4573.50]                                                                                 


Epoch #11: test_reward: 9961.000000 ± 2674.636461, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #12: 1001it [00:03, 297.42it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=4366.00]                                                                                 


Epoch #12: test_reward: 11594.000000 ± 4636.529305, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #13: 1001it [00:03, 315.55it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=4627.00]                                                                                 


Epoch #13: test_reward: 11431.900000 ± 6819.636947, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #14: 1001it [00:03, 306.14it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=5731.00]                                                                                 


Epoch #14: test_reward: 11230.600000 ± 4393.712148, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #15: 1001it [00:03, 281.85it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=7487.50]                                                                                 


Epoch #15: test_reward: 11777.000000 ± 2791.205367, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #16: 1001it [00:03, 269.23it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=6686.00]                                                                                 


Epoch #16: test_reward: 14657.800000 ± 3066.731870, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #17: 1001it [00:03, 287.77it/s, env_step=17000, gradient_step=1700, len=168, n/ep=0, n/st=100, rew=6049.00]                                                                                 


Epoch #17: test_reward: 12226.000000 ± 4111.211038, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #18: 1001it [00:03, 303.43it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=8871.50]                                                                                 


Epoch #18: test_reward: 10531.600000 ± 7259.256810, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #19: 1001it [00:03, 318.80it/s, env_step=19000, gradient_step=1900, len=182, n/ep=0, n/st=100, rew=7564.50]                                                                                 


Epoch #19: test_reward: 12320.800000 ± 7672.709727, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #20: 1001it [00:02, 340.06it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=9063.00]                                                                                 


Epoch #20: test_reward: 10850.000000 ± 4465.837167, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #21: 1001it [00:03, 324.48it/s, env_step=21000, gradient_step=2100, len=52, n/ep=1, n/st=100, rew=2113.00]                                                                                  


Epoch #21: test_reward: 14777.500000 ± 7524.353092, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #22: 1001it [00:03, 280.61it/s, env_step=22000, gradient_step=2200, len=40, n/ep=1, n/st=100, rew=1524.00]                                                                                  


Epoch #22: test_reward: 17302.200000 ± 5445.467359, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #23: 1001it [00:03, 311.37it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=9892.00]                                                                                 


Epoch #23: test_reward: 10536.200000 ± 6595.247528, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #24: 1001it [00:03, 278.25it/s, env_step=24000, gradient_step=2400, len=238, n/ep=0, n/st=100, rew=11299.33]                                                                                


Epoch #24: test_reward: 10016.700000 ± 1769.425729, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #25: 1001it [00:03, 279.76it/s, env_step=25000, gradient_step=2500, len=153, n/ep=3, n/st=100, rew=7906.33]                                                                                 


Epoch #25: test_reward: 13425.100000 ± 5302.558184, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #26: 1001it [00:03, 292.28it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=13301.00]                                                                                


Epoch #26: test_reward: 14714.700000 ± 5084.470219, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #27: 1001it [00:03, 285.55it/s, env_step=27000, gradient_step=2700, len=131, n/ep=0, n/st=100, rew=6958.33]                                                                                 


Epoch #27: test_reward: 9616.300000 ± 3835.289534, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #28: 1001it [00:02, 344.79it/s, env_step=28000, gradient_step=2800, len=78, n/ep=0, n/st=100, rew=3703.00]                                                                                  


Epoch #28: test_reward: 10724.400000 ± 3445.938020, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #29: 1001it [00:03, 312.63it/s, env_step=29000, gradient_step=2900, len=96, n/ep=1, n/st=100, rew=5379.00]                                                                                  


Epoch #29: test_reward: 9733.000000 ± 2682.835440, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #30: 1001it [00:03, 306.79it/s, env_step=30000, gradient_step=3000, len=300, n/ep=1, n/st=100, rew=15783.00]                                                                                


Epoch #30: test_reward: 10243.300000 ± 1248.317191, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #31: 1001it [00:02, 341.47it/s, env_step=31000, gradient_step=3100, len=184, n/ep=1, n/st=100, rew=11197.00]                                                                                


Epoch #31: test_reward: 10794.800000 ± 5642.604927, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #32: 1001it [00:03, 316.87it/s, env_step=32000, gradient_step=3200, len=167, n/ep=3, n/st=100, rew=10066.00]                                                                                


Epoch #32: test_reward: 13286.600000 ± 4910.455136, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #33: 1001it [00:03, 307.30it/s, env_step=33000, gradient_step=3300, len=126, n/ep=0, n/st=100, rew=7302.00]                                                                                 


Epoch #33: test_reward: 8729.200000 ± 4845.892896, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #34: 1001it [00:03, 263.40it/s, env_step=34000, gradient_step=3400, len=90, n/ep=1, n/st=100, rew=4966.00]                                                                                  


Epoch #34: test_reward: 8541.700000 ± 1827.943984, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #35: 1001it [00:03, 301.46it/s, env_step=35000, gradient_step=3500, len=150, n/ep=0, n/st=100, rew=8318.00]                                                                                 


Epoch #35: test_reward: 9822.300000 ± 3167.039976, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #36: 1001it [00:02, 342.89it/s, env_step=36000, gradient_step=3600, len=141, n/ep=2, n/st=100, rew=8462.00]                                                                                 


Epoch #36: test_reward: 11916.800000 ± 2380.733660, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #37: 1001it [00:03, 301.42it/s, env_step=37000, gradient_step=3700, len=370, n/ep=1, n/st=100, rew=18655.00]                                                                                


Epoch #37: test_reward: 9378.700000 ± 3791.677335, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #38: 1001it [00:03, 322.07it/s, env_step=38000, gradient_step=3800, len=102, n/ep=4, n/st=100, rew=6030.88]                                                                                 


Epoch #38: test_reward: 9725.000000 ± 1799.825714, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #39: 1001it [00:03, 314.82it/s, env_step=39000, gradient_step=3900, len=166, n/ep=3, n/st=100, rew=9674.67]                                                                                 


Epoch #39: test_reward: 11818.000000 ± 3322.477600, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #40: 1001it [00:03, 309.64it/s, env_step=40000, gradient_step=4000, len=217, n/ep=4, n/st=100, rew=12134.12]                                                                                


Epoch #40: test_reward: 11633.700000 ± 2667.179186, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #41: 1001it [00:03, 287.92it/s, env_step=41000, gradient_step=4100, len=142, n/ep=1, n/st=100, rew=9200.00]                                                                                 


Epoch #41: test_reward: 12912.800000 ± 3737.096916, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #42: 1001it [00:03, 288.64it/s, env_step=42000, gradient_step=4200, len=276, n/ep=2, n/st=100, rew=15276.00]                                                                                


Epoch #42: test_reward: 9455.400000 ± 2562.664246, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #43: 1001it [00:03, 289.13it/s, env_step=43000, gradient_step=4300, len=194, n/ep=0, n/st=100, rew=11875.25]                                                                                


Epoch #43: test_reward: 9446.900000 ± 4324.634423, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #44: 1001it [00:03, 286.57it/s, env_step=44000, gradient_step=4400, len=257, n/ep=0, n/st=100, rew=16103.50]                                                                                


Epoch #44: test_reward: 10534.200000 ± 2958.990970, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #45: 1001it [00:03, 297.23it/s, env_step=45000, gradient_step=4500, len=44, n/ep=0, n/st=100, rew=1413.00]                                                                                  


Epoch #45: test_reward: 11208.400000 ± 2703.294072, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #46: 1001it [00:03, 302.93it/s, env_step=46000, gradient_step=4600, len=108, n/ep=1, n/st=100, rew=5665.00]                                                                                 


Epoch #46: test_reward: 10153.300000 ± 3754.847135, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #47: 1001it [00:03, 277.58it/s, env_step=47000, gradient_step=4700, len=168, n/ep=0, n/st=100, rew=10656.00]                                                                                


Epoch #47: test_reward: 8618.400000 ± 2079.258676, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #48: 1001it [00:03, 276.22it/s, env_step=48000, gradient_step=4800, len=182, n/ep=0, n/st=100, rew=10772.25]                                                                                


Epoch #48: test_reward: 9905.200000 ± 2836.407298, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #49: 1001it [00:03, 320.06it/s, env_step=49000, gradient_step=4900, len=258, n/ep=0, n/st=100, rew=15169.00]                                                                                


Epoch #49: test_reward: 9973.800000 ± 2935.247921, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #50: 1001it [00:03, 300.75it/s, env_step=50000, gradient_step=5000, len=291, n/ep=0, n/st=100, rew=20227.00]                                                                                


Epoch #50: test_reward: 10874.100000 ± 6061.251231, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #51: 1001it [00:03, 325.16it/s, env_step=51000, gradient_step=5100, len=161, n/ep=0, n/st=100, rew=10111.50]                                                                                


Epoch #51: test_reward: 10367.000000 ± 4851.120551, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #52: 1001it [00:03, 307.39it/s, env_step=52000, gradient_step=5200, len=245, n/ep=0, n/st=100, rew=15517.25]                                                                                


Epoch #52: test_reward: 10453.500000 ± 2543.307384, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #53: 1001it [00:03, 326.83it/s, env_step=53000, gradient_step=5300, len=170, n/ep=1, n/st=100, rew=10715.50]                                                                                


Epoch #53: test_reward: 8322.800000 ± 2357.254199, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #54: 1001it [00:03, 283.03it/s, env_step=54000, gradient_step=5400, len=208, n/ep=0, n/st=100, rew=13307.75]                                                                                


Epoch #54: test_reward: 12461.200000 ± 5796.069441, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #55: 1001it [00:03, 276.31it/s, env_step=55000, gradient_step=5500, len=124, n/ep=1, n/st=100, rew=5645.00]                                                                                 


Epoch #55: test_reward: 11448.100000 ± 1063.210464, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #56: 1001it [00:03, 289.35it/s, env_step=56000, gradient_step=5600, len=182, n/ep=1, n/st=100, rew=9745.00]                                                                                 


Epoch #56: test_reward: 8636.000000 ± 2702.664352, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #57: 1001it [00:03, 305.76it/s, env_step=57000, gradient_step=5700, len=84, n/ep=0, n/st=100, rew=4846.00]                                                                                  


Epoch #57: test_reward: 14202.600000 ± 4584.872173, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #58: 1001it [00:03, 314.55it/s, env_step=58000, gradient_step=5800, len=102, n/ep=1, n/st=100, rew=6336.00]                                                                                 


Epoch #58: test_reward: 10036.900000 ± 2359.151434, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #59: 1001it [00:03, 292.67it/s, env_step=59000, gradient_step=5900, len=42, n/ep=0, n/st=100, rew=1304.00]                                                                                  


Epoch #59: test_reward: 12131.800000 ± 5634.251925, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #60: 1001it [00:03, 301.81it/s, env_step=60000, gradient_step=6000, len=208, n/ep=0, n/st=100, rew=12815.50]                                                                                


Epoch #60: test_reward: 9786.300000 ± 3821.386111, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #61: 1001it [00:03, 283.16it/s, env_step=61000, gradient_step=6100, len=103, n/ep=0, n/st=100, rew=5961.50]                                                                                 


Epoch #61: test_reward: 14611.600000 ± 7923.101542, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #62: 1001it [00:02, 342.34it/s, env_step=62000, gradient_step=6200, len=118, n/ep=1, n/st=100, rew=7327.00]                                                                                 


Epoch #62: test_reward: 6673.500000 ± 1828.539704, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #63: 1001it [00:03, 300.98it/s, env_step=63000, gradient_step=6300, len=242, n/ep=1, n/st=100, rew=15914.50]                                                                                


Epoch #63: test_reward: 8843.000000 ± 2068.606971, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #64: 1001it [00:03, 291.58it/s, env_step=64000, gradient_step=6400, len=133, n/ep=1, n/st=100, rew=7995.00]                                                                                 


Epoch #64: test_reward: 10496.200000 ± 3435.072220, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #65: 1001it [00:03, 299.40it/s, env_step=65000, gradient_step=6500, len=230, n/ep=0, n/st=100, rew=14713.50]                                                                                


Epoch #65: test_reward: 12433.500000 ± 7923.735032, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #66: 1001it [00:03, 279.99it/s, env_step=66000, gradient_step=6600, len=152, n/ep=0, n/st=100, rew=9925.00]                                                                                 


Epoch #66: test_reward: 14623.700000 ± 6845.811158, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #67: 1001it [00:03, 303.66it/s, env_step=67000, gradient_step=6700, len=162, n/ep=0, n/st=100, rew=9869.00]                                                                                 


Epoch #67: test_reward: 12018.400000 ± 8143.232616, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #68: 1001it [00:03, 309.07it/s, env_step=68000, gradient_step=6800, len=147, n/ep=0, n/st=100, rew=8802.00]                                                                                 


Epoch #68: test_reward: 9052.100000 ± 2655.486187, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #69: 1001it [00:03, 326.47it/s, env_step=69000, gradient_step=6900, len=110, n/ep=0, n/st=100, rew=7023.00]                                                                                 


Epoch #69: test_reward: 12043.900000 ± 3695.590817, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #70: 1001it [00:03, 323.06it/s, env_step=70000, gradient_step=7000, len=181, n/ep=3, n/st=100, rew=10612.67]                                                                                


Epoch #70: test_reward: 12320.300000 ± 6704.912588, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #71: 1001it [00:03, 308.32it/s, env_step=71000, gradient_step=7100, len=158, n/ep=1, n/st=100, rew=10464.00]                                                                                


Epoch #71: test_reward: 9330.200000 ± 4090.956631, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #72: 1001it [00:03, 295.72it/s, env_step=72000, gradient_step=7200, len=154, n/ep=1, n/st=100, rew=9287.50]                                                                                 


Epoch #72: test_reward: 9685.400000 ± 2823.163906, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #73: 1001it [00:03, 314.39it/s, env_step=73000, gradient_step=7300, len=173, n/ep=2, n/st=100, rew=11393.00]                                                                                


Epoch #73: test_reward: 9496.600000 ± 2707.416340, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #74: 1001it [00:03, 276.09it/s, env_step=74000, gradient_step=7400, len=184, n/ep=0, n/st=100, rew=11290.00]                                                                                


Epoch #74: test_reward: 8916.900000 ± 4089.099423, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #75: 1001it [00:03, 325.13it/s, env_step=75000, gradient_step=7500, len=192, n/ep=0, n/st=100, rew=13101.50]                                                                                


Epoch #75: test_reward: 8523.300000 ± 4083.980780, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #76: 1001it [00:03, 277.40it/s, env_step=76000, gradient_step=7600, len=160, n/ep=0, n/st=100, rew=10606.50]                                                                                


Epoch #76: test_reward: 10213.500000 ± 5931.686813, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #77: 1001it [00:03, 270.71it/s, env_step=77000, gradient_step=7700, len=192, n/ep=1, n/st=100, rew=12037.00]                                                                                


Epoch #77: test_reward: 5858.000000 ± 5612.766715, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #78: 1001it [00:03, 325.52it/s, env_step=78000, gradient_step=7800, len=94, n/ep=0, n/st=100, rew=4752.00]                                                                                  


Epoch #78: test_reward: 10919.800000 ± 2700.445400, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #79: 1001it [00:03, 286.64it/s, env_step=79000, gradient_step=7900, len=168, n/ep=0, n/st=100, rew=10121.50]                                                                                


Epoch #79: test_reward: 10262.600000 ± 2194.627996, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #80: 1001it [00:03, 312.31it/s, env_step=80000, gradient_step=8000, len=114, n/ep=1, n/st=100, rew=6506.50]                                                                                 


Epoch #80: test_reward: 8052.100000 ± 4660.387547, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #81: 1001it [00:03, 280.77it/s, env_step=81000, gradient_step=8100, len=110, n/ep=1, n/st=100, rew=6368.00]                                                                                 


Epoch #81: test_reward: 11387.700000 ± 7719.052481, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #82: 1001it [00:03, 267.11it/s, env_step=82000, gradient_step=8200, len=177, n/ep=0, n/st=100, rew=11238.50]                                                                                


Epoch #82: test_reward: 14655.800000 ± 7656.538197, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #83: 1001it [00:03, 313.94it/s, env_step=83000, gradient_step=8300, len=76, n/ep=1, n/st=100, rew=4113.00]                                                                                  


Epoch #83: test_reward: 9597.400000 ± 4545.273330, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #84: 1001it [00:03, 329.65it/s, env_step=84000, gradient_step=8400, len=108, n/ep=0, n/st=100, rew=5561.00]                                                                                 


Epoch #84: test_reward: 5541.300000 ± 3631.711223, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #85: 1001it [00:03, 298.02it/s, env_step=85000, gradient_step=8500, len=120, n/ep=1, n/st=100, rew=7063.50]                                                                                 


Epoch #85: test_reward: 13195.700000 ± 8701.924271, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #86: 1001it [00:03, 305.57it/s, env_step=86000, gradient_step=8600, len=183, n/ep=3, n/st=100, rew=11499.17]                                                                                


Epoch #86: test_reward: 9364.800000 ± 4821.710501, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #87: 1001it [00:03, 292.88it/s, env_step=87000, gradient_step=8700, len=96, n/ep=2, n/st=100, rew=5162.50]                                                                                  


Epoch #87: test_reward: 6171.400000 ± 5339.355096, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #88: 1001it [00:03, 279.29it/s, env_step=88000, gradient_step=8800, len=92, n/ep=0, n/st=100, rew=4787.00]                                                                                  


Epoch #88: test_reward: 8366.500000 ± 4217.503699, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #89: 1001it [00:03, 315.83it/s, env_step=89000, gradient_step=8900, len=173, n/ep=1, n/st=100, rew=11168.00]                                                                                


Epoch #89: test_reward: 8194.000000 ± 5678.159491, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #90: 1001it [00:03, 290.16it/s, env_step=90000, gradient_step=9000, len=72, n/ep=0, n/st=100, rew=3755.50]                                                                                  


Epoch #90: test_reward: 9651.600000 ± 3229.376014, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #91: 1001it [00:03, 302.03it/s, env_step=91000, gradient_step=9100, len=191, n/ep=1, n/st=100, rew=12344.00]                                                                                


Epoch #91: test_reward: 6084.800000 ± 4742.990803, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #92: 1001it [00:03, 290.52it/s, env_step=92000, gradient_step=9200, len=78, n/ep=0, n/st=100, rew=4113.50]                                                                                  


Epoch #92: test_reward: 7119.700000 ± 4815.424676, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #93: 1001it [00:03, 286.29it/s, env_step=93000, gradient_step=9300, len=48, n/ep=0, n/st=100, rew=2057.00]                                                                                  


Epoch #93: test_reward: 4285.600000 ± 3135.176907, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #94: 1001it [00:02, 337.82it/s, env_step=94000, gradient_step=9400, len=144, n/ep=2, n/st=100, rew=8525.50]                                                                                 


Epoch #94: test_reward: 11913.900000 ± 7124.291550, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #95: 1001it [00:02, 337.65it/s, env_step=95000, gradient_step=9500, len=123, n/ep=2, n/st=100, rew=7348.25]                                                                                 


Epoch #95: test_reward: 15581.400000 ± 5870.598150, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #96: 1001it [00:03, 300.35it/s, env_step=96000, gradient_step=9600, len=146, n/ep=0, n/st=100, rew=8361.00]                                                                                 


Epoch #96: test_reward: 13715.800000 ± 6074.625302, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #97: 1001it [00:03, 323.14it/s, env_step=97000, gradient_step=9700, len=114, n/ep=0, n/st=100, rew=6989.50]                                                                                 


Epoch #97: test_reward: 14371.200000 ± 10830.128058, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #98: 1001it [00:03, 302.92it/s, env_step=98000, gradient_step=9800, len=112, n/ep=1, n/st=100, rew=6777.00]                                                                                 


Epoch #98: test_reward: 10466.900000 ± 4251.920847, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #99: 1001it [00:03, 281.76it/s, env_step=99000, gradient_step=9900, len=146, n/ep=0, n/st=100, rew=8702.00]                                                                                 


Epoch #99: test_reward: 6746.600000 ± 6310.726998, best_reward: 18257.300000 ± 4532.129854 in #4


Epoch #100: 1001it [00:03, 306.96it/s, env_step=100000, gradient_step=10000, len=106, n/ep=1, n/st=100, rew=5448.00]                                                                              


Epoch #100: test_reward: 11444.700000 ± 3890.270994, best_reward: 18257.300000 ± 4532.129854 in #4

InfoStats(gradient_step=10000, best_reward=18257.3, best_reward_std=4532.129853611876, train_step=100000, train_episode=575, test_step=174034, test_episode=1010, timing=TimingStats(total_time=446.8160455226898, train_time=334.85881638526917, train_time_collect=34.886048316955566, train_time_update=295.54462909698486, test_time=111.95722913742065, update_speed=298.63331979572484))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #34


Epoch #1: 1001it [00:03, 293.30it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 10538.900000 ± 3047.121771, best_reward: 10538.900000 ± 3047.121771 in #1


Epoch #2: 1001it [00:03, 261.33it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 13641.700000 ± 4120.912303, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #3: 1001it [00:03, 275.85it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 12405.000000 ± 6804.793678, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #4: 1001it [00:03, 291.88it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 12236.400000 ± 2752.941525, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #5: 1001it [00:03, 309.12it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 8436.700000 ± 3508.607531, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #6: 1001it [00:03, 281.79it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 11395.400000 ± 6363.122884, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #7: 1001it [00:03, 296.41it/s, env_step=7000, gradient_step=700, len=68, n/ep=0, n/st=100, rew=3348.00]                                                                                     


Epoch #7: test_reward: 11943.200000 ± 5371.028129, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #8: 1001it [00:03, 277.03it/s, env_step=8000, gradient_step=800, len=68, n/ep=0, n/st=100, rew=3348.00]                                                                                     


Epoch #8: test_reward: 11355.300000 ± 3570.404740, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #9: 1001it [00:03, 295.21it/s, env_step=9000, gradient_step=900, len=90, n/ep=1, n/st=100, rew=4554.00]                                                                                     


Epoch #9: test_reward: 9521.400000 ± 5765.972601, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #10: 1001it [00:03, 261.90it/s, env_step=10000, gradient_step=1000, len=94, n/ep=0, n/st=100, rew=4280.00]                                                                                  


Epoch #10: test_reward: 9934.000000 ± 3330.995527, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #11: 1001it [00:03, 307.80it/s, env_step=11000, gradient_step=1100, len=106, n/ep=0, n/st=100, rew=6050.33]                                                                                 


Epoch #11: test_reward: 9227.100000 ± 3510.649241, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #12: 1001it [00:03, 285.99it/s, env_step=12000, gradient_step=1200, len=120, n/ep=2, n/st=100, rew=7223.00]                                                                                 


Epoch #12: test_reward: 9707.200000 ± 3366.478540, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #13: 1001it [00:03, 294.35it/s, env_step=13000, gradient_step=1300, len=128, n/ep=0, n/st=100, rew=7451.75]                                                                                 


Epoch #13: test_reward: 10947.700000 ± 5998.002535, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #14: 1001it [00:03, 293.13it/s, env_step=14000, gradient_step=1400, len=140, n/ep=2, n/st=100, rew=8129.75]                                                                                 


Epoch #14: test_reward: 10276.500000 ± 2689.450585, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #15: 1001it [00:03, 333.65it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=8798.50]                                                                                 


Epoch #15: test_reward: 11437.200000 ± 3851.262879, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #16: 1001it [00:03, 328.72it/s, env_step=16000, gradient_step=1600, len=159, n/ep=0, n/st=100, rew=9768.50]                                                                                 


Epoch #16: test_reward: 9512.100000 ± 4497.483395, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #17: 1001it [00:03, 315.61it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=10220.75]                                                                                


Epoch #17: test_reward: 12259.600000 ± 5935.551587, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #18: 1001it [00:03, 312.07it/s, env_step=18000, gradient_step=1800, len=179, n/ep=0, n/st=100, rew=11561.00]                                                                                


Epoch #18: test_reward: 11470.100000 ± 5195.059046, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #19: 1001it [00:03, 311.79it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=12955.00]                                                                                


Epoch #19: test_reward: 13284.600000 ± 7318.179940, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #20: 1001it [00:03, 290.70it/s, env_step=20000, gradient_step=2000, len=198, n/ep=0, n/st=100, rew=12153.00]                                                                                


Epoch #20: test_reward: 10398.900000 ± 6326.475377, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #21: 1001it [00:03, 315.64it/s, env_step=21000, gradient_step=2100, len=97, n/ep=0, n/st=100, rew=6046.00]                                                                                  


Epoch #21: test_reward: 10275.900000 ± 2647.611091, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #22: 1001it [00:03, 298.75it/s, env_step=22000, gradient_step=2200, len=69, n/ep=0, n/st=100, rew=2864.00]                                                                                  


Epoch #22: test_reward: 10034.700000 ± 2199.381552, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #23: 1001it [00:03, 276.72it/s, env_step=23000, gradient_step=2300, len=112, n/ep=1, n/st=100, rew=6315.50]                                                                                 


Epoch #23: test_reward: 8901.400000 ± 2455.685819, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #24: 1001it [00:03, 298.11it/s, env_step=24000, gradient_step=2400, len=174, n/ep=2, n/st=100, rew=10984.75]                                                                                


Epoch #24: test_reward: 8394.400000 ± 1795.425699, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #25: 1001it [00:03, 304.97it/s, env_step=25000, gradient_step=2500, len=215, n/ep=0, n/st=100, rew=14234.75]                                                                                


Epoch #25: test_reward: 10978.600000 ± 2912.530453, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #26: 1001it [00:03, 328.49it/s, env_step=26000, gradient_step=2600, len=259, n/ep=0, n/st=100, rew=17625.00]                                                                                


Epoch #26: test_reward: 8673.700000 ± 3579.063426, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #27: 1001it [00:03, 300.35it/s, env_step=27000, gradient_step=2700, len=118, n/ep=0, n/st=100, rew=6496.00]                                                                                 


Epoch #27: test_reward: 10224.000000 ± 5128.878708, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #28: 1001it [00:03, 289.44it/s, env_step=28000, gradient_step=2800, len=137, n/ep=0, n/st=100, rew=8180.00]                                                                                 


Epoch #28: test_reward: 9969.100000 ± 4336.032298, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #29: 1001it [00:05, 195.71it/s, env_step=29000, gradient_step=2900, len=220, n/ep=2, n/st=100, rew=13374.50]                                                                                


Epoch #29: test_reward: 9040.300000 ± 3745.454980, best_reward: 13641.700000 ± 4120.912303 in #2


Epoch #30: 1001it [00:04, 242.06it/s, env_step=30000, gradient_step=3000, len=108, n/ep=2, n/st=100, rew=6105.00]                                                                                 


Epoch #30: test_reward: 14779.600000 ± 5701.940287, best_reward: 14779.600000 ± 5701.940287 in #30


Epoch #31: 1001it [00:03, 263.96it/s, env_step=31000, gradient_step=3100, len=136, n/ep=1, n/st=100, rew=8361.00]                                                                                 


Epoch #31: test_reward: 11402.200000 ± 5184.798199, best_reward: 14779.600000 ± 5701.940287 in #30


Epoch #32: 1001it [00:04, 239.99it/s, env_step=32000, gradient_step=3200, len=209, n/ep=1, n/st=100, rew=12794.00]                                                                                


Epoch #32: test_reward: 13038.000000 ± 8884.615917, best_reward: 14779.600000 ± 5701.940287 in #30


Epoch #33: 1001it [00:04, 230.72it/s, env_step=33000, gradient_step=3300, len=115, n/ep=0, n/st=100, rew=6455.00]                                                                                 


Epoch #33: test_reward: 8486.200000 ± 3122.434877, best_reward: 14779.600000 ± 5701.940287 in #30


Epoch #34: 1001it [00:04, 242.29it/s, env_step=34000, gradient_step=3400, len=199, n/ep=0, n/st=100, rew=13142.00]                                                                                


Epoch #34: test_reward: 10942.800000 ± 5185.890797, best_reward: 14779.600000 ± 5701.940287 in #30


Epoch #35: 1001it [00:04, 245.02it/s, env_step=35000, gradient_step=3500, len=61, n/ep=1, n/st=100, rew=2604.00]                                                                                  


Epoch #35: test_reward: 18345.400000 ± 8063.698819, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #36: 1001it [00:03, 259.23it/s, env_step=36000, gradient_step=3600, len=155, n/ep=0, n/st=100, rew=9748.50]                                                                                 


Epoch #36: test_reward: 11075.100000 ± 4214.430886, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #37: 1001it [00:04, 226.58it/s, env_step=37000, gradient_step=3700, len=239, n/ep=2, n/st=100, rew=15435.50]                                                                                


Epoch #37: test_reward: 14378.400000 ± 7752.010000, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #38: 1001it [00:03, 250.36it/s, env_step=38000, gradient_step=3800, len=56, n/ep=0, n/st=100, rew=2214.00]                                                                                  


Epoch #38: test_reward: 9194.700000 ± 3597.451099, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #39: 1001it [00:04, 242.49it/s, env_step=39000, gradient_step=3900, len=389, n/ep=0, n/st=100, rew=27294.00]                                                                                


Epoch #39: test_reward: 10806.000000 ± 4689.114714, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #40: 1001it [00:04, 218.44it/s, env_step=40000, gradient_step=4000, len=400, n/ep=4, n/st=100, rew=26748.25]                                                                                


Epoch #40: test_reward: 11827.000000 ± 7036.885575, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #41: 1001it [00:04, 236.08it/s, env_step=41000, gradient_step=4100, len=227, n/ep=0, n/st=100, rew=15048.00]                                                                                


Epoch #41: test_reward: 9842.700000 ± 5473.623189, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #42: 1001it [00:03, 254.87it/s, env_step=42000, gradient_step=4200, len=137, n/ep=0, n/st=100, rew=8806.50]                                                                                 


Epoch #42: test_reward: 12751.100000 ± 8922.885592, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #43: 1001it [00:03, 257.64it/s, env_step=43000, gradient_step=4300, len=145, n/ep=0, n/st=100, rew=8744.00]                                                                                 


Epoch #43: test_reward: 11048.700000 ± 7218.839603, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #44: 1001it [00:04, 235.36it/s, env_step=44000, gradient_step=4400, len=140, n/ep=2, n/st=100, rew=8990.00]                                                                                 


Epoch #44: test_reward: 9513.200000 ± 4843.320448, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #45: 1001it [00:03, 253.89it/s, env_step=45000, gradient_step=4500, len=210, n/ep=1, n/st=100, rew=13652.00]                                                                                


Epoch #45: test_reward: 8697.300000 ± 4765.382252, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #46: 1001it [00:04, 222.67it/s, env_step=46000, gradient_step=4600, len=129, n/ep=1, n/st=100, rew=7684.00]                                                                                 


Epoch #46: test_reward: 16170.100000 ± 6119.225792, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #47: 1001it [00:03, 253.53it/s, env_step=47000, gradient_step=4700, len=204, n/ep=3, n/st=100, rew=12584.50]                                                                                


Epoch #47: test_reward: 13038.300000 ± 6532.438657, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #48: 1001it [00:04, 216.65it/s, env_step=48000, gradient_step=4800, len=158, n/ep=1, n/st=100, rew=10098.50]                                                                                


Epoch #48: test_reward: 17110.400000 ± 8077.682133, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #49: 1001it [00:04, 224.82it/s, env_step=49000, gradient_step=4900, len=90, n/ep=0, n/st=100, rew=5052.00]                                                                                  


Epoch #49: test_reward: 11566.600000 ± 2476.726234, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #50: 1001it [00:04, 244.39it/s, env_step=50000, gradient_step=5000, len=146, n/ep=0, n/st=100, rew=9428.00]                                                                                 


Epoch #50: test_reward: 14403.000000 ± 7565.794089, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #51: 1001it [00:04, 218.11it/s, env_step=51000, gradient_step=5100, len=132, n/ep=1, n/st=100, rew=8561.00]                                                                                 


Epoch #51: test_reward: 15320.000000 ± 9132.826715, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #52: 1001it [00:04, 231.66it/s, env_step=52000, gradient_step=5200, len=147, n/ep=0, n/st=100, rew=8625.00]                                                                                 


Epoch #52: test_reward: 13082.300000 ± 2429.349627, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #53: 1001it [00:04, 222.86it/s, env_step=53000, gradient_step=5300, len=145, n/ep=2, n/st=100, rew=8870.50]                                                                                 


Epoch #53: test_reward: 10888.000000 ± 2351.212623, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #54: 1001it [00:04, 237.32it/s, env_step=54000, gradient_step=5400, len=103, n/ep=0, n/st=100, rew=5991.50]                                                                                 


Epoch #54: test_reward: 13768.700000 ± 5986.592354, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #55: 1001it [00:04, 233.08it/s, env_step=55000, gradient_step=5500, len=172, n/ep=0, n/st=100, rew=11649.00]                                                                                


Epoch #55: test_reward: 9651.900000 ± 3399.591224, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #56: 1001it [00:04, 225.93it/s, env_step=56000, gradient_step=5600, len=171, n/ep=3, n/st=100, rew=10883.17]                                                                                


Epoch #56: test_reward: 9157.600000 ± 2842.233425, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #57: 1001it [00:04, 224.03it/s, env_step=57000, gradient_step=5700, len=171, n/ep=0, n/st=100, rew=11119.50]                                                                                


Epoch #57: test_reward: 12746.500000 ± 7293.432254, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #58: 1001it [00:04, 233.40it/s, env_step=58000, gradient_step=5800, len=128, n/ep=0, n/st=100, rew=7122.00]                                                                                 


Epoch #58: test_reward: 14806.200000 ± 5644.312213, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #59: 1001it [00:05, 192.83it/s, env_step=59000, gradient_step=5900, len=138, n/ep=0, n/st=100, rew=8437.50]                                                                                 


Epoch #59: test_reward: 15184.600000 ± 5583.229177, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #60: 1001it [00:05, 181.23it/s, env_step=60000, gradient_step=6000, len=204, n/ep=0, n/st=100, rew=13048.00]                                                                                


Epoch #60: test_reward: 13764.000000 ± 6627.800661, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #61: 1001it [00:05, 189.90it/s, env_step=61000, gradient_step=6100, len=116, n/ep=2, n/st=100, rew=6589.25]                                                                                 


Epoch #61: test_reward: 7996.500000 ± 2755.395770, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #62: 1001it [00:05, 183.44it/s, env_step=62000, gradient_step=6200, len=149, n/ep=0, n/st=100, rew=8205.00]                                                                                 


Epoch #62: test_reward: 11444.200000 ± 4649.205390, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #63: 1001it [00:05, 188.47it/s, env_step=63000, gradient_step=6300, len=190, n/ep=1, n/st=100, rew=12172.00]                                                                                


Epoch #63: test_reward: 10451.300000 ± 3419.556990, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #64: 1001it [00:05, 185.97it/s, env_step=64000, gradient_step=6400, len=143, n/ep=3, n/st=100, rew=8945.50]                                                                                 


Epoch #64: test_reward: 8844.400000 ± 1599.247335, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #65: 1001it [00:04, 212.33it/s, env_step=65000, gradient_step=6500, len=193, n/ep=0, n/st=100, rew=12947.00]                                                                                


Epoch #65: test_reward: 12937.400000 ± 6310.222519, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #66: 1001it [00:05, 194.08it/s, env_step=66000, gradient_step=6600, len=137, n/ep=1, n/st=100, rew=8951.00]                                                                                 


Epoch #66: test_reward: 12144.800000 ± 3890.112384, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #67: 1001it [00:05, 197.36it/s, env_step=67000, gradient_step=6700, len=206, n/ep=1, n/st=100, rew=13842.00]                                                                                


Epoch #67: test_reward: 13740.300000 ± 8208.689543, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #68: 1001it [00:04, 200.63it/s, env_step=68000, gradient_step=6800, len=138, n/ep=1, n/st=100, rew=8704.00]                                                                                 


Epoch #68: test_reward: 10743.900000 ± 6285.957723, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #69: 1001it [00:05, 184.04it/s, env_step=69000, gradient_step=6900, len=261, n/ep=2, n/st=100, rew=17742.50]                                                                                


Epoch #69: test_reward: 13266.400000 ± 6983.224688, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #70: 1001it [00:05, 177.77it/s, env_step=70000, gradient_step=7000, len=400, n/ep=1, n/st=100, rew=26991.00]                                                                                


Epoch #70: test_reward: 9527.500000 ± 2385.816642, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #71: 1001it [00:05, 179.24it/s, env_step=71000, gradient_step=7100, len=223, n/ep=0, n/st=100, rew=15590.50]                                                                                


Epoch #71: test_reward: 8765.800000 ± 2485.348619, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #72: 1001it [00:05, 181.63it/s, env_step=72000, gradient_step=7200, len=145, n/ep=0, n/st=100, rew=8807.50]                                                                                 


Epoch #72: test_reward: 11471.600000 ± 4595.692749, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #73: 1001it [00:05, 182.51it/s, env_step=73000, gradient_step=7300, len=114, n/ep=1, n/st=100, rew=7008.50]                                                                                 


Epoch #73: test_reward: 14530.200000 ± 5404.898201, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #74: 1001it [00:05, 187.16it/s, env_step=74000, gradient_step=7400, len=188, n/ep=2, n/st=100, rew=12168.25]                                                                                


Epoch #74: test_reward: 10955.500000 ± 4253.399987, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #75: 1001it [00:04, 212.70it/s, env_step=75000, gradient_step=7500, len=214, n/ep=0, n/st=100, rew=13857.00]                                                                                


Epoch #75: test_reward: 11738.500000 ± 6850.145272, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #76: 1001it [00:05, 173.43it/s, env_step=76000, gradient_step=7600, len=130, n/ep=1, n/st=100, rew=7186.00]                                                                                 


Epoch #76: test_reward: 11732.300000 ± 5304.402286, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #77: 1001it [00:05, 182.02it/s, env_step=77000, gradient_step=7700, len=139, n/ep=1, n/st=100, rew=9101.00]                                                                                 


Epoch #77: test_reward: 14060.600000 ± 9781.380099, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #78: 1001it [00:05, 191.85it/s, env_step=78000, gradient_step=7800, len=124, n/ep=1, n/st=100, rew=5977.00]                                                                                 


Epoch #78: test_reward: 11271.800000 ± 6700.983238, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #79: 1001it [00:04, 201.40it/s, env_step=79000, gradient_step=7900, len=113, n/ep=0, n/st=100, rew=6844.75]                                                                                 


Epoch #79: test_reward: 12495.900000 ± 3105.134955, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #80: 1001it [00:05, 193.05it/s, env_step=80000, gradient_step=8000, len=275, n/ep=0, n/st=100, rew=19290.00]                                                                                


Epoch #80: test_reward: 11443.100000 ± 5586.033878, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #81: 1001it [00:05, 195.03it/s, env_step=81000, gradient_step=8100, len=63, n/ep=0, n/st=100, rew=3348.50]                                                                                  


Epoch #81: test_reward: 9788.700000 ± 2387.998578, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #82: 1001it [00:05, 197.11it/s, env_step=82000, gradient_step=8200, len=64, n/ep=0, n/st=100, rew=3115.00]                                                                                  


Epoch #82: test_reward: 16856.500000 ± 9055.400347, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #83: 1001it [00:05, 193.51it/s, env_step=83000, gradient_step=8300, len=143, n/ep=0, n/st=100, rew=8375.00]                                                                                 


Epoch #83: test_reward: 10759.900000 ± 2522.260908, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #84: 1001it [00:05, 178.76it/s, env_step=84000, gradient_step=8400, len=150, n/ep=1, n/st=100, rew=9706.00]                                                                                 


Epoch #84: test_reward: 10697.800000 ± 4675.466026, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #85: 1001it [00:05, 198.19it/s, env_step=85000, gradient_step=8500, len=189, n/ep=0, n/st=100, rew=12697.00]                                                                                


Epoch #85: test_reward: 9476.800000 ± 2311.771087, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #86: 1001it [00:04, 215.63it/s, env_step=86000, gradient_step=8600, len=156, n/ep=0, n/st=100, rew=9869.00]                                                                                 


Epoch #86: test_reward: 11911.800000 ± 5695.301692, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #87: 1001it [00:04, 243.52it/s, env_step=87000, gradient_step=8700, len=241, n/ep=2, n/st=100, rew=16282.50]                                                                                


Epoch #87: test_reward: 11155.800000 ± 1737.973924, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #88: 1001it [00:04, 224.86it/s, env_step=88000, gradient_step=8800, len=103, n/ep=1, n/st=100, rew=6257.00]                                                                                 


Epoch #88: test_reward: 12214.100000 ± 2846.994080, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #89: 1001it [00:04, 240.06it/s, env_step=89000, gradient_step=8900, len=275, n/ep=1, n/st=100, rew=19131.00]                                                                                


Epoch #89: test_reward: 10191.600000 ± 6797.572982, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #90: 1001it [00:04, 201.73it/s, env_step=90000, gradient_step=9000, len=161, n/ep=0, n/st=100, rew=10351.00]                                                                                


Epoch #90: test_reward: 9871.900000 ± 1778.646083, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #91: 1001it [00:05, 190.81it/s, env_step=91000, gradient_step=9100, len=169, n/ep=0, n/st=100, rew=10460.00]                                                                                


Epoch #91: test_reward: 10555.600000 ± 3241.216784, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #92: 1001it [00:05, 192.89it/s, env_step=92000, gradient_step=9200, len=171, n/ep=1, n/st=100, rew=11605.00]                                                                                


Epoch #92: test_reward: 12031.700000 ± 7049.094510, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #93: 1001it [00:05, 193.30it/s, env_step=93000, gradient_step=9300, len=147, n/ep=1, n/st=100, rew=9581.50]                                                                                 


Epoch #93: test_reward: 12928.600000 ± 3140.401987, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #94: 1001it [00:05, 188.95it/s, env_step=94000, gradient_step=9400, len=400, n/ep=0, n/st=100, rew=29062.00]                                                                                


Epoch #94: test_reward: 11597.400000 ± 4644.467401, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #95: 1001it [00:04, 200.25it/s, env_step=95000, gradient_step=9500, len=193, n/ep=1, n/st=100, rew=11853.00]                                                                                


Epoch #95: test_reward: 10690.900000 ± 1674.218412, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #96: 1001it [00:05, 193.80it/s, env_step=96000, gradient_step=9600, len=117, n/ep=1, n/st=100, rew=7592.50]                                                                                 


Epoch #96: test_reward: 9507.900000 ± 1994.299248, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #97: 1001it [00:05, 177.86it/s, env_step=97000, gradient_step=9700, len=171, n/ep=0, n/st=100, rew=11645.00]                                                                                


Epoch #97: test_reward: 10111.700000 ± 3239.018402, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #98: 1001it [00:05, 188.94it/s, env_step=98000, gradient_step=9800, len=281, n/ep=0, n/st=100, rew=20128.00]                                                                                


Epoch #98: test_reward: 11137.400000 ± 3005.655908, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #99: 1001it [00:05, 199.85it/s, env_step=99000, gradient_step=9900, len=129, n/ep=1, n/st=100, rew=8035.50]                                                                                 


Epoch #99: test_reward: 9999.200000 ± 2406.920431, best_reward: 18345.400000 ± 8063.698819 in #35


Epoch #100: 1001it [00:05, 194.33it/s, env_step=100000, gradient_step=10000, len=151, n/ep=1, n/st=100, rew=9509.00]                                                                              


Epoch #100: test_reward: 17377.000000 ± 7626.624260, best_reward: 18345.400000 ± 8063.698819 in #35

InfoStats(gradient_step=10000, best_reward=18345.4, best_reward_std=8063.6988187803745, train_step=100000, train_episode=528, test_step=177760, test_episode=1010, timing=TimingStats(total_time=590.0176703929901, train_time=440.50773429870605, train_time_collect=41.05334520339966, train_time_update=394.0332233905792, test_time=149.50993609428406, update_speed=227.01077010418734))

(the trained policy can be accessed via policy.policies[agents[1]])


### 🦧 Play different learned policies

In [40]:
for n_agent, agent in enumerate(agents_learned):
    print("----------------------------------------------------")
    winners = 0
    for n_agent_opp, agent_opponent in enumerate(agents_learned):
        if n_agent != n_agent_opp:
            PLAYS = {"bastaushy": 0, "qostaushy": 0}
    
            env = _get_env()#render_mode="human")
    
            policies = MultiAgentPolicyManager(policies=[agent_opponent, agent], env=env)
            env = DummyVectorEnv([lambda: env])
            
            collector = Collector(policies, env)
            
            result = collector.collect(n_episode=1, reset_before_collect=True)
            print(f"Agent {n_agent} plays as Qostaushy with agent_opponent {n_agent_opp}, result: {PLAYS}")
            winners += PLAYS["qostaushy"]
    print(f"Agent {n_agent} wins as Qostaushy {winners} times")

----------------------------------------------------
Agent 0 plays as Qostaushy with agent_opponent 1, result: {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays as Qostaushy with agent_opponent 2, result: {'bastaushy': 0, 'qostaushy': 0}
Agent 0 plays as Qostaushy with agent_opponent 3, result: {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays as Qostaushy with agent_opponent 4, result: {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays as Qostaushy with agent_opponent 5, result: {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays as Qostaushy with agent_opponent 6, result: {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays as Qostaushy with agent_opponent 7, result: {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays as Qostaushy with agent_opponent 8, result: {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays as Qostaushy with agent_opponent 9, result: {'bastaushy': 0, 'qostaushy': 0}
Agent 0 plays as Qostaushy with agent_opponent 10, result: {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays as Qostaushy with agen

### 🦉 Play stronger policy with another one

In [44]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}

env = _get_env(render_mode="human")

policies = MultiAgentPolicyManager(policies=[agents_learned[21], agents_learned[28]], env=env)

env = DummyVectorEnv([lambda: env])

collector = Collector(policies, env)

result = collector.collect(n_episode=1, reset_before_collect=True)
print(PLAYS)

{'bastaushy': 0, 'qostaushy': 1}


### 🦎 Play stronger policy with random policy

In [20]:
for model in range(1, 42):
    
    PLAYS = {"bastaushy": 0, "qostaushy": 0}

    env = _get_env()#render_mode="human")
    net = Net(
                state_shape=(22,),
                action_shape=env.action_space.shape or env.action_space.n,
                hidden_sizes=[256, 512, 512, 256],
                device="cuda" if torch.cuda.is_available() else "cpu",
            ).to("cuda" if torch.cuda.is_available() else "cpu")
    
    agent_learned = DQNPolicy(
                model=net,
                optim = torch.optim.Adam(net.parameters(), lr=1e-4),
                discount_factor=0.9,
                estimation_step=3,
                target_update_freq=320,
                action_space=env.action_space
            ).to("cuda" if torch.cuda.is_available() else "cpu")
    
    agent_learned.load_state_dict(torch.load(f"models/policy_dqn_256x512x512x256_{model}.pth"))
    
    policies = MultiAgentPolicyManager(policies=[RandomPolicy(action_space=env.action_space), agent_learned], env=env)
    
    env = DummyVectorEnv([lambda: env])
    
    collector = Collector(policies, env)
    
    result = collector.collect(n_episode=1000, reset_before_collect=True)
    print(f"Agent #{model} plays as qostaushy, result: {PLAYS}")

Agent #1 plays as qostaushy, result: {'bastaushy': 686, 'qostaushy': 207}
Agent #2 plays as qostaushy, result: {'bastaushy': 886, 'qostaushy': 91}
Agent #3 plays as qostaushy, result: {'bastaushy': 779, 'qostaushy': 217}
Agent #4 plays as qostaushy, result: {'bastaushy': 781, 'qostaushy': 187}
Agent #5 plays as qostaushy, result: {'bastaushy': 940, 'qostaushy': 60}
Agent #6 plays as qostaushy, result: {'bastaushy': 663, 'qostaushy': 234}
Agent #7 plays as qostaushy, result: {'bastaushy': 898, 'qostaushy': 94}
Agent #8 plays as qostaushy, result: {'bastaushy': 805, 'qostaushy': 158}
Agent #9 plays as qostaushy, result: {'bastaushy': 859, 'qostaushy': 119}
Agent #10 plays as qostaushy, result: {'bastaushy': 630, 'qostaushy': 359}
Agent #11 plays as qostaushy, result: {'bastaushy': 693, 'qostaushy': 250}
Agent #12 plays as qostaushy, result: {'bastaushy': 962, 'qostaushy': 38}
Agent #13 plays as qostaushy, result: {'bastaushy': 528, 'qostaushy': 208}
Agent #14 plays as qostaushy, result: 