<a href="https://colab.research.google.com/github/zhus-dika/togyz-qumalaq-agent/blob/main/togyzqumalaq_aec_vs_random_policy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!python -m ipykernel install --user --name=venv

#  🐘 AEC environment https://pettingzoo.farama.org/api/aec/#about-aec

### 🐞 Imports

In [1]:
import gymnasium
import os
import numpy as np
from gymnasium.spaces import Discrete, MultiDiscrete
from gymnasium import spaces

from IPython.display import clear_output
import time
from pettingzoo import AECEnv
from pettingzoo.utils import agent_selector, wrappers
import matplotlib.pyplot as plt

NUM_ITERS = 400
PLAYS = {"bastaushy": 0, "qostaushy": 0}

### 🦉 Create environment

In [211]:
def env(render_mode=None):
    """
    The env function often wraps the environment in wrappers by default.
    You can find full documentation for these methods
    elsewhere in the developer documentation.
    """
    internal_render_mode = render_mode if render_mode != "ansi" else "human"
    env = raw_env(render_mode=internal_render_mode)
    # This wrapper is only for environments which print results to the terminal
    if render_mode == "ansi":
        env = wrappers.CaptureStdoutWrapper(env)
    # this wrapper helps error handling for discrete action spaces
    env = wrappers.AssertOutOfBoundsWrapper(env)
    # Provides a wide vareity of helpful user errors
    # Strongly recommended
    env = wrappers.OrderEnforcingWrapper(env)
    return env


class raw_env(AECEnv):
    """
    The metadata holds environment constants. From gymnasium, we inherit the "render_modes",
    metadata which specifies which modes can be put into the render() method.
    At least human mode should be supported.
    The "name" metadata allows the environment to be pretty printed.
    """

    metadata = {
        "render_modes": ["ansi", "human"],
        "name": "togyzqumalaq_v0"
        }

    def __init__(self, render_mode=None):
        """
        The init method takes in environment arguments and
         should define the following attributes:
        - otaular
        - tuzdyq
        - qazandar
        - possible_agents
        - render_mode

        Note: as of v1.18.1, the action_spaces and observation_spaces attributes are deprecated.
        Spaces should be defined in the action_space() and observation_space() methods.
        If these methods are not overridden, spaces will be inferred from self.observation_spaces/action_spaces, raising a warning.

        These attributes should not be changed after initialization.
        """
        self.otaular = []
        self.tuzdyq = []
        self.qazandar = []
        self.direction = []
        self.agents = ["bastaushy", "qostaushy"]
        self.possible_agents = self.agents[:]
        # optional: we can define the observation and action spaces here as attributes to be used in their corresponding methods
        self.action_spaces = {i: spaces.Discrete(9) for i in self.agents}
        self.observation_spaces = {
            i: spaces.Dict(
                {
                    "observation": MultiDiscrete([100] * 18 + [9] * 2 + [82] * 2),
                    "action_mask": Discrete(9),
                }
            )
            for i in self.agents
        }
        self.render_mode = render_mode

    # Observation space should be defined here.
    def action_space(self, agent):
        return self.action_spaces[agent]

    # Action space should be defined here.
    def observation_space(self, agent):
        return self.observation_spaces[agent]

    def render(self):
        """
        Renders the environment. In human mode, it can print to terminal, open
        up a graphical window, or open up some other display that a human can see and understand.
        """
        """Renders the environment."""
        if self.render_mode is None:
            gymnasium.logger.warn(
                "You are calling render method without specifying any render mode."
            )
            return

        if len(self.agents) == 2:
            points_bastaushy_x = np.array([i * 2 for i in range(10)])
            points_bastaushy_y = np.array([i % 5 for i in range(50)])

            x = np.arange(-3, 225, 1)
            y = -1

            text_kwargs = dict(ha='center', va='center', fontsize=12)
            plt.figure(figsize=(17, 6))

            for i in range(9):
                # qostaushy's part
                plt.scatter(np.repeat(points_bastaushy_x + 25 * i, 5)[:self.otaular[17 - i]], points_bastaushy_y[:self.otaular[17 - i]], marker='o')
                # horizontal line
                plt.plot(x, np.repeat(y, len(x)))
                # vertical lines
                plt.plot(np.repeat(25 * i - 2, len(x)), np.arange(-7, 5, 12 / len(x)))
                # bastaushy's part
                plt.scatter(np.repeat(points_bastaushy_x + 25 * i, 5)[:self.otaular[i]], points_bastaushy_y[:self.otaular[i]] - 6, marker='o')

            #last vertical line
            plt.plot(np.repeat(25 * 9 - 2, len(x)), np.arange(-7, 5, 12 / len(x)))

            for i in range(9):
                # bastaushy's qumalaqtar
                plt.text(25 * i + 10, -7, f'{i} ({self.otaular[i]})', **text_kwargs)
                # qostaushy's qumalaqtar
                plt.text(25 * i + 10, 5, f'{17 - i} ({self.otaular[17 - i]})', **text_kwargs)
            # bastaushy qazan's qumalaqtar
            plt.text(230, -4, f'qazan: {self.qazandar[0]}', **text_kwargs)
            # qostaushy qazan's qumalaqtar
            plt.text(230, 2, f'qazan: {self.qazandar[1]}', **text_kwargs);
            # bastaushy tuzdyq's qumalaqtar
            plt.text(230, -6, f'tuzdyq: {self.tuzdyq[0]}', **text_kwargs)
            # qostaushy tuzdyq's qumalaqtar
            plt.text(230, 0, f'tuzdyq: {self.tuzdyq[1]}', **text_kwargs);
            plt.show()
        else:
            if self.render_mode == "human":
                print("Game over")
        time.sleep(2)
        clear_output(True)

    def _legal_moves(self, agent):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2
        return [item for item in range(9 * cur_player, (cur_player + 1) * 9) if self.tuzdyq[opp_player] != item and self.otaular[item] > 0]

    def observe(self, agent):
        """
        Observe should return the observation of the specified agent. This function
        should return a sane observation (though not necessarily the most up to date possible)
        at any time after reset() is called.
        """
        # observation of one agent is the previous state of the other
        legal_moves = self._legal_moves(agent) if agent == self.agent_selection else []
        action_mask = np.zeros(9, "int8")
        if self.possible_agents.index(agent) == 1:
            legal_moves = [i - 9 for i in legal_moves]
        for i in legal_moves:
            action_mask[i] = 1
        observation = tuple(
            self.otaular + self.tuzdyq + self.qazandar
        )
        return {"observation": observation, "action_mask": action_mask}

    def close(self):
        """
        Close should release any graphical displays, subprocesses, network connections
        or any other environment data which should not be kept around after the
        user is no longer using the environment.
        """
        pass

    def reset(self, seed=None, options=None):
        """
        Reset needs to initialize the following attributes
        - agents
        - rewards
        - _cumulative_rewards
        - terminations
        - truncations
        - infos
        - agent_selection
        And must set up the environment so that render(), step(), and observe()
        can be called without issues.
        Here it sets up the state dictionary which is used by step() and the observations dictionary which is used by step() and observe()
        """
        self.agents = self.possible_agents[:]
        self.rewards = {agent: 0 for agent in self.agents}
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.otaular = [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
        self.direction = [list(range(18)), [9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8]]
        self.tuzdyq = [-1, -1]
        self.qazandar = [0, 0]
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}
        self.num_moves = 0
        observation = tuple(
            self.otaular + self.tuzdyq + self.qazandar
        )
        self.observations = {agent: observation for agent in self.agents}
        """
        Our agent_selector utility allows easy cyclic stepping through the agents list.
        """
        self._agent_selector = agent_selector(self.agents)
        self.agent_selection = self._agent_selector.next()

    def step(self, action):
        """
        step(action) takes in an action for the current agent (specified by
        agent_selection) and needs to update
        - rewards
        - _cumulative_rewards (accumulating the rewards)
        - terminations
        - truncations
        - infos
        - agent_selection (to the next agent)
        And any internal state used by observe() or render()
        """
        if (
            self.terminations[self.agent_selection]
            or self.truncations[self.agent_selection]
        ):
            # handles stepping an agent which is already dead
            # accepts a None action for the one agent, and moves the agent_selection to
            # the next dead agent,  or if there are no more dead agents, to the next live agent
            self._was_dead_step(action)
            return

        cur_player = self.possible_agents.index(self.agent_selection)
        opp_player = (cur_player + 1) % 2
        self.num_moves += 1
        if self.render_mode == "human":
            print(f'MOVE #{self.num_moves}')
        # The truncations dictionary must be updated for all players.
        self.truncations = {
            agent: self.num_moves >= NUM_ITERS for agent in self.agents
        }
        # distribute qumalaqs
        if cur_player == 1:
            action += 9
        if self.render_mode == "human":
            print(f'{self.agent_selection} made action {action}')
        num_qumalaq = self.otaular[action]
        idx_action = self.direction[cur_player].index(action)
        if self.otaular[action] == 1:
            self.otaular[self.direction[cur_player][idx_action + 1]] += 1
            self.otaular[action] -= 1
        else:
            i = 1
            while self.otaular[action] > 1:
                self.otaular[self.direction[cur_player][(idx_action + i) % 18]] += 1
                self.otaular[action] -= 1
                i += 1
        # check tuzdyq & add rewards to qazandar
        reward = 0
        if self.check_tuzdyq(self.agent_selection, action):
            reward += 3
            if self.render_mode == "human":
                print(f'{self.agent_selection} won tuzdyq {reward}')
        else:

            if num_qumalaq > 1:
                last_otau = self.direction[cur_player][(idx_action + num_qumalaq - 1) % 18]
            else:
                last_otau = self.direction[cur_player][(idx_action + num_qumalaq) % 18]

            if last_otau in range(opp_player * 9, (opp_player + 1) * 9) and self.otaular[last_otau] % 2 == 0:
                reward += self.otaular[last_otau]
                if self.render_mode == "human":
                    print(f'{self.agent_selection} won {reward}')
                self.otaular[last_otau] = 0
            if self.tuzdyq[cur_player] >= 0 and self.otaular[self.tuzdyq[cur_player]] > 0:
                reward += self.otaular[self.tuzdyq[cur_player]]
                if self.render_mode == "human":
                    print(f'{self.agent_selection} won tuzdyq {self.otaular[self.tuzdyq[cur_player]]}')
                self.otaular[self.tuzdyq[cur_player]] = 0
        if self.render_mode == "human":
            print(f'{self.agent_selection} won total {reward}')
        self.qazandar[cur_player] += reward
        self.rewards[self.agent_selection] += reward
        # check if there is a winner
        winner = self.check_for_winner()
        if winner:
            self.terminations = {i: True for i in self.agents}
            if self.render_mode == "human":
                print(f'{self.agent_selection} won the game!!!')
        # selects the next agent.
        self.agent_selection = self._agent_selector.next()
        # Adds .rewards to ._cumulative_rewards
        self._accumulate_rewards()

        if self.render_mode == "human":
            self.render()

    def check_tuzdyq(self, agent, action):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2
        idx = self.direction[cur_player].index(action)
        num_qumalaq = self.otaular[action]

        if num_qumalaq > 1:
            last_otau = self.direction[cur_player][(idx + num_qumalaq - 1) % 18]
        else:
            last_otau = self.direction[cur_player][(idx + num_qumalaq) % 18]

        if last_otau in range(opp_player * 9, (opp_player + 1) * 9) and self.otaular[last_otau] == 3 and last_otau != 17 - cur_player * 9 and abs(last_otau - self.tuzdyq[opp_player]) != 9:
            self.tuzdyq[cur_player] = last_otau
            self.otaular[last_otau] = 0
            if self.render_mode == "human":
                print(f'{agent} got tuzdyq {last_otau}!')
            return True

        return False

    def check_atsyrau(self, agent):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2

        for idx, i in enumerate(self.otaular[cur_player * 9: (cur_player + 1) * 9]):
            if i > 0 and idx + cur_player * 9 != self.tuzdyq[opp_player]:
                return False
        if self.render_mode == "human":
            print(f'{agent} reached atsyrau')
        return True

    def check_for_winner(self):
        cur_player = self.possible_agents.index(self.agent_selection)
        opp_player = (cur_player + 1) % 2
        if self.qazandar[cur_player] > 81:
            PLAYS[self.agent_selection] += 1
            return True
        if self.check_atsyrau(self.possible_agents[opp_player]) and self.qazandar[opp_player] <= 81:
            PLAYS[self.agent_selection] += 1
            return True
        return False

### 🦚 Testing environment

In [None]:
# env = env(render_mode="human")
# env.reset(seed=42)

# for agent in env.agent_iter():
#     observation, reward, termination, truncation, info = env.last()

#     if termination or truncation:
#         action = None
#     else:
#         mask = observation["action_mask"]
#         # this is where you would insert your policy
#         action = env.action_space(agent).sample(mask)

#     env.step(action)
# env.close()

# 🐼 DQN agent to play vs a random policy agent https://pettingzoo.farama.org/tutorials/tianshou/intermediate/

### 🐝 Imports

In [27]:
import os
from typing import Optional, Tuple

import gymnasium
import numpy as np
import torch
from copy import deepcopy
from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import BasePolicy, DQNPolicy, RainbowPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import OffpolicyTrainer
from tianshou.utils.net.common import Net

### 🐎 Load trained agents

In [212]:
agent1_path = "models/policy_128x256x256x128_bs64.pth"
agent2_path = "models/policy_256x512x512x256_bs128.pth"
agent3_path = "models/policy_512x1024x1024x512_bs128.pth"
agent4_path = "models/policy_128x256x512x256x128_trained_128x256x256x128.pth"

agents_learned = []
env = PettingZooEnv(env())
net1 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent1_learned = DQNPolicy(
            model=net1,
            optim = torch.optim.Adam(net1.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent1_learned.load_state_dict(torch.load(agent1_path))
agents_learned.append(agent1_learned)


net2 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[256, 512, 512, 256],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent2_learned = DQNPolicy(
            model=net2,
            optim = torch.optim.Adam(net1.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")
agent2_learned.load_state_dict(torch.load(agent2_path))
agents_learned.append(agent2_learned)


net3 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[512, 1024, 1024, 512],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent3_learned = DQNPolicy(
            model=net3,
            optim = torch.optim.Adam(net3.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")
agent3_learned.load_state_dict(torch.load(agent3_path))
agents_learned.append(agent3_learned)


net4 = Net(
            state_shape=(22,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 512, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent4_learned = DQNPolicy(
            model=net4,
            optim = torch.optim.Adam(net3.parameters(), lr=1e-4),
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")

agent4_learned.load_state_dict(torch.load(agent4_path))
agents_learned.append(agent4_learned)

### 🐫 Prepare main functions

In [217]:
def _get_agents_dqn(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gymnasium.spaces.Dict)
        else env.observation_space
    )
    if agent_learn is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[256, 512, 512, 256],
            #hidden_sizes=[1024, 2048, 2048, 1024],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=1e-4)
        agent_learn = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=0.9,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")


    if agent_opponent is None:
        if agent1_path:
            agent_opponent = agent1_learned
        else:
            agent_opponent = RandomPolicy(action_space=env.action_space)

    agents = [agent_opponent, agent_learn]
    #agents = [agent_learn, agent_opponent]
    policy = MultiAgentPolicyManager(policies=agents, env=env)
    return policy, optim, env.agents


def _get_env(render_mode=None):
    """This function is needed to provide callables for DummyVectorEnv."""
    def env(render_mode=None):
        """
        The env function often wraps the environment in wrappers by default.
        You can find full documentation for these methods
        elsewhere in the developer documentation.
        """
        internal_render_mode = render_mode if render_mode != "ansi" else "human"
        env = raw_env(render_mode=internal_render_mode)
        # This wrapper is only for environments which print results to the terminal
        if render_mode == "ansi":
            env = wrappers.CaptureStdoutWrapper(env)
        # this wrapper helps error handling for discrete action spaces
        env = wrappers.AssertOutOfBoundsWrapper(env)
        # Provides a wide vareity of helpful user errors
        # Strongly recommended
        env = wrappers.OrderEnforcingWrapper(env)
        return env
    return PettingZooEnv(env(render_mode=render_mode))

###  🐑 Training code https://tianshou.org/en/stable/01_tutorials/04_tictactoe.html

In [None]:
# Before evaluate this cell run the cell with env
# ======== Step 1: Environment setup =========
train_envs = DummyVectorEnv([_get_env for _ in range(100)])
test_envs = DummyVectorEnv([_get_env for _ in range(100)])

# seed
seed = 11
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# ======== Step 2: Agent setup =========
policy, optim, agents = _get_agents_dqn(agent_opponent=agents_learned[1])

# # ======== Step 3: Collector setup =========
train_collector = Collector(
    policy,
    train_envs,
    VectorReplayBuffer(20_000, len(train_envs)),
    exploration_noise=True,
)
test_collector = Collector(policy, test_envs, exploration_noise=True)
# policy.set_eps(1)

# ======== Step 4: Callback functions setup =========
def save_best_fn(policy):
    model_save_path = os.path.join("models", "dqn", "policy.pth")
    os.makedirs(os.path.join("models", "dqn"), exist_ok=True)
    torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

def stop_fn(mean_rewards):
    return mean_rewards >= 21000

def train_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.1)

def test_fn(epoch, env_step):
    policy.policies[agents[1]].set_eps(0.05)

def reward_metric(rews):
    return rews[:, 1]

# ======== Step 5: Run the trainer =========
result = OffpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=150,
    step_per_epoch=1000,
    step_per_collect=50,
    episode_per_test=10,
    batch_size=256,
    train_fn=train_fn,
    test_fn=test_fn,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn,
    update_per_step=0.1,
    test_in_train=False,
    reward_metric=reward_metric,
    verbose=True
).run()

# return result, policy.policies[agents[1]]
print(f"\n==========Result==========\n{result}")
print("\n(the trained policy can be accessed via policy.policies[agents[1]])")

### 🐙 Evaluate best Qostaushy agent with random policy

In [67]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}
# Step 1: Load the PettingZoo environment
env = env()#render_mode="human")

# Step 2: Wrap the environment for Tianshou interfacing
env = PettingZooEnv(env)

# # Step 3: Define policies for each agent
policies = MultiAgentPolicyManager(policies=[RandomPolicy(action_space=env.action_space), agents_learned[3]], env=env)
# # Step 4: Convert the env to vector format
env = DummyVectorEnv([lambda: env])

# # Step 5: Construct the Collector, which interfaces the policies with the vectorised environment
collector = Collector(policies, env)

# # Step 6: Execute the environment with the agents playing for 1 episode, and render a frame every ? seconds
result = collector.collect(n_episode=100, reset_before_collect=True)
print(PLAYS)

{'bastaushy': 85, 'qostaushy': 13}


🐳 Experiments

1.

 net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=64 * 100)  # batch size * training_num

res: {'bastaushy': 781, 'qostaushy': 194}

res: {'bastaushy': 790, 'qostaushy': 191}

2.

 net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=256 * 100)  # batch size * training_num

res: 6/3

3.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[128, 256, 256, 128],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=128 * 100)  # batch size * training_num     

{'bastaushy': 537, 'qostaushy': 411}

4.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[256, 512, 512, 256],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

train_collector.collect(n_step=128 * 100)  # batch size * training_num   

res: {'bastaushy': 493, 'qostaushy': 483}

res: {'bastaushy': 512, 'qostaushy': 464}

5.

net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=[512, 1024, 1024, 512],
            device="cuda" if torch.cuda.is_available() else "cpu",
        ).to("cuda" if torch.cuda.is_available() else "cpu")

res: {'bastaushy': 38, 'qostaushy': 59}

res: {'bastaushy': 372, 'qostaushy': 562}

### 🦎 Play with different policies

In [77]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}
# Step 1: Load the PettingZoo environment
env = env()#render_mode="human")

# Step 2: Wrap the environment for Tianshou interfacing
env = PettingZooEnv(env)

# # Step 3: Define policies for each agent
policies = MultiAgentPolicyManager(policies=[agents_learned[0], agents_learned[2]], env=env)
# # Step 4: Convert the env to vector format
env = DummyVectorEnv([lambda: env])

# # Step 5: Construct the Collector, which interfaces the policies with the vectorised environment
collector = Collector(policies, env)

# # Step 6: Execute the environment with the agents playing for 1 episode, and render a frame every 2 seconds
result = collector.collect(n_episode=1, reset_before_collect=True)
print(PLAYS)

{'bastaushy': 0, 'qostaushy': 1}


🐯 Experiment results


*   agent3 vs agent1: 0-0
*   agent1 vs agent3: 0-1
*   agent3 vs agent2: 0-1
*   agent2 vs agent3: 1-0
*   agent2 vs agent1: 0-1
*   agent1 vs agent2: 1-0
*   trained with agent1 vs agent1: 1-0
*   agent1 vs trained with agent1: 0-0
*   trained with agent1 vs agent2: 0-0
*   agent2 vs trained with agent1: 0-0
*   trained with agent1 vs agent3: 1-0
*   agent3 vs trained with agent1: 0-0



# 🦩 PPO policy training

### 🐊 Change files tianshou.utils.net.common & tianshou.policy.modelfree.pgpolicy

In [None]:
#common.py
from abc import ABC, abstractmethod
from collections.abc import Callable, Sequence
from typing import Any, Generic, TypeAlias, TypeVar, cast, no_type_check

import numpy as np
import torch
from torch import nn

from tianshou.data import to_torch_as
from tianshou.data.batch import Batch
from tianshou.data.types import RecurrentStateBatch

ModuleType = type[nn.Module]
ArgsType = tuple[Any, ...] | dict[Any, Any] | Sequence[tuple[Any, ...]] | Sequence[dict[Any, Any]]
TActionShape: TypeAlias = Sequence[int] | int | np.int64
TLinearLayer: TypeAlias = Callable[[int, int], nn.Module]
T = TypeVar("T")


def miniblock(
    input_size: int,
    output_size: int = 0,
    norm_layer: ModuleType | None = None,
    norm_args: tuple[Any, ...] | dict[Any, Any] | None = None,
    activation: ModuleType | None = None,
    act_args: tuple[Any, ...] | dict[Any, Any] | None = None,
    linear_layer: TLinearLayer = nn.Linear,
) -> list[nn.Module]:
    """Construct a miniblock with given input/output-size, norm layer and activation."""
    layers: list[nn.Module] = [linear_layer(input_size, output_size)]
    if norm_layer is not None:
        if isinstance(norm_args, tuple):
            layers += [norm_layer(output_size, *norm_args)]
        elif isinstance(norm_args, dict):
            layers += [norm_layer(output_size, **norm_args)]
        else:
            layers += [norm_layer(output_size)]
    if activation is not None:
        if isinstance(act_args, tuple):
            layers += [activation(*act_args)]
        elif isinstance(act_args, dict):
            layers += [activation(**act_args)]
        else:
            layers += [activation()]
    return layers


class MLP(nn.Module):
    """Simple MLP backbone.

    Create a MLP of size input_dim * hidden_sizes[0] * hidden_sizes[1] * ...
    * hidden_sizes[-1] * output_dim

    :param input_dim: dimension of the input vector.
    :param output_dim: dimension of the output vector. If set to 0, there
        is no final linear layer.
    :param hidden_sizes: shape of MLP passed in as a list, not including
        input_dim and output_dim.
    :param norm_layer: use which normalization before activation, e.g.,
        ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization.
        You can also pass a list of normalization modules with the same length
        of hidden_sizes, to use different normalization module in different
        layers. Default to no normalization.
    :param activation: which activation to use after each layer, can be both
        the same activation for all layers if passed in nn.Module, or different
        activation for different Modules if passed in a list. Default to
        nn.ReLU.
    :param device: which device to create this model on. Default to None.
    :param linear_layer: use this module as linear layer. Default to nn.Linear.
    :param flatten_input: whether to flatten input data. Default to True.
    """

    def __init__(
        self,
        input_dim: int,
        output_dim: int = 0,
        hidden_sizes: Sequence[int] = (),
        norm_layer: ModuleType | Sequence[ModuleType] | None = None,
        norm_args: ArgsType | None = None,
        activation: ModuleType | Sequence[ModuleType] | None = nn.ReLU,
        act_args: ArgsType | None = None,
        device: str | int | torch.device | None = None,
        linear_layer: TLinearLayer = nn.Linear,
        flatten_input: bool = True,
    ) -> None:
        super().__init__()
        self.device = device
        if norm_layer:
            if isinstance(norm_layer, list):
                assert len(norm_layer) == len(hidden_sizes)
                norm_layer_list = norm_layer
                if isinstance(norm_args, list):
                    assert len(norm_args) == len(hidden_sizes)
                    norm_args_list = norm_args
                else:
                    norm_args_list = [norm_args for _ in range(len(hidden_sizes))]
            else:
                norm_layer_list = [norm_layer for _ in range(len(hidden_sizes))]
                norm_args_list = [norm_args for _ in range(len(hidden_sizes))]
        else:
            norm_layer_list = [None] * len(hidden_sizes)
            norm_args_list = [None] * len(hidden_sizes)
        if activation:
            if isinstance(activation, list):
                assert len(activation) == len(hidden_sizes)
                activation_list = activation
                if isinstance(act_args, list):
                    assert len(act_args) == len(hidden_sizes)
                    act_args_list = act_args
                else:
                    act_args_list = [act_args for _ in range(len(hidden_sizes))]
            else:
                activation_list = [activation for _ in range(len(hidden_sizes))]
                act_args_list = [act_args for _ in range(len(hidden_sizes))]
        else:
            activation_list = [None] * len(hidden_sizes)
            act_args_list = [None] * len(hidden_sizes)
        hidden_sizes = [input_dim, *list(hidden_sizes)]
        model = []
        for in_dim, out_dim, norm, norm_args, activ, act_args in zip(
            hidden_sizes[:-1],
            hidden_sizes[1:],
            norm_layer_list,
            norm_args_list,
            activation_list,
            act_args_list,
            strict=True,
        ):
            model += miniblock(in_dim, out_dim, norm, norm_args, activ, act_args, linear_layer)
        if output_dim > 0:
            model += [linear_layer(hidden_sizes[-1], output_dim)]
        self.output_dim = output_dim or hidden_sizes[-1]
        self.model = nn.Sequential(*model)
        self.flatten_input = flatten_input

    @no_type_check
    def forward(self, obs: np.ndarray | torch.Tensor) -> torch.Tensor:
        obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
        if self.flatten_input:
            obs = obs.flatten(1)
        return self.model(obs)


TRecurrentState = TypeVar("TRecurrentState", bound=Any)


class NetBase(nn.Module, Generic[TRecurrentState], ABC):
    """Interface for NNs used in policies."""

    @abstractmethod
    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: TRecurrentState | None = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[torch.Tensor, TRecurrentState | None]:
        pass


class Net(NetBase[Any]):
    """Wrapper of MLP to support more specific DRL usage.

    For advanced usage (how to customize the network), please refer to
    :ref:`build_the_network`.

    :param state_shape: int or a sequence of int of the shape of state.
    :param action_shape: int or a sequence of int of the shape of action.
    :param hidden_sizes: shape of MLP passed in as a list.
    :param norm_layer: use which normalization before activation, e.g.,
        ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization.
        You can also pass a list of normalization modules with the same length
        of hidden_sizes, to use different normalization module in different
        layers. Default to no normalization.
    :param activation: which activation to use after each layer, can be both
        the same activation for all layers if passed in nn.Module, or different
        activation for different Modules if passed in a list. Default to
        nn.ReLU.
    :param device: specify the device when the network actually runs. Default
        to "cpu".
    :param softmax: whether to apply a softmax layer over the last layer's
        output.
    :param concat: whether the input shape is concatenated by state_shape
        and action_shape. If it is True, ``action_shape`` is not the output
        shape, but affects the input shape only.
    :param num_atoms: in order to expand to the net of distributional RL.
        Default to 1 (not use).
    :param dueling_param: whether to use dueling network to calculate Q
        values (for Dueling DQN). If you want to use dueling option, you should
        pass a tuple of two dict (first for Q and second for V) stating
        self-defined arguments as stated in
        class:`~tianshou.utils.net.common.MLP`. Default to None.
    :param linear_layer: use this module constructor, which takes the input
        and output dimension as input, as linear layer. Default to nn.Linear.

    .. seealso::

        Please refer to :class:`~tianshou.utils.net.common.MLP` for more
        detailed explanation on the usage of activation, norm_layer, etc.

        You can also refer to :class:`~tianshou.utils.net.continuous.Actor`,
        :class:`~tianshou.utils.net.continuous.Critic`, etc, to see how it's
        suggested be used.
    """

    def __init__(
        self,
        state_shape: int | Sequence[int],
        action_shape: TActionShape = 0,
        hidden_sizes: Sequence[int] = (),
        norm_layer: ModuleType | Sequence[ModuleType] | None = None,
        norm_args: ArgsType | None = None,
        activation: ModuleType | Sequence[ModuleType] | None = nn.ReLU,
        act_args: ArgsType | None = None,
        device: str | int | torch.device = "cpu",
        softmax: bool = False,
        concat: bool = False,
        num_atoms: int = 1,
        dueling_param: tuple[dict[str, Any], dict[str, Any]] | None = None,
        linear_layer: TLinearLayer = nn.Linear,
    ) -> None:
        super().__init__()
        self.device = device
        self.softmax = softmax
        self.num_atoms = num_atoms
        self.Q: MLP | None = None
        self.V: MLP | None = None

        input_dim = int(np.prod(state_shape))
        action_dim = int(np.prod(action_shape)) * num_atoms
        if concat:
            input_dim += action_dim
        self.use_dueling = dueling_param is not None
        output_dim = action_dim if not self.use_dueling and not concat else 0
        self.model = MLP(
            input_dim,
            output_dim,
            hidden_sizes,
            norm_layer,
            norm_args,
            activation,
            act_args,
            device,
            linear_layer,
        )
        if self.use_dueling:  # dueling DQN
            assert dueling_param is not None
            kwargs_update = {
                "input_dim": self.model.output_dim,
                "device": self.device,
            }
            # Important: don't change the original dict (e.g., don't use .update())
            q_kwargs = {**dueling_param[0], **kwargs_update}
            v_kwargs = {**dueling_param[1], **kwargs_update}

            q_kwargs["output_dim"] = 0 if concat else action_dim
            v_kwargs["output_dim"] = 0 if concat else num_atoms
            self.Q, self.V = MLP(**q_kwargs), MLP(**v_kwargs)
            self.output_dim = self.Q.output_dim
        else:
            self.output_dim = self.model.output_dim

    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: Any = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[torch.Tensor, Any]:
        """Mapping: obs -> flatten (inside MLP)-> logits.

        :param obs:
        :param state: unused and returned as is
        :param info: unused
        """
        if hasattr(obs, "obs"):
            obs = obs.obs
        logits = self.model(obs)
        batch_size = logits.shape[0]
        if self.use_dueling:  # Dueling DQN
            assert self.Q is not None
            assert self.V is not None
            q, v = self.Q(logits), self.V(logits)
            if self.num_atoms > 1:
                q = q.view(batch_size, -1, self.num_atoms)
                v = v.view(batch_size, -1, self.num_atoms)
            logits = q - q.mean(dim=1, keepdim=True) + v
        elif self.num_atoms > 1:
            logits = logits.view(batch_size, -1, self.num_atoms)
        if self.softmax:
            logits = torch.softmax(logits, dim=-1)
        return logits, state


class Recurrent(NetBase[RecurrentStateBatch]):
    """Simple Recurrent network based on LSTM.

    For advanced usage (how to customize the network), please refer to
    :ref:`build_the_network`.
    """

    def __init__(
        self,
        layer_num: int,
        state_shape: int | Sequence[int],
        action_shape: TActionShape,
        device: str | int | torch.device = "cpu",
        hidden_layer_size: int = 128,
    ) -> None:
        super().__init__()
        self.device = device
        self.nn = nn.LSTM(
            input_size=hidden_layer_size,
            hidden_size=hidden_layer_size,
            num_layers=layer_num,
            batch_first=True,
        )
        self.fc1 = nn.Linear(int(np.prod(state_shape)), hidden_layer_size)
        self.fc2 = nn.Linear(hidden_layer_size, int(np.prod(action_shape)))

    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: RecurrentStateBatch | None = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[torch.Tensor, RecurrentStateBatch]:
        """Mapping: obs -> flatten -> logits.

        In the evaluation mode, `obs` should be with shape ``[bsz, dim]``; in the
        training mode, `obs` should be with shape ``[bsz, len, dim]``. See the code
        and comment for more detail.

        :param obs:
        :param state: either None or a dict with keys 'hidden' and 'cell'
        :param info: unused
        :return: predicted action, next state as dict with keys 'hidden' and 'cell'
        """
        # Note: the original type of state is Batch but it might also be a dict
        # If it is a Batch, .issubset(state) will not work. However,
        # issubset(state.keys()) always works
        if state is not None and not {"hidden", "cell"}.issubset(state.keys()):
            raise ValueError(
                f"Expected to find keys 'hidden' and 'cell' but instead found {state.keys()}",
            )

        obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
        # obs [bsz, len, dim] (training) or [bsz, dim] (evaluation)
        # In short, the tensor's shape in training phase is longer than which
        # in evaluation phase.
        if len(obs.shape) == 2:
            obs = obs.unsqueeze(-2)
        obs = self.fc1(obs)
        self.nn.flatten_parameters()
        if state is None:
            obs, (hidden, cell) = self.nn(obs)
        else:
            # we store the stack data in [bsz, len, ...] format
            # but pytorch rnn needs [len, bsz, ...]
            obs, (hidden, cell) = self.nn(
                obs,
                (
                    state["hidden"].transpose(0, 1).contiguous(),
                    state["cell"].transpose(0, 1).contiguous(),
                ),
            )
        obs = self.fc2(obs[:, -1])
        # please ensure the first dim is batch size: [bsz, len, ...]
        rnn_state_batch = cast(
            RecurrentStateBatch,
            Batch(
                {
                    "hidden": hidden.transpose(0, 1).detach(),
                    "cell": cell.transpose(0, 1).detach(),
                },
            ),
        )
        return obs, rnn_state_batch


class ActorCritic(nn.Module):
    """An actor-critic network for parsing parameters.

    Using ``actor_critic.parameters()`` instead of set.union or list+list to avoid
    issue #449.

    :param nn.Module actor: the actor network.
    :param nn.Module critic: the critic network.
    """

    def __init__(self, actor: nn.Module, critic: nn.Module) -> None:
        super().__init__()
        self.actor = actor
        self.critic = critic


class DataParallelNet(nn.Module):
    """DataParallel wrapper for training agent with multi-GPU.

    This class does only the conversion of input data type, from numpy array to torch's
    Tensor. If the input is a nested dictionary, the user should create a similar class
    to do the same thing.

    :param nn.Module net: the network to be distributed in different GPUs.
    """

    def __init__(self, net: nn.Module) -> None:
        super().__init__()
        self.net = nn.DataParallel(net)

    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        *args: Any,
        **kwargs: Any,
    ) -> tuple[Any, Any]:
        if not isinstance(obs, torch.Tensor):
            obs = torch.as_tensor(obs, dtype=torch.float32)
        return self.net(obs=obs.cuda(), *args, **kwargs)  # noqa: B026


class EnsembleLinear(nn.Module):
    """Linear Layer of Ensemble network.

    :param ensemble_size: Number of subnets in the ensemble.
    :param in_feature: dimension of the input vector.
    :param out_feature: dimension of the output vector.
    :param bias: whether to include an additive bias, default to be True.
    """

    def __init__(
        self,
        ensemble_size: int,
        in_feature: int,
        out_feature: int,
        bias: bool = True,
    ) -> None:
        super().__init__()

        # To be consistent with PyTorch default initializer
        k = np.sqrt(1.0 / in_feature)
        weight_data = torch.rand((ensemble_size, in_feature, out_feature)) * 2 * k - k
        self.weight = nn.Parameter(weight_data, requires_grad=True)

        self.bias_weights: nn.Parameter | None = None
        if bias:
            bias_data = torch.rand((ensemble_size, 1, out_feature)) * 2 * k - k
            self.bias_weights = nn.Parameter(bias_data, requires_grad=True)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.matmul(x, self.weight)
        if self.bias_weights is not None:
            x = x + self.bias_weights
        return x


# TODO: fix docstring
class BranchingNet(NetBase[Any]):
    """Branching dual Q network.

    Network for the BranchingDQNPolicy, it uses a common network module, a value module
    and action "branches" one for each dimension.It allows for a linear scaling
    of Q-value the output w.r.t. the number of dimensions in the action space.
    For more info please refer to: arXiv:1711.08946.
    :param state_shape: int or a sequence of int of the shape of state.
    :param action_shape: int or a sequence of int of the shape of action.
    :param action_peer_branch: int or a sequence of int of the number of actions in
    each dimension.
    :param common_hidden_sizes: shape of the common MLP network passed in as a list.
    :param value_hidden_sizes: shape of the value MLP network passed in as a list.
    :param action_hidden_sizes: shape of the action MLP network passed in as a list.
    :param norm_layer: use which normalization before activation, e.g.,
    ``nn.LayerNorm`` and ``nn.BatchNorm1d``. Default to no normalization.
    You can also pass a list of normalization modules with the same length
    of hidden_sizes, to use different normalization module in different
    layers. Default to no normalization.
    :param activation: which activation to use after each layer, can be both
    the same activation for all layers if passed in nn.Module, or different
    activation for different Modules if passed in a list. Default to
    nn.ReLU.
    :param device: specify the device when the network actually runs. Default
    to "cpu".
    :param softmax: whether to apply a softmax layer over the last layer's
    output.
    """

    def __init__(
        self,
        state_shape: int | Sequence[int],
        num_branches: int = 0,
        action_per_branch: int = 2,
        common_hidden_sizes: list[int] | None = None,
        value_hidden_sizes: list[int] | None = None,
        action_hidden_sizes: list[int] | None = None,
        norm_layer: ModuleType | None = None,
        norm_args: ArgsType | None = None,
        activation: ModuleType | None = nn.ReLU,
        act_args: ArgsType | None = None,
        device: str | int | torch.device = "cpu",
    ) -> None:
        super().__init__()
        common_hidden_sizes = common_hidden_sizes or []
        value_hidden_sizes = value_hidden_sizes or []
        action_hidden_sizes = action_hidden_sizes or []

        self.device = device
        self.num_branches = num_branches
        self.action_per_branch = action_per_branch
        # common network
        common_input_dim = int(np.prod(state_shape))
        common_output_dim = 0
        self.common = MLP(
            common_input_dim,
            common_output_dim,
            common_hidden_sizes,
            norm_layer,
            norm_args,
            activation,
            act_args,
            device,
        )
        # value network
        value_input_dim = common_hidden_sizes[-1]
        value_output_dim = 1
        self.value = MLP(
            value_input_dim,
            value_output_dim,
            value_hidden_sizes,
            norm_layer,
            norm_args,
            activation,
            act_args,
            device,
        )
        # action branching network
        action_input_dim = common_hidden_sizes[-1]
        action_output_dim = action_per_branch
        self.branches = nn.ModuleList(
            [
                MLP(
                    action_input_dim,
                    action_output_dim,
                    action_hidden_sizes,
                    norm_layer,
                    norm_args,
                    activation,
                    act_args,
                    device,
                )
                for _ in range(self.num_branches)
            ],
        )

    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: Any = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[torch.Tensor, Any]:
        """Mapping: obs -> model -> logits."""
        common_out = self.common(obs)
        value_out = self.value(common_out)
        value_out = torch.unsqueeze(value_out, 1)
        action_out = []
        for b in self.branches:
            action_out.append(b(common_out))
        action_scores = torch.stack(action_out, 1)
        action_scores = action_scores - torch.mean(action_scores, 2, keepdim=True)
        logits = value_out + action_scores
        return logits, state


def get_dict_state_decorator(
    state_shape: dict[str, int | Sequence[int]],
    keys: Sequence[str],
) -> tuple[Callable, int]:
    """A helper function to make Net or equivalent classes (e.g. Actor, Critic) applicable to dict state.

    The first return item, ``decorator_fn``, will alter the implementation of forward
    function of the given class by preprocessing the observation. The preprocessing is
    basically flatten the observation and concatenate them based on the ``keys`` order.
    The batch dimension is preserved if presented. The result observation shape will
    be equal to ``new_state_shape``, the second return item.

    :param state_shape: A dictionary indicating each state's shape
    :param keys: A list of state's keys. The flatten observation will be according to
        this list order.
    :returns: a 2-items tuple ``decorator_fn`` and ``new_state_shape``
    """
    original_shape = state_shape
    flat_state_shapes = []
    for k in keys:
        flat_state_shapes.append(int(np.prod(state_shape[k])))
    new_state_shape = sum(flat_state_shapes)

    def preprocess_obs(obs: Batch | dict | torch.Tensor | np.ndarray) -> torch.Tensor:
        if isinstance(obs, dict) or (isinstance(obs, Batch) and keys[0] in obs):
            if original_shape[keys[0]] == obs[keys[0]].shape:
                # No batch dim
                new_obs = torch.Tensor([obs[k] for k in keys]).flatten()
                # new_obs = torch.Tensor([obs[k] for k in keys]).reshape(1, -1)
            else:
                bsz = obs[keys[0]].shape[0]
                new_obs = torch.cat([torch.Tensor(obs[k].reshape(bsz, -1)) for k in keys], dim=1)
        else:
            new_obs = torch.Tensor(obs)
        return new_obs

    @no_type_check
    def decorator_fn(net_class):
        class new_net_class(net_class):
            def forward(self, obs: np.ndarray | torch.Tensor, *args, **kwargs) -> Any:
                return super().forward(preprocess_obs(obs), *args, **kwargs)

        return new_net_class

    return decorator_fn, new_state_shape


class BaseActor(nn.Module, ABC):
    @abstractmethod
    def get_preprocess_net(self) -> nn.Module:
        pass

    @abstractmethod
    def get_output_dim(self) -> int:
        pass

    @abstractmethod
    def forward(
        self,
        obs: np.ndarray | torch.Tensor,
        state: Any = None,
        info: dict[str, Any] | None = None,
    ) -> tuple[Any, Any]:
        # TODO: ALGO-REFACTORING. Marked to be addressed as part of Algorithm abstraction.
        #  Return type needs to be more specific
        pass


def getattr_with_matching_alt_value(obj: Any, attr_name: str, alt_value: T | None) -> T:
    """Gets the given attribute from the given object or takes the alternative value if it is not present.
    If both are present, they are required to match.

    :param obj: the object from which to obtain the attribute value
    :param attr_name: the attribute name
    :param alt_value: the alternative value for the case where the attribute is not present, which cannot be None
        if the attribute is not present
    :return: the value
    """
    v = getattr(obj, attr_name)
    if v is not None:
        if alt_value is not None and v != alt_value:
            raise ValueError(
                f"Attribute '{attr_name}' of {obj} is defined ({v}) but does not match alt. value ({alt_value})",
            )
        return v
    else:
        if alt_value is None:
            raise ValueError(
                f"Attribute '{attr_name}' of {obj} is not defined and no fallback given",
            )
        return alt_value


def get_output_dim(module: nn.Module, alt_value: int | None) -> int:
    """Retrieves value the `output_dim` attribute of the given module or uses the given alternative value if the attribute is not present.
    If both are present, they must match.

    :param module: the module
    :param alt_value: the alternative value
    :return: the value
    """
    return getattr_with_matching_alt_value(module, "output_dim", alt_value)


In [None]:
# pgpolicy.py
import warnings
from collections.abc import Callable
from dataclasses import dataclass
from typing import Any, Generic, Literal, TypeVar, cast

import gymnasium as gym
import numpy as np
import torch

from tianshou.data import (
    Batch,
    ReplayBuffer,
    SequenceSummaryStats,
    to_torch,
    to_torch_as,
)
from tianshou.data.batch import BatchProtocol
from tianshou.data.types import (
    BatchWithReturnsProtocol,
    DistBatchProtocol,
    ObsBatchProtocol,
    RolloutBatchProtocol,
)
from tianshou.policy import BasePolicy
from tianshou.policy.base import TLearningRateScheduler, TrainingStats
from tianshou.utils import RunningMeanStd
from tianshou.utils.net.continuous import ActorProb
from tianshou.utils.net.discrete import Actor

# Dimension Naming Convention
# B - Batch Size
# A - Action
# D - Dist input (usually 2, loc and scale)
# H - Dimension of hidden, can be None

TDistFnContinuous = Callable[
    [tuple[torch.Tensor, torch.Tensor]],
    torch.distributions.Distribution,
]
TDistFnDiscrete = Callable[[torch.Tensor], torch.distributions.Categorical]

TDistFnDiscrOrCont = TDistFnContinuous | TDistFnDiscrete


@dataclass(kw_only=True)
class PGTrainingStats(TrainingStats):
    loss: SequenceSummaryStats


TPGTrainingStats = TypeVar("TPGTrainingStats", bound=PGTrainingStats)


class PGPolicy(BasePolicy[TPGTrainingStats], Generic[TPGTrainingStats]):
    """Implementation of REINFORCE algorithm.

    :param actor: the actor network following the rules:
        If `self.action_type == "discrete"`: (`s_B` ->`action_values_BA`).
        If `self.action_type == "continuous"`: (`s_B` -> `dist_input_BD`).
    :param optim: optimizer for actor network.
    :param dist_fn: distribution class for computing the action.
        Maps model_output -> distribution. Typically a Gaussian distribution
        taking `model_output=mean,std` as input for continuous action spaces,
        or a categorical distribution taking `model_output=logits`
        for discrete action spaces. Note that as user, you are responsible
        for ensuring that the distribution is compatible with the action space.
    :param action_space: env's action space.
    :param discount_factor: in [0, 1].
    :param reward_normalization: if True, will normalize the *returns*
        by subtracting the running mean and dividing by the running standard deviation.
        Can be detrimental to performance! See TODO in process_fn.
    :param deterministic_eval: if True, will use deterministic action (the dist's mode)
        instead of stochastic one during evaluation. Does not affect training.
    :param observation_space: Env's observation space.
    :param action_scaling: if True, scale the action from [-1, 1] to the range
        of action_space. Only used if the action_space is continuous.
    :param action_bound_method: method to bound action to range [-1, 1].
        Only used if the action_space is continuous.
    :param lr_scheduler: if not None, will be called in `policy.update()`.

    .. seealso::

        Please refer to :class:`~tianshou.policy.BasePolicy` for more detailed explanation.
    """

    def __init__(
        self,
        *,
        actor: torch.nn.Module | ActorProb | Actor,
        optim: torch.optim.Optimizer,
        dist_fn: TDistFnDiscrOrCont,
        action_space: gym.Space,
        discount_factor: float = 0.99,
        # TODO: rename to return_normalization?
        reward_normalization: bool = False,
        deterministic_eval: bool = False,
        observation_space: gym.Space | None = None,
        # TODO: why change the default from the base?
        action_scaling: bool = True,
        action_bound_method: Literal["clip", "tanh"] | None = "clip",
        lr_scheduler: TLearningRateScheduler | None = None,
    ) -> None:
        super().__init__(
            action_space=action_space,
            observation_space=observation_space,
            action_scaling=action_scaling,
            action_bound_method=action_bound_method,
            lr_scheduler=lr_scheduler,
        )
        if action_scaling and not np.isclose(actor.max_action, 1.0):
            warnings.warn(
                "action_scaling and action_bound_method are only intended"
                "to deal with unbounded model action space, but find actor model"
                f"bound action space with max_action={actor.max_action}."
                "Consider using unbounded=True option of the actor model,"
                "or set action_scaling to False and action_bound_method to None.",
            )
        self.actor = actor
        self.optim = optim
        self.dist_fn = dist_fn
        assert 0.0 <= discount_factor <= 1.0, "discount factor should be in [0, 1]"
        self.gamma = discount_factor
        self.rew_norm = reward_normalization
        self.ret_rms = RunningMeanStd()
        self._eps = 1e-8
        self.deterministic_eval = deterministic_eval

    def process_fn(
        self,
        batch: RolloutBatchProtocol,
        buffer: ReplayBuffer,
        indices: np.ndarray,
    ) -> BatchWithReturnsProtocol:
        r"""Compute the discounted returns (Monte Carlo estimates) for each transition.

        They are added to the batch under the field `returns`.
        Note: this function will modify the input batch!

        .. math::
            G_t = \sum_{i=t}^T \gamma^{i-t}r_i

        where :math:`T` is the terminal time step, :math:`\gamma` is the
        discount factor, :math:`\gamma \in [0, 1]`.

        :param batch: a data batch which contains several episodes of data in
            sequential order. Mind that the end of each finished episode of batch
            should be marked by done flag, unfinished (or collecting) episodes will be
            recognized by buffer.unfinished_index().
        :param buffer: the corresponding replay buffer.
        :param numpy.ndarray indices: tell batch's location in buffer, batch is equal
            to buffer[indices].
        """
        v_s_ = np.full(indices.shape, self.ret_rms.mean)
        # gae_lambda = 1.0 means we use Monte Carlo estimate
        unnormalized_returns, _ = self.compute_episodic_return(
            batch,
            buffer,
            indices,
            v_s_=v_s_,
            gamma=self.gamma,
            gae_lambda=1.0,
        )
        # TODO: overridden in A2C, where mean is not subtracted. Subtracting mean
        #  can be very detrimental! It also has no theoretical grounding.
        #  This should be addressed soon!
        if self.rew_norm:
            batch.returns = (unnormalized_returns - self.ret_rms.mean) / np.sqrt(
                self.ret_rms.var + self._eps,
            )
            self.ret_rms.update(unnormalized_returns)
        else:
            batch.returns = unnormalized_returns
        batch: BatchWithReturnsProtocol
        return batch

    def forward(
        self,
        batch: ObsBatchProtocol,
        state: dict | BatchProtocol | np.ndarray | None = None,
        **kwargs: Any,
    ) -> DistBatchProtocol:
        """Compute action over the given batch data by applying the actor.

        Will sample from the dist_fn, if appropriate.
        Returns a new object representing the processed batch data
        (contrary to other methods that modify the input batch inplace).

        .. seealso::

            Please refer to :meth:`~tianshou.policy.BasePolicy.forward` for
            more detailed explanation.
        """
        # TODO - ALGO: marked for algorithm refactoring
        # print(type(batch))
        # print('batch', batch)
        # print('batch.obs.obs', batch.obs.obs)
        # print('batch.obs.mask', batch.obs.mask)
        obs = batch.obs
        # TODO: this is convoluted! See also other places where this is done.
        obs_next = obs.obs if hasattr(obs, "obs") else obs
        # action_values_BA, hidden_BH = model(obs_next, state=state, info=batch.info)
        action_dist_input_BD, hidden_BH = self.actor(obs_next, state=state, info=batch.info)
        # in the case that self.action_type == "discrete", the dist should always be Categorical, and D=A
        # therefore action_dist_input_BD is equivalent to logits_BA
        # If discrete, dist_fn will typically map loc, scale to a distribution (usually a Gaussian)
        # the action_dist_input_BD in that case is a tuple of loc_B, scale_B and needs to be unpacked

        # print('action_dist_input_BD', action_dist_input_BD)
        # print('to_torch_as(batch.obs.mask, logits)', to_torch_as(obs.mask, action_dist_input_BD))
        if isinstance(action_dist_input_BD, tuple):

            # this is for (mu, sigma) from Normal distribution
            dist = self.dist_fn(*action_dist_input_BD)

        else:  # categorical distribution
        # mask: np.array with shape=(bsz, n_act), dtype=bool, True means available
        # logits: torch.Tensor with shape=(bsz, n_act), dtype=torch.float, range=0~1
            action_dist_input_BD = to_torch_as(batch.obs.mask, action_dist_input_BD)
        dist = self.dist_fn(action_dist_input_BD)
        if self.deterministic_eval and not self.training:
            act_B = dist.mode
        else:
            act_B = dist.sample()
        # act is of dimension BA in continuous case and of dimension B in discrete
        result = Batch(logits=action_dist_input_BD, act=act_B, state=hidden_BH, dist=dist)
        return cast(DistBatchProtocol, result)

    # TODO: why does mypy complain?
    def learn(  # type: ignore
        self,
        batch: BatchWithReturnsProtocol,
        batch_size: int | None,
        repeat: int,
        *args: Any,
        **kwargs: Any,
    ) -> TPGTrainingStats:
        losses = []
        split_batch_size = batch_size or -1
        for _ in range(repeat):
            for minibatch in batch.split(split_batch_size, merge_last=True):
                self.optim.zero_grad()
                result = self(minibatch)
                dist = result.dist
                act = to_torch_as(minibatch.act, result.act)
                ret = to_torch(minibatch.returns, torch.float, result.act.device)
                log_prob = dist.log_prob(act).reshape(len(ret), -1).transpose(0, 1)
                loss = -(log_prob * ret).mean()
                loss.backward()
                self.optim.step()
                losses.append(loss.item())

        loss_summary_stat = SequenceSummaryStats.from_sequence(losses)

        return PGTrainingStats(loss=loss_summary_stat)  # type: ignore[return-value]

### 🦆 Imports

In [49]:
from typing import Optional, Tuple
from tianshou.env.pettingzoo_env import PettingZooEnv
import torch

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.policy import BasePolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.env import DummyVectorEnv
from tianshou.policy import BasePolicy, PPOPolicy
from tianshou.trainer import OnpolicyTrainer
from tianshou.utils.net.common import ActorCritic, Net
from tianshou.utils.net.discrete import Actor, Critic

device = "cuda" if torch.cuda.is_available() else "cpu"
NUM_ITERS = 400
PLAYS = {"bastaushy": 0, "qostaushy": 0}

### 🐥 Prepare functions

In [218]:
def _get_agents_ppo(
    agent_learn: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gymnasium.spaces.Dict)
        else env.observation_space
    )
    assert env.observation_space["observation"].shape is not None  # for mypy
    assert isinstance(env.action_space, gymnasium.spaces.Discrete)
    if agent_learn is None:
        # model
        net = Net(state_shape=observation_space.shape, hidden_sizes=[128, 256, 256, 128], device=device).to(device)
        actor = Actor(preprocess_net=net, action_shape=env.action_space.shape, device=device).to(device)
        critic = Critic(preprocess_net=net, device=device).to(device)
        actor_critic = ActorCritic(actor=actor, critic=critic)

        # optimizer of the actor and the critic
        if optim is None:
            optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)
        agent_learn: PPOPolicy = PPOPolicy(
            actor=actor,
            critic=critic,
            optim=optim,
            dist_fn=torch.distributions.Categorical,
            action_space=env.action_space,
            deterministic_eval=True,
            action_scaling=False,
        )

    if agent_opponent is None:
        agent_opponent = RandomPolicy(action_space=env.action_space)

    agents = [agent_opponent, agent_learn]
    #agents = [agent_learn, agent_opponent]
    policy = MultiAgentPolicyManager(policies=agents, env=env)
    return policy, optim, env.agents


def _get_env(render_mode=None):
    """This function is needed to provide callables for DummyVectorEnv."""
    def env(render_mode=None):
        """
        The env function often wraps the environment in wrappers by default.
        You can find full documentation for these methods
        elsewhere in the developer documentation.
        """
        internal_render_mode = render_mode if render_mode != "ansi" else "human"
        env = raw_env(render_mode=internal_render_mode)
        # This wrapper is only for environments which print results to the terminal
        if render_mode == "ansi":
            env = wrappers.CaptureStdoutWrapper(env)
        # this wrapper helps error handling for discrete action spaces
        env = wrappers.AssertOutOfBoundsWrapper(env)
        # Provides a wide vareity of helpful user errors
        # Strongly recommended
        env = wrappers.OrderEnforcingWrapper(env)
        return env
    return PettingZooEnv(env(render_mode=render_mode))

### 🐸 Training code

In [68]:
# Before evaluate this cell run the cell with env
# ======== Step 1: Environment setup =========

train_envs = DummyVectorEnv([_get_env for _ in range(100)])
test_envs = DummyVectorEnv([_get_env for _ in range(100)])

# seed
seed = 77
np.random.seed(seed)
torch.manual_seed(seed)
train_envs.seed(seed)
test_envs.seed(seed)

# ======== Step 2: Agent setup =========
policy, optim, agents = _get_agents_ppo()

# ======== Step 3: Collector setup =========
train_collector = Collector(
    policy=policy,
    env=train_envs,
    buffer=VectorReplayBuffer(20_000, len(train_envs)),
)
test_collector = Collector(policy=policy, env=test_envs)

#======== Step 4: Callback functions setup =========

def save_best_fn(policy):
    model_save_path = os.path.join("models", "ppo", "policy_ppo_128x256x256x128.pth")
    os.makedirs(os.path.join("models", "ppo"), exist_ok=True)
    torch.save(policy.policies[agents[1]].state_dict(), model_save_path)

def stop_fn(mean_rewards):
    return mean_rewards >= 22000

def reward_metric(rews):
    return rews[:, 1]

#======== Step 5: Run the trainer =========
result = OnpolicyTrainer(
    policy=policy,
    train_collector=train_collector,
    test_collector=test_collector,
    max_epoch=100,
    step_per_epoch=50000,
    repeat_per_collect=10,
    episode_per_test=10,
    batch_size=256,
    step_per_collect=2000,
    reward_metric=reward_metric,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn
).run()

# return result, policy.policies[agents[1]]
print(f"\n==========Result==========\n{result}")
print("\n(the trained policy can be accessed via policy.policies[agents[1]])")

Epoch #1: 50001it [00:21, 2352.92it/s, env_step=50000, gradient_step=200, len=192, n/ep=14, n/st=2000, rew=10251.79]                            


Epoch #1: test_reward: 5326.200000 ± 4119.836934, best_reward: 5572.000000 ± 3150.819322 in #0


Epoch #2: 50001it [00:23, 2165.75it/s, env_step=100000, gradient_step=400, len=163, n/ep=14, n/st=2000, rew=8485.25]                            


Epoch #2: test_reward: 4131.600000 ± 3282.587004, best_reward: 5572.000000 ± 3150.819322 in #0


Epoch #3: 50001it [00:22, 2220.85it/s, env_step=150000, gradient_step=600, len=144, n/ep=11, n/st=2000, rew=7083.59]                            


Epoch #3: test_reward: 5586.000000 ± 3230.172255, best_reward: 5586.000000 ± 3230.172255 in #3


Epoch #4: 50001it [00:20, 2468.04it/s, env_step=200000, gradient_step=800, len=141, n/ep=11, n/st=2000, rew=7083.77]                            


Epoch #4: test_reward: 4204.000000 ± 2408.019933, best_reward: 5586.000000 ± 3230.172255 in #3


Epoch #5: 50001it [00:19, 2551.85it/s, env_step=250000, gradient_step=1000, len=137, n/ep=14, n/st=2000, rew=6835.57]                           


Epoch #5: test_reward: 5098.800000 ± 4635.544171, best_reward: 5586.000000 ± 3230.172255 in #3


Epoch #6: 50001it [00:19, 2510.60it/s, env_step=300000, gradient_step=1200, len=183, n/ep=16, n/st=2000, rew=9730.31]                           


Epoch #6: test_reward: 4358.200000 ± 3324.836652, best_reward: 5586.000000 ± 3230.172255 in #3


Epoch #7: 50001it [00:20, 2469.95it/s, env_step=350000, gradient_step=1400, len=183, n/ep=10, n/st=2000, rew=9454.10]                           


Epoch #7: test_reward: 5757.000000 ± 3577.310750, best_reward: 5757.000000 ± 3577.310750 in #7


Epoch #8: 50001it [00:20, 2414.89it/s, env_step=400000, gradient_step=1600, len=176, n/ep=13, n/st=2000, rew=8995.08]                           


Epoch #8: test_reward: 6098.200000 ± 4260.609670, best_reward: 6098.200000 ± 4260.609670 in #8


Epoch #9: 50001it [00:17, 2862.55it/s, env_step=450000, gradient_step=1800, len=156, n/ep=14, n/st=2000, rew=7924.71]                           


Epoch #9: test_reward: 4496.200000 ± 3081.464386, best_reward: 6098.200000 ± 4260.609670 in #8


Epoch #10: 50001it [00:19, 2569.37it/s, env_step=500000, gradient_step=2000, len=149, n/ep=11, n/st=2000, rew=7709.68]                          


Epoch #10: test_reward: 6602.500000 ± 4518.199359, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #11: 50001it [00:17, 2783.69it/s, env_step=550000, gradient_step=2200, len=180, n/ep=12, n/st=2000, rew=9105.67]                          


Epoch #11: test_reward: 3592.400000 ± 2351.290675, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #12: 50001it [00:19, 2629.72it/s, env_step=600000, gradient_step=2400, len=186, n/ep=17, n/st=2000, rew=9543.65]                          


Epoch #12: test_reward: 4511.200000 ± 2234.224734, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #13: 50001it [00:18, 2659.34it/s, env_step=650000, gradient_step=2600, len=152, n/ep=10, n/st=2000, rew=7499.35]                          


Epoch #13: test_reward: 2609.200000 ± 887.379152, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #14: 50001it [00:20, 2443.70it/s, env_step=700000, gradient_step=2800, len=171, n/ep=16, n/st=2000, rew=9055.66]                          


Epoch #14: test_reward: 4877.800000 ± 3549.715420, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #15: 50001it [00:19, 2546.54it/s, env_step=750000, gradient_step=3000, len=154, n/ep=17, n/st=2000, rew=7804.38]                          


Epoch #15: test_reward: 3852.800000 ± 2870.296737, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #16: 50001it [00:19, 2629.95it/s, env_step=800000, gradient_step=3200, len=170, n/ep=15, n/st=2000, rew=8896.10]                          


Epoch #16: test_reward: 4611.800000 ± 2678.213352, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #17: 50001it [00:18, 2727.96it/s, env_step=850000, gradient_step=3400, len=139, n/ep=11, n/st=2000, rew=6684.50]                          


Epoch #17: test_reward: 3369.800000 ± 1696.818541, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #18: 50001it [00:19, 2628.89it/s, env_step=900000, gradient_step=3600, len=116, n/ep=6, n/st=2000, rew=5218.00]                           


Epoch #18: test_reward: 4780.800000 ± 2903.210113, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #19: 50001it [00:19, 2602.10it/s, env_step=950000, gradient_step=3800, len=158, n/ep=8, n/st=2000, rew=8117.75]                           


Epoch #19: test_reward: 4059.800000 ± 1350.193897, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #20: 50001it [00:18, 2659.47it/s, env_step=1000000, gradient_step=4000, len=177, n/ep=16, n/st=2000, rew=9345.25]                         


Epoch #20: test_reward: 2745.200000 ± 2169.040931, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #21: 50001it [00:19, 2613.66it/s, env_step=1050000, gradient_step=4200, len=196, n/ep=11, n/st=2000, rew=10159.68]                        


Epoch #21: test_reward: 3393.200000 ± 1798.118839, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #22: 50001it [00:18, 2774.77it/s, env_step=1100000, gradient_step=4400, len=177, n/ep=12, n/st=2000, rew=8973.88]                         


Epoch #22: test_reward: 3774.800000 ± 1912.079747, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #23: 50001it [00:19, 2590.21it/s, env_step=1150000, gradient_step=4600, len=142, n/ep=9, n/st=2000, rew=6789.22]                          


Epoch #23: test_reward: 5076.400000 ± 3637.774023, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #24: 50001it [00:19, 2593.75it/s, env_step=1200000, gradient_step=4800, len=150, n/ep=13, n/st=2000, rew=7813.50]                         


Epoch #24: test_reward: 3706.200000 ± 1130.051486, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #25: 50001it [00:18, 2770.69it/s, env_step=1250000, gradient_step=5000, len=169, n/ep=15, n/st=2000, rew=8860.13]                         


Epoch #25: test_reward: 3112.800000 ± 2793.697936, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #26: 50001it [00:19, 2585.72it/s, env_step=1300000, gradient_step=5200, len=159, n/ep=14, n/st=2000, rew=8011.89]                         


Epoch #26: test_reward: 3768.200000 ± 1888.206228, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #27: 50001it [00:19, 2592.54it/s, env_step=1350000, gradient_step=5400, len=193, n/ep=9, n/st=2000, rew=10182.78]                         


Epoch #27: test_reward: 3174.800000 ± 1242.355247, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #28: 50001it [00:18, 2699.85it/s, env_step=1400000, gradient_step=5600, len=201, n/ep=11, n/st=2000, rew=10718.73]                        


Epoch #28: test_reward: 4321.200000 ± 2832.275156, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #29: 50001it [00:18, 2701.37it/s, env_step=1450000, gradient_step=5800, len=130, n/ep=14, n/st=2000, rew=6549.61]                         


Epoch #29: test_reward: 4474.800000 ± 4186.136854, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #30: 50001it [00:19, 2584.78it/s, env_step=1500000, gradient_step=6000, len=189, n/ep=14, n/st=2000, rew=9697.04]                         


Epoch #30: test_reward: 6022.200000 ± 3815.354605, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #31: 50001it [00:19, 2501.03it/s, env_step=1550000, gradient_step=6200, len=164, n/ep=13, n/st=2000, rew=8628.27]                         


Epoch #31: test_reward: 4164.200000 ± 2425.888448, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #32: 50001it [00:19, 2525.01it/s, env_step=1600000, gradient_step=6400, len=148, n/ep=10, n/st=2000, rew=7309.15]                         


Epoch #32: test_reward: 4856.000000 ± 2788.553890, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #33: 50001it [00:20, 2477.69it/s, env_step=1650000, gradient_step=6600, len=143, n/ep=11, n/st=2000, rew=7114.50]                         


Epoch #33: test_reward: 5252.000000 ± 2685.595800, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #34: 50001it [00:19, 2591.54it/s, env_step=1700000, gradient_step=6800, len=166, n/ep=14, n/st=2000, rew=8756.82]                         


Epoch #34: test_reward: 6460.600000 ± 3876.872662, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #35: 50001it [00:20, 2395.63it/s, env_step=1750000, gradient_step=7000, len=169, n/ep=17, n/st=2000, rew=8533.94]                         


Epoch #35: test_reward: 3292.000000 ± 1212.639105, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #36: 50001it [00:20, 2469.25it/s, env_step=1800000, gradient_step=7200, len=128, n/ep=10, n/st=2000, rew=6587.90]                         


Epoch #36: test_reward: 4945.200000 ± 3049.082249, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #37: 50001it [00:19, 2629.92it/s, env_step=1850000, gradient_step=7400, len=189, n/ep=12, n/st=2000, rew=10433.12]                        


Epoch #37: test_reward: 4568.200000 ± 2871.068435, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #38: 50001it [00:18, 2670.91it/s, env_step=1900000, gradient_step=7600, len=148, n/ep=10, n/st=2000, rew=7115.85]                         


Epoch #38: test_reward: 5214.400000 ± 2458.259677, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #39: 50001it [00:19, 2576.33it/s, env_step=1950000, gradient_step=7800, len=133, n/ep=10, n/st=2000, rew=6608.25]                         


Epoch #39: test_reward: 5086.000000 ± 1937.245674, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #40: 50001it [00:20, 2429.73it/s, env_step=2000000, gradient_step=8000, len=167, n/ep=16, n/st=2000, rew=9151.50]                         


Epoch #40: test_reward: 3670.400000 ± 1516.454760, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #41: 50001it [00:20, 2445.77it/s, env_step=2050000, gradient_step=8200, len=147, n/ep=17, n/st=2000, rew=7385.62]                         


Epoch #41: test_reward: 3598.600000 ± 2151.073230, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #42: 50001it [00:19, 2514.32it/s, env_step=2100000, gradient_step=8400, len=149, n/ep=9, n/st=2000, rew=7409.83]                          


Epoch #42: test_reward: 5013.200000 ± 3670.029667, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #43: 50001it [00:19, 2523.44it/s, env_step=2150000, gradient_step=8600, len=170, n/ep=12, n/st=2000, rew=8701.25]                         


Epoch #43: test_reward: 6328.000000 ± 4468.991743, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #44: 50001it [00:20, 2445.61it/s, env_step=2200000, gradient_step=8800, len=168, n/ep=12, n/st=2000, rew=8545.83]                         


Epoch #44: test_reward: 2319.000000 ± 2618.034263, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #45: 50001it [00:19, 2536.27it/s, env_step=2250000, gradient_step=9000, len=152, n/ep=16, n/st=2000, rew=7485.56]                         


Epoch #45: test_reward: 3732.400000 ± 2543.545211, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #46: 50001it [00:19, 2529.46it/s, env_step=2300000, gradient_step=9200, len=165, n/ep=12, n/st=2000, rew=8536.42]                         


Epoch #46: test_reward: 4376.400000 ± 2614.282586, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #47: 50001it [00:20, 2487.14it/s, env_step=2350000, gradient_step=9400, len=136, n/ep=14, n/st=2000, rew=6549.39]                         


Epoch #47: test_reward: 4850.800000 ± 2671.903322, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #48: 50001it [00:19, 2575.07it/s, env_step=2400000, gradient_step=9600, len=164, n/ep=17, n/st=2000, rew=7907.59]                         


Epoch #48: test_reward: 4584.800000 ± 3036.019657, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #49: 50001it [00:20, 2498.61it/s, env_step=2450000, gradient_step=9800, len=181, n/ep=14, n/st=2000, rew=9660.61]                         


Epoch #49: test_reward: 3338.400000 ± 1821.017035, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #50: 50001it [00:19, 2605.42it/s, env_step=2500000, gradient_step=10000, len=176, n/ep=11, n/st=2000, rew=9237.73]                        


Epoch #50: test_reward: 4193.800000 ± 2672.109347, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #51: 50001it [00:19, 2623.53it/s, env_step=2550000, gradient_step=10200, len=172, n/ep=14, n/st=2000, rew=8769.71]                        


Epoch #51: test_reward: 3710.400000 ± 2025.099859, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #52: 50001it [00:19, 2577.53it/s, env_step=2600000, gradient_step=10400, len=113, n/ep=12, n/st=2000, rew=5238.88]                        


Epoch #52: test_reward: 4558.000000 ± 2206.235527, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #53: 50001it [00:18, 2640.78it/s, env_step=2650000, gradient_step=10600, len=164, n/ep=14, n/st=2000, rew=8170.54]                        


Epoch #53: test_reward: 5211.200000 ± 1880.252792, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #54: 50001it [00:19, 2606.99it/s, env_step=2700000, gradient_step=10800, len=145, n/ep=8, n/st=2000, rew=6741.12]                         


Epoch #54: test_reward: 6237.200000 ± 4673.507479, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #55: 50001it [00:19, 2607.36it/s, env_step=2750000, gradient_step=11000, len=119, n/ep=5, n/st=2000, rew=5784.90]                         


Epoch #55: test_reward: 4586.200000 ± 2905.288757, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #56: 50001it [00:19, 2606.41it/s, env_step=2800000, gradient_step=11200, len=119, n/ep=10, n/st=2000, rew=5592.60]                        


Epoch #56: test_reward: 4481.400000 ± 1730.077004, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #57: 50001it [00:19, 2573.82it/s, env_step=2850000, gradient_step=11400, len=136, n/ep=13, n/st=2000, rew=6498.54]                        


Epoch #57: test_reward: 4687.400000 ± 2457.547729, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #58: 50001it [00:19, 2624.29it/s, env_step=2900000, gradient_step=11600, len=156, n/ep=9, n/st=2000, rew=7989.39]                         


Epoch #58: test_reward: 5284.000000 ± 2993.724904, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #59: 50001it [00:19, 2600.51it/s, env_step=2950000, gradient_step=11800, len=173, n/ep=12, n/st=2000, rew=8961.29]                        


Epoch #59: test_reward: 3549.400000 ± 1934.511422, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #60: 50001it [00:20, 2490.63it/s, env_step=3000000, gradient_step=12000, len=197, n/ep=15, n/st=2000, rew=10256.67]                       


Epoch #60: test_reward: 4800.800000 ± 3278.418546, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #61: 50001it [00:20, 2499.43it/s, env_step=3050000, gradient_step=12200, len=149, n/ep=10, n/st=2000, rew=7564.95]                        


Epoch #61: test_reward: 2620.000000 ± 1782.639167, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #62: 50001it [00:18, 2647.89it/s, env_step=3100000, gradient_step=12400, len=149, n/ep=15, n/st=2000, rew=7022.90]                        


Epoch #62: test_reward: 3423.600000 ± 2365.344592, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #63: 50001it [00:20, 2432.47it/s, env_step=3150000, gradient_step=12600, len=134, n/ep=13, n/st=2000, rew=6096.19]                        


Epoch #63: test_reward: 3952.000000 ± 1916.713646, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #64: 50001it [00:21, 2369.95it/s, env_step=3200000, gradient_step=12800, len=180, n/ep=11, n/st=2000, rew=9633.68]                        


Epoch #64: test_reward: 4388.600000 ± 2970.434318, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #65: 50001it [00:19, 2622.78it/s, env_step=3250000, gradient_step=13000, len=168, n/ep=15, n/st=2000, rew=8386.67]                        


Epoch #65: test_reward: 3751.000000 ± 3131.544316, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #66: 50001it [00:18, 2637.05it/s, env_step=3300000, gradient_step=13200, len=205, n/ep=14, n/st=2000, rew=10296.86]                       


Epoch #66: test_reward: 4132.200000 ± 3106.201726, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #67: 50001it [00:18, 2752.90it/s, env_step=3350000, gradient_step=13400, len=152, n/ep=9, n/st=2000, rew=6819.50]                         


Epoch #67: test_reward: 2992.200000 ± 2025.003002, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #68: 50001it [00:18, 2682.17it/s, env_step=3400000, gradient_step=13600, len=194, n/ep=9, n/st=2000, rew=10431.94]                        


Epoch #68: test_reward: 3881.600000 ± 1640.350889, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #69: 50001it [00:21, 2294.73it/s, env_step=3450000, gradient_step=13800, len=140, n/ep=11, n/st=2000, rew=6674.09]                        


Epoch #69: test_reward: 4849.200000 ± 2715.142457, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #70: 50001it [00:19, 2618.13it/s, env_step=3500000, gradient_step=14000, len=147, n/ep=9, n/st=2000, rew=6839.06]                         


Epoch #70: test_reward: 4185.800000 ± 2298.116176, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #71: 50001it [00:19, 2547.97it/s, env_step=3550000, gradient_step=14200, len=181, n/ep=15, n/st=2000, rew=9245.47]                        


Epoch #71: test_reward: 6512.000000 ± 2964.698703, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #72: 50001it [00:20, 2481.44it/s, env_step=3600000, gradient_step=14400, len=165, n/ep=16, n/st=2000, rew=8092.28]                        


Epoch #72: test_reward: 6213.700000 ± 4076.827395, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #73: 50001it [00:20, 2453.66it/s, env_step=3650000, gradient_step=14600, len=197, n/ep=8, n/st=2000, rew=10967.88]                        


Epoch #73: test_reward: 2853.400000 ± 1787.691931, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #74: 50001it [00:20, 2497.21it/s, env_step=3700000, gradient_step=14800, len=209, n/ep=9, n/st=2000, rew=11509.50]                        


Epoch #74: test_reward: 3363.600000 ± 2897.608780, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #75: 50001it [00:19, 2610.36it/s, env_step=3750000, gradient_step=15000, len=184, n/ep=12, n/st=2000, rew=9276.42]                        


Epoch #75: test_reward: 5571.600000 ± 3707.875597, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #76: 50001it [00:18, 2723.19it/s, env_step=3800000, gradient_step=15200, len=203, n/ep=5, n/st=2000, rew=10413.40]                        


Epoch #76: test_reward: 6105.000000 ± 3292.483834, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #77: 50001it [00:18, 2727.72it/s, env_step=3850000, gradient_step=15400, len=256, n/ep=7, n/st=2000, rew=13936.14]                        


Epoch #77: test_reward: 2995.000000 ± 2505.556745, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #78: 50001it [00:20, 2488.74it/s, env_step=3900000, gradient_step=15600, len=146, n/ep=6, n/st=2000, rew=6939.50]                         


Epoch #78: test_reward: 4142.400000 ± 2155.003814, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #79: 50001it [00:18, 2685.88it/s, env_step=3950000, gradient_step=15800, len=185, n/ep=14, n/st=2000, rew=9614.79]                        


Epoch #79: test_reward: 4051.800000 ± 2459.068189, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #80: 50001it [00:19, 2570.16it/s, env_step=4000000, gradient_step=16000, len=167, n/ep=11, n/st=2000, rew=8395.23]                        


Epoch #80: test_reward: 4231.600000 ± 2481.118425, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #81: 50001it [00:18, 2693.57it/s, env_step=4050000, gradient_step=16200, len=166, n/ep=7, n/st=2000, rew=7624.50]                         


Epoch #81: test_reward: 4549.000000 ± 3190.031881, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #82: 50001it [00:18, 2644.80it/s, env_step=4100000, gradient_step=16400, len=143, n/ep=14, n/st=2000, rew=7207.57]                        


Epoch #82: test_reward: 4128.800000 ± 1927.700848, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #83: 50001it [00:19, 2553.02it/s, env_step=4150000, gradient_step=16600, len=160, n/ep=13, n/st=2000, rew=8088.15]                        


Epoch #83: test_reward: 3168.400000 ± 2436.067454, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #84: 50001it [00:19, 2502.39it/s, env_step=4200000, gradient_step=16800, len=159, n/ep=17, n/st=2000, rew=7796.56]                        


Epoch #84: test_reward: 3946.400000 ± 2144.008358, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #85: 50001it [00:18, 2727.02it/s, env_step=4250000, gradient_step=17000, len=158, n/ep=14, n/st=2000, rew=7641.96]                        


Epoch #85: test_reward: 5063.400000 ± 3161.745474, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #86: 50001it [00:20, 2438.89it/s, env_step=4300000, gradient_step=17200, len=149, n/ep=11, n/st=2000, rew=7373.18]                        


Epoch #86: test_reward: 3849.000000 ± 1854.193140, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #87: 50001it [00:20, 2472.80it/s, env_step=4350000, gradient_step=17400, len=180, n/ep=13, n/st=2000, rew=8866.00]                        


Epoch #87: test_reward: 3398.000000 ± 2384.036409, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #88: 50001it [00:18, 2767.21it/s, env_step=4400000, gradient_step=17600, len=194, n/ep=18, n/st=2000, rew=10353.17]                       


Epoch #88: test_reward: 5092.400000 ± 4270.532782, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #89: 50001it [00:20, 2469.27it/s, env_step=4450000, gradient_step=17800, len=153, n/ep=10, n/st=2000, rew=7301.65]                        


Epoch #89: test_reward: 3865.200000 ± 2250.190783, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #90: 50001it [00:20, 2494.99it/s, env_step=4500000, gradient_step=18000, len=187, n/ep=10, n/st=2000, rew=10085.75]                       


Epoch #90: test_reward: 3568.000000 ± 2623.652873, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #91: 50001it [00:20, 2384.18it/s, env_step=4550000, gradient_step=18200, len=178, n/ep=14, n/st=2000, rew=9025.86]                        


Epoch #91: test_reward: 3832.400000 ± 1409.067436, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #92: 50001it [00:19, 2596.77it/s, env_step=4600000, gradient_step=18400, len=146, n/ep=16, n/st=2000, rew=6939.19]                        


Epoch #92: test_reward: 4641.000000 ± 2008.585821, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #93: 50001it [00:20, 2415.39it/s, env_step=4650000, gradient_step=18600, len=131, n/ep=10, n/st=2000, rew=6322.25]                        


Epoch #93: test_reward: 4366.200000 ± 3091.402588, best_reward: 6602.500000 ± 4518.199359 in #10


Epoch #94:  20%|#4     | 10000/50000 [00:04<00:19, 2047.28it/s, env_step=4660000, gradient_step=18640, len=147, n/ep=13, n/st=2000, rew=7074.42]


KeyboardInterrupt: 

### 🐃 Load ppo agents

In [165]:
agent1_learned_ppo_path = "models/policy_ppo_64x128x128x64.pth"
agent2_learned_ppo_path = "models/policy_ppo_128x256x256x128.pth"
env = _get_env()
net1 = Net(state_shape=(22,), hidden_sizes=[64, 128, 128, 64], device=device).to(device)
actor = Actor(preprocess_net=net1, action_shape=env.action_space.shape, device=device).to(device)
critic = Critic(preprocess_net=net1, device=device).to(device)
actor_critic = ActorCritic(actor=actor, critic=critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

agent1_learned_ppo = PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=torch.distributions.Categorical,
    action_space=env.action_space,
    deterministic_eval=True,
    action_scaling=False,
)
agent1_learned_ppo.load_state_dict(torch.load(agent1_learned_ppo_path))
agents_learned.append(agent1_learned_ppo)


net2 = Net(state_shape=(22,), hidden_sizes=[128, 256, 256, 128], device=device).to(device)
actor = Actor(preprocess_net=net2, action_shape=env.action_space.shape, device=device).to(device)
critic = Critic(preprocess_net=net2, device=device).to(device)
actor_critic = ActorCritic(actor=actor, critic=critic)
optim = torch.optim.Adam(actor_critic.parameters(), lr=0.0003)

agent2_learned_ppo = PPOPolicy(
    actor=actor,
    critic=critic,
    optim=optim,
    dist_fn=torch.distributions.Categorical,
    action_space=env.action_space,
    deterministic_eval=True,
    action_scaling=False,
)
agent2_learned_ppo.load_state_dict(torch.load(agent2_learned_ppo_path))
agents_learned.append(agent2_learned_ppo)

### 🐟 Evaluate best Qostaushy agent with different policies

In [59]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}
# Step 1: Load the PettingZoo environment
env = env()#render_mode="human")

# Step 2: Wrap the environment for Tianshou interfacing
env = PettingZooEnv(env)

# # Step 3: Define policies for each agent
policies = MultiAgentPolicyManager(policies=[agents_learned[3], agents_learned[4]], env=env)
# # Step 4: Convert the env to vector format
env = DummyVectorEnv([lambda: env])

# # Step 5: Construct the Collector, which interfaces the policies with the vectorised environment
collector = Collector(policies, env)

# # Step 6: Execute the environment with the agents playing for 1 episode, and render a frame every 2 seconds
result = collector.collect(n_episode=1, reset_before_collect=True)
print(PLAYS)

{'bastaushy': 0, 'qostaushy': 1}


### 𓃰 Play results

- *agent4_learned* vs *agent1_learned_ppo*: {'bastaushy': 1, 'qostaushy': 0}

- *agent1_learned* vs *agent1_learned_ppo*: {'bastaushy': 0, 'qostaushy': 1}

- *agent1_learned_ppo* vs *agent1_learned*: {'bastaushy': 1, 'qostaushy': 0}

- *agent2_learned* vs *agent1_learned_ppo*: {'bastaushy': 0, 'qostaushy': 1}

- *agent1_learned_ppo* vs *agent2_learned*: {'bastaushy': 0, 'qostaushy': 1}

- *agent3_learned* vs *agent1_learned_ppo*: {'bastaushy': 0, 'qostaushy': 0}

- *agent1_learned_ppo* vs *agent3_learned*: {'bastaushy': 1, 'qostaushy': 0}

# 🐎 Self-play training

### 🦘 Imports

In [80]:
import random

### 🦚 Method train agent

In [149]:
def train_agent_ppo(i, agent, seed):
    # Before evaluate this cell run the cell with env
    # ======== Step 1: Environment setup =========
    
    train_envs = DummyVectorEnv([_get_env for _ in range(100)])
    test_envs = DummyVectorEnv([_get_env for _ in range(100)])
    
    # seed
    np.random.seed(seed)
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)
    
    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents_ppo(agent_opponent=agent)
    
    # ======== Step 3: Collector setup =========
    train_collector = Collector(
        policy=policy,
        env=train_envs,
        buffer=VectorReplayBuffer(20_000, len(train_envs)),
    )
    test_collector = Collector(policy=policy, env=test_envs)
    
    #======== Step 4: Callback functions setup =========
    
    def save_best_fn(policy):
        model_save_path = os.path.join("models", f'policy_ppo_128x256x256x128_{i}.pth')
        os.makedirs(os.path.join("models"), exist_ok=True)
        torch.save(policy.policies[agents[1]].state_dict(), model_save_path)
    
    def stop_fn(mean_rewards):
        return mean_rewards >= 22000
    
    def reward_metric(rews):
        return rews[:, 1]
    
    #======== Step 5: Run the trainer =========
    result = OnpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,
        max_epoch=50,
        step_per_epoch=50000,
        repeat_per_collect=10,
        episode_per_test=10,
        batch_size=256,
        step_per_collect=2000,
        reward_metric=reward_metric,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn
    ).run()
    
    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[1]])")
    return policy.policies[agents[1]]

In [213]:
def train_agent_dqn(i, agent, seed):
    # Before evaluate this cell run the cell with env
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(100)])
    test_envs = DummyVectorEnv([_get_env for _ in range(100)])
    
    # seed
    np.random.seed(seed)
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)
    
    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents_dqn(agent_opponent=agent)
    
    # # ======== Step 3: Collector setup =========
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(20_000, len(train_envs)),
        exploration_noise=True,
    )
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    # policy.set_eps(1)
    
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):
        model_save_path = os.path.join("models", f'policy_dqn_256x512x512x256_{i}.pth')
        os.makedirs(os.path.join("models"), exist_ok=True)
        torch.save(policy.policies[agents[1]].state_dict(), model_save_path)
    
    def stop_fn(mean_rewards):
        return mean_rewards >= 22000
    
    def train_fn(epoch, env_step):
        policy.policies[agents[1]].set_eps(0.1)
    
    def test_fn(epoch, env_step):
        policy.policies[agents[1]].set_eps(0.05)
    
    def reward_metric(rews):
        return rews[:, 1]
    
    # ======== Step 5: Run the trainer =========
    result = OffpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,
        max_epoch=150,
        step_per_epoch=1000,
        step_per_collect=50,
        episode_per_test=10,
        batch_size=256,
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=0.1,
        test_in_train=False,
        reward_metric=reward_metric,
        verbose=True
    ).run()
    
    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print("\n(the trained policy can be accessed via policy.policies[agents[1]])")
    return policy.policies[agents[1]]

### Self-play

In [214]:
models = agents_learned
for i in range(21):
    # random select opponent
    directory = "models"

    # for path, folders, files in os.walk(directory):
    #     for filename in files:
    #         models.append(os.path.join(directory, filename))
    # agent_opponent_random = random.choice(range(len(models)))
    agent_opponent_random = random.choice(models)
    idx = models.index(agent_opponent_random)
    models.remove(agent_opponent_random)
    print(f"Train with agent_opponent #{idx}")

    # training
    seed = random.choice(range(1000))
    agent_learned = train_agent_dqn(i, agent_opponent_random, seed)
    
    #add new learned agent
    agents_learned.append(agent_learned)
    models.append(agent_learned)

Train with agent_opponent #3


Epoch #1: 1001it [00:02, 482.25it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 11991.800000 ± 3880.892315, best_reward: 13439.600000 ± 2821.158315 in #0


Epoch #2: 1001it [00:02, 461.83it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 10399.400000 ± 5205.098735, best_reward: 13439.600000 ± 2821.158315 in #0


Epoch #3: 1001it [00:01, 570.99it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 13740.200000 ± 4190.998230, best_reward: 13740.200000 ± 4190.998230 in #3


Epoch #4: 1001it [00:01, 504.13it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 7657.100000 ± 2568.765791, best_reward: 13740.200000 ± 4190.998230 in #3


Epoch #5: 1001it [00:01, 747.35it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 9299.200000 ± 4654.768067, best_reward: 13740.200000 ± 4190.998230 in #3


Epoch #6: 1001it [00:01, 674.72it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 13570.300000 ± 4098.814318, best_reward: 13740.200000 ± 4190.998230 in #3


Epoch #7: 1001it [00:01, 687.55it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 4643.900000 ± 4831.193361, best_reward: 13740.200000 ± 4190.998230 in #3


Epoch #8: 1001it [00:01, 672.16it/s, env_step=8000, gradient_step=800, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #8: test_reward: 10582.400000 ± 3242.438194, best_reward: 13740.200000 ± 4190.998230 in #3


Epoch #9: 1001it [00:01, 561.13it/s, env_step=9000, gradient_step=900, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #9: test_reward: 14848.000000 ± 6926.862955, best_reward: 14848.000000 ± 6926.862955 in #9


Epoch #10: 1001it [00:02, 476.02it/s, env_step=10000, gradient_step=1000, len=93, n/ep=0, n/st=100, rew=3251.00]                                                                                  


Epoch #10: test_reward: 12664.100000 ± 3308.619182, best_reward: 14848.000000 ± 6926.862955 in #9


Epoch #11: 1001it [00:01, 508.32it/s, env_step=11000, gradient_step=1100, len=93, n/ep=0, n/st=100, rew=3251.00]                                                                                  


Epoch #11: test_reward: 10903.200000 ± 5721.287491, best_reward: 14848.000000 ± 6926.862955 in #9


Epoch #12: 1001it [00:01, 570.96it/s, env_step=12000, gradient_step=1200, len=93, n/ep=0, n/st=100, rew=3251.00]                                                                                  


Epoch #12: test_reward: 8790.300000 ± 3606.731208, best_reward: 14848.000000 ± 6926.862955 in #9


Epoch #13: 1001it [00:01, 632.55it/s, env_step=13000, gradient_step=1300, len=121, n/ep=0, n/st=100, rew=5057.00]                                                                                 


Epoch #13: test_reward: 15100.800000 ± 4252.057121, best_reward: 15100.800000 ± 4252.057121 in #13


Epoch #14: 1001it [00:02, 499.14it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=5101.50]                                                                                 


Epoch #14: test_reward: 9241.200000 ± 4526.050039, best_reward: 15100.800000 ± 4252.057121 in #13


Epoch #15: 1001it [00:01, 531.11it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=5706.00]                                                                                 


Epoch #15: test_reward: 12257.800000 ± 4287.233066, best_reward: 15100.800000 ± 4252.057121 in #13


Epoch #16: 1001it [00:01, 527.75it/s, env_step=16000, gradient_step=1600, len=159, n/ep=0, n/st=100, rew=6257.00]                                                                                 


Epoch #16: test_reward: 12894.600000 ± 4876.681499, best_reward: 15100.800000 ± 4252.057121 in #13


Epoch #17: 1001it [00:02, 456.99it/s, env_step=17000, gradient_step=1700, len=170, n/ep=1, n/st=100, rew=8736.00]                                                                                 


Epoch #17: test_reward: 14117.600000 ± 5721.857779, best_reward: 15100.800000 ± 4252.057121 in #13


Epoch #18: 1001it [00:01, 534.31it/s, env_step=18000, gradient_step=1800, len=177, n/ep=0, n/st=100, rew=8703.00]                                                                                 


Epoch #18: test_reward: 15461.000000 ± 4998.661781, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #19: 1001it [00:01, 520.72it/s, env_step=19000, gradient_step=1900, len=96, n/ep=0, n/st=100, rew=3177.50]                                                                                  


Epoch #19: test_reward: 10814.200000 ± 2428.498705, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #20: 1001it [00:01, 579.38it/s, env_step=20000, gradient_step=2000, len=198, n/ep=0, n/st=100, rew=7400.00]                                                                                 


Epoch #20: test_reward: 15001.600000 ± 4041.887609, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #21: 1001it [00:02, 488.33it/s, env_step=21000, gradient_step=2100, len=209, n/ep=0, n/st=100, rew=8639.50]                                                                                 


Epoch #21: test_reward: 14644.800000 ± 2759.169940, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #22: 1001it [00:01, 547.06it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=10318.00]                                                                                


Epoch #22: test_reward: 9187.600000 ± 3365.549084, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #23: 1001it [00:02, 451.03it/s, env_step=23000, gradient_step=2300, len=229, n/ep=0, n/st=100, rew=8389.00]                                                                                 


Epoch #23: test_reward: 13036.900000 ± 6297.948975, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #24: 1001it [00:01, 537.87it/s, env_step=24000, gradient_step=2400, len=85, n/ep=2, n/st=100, rew=2309.00]                                                                                  


Epoch #24: test_reward: 13201.200000 ± 4823.592785, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #25: 1001it [00:01, 626.51it/s, env_step=25000, gradient_step=2500, len=247, n/ep=0, n/st=100, rew=14498.00]                                                                                


Epoch #25: test_reward: 14262.300000 ± 4301.100837, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #26: 1001it [00:02, 490.66it/s, env_step=26000, gradient_step=2600, len=259, n/ep=0, n/st=100, rew=12251.00]                                                                                


Epoch #26: test_reward: 15293.200000 ± 3190.303584, best_reward: 15461.000000 ± 4998.661781 in #18


Epoch #27: 1001it [00:01, 505.97it/s, env_step=27000, gradient_step=2700, len=270, n/ep=2, n/st=100, rew=11381.00]                                                                                


Epoch #27: test_reward: 19959.100000 ± 3349.941117, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #28: 1001it [00:01, 532.54it/s, env_step=28000, gradient_step=2800, len=271, n/ep=0, n/st=100, rew=13320.00]                                                                                


Epoch #28: test_reward: 13868.200000 ± 5519.805899, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #29: 1001it [00:02, 464.09it/s, env_step=29000, gradient_step=2900, len=173, n/ep=0, n/st=100, rew=6857.00]                                                                                 


Epoch #29: test_reward: 16057.800000 ± 4239.952212, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #30: 1001it [00:01, 640.77it/s, env_step=30000, gradient_step=3000, len=136, n/ep=1, n/st=100, rew=4949.00]                                                                                 


Epoch #30: test_reward: 15287.800000 ± 5383.681079, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #31: 1001it [00:01, 512.39it/s, env_step=31000, gradient_step=3100, len=307, n/ep=0, n/st=100, rew=15123.00]                                                                                


Epoch #31: test_reward: 15247.300000 ± 5407.284606, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #32: 1001it [00:01, 623.10it/s, env_step=32000, gradient_step=3200, len=226, n/ep=2, n/st=100, rew=11142.75]                                                                                


Epoch #32: test_reward: 14933.900000 ± 3561.063899, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #33: 1001it [00:02, 432.03it/s, env_step=33000, gradient_step=3300, len=327, n/ep=0, n/st=100, rew=15259.00]                                                                                


Epoch #33: test_reward: 15214.600000 ± 6325.300186, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #34: 1001it [00:02, 467.75it/s, env_step=34000, gradient_step=3400, len=152, n/ep=0, n/st=100, rew=6244.00]                                                                                 


Epoch #34: test_reward: 16775.200000 ± 5338.642989, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #35: 1001it [00:02, 428.45it/s, env_step=35000, gradient_step=3500, len=349, n/ep=0, n/st=100, rew=17164.00]                                                                                


Epoch #35: test_reward: 9494.600000 ± 1581.586052, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #36: 1001it [00:01, 564.03it/s, env_step=36000, gradient_step=3600, len=181, n/ep=0, n/st=100, rew=9101.50]                                                                                 


Epoch #36: test_reward: 19057.700000 ± 6008.380898, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #37: 1001it [00:01, 599.67it/s, env_step=37000, gradient_step=3700, len=164, n/ep=0, n/st=100, rew=6613.50]                                                                                 


Epoch #37: test_reward: 14785.100000 ± 4828.347283, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #38: 1001it [00:01, 563.03it/s, env_step=38000, gradient_step=3800, len=183, n/ep=0, n/st=100, rew=9253.17]                                                                                 


Epoch #38: test_reward: 7118.000000 ± 3354.415955, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #39: 1001it [00:01, 539.09it/s, env_step=39000, gradient_step=3900, len=204, n/ep=0, n/st=100, rew=7945.00]                                                                                 


Epoch #39: test_reward: 9478.400000 ± 3809.095305, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #40: 1001it [00:01, 588.08it/s, env_step=40000, gradient_step=4000, len=400, n/ep=2, n/st=100, rew=19187.50]                                                                                


Epoch #40: test_reward: 15014.800000 ± 3617.534984, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #41: 1001it [00:01, 648.24it/s, env_step=41000, gradient_step=4100, len=184, n/ep=0, n/st=100, rew=9545.50]                                                                                 


Epoch #41: test_reward: 8722.600000 ± 2466.915815, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #42: 1001it [00:01, 508.34it/s, env_step=42000, gradient_step=4200, len=174, n/ep=0, n/st=100, rew=6459.00]                                                                                 


Epoch #42: test_reward: 9250.000000 ± 3012.423244, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #43: 1001it [00:01, 598.11it/s, env_step=43000, gradient_step=4300, len=222, n/ep=1, n/st=100, rew=10276.00]                                                                                


Epoch #43: test_reward: 9971.500000 ± 5028.643659, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #44: 1001it [00:01, 595.39it/s, env_step=44000, gradient_step=4400, len=198, n/ep=2, n/st=100, rew=10472.25]                                                                                


Epoch #44: test_reward: 15112.400000 ± 5026.252584, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #45: 1001it [00:01, 615.20it/s, env_step=45000, gradient_step=4500, len=197, n/ep=0, n/st=100, rew=8539.00]                                                                                 


Epoch #45: test_reward: 7718.800000 ± 2262.864768, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #46: 1001it [00:02, 491.51it/s, env_step=46000, gradient_step=4600, len=189, n/ep=0, n/st=100, rew=10572.00]                                                                                


Epoch #46: test_reward: 8399.700000 ± 1104.332020, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #47: 1001it [00:02, 470.88it/s, env_step=47000, gradient_step=4700, len=246, n/ep=0, n/st=100, rew=11231.50]                                                                                


Epoch #47: test_reward: 10576.000000 ± 5952.629150, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #48: 1001it [00:01, 541.85it/s, env_step=48000, gradient_step=4800, len=184, n/ep=2, n/st=100, rew=8151.50]                                                                                 


Epoch #48: test_reward: 13767.100000 ± 3418.943944, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #49: 1001it [00:01, 534.03it/s, env_step=49000, gradient_step=4900, len=319, n/ep=1, n/st=100, rew=16021.00]                                                                                


Epoch #49: test_reward: 16578.400000 ± 4583.226444, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #50: 1001it [00:01, 546.78it/s, env_step=50000, gradient_step=5000, len=294, n/ep=1, n/st=100, rew=14769.50]                                                                                


Epoch #50: test_reward: 13443.500000 ± 5955.050768, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #51: 1001it [00:02, 473.48it/s, env_step=51000, gradient_step=5100, len=183, n/ep=0, n/st=100, rew=8432.00]                                                                                 


Epoch #51: test_reward: 10956.400000 ± 4409.839820, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #52: 1001it [00:01, 600.90it/s, env_step=52000, gradient_step=5200, len=329, n/ep=1, n/st=100, rew=15256.00]                                                                                


Epoch #52: test_reward: 17250.000000 ± 5932.278753, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #53: 1001it [00:01, 594.38it/s, env_step=53000, gradient_step=5300, len=258, n/ep=0, n/st=100, rew=13623.00]                                                                                


Epoch #53: test_reward: 11092.500000 ± 3646.003216, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #54: 1001it [00:01, 500.52it/s, env_step=54000, gradient_step=5400, len=273, n/ep=1, n/st=100, rew=12230.00]                                                                                


Epoch #54: test_reward: 13779.800000 ± 5830.913373, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #55: 1001it [00:02, 459.05it/s, env_step=55000, gradient_step=5500, len=277, n/ep=0, n/st=100, rew=14485.50]                                                                                


Epoch #55: test_reward: 11614.200000 ± 4025.531165, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #56: 1001it [00:02, 473.05it/s, env_step=56000, gradient_step=5600, len=339, n/ep=0, n/st=100, rew=17515.50]                                                                                


Epoch #56: test_reward: 8974.200000 ± 3931.838496, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #57: 1001it [00:01, 577.49it/s, env_step=57000, gradient_step=5700, len=235, n/ep=0, n/st=100, rew=9546.00]                                                                                 


Epoch #57: test_reward: 10676.000000 ± 2727.916861, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #58: 1001it [00:01, 558.78it/s, env_step=58000, gradient_step=5800, len=161, n/ep=0, n/st=100, rew=5793.00]                                                                                 


Epoch #58: test_reward: 14956.300000 ± 5029.584636, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #59: 1001it [00:01, 540.92it/s, env_step=59000, gradient_step=5900, len=126, n/ep=0, n/st=100, rew=5534.00]                                                                                 


Epoch #59: test_reward: 13472.200000 ± 2526.946054, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #60: 1001it [00:01, 621.72it/s, env_step=60000, gradient_step=6000, len=257, n/ep=0, n/st=100, rew=13175.00]                                                                                


Epoch #60: test_reward: 10774.200000 ± 2486.575227, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #61: 1001it [00:01, 532.85it/s, env_step=61000, gradient_step=6100, len=137, n/ep=0, n/st=100, rew=5459.00]                                                                                 


Epoch #61: test_reward: 14607.600000 ± 3396.375191, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #62: 1001it [00:01, 526.40it/s, env_step=62000, gradient_step=6200, len=132, n/ep=0, n/st=100, rew=6116.00]                                                                                 


Epoch #62: test_reward: 19082.200000 ± 6853.949078, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #63: 1001it [00:02, 411.84it/s, env_step=63000, gradient_step=6300, len=132, n/ep=0, n/st=100, rew=6116.00]                                                                                 


Epoch #63: test_reward: 13734.000000 ± 7309.877537, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #64: 1001it [00:01, 510.46it/s, env_step=64000, gradient_step=6400, len=400, n/ep=1, n/st=100, rew=21237.00]                                                                                


Epoch #64: test_reward: 19111.800000 ± 8818.116282, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #65: 1001it [00:01, 619.75it/s, env_step=65000, gradient_step=6500, len=225, n/ep=1, n/st=100, rew=11040.00]                                                                                


Epoch #65: test_reward: 11866.300000 ± 5477.134288, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #66: 1001it [00:01, 504.18it/s, env_step=66000, gradient_step=6600, len=195, n/ep=0, n/st=100, rew=9839.50]                                                                                 


Epoch #66: test_reward: 15476.600000 ± 3715.705053, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #67: 1001it [00:02, 472.70it/s, env_step=67000, gradient_step=6700, len=324, n/ep=2, n/st=100, rew=15897.50]                                                                                


Epoch #67: test_reward: 13824.700000 ± 5676.263948, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #68: 1001it [00:01, 554.62it/s, env_step=68000, gradient_step=6800, len=191, n/ep=1, n/st=100, rew=9453.50]                                                                                 


Epoch #68: test_reward: 10933.300000 ± 5739.236483, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #69: 1001it [00:01, 540.27it/s, env_step=69000, gradient_step=6900, len=212, n/ep=1, n/st=100, rew=9732.00]                                                                                 


Epoch #69: test_reward: 14074.200000 ± 6029.112071, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #70: 1001it [00:01, 596.35it/s, env_step=70000, gradient_step=7000, len=96, n/ep=1, n/st=100, rew=3754.00]                                                                                  


Epoch #70: test_reward: 14675.600000 ± 7621.390729, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #71: 1001it [00:01, 528.44it/s, env_step=71000, gradient_step=7100, len=138, n/ep=1, n/st=100, rew=5715.00]                                                                                 


Epoch #71: test_reward: 13676.800000 ± 5185.869644, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #72: 1001it [00:01, 541.93it/s, env_step=72000, gradient_step=7200, len=112, n/ep=0, n/st=100, rew=4121.00]                                                                                 


Epoch #72: test_reward: 14885.900000 ± 7281.257041, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #73: 1001it [00:02, 482.84it/s, env_step=73000, gradient_step=7300, len=185, n/ep=0, n/st=100, rew=9713.50]                                                                                 


Epoch #73: test_reward: 8893.200000 ± 3252.775701, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #74: 1001it [00:02, 465.94it/s, env_step=74000, gradient_step=7400, len=98, n/ep=1, n/st=100, rew=3511.00]                                                                                  


Epoch #74: test_reward: 15404.600000 ± 7033.109998, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #75: 1001it [00:02, 479.79it/s, env_step=75000, gradient_step=7500, len=110, n/ep=1, n/st=100, rew=4242.00]                                                                                 


Epoch #75: test_reward: 13150.000000 ± 8194.499326, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #76: 1001it [00:02, 495.03it/s, env_step=76000, gradient_step=7600, len=253, n/ep=0, n/st=100, rew=12588.00]                                                                                


Epoch #76: test_reward: 16519.400000 ± 6967.735532, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #77: 1001it [00:01, 617.20it/s, env_step=77000, gradient_step=7700, len=213, n/ep=1, n/st=100, rew=10326.00]                                                                                


Epoch #77: test_reward: 11357.800000 ± 4164.142524, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #78: 1001it [00:02, 466.56it/s, env_step=78000, gradient_step=7800, len=237, n/ep=0, n/st=100, rew=11618.00]                                                                                


Epoch #78: test_reward: 11579.700000 ± 4595.270526, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #79: 1001it [00:02, 409.99it/s, env_step=79000, gradient_step=7900, len=192, n/ep=0, n/st=100, rew=9392.88]                                                                                 


Epoch #79: test_reward: 14604.600000 ± 4488.970243, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #80: 1001it [00:01, 536.66it/s, env_step=80000, gradient_step=8000, len=187, n/ep=0, n/st=100, rew=9366.50]                                                                                 


Epoch #80: test_reward: 19073.400000 ± 5580.601477, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #81: 1001it [00:02, 455.00it/s, env_step=81000, gradient_step=8100, len=161, n/ep=0, n/st=100, rew=8343.00]                                                                                 


Epoch #81: test_reward: 13541.700000 ± 5505.528241, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #82: 1001it [00:01, 513.43it/s, env_step=82000, gradient_step=8200, len=184, n/ep=2, n/st=100, rew=9112.50]                                                                                 


Epoch #82: test_reward: 12756.400000 ± 6561.645330, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #83: 1001it [00:01, 504.88it/s, env_step=83000, gradient_step=8300, len=400, n/ep=1, n/st=100, rew=20274.00]                                                                                


Epoch #83: test_reward: 16701.800000 ± 4483.472154, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #84: 1001it [00:01, 527.76it/s, env_step=84000, gradient_step=8400, len=140, n/ep=0, n/st=100, rew=5270.00]                                                                                 


Epoch #84: test_reward: 10767.100000 ± 5923.954363, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #85: 1001it [00:01, 553.93it/s, env_step=85000, gradient_step=8500, len=89, n/ep=1, n/st=100, rew=3386.00]                                                                                  


Epoch #85: test_reward: 15001.600000 ± 6131.110686, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #86: 1001it [00:01, 531.72it/s, env_step=86000, gradient_step=8600, len=211, n/ep=0, n/st=100, rew=9712.00]                                                                                 


Epoch #86: test_reward: 10034.000000 ± 5836.350709, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #87: 1001it [00:01, 624.75it/s, env_step=87000, gradient_step=8700, len=124, n/ep=0, n/st=100, rew=4295.00]                                                                                 


Epoch #87: test_reward: 16291.300000 ± 6315.557648, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #88: 1001it [00:01, 579.73it/s, env_step=88000, gradient_step=8800, len=276, n/ep=0, n/st=100, rew=11669.00]                                                                                


Epoch #88: test_reward: 14241.100000 ± 4722.242909, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #89: 1001it [00:01, 610.52it/s, env_step=89000, gradient_step=8900, len=301, n/ep=0, n/st=100, rew=13546.75]                                                                                


Epoch #89: test_reward: 13621.400000 ± 4199.015675, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #90: 1001it [00:01, 514.05it/s, env_step=90000, gradient_step=9000, len=167, n/ep=2, n/st=100, rew=6301.00]                                                                                 


Epoch #90: test_reward: 14489.400000 ± 5673.393150, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #91: 1001it [00:02, 420.64it/s, env_step=91000, gradient_step=9100, len=110, n/ep=0, n/st=100, rew=4546.00]                                                                                 


Epoch #91: test_reward: 11196.600000 ± 5509.305731, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #92: 1001it [00:01, 560.87it/s, env_step=92000, gradient_step=9200, len=124, n/ep=1, n/st=100, rew=5909.50]                                                                                 


Epoch #92: test_reward: 11529.700000 ± 5115.815244, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #93: 1001it [00:01, 617.49it/s, env_step=93000, gradient_step=9300, len=121, n/ep=2, n/st=100, rew=4547.00]                                                                                 


Epoch #93: test_reward: 11343.700000 ± 4261.615188, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #94: 1001it [00:01, 569.64it/s, env_step=94000, gradient_step=9400, len=169, n/ep=2, n/st=100, rew=6597.00]                                                                                 


Epoch #94: test_reward: 10934.000000 ± 4276.067539, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #95: 1001it [00:01, 517.08it/s, env_step=95000, gradient_step=9500, len=257, n/ep=0, n/st=100, rew=10257.50]                                                                                


Epoch #95: test_reward: 16743.100000 ± 5016.987432, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #96: 1001it [00:02, 432.58it/s, env_step=96000, gradient_step=9600, len=168, n/ep=0, n/st=100, rew=6614.00]                                                                                 


Epoch #96: test_reward: 15723.300000 ± 5740.759114, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #97: 1001it [00:01, 503.71it/s, env_step=97000, gradient_step=9700, len=280, n/ep=0, n/st=100, rew=11976.75]                                                                                


Epoch #97: test_reward: 14940.700000 ± 4430.999211, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #98: 1001it [00:01, 583.26it/s, env_step=98000, gradient_step=9800, len=48, n/ep=0, n/st=100, rew=1308.00]                                                                                  


Epoch #98: test_reward: 14452.600000 ± 6124.974713, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #99: 1001it [00:02, 472.10it/s, env_step=99000, gradient_step=9900, len=151, n/ep=0, n/st=100, rew=4402.50]                                                                                 


Epoch #99: test_reward: 13429.400000 ± 4129.970973, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #100: 1001it [00:01, 612.71it/s, env_step=100000, gradient_step=10000, len=185, n/ep=2, n/st=100, rew=8211.25]                                                                              


Epoch #100: test_reward: 16970.200000 ± 4946.643282, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #101: 1001it [00:02, 415.80it/s, env_step=101000, gradient_step=10100, len=144, n/ep=0, n/st=100, rew=6366.50]                                                                              


Epoch #101: test_reward: 14278.600000 ± 3657.368595, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #102: 1001it [00:01, 560.58it/s, env_step=102000, gradient_step=10200, len=225, n/ep=0, n/st=100, rew=8662.50]                                                                              


Epoch #102: test_reward: 18307.100000 ± 2752.173848, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #103: 1001it [00:01, 534.75it/s, env_step=103000, gradient_step=10300, len=254, n/ep=0, n/st=100, rew=11768.00]                                                                             


Epoch #103: test_reward: 15178.200000 ± 5586.778353, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #104: 1001it [00:01, 538.65it/s, env_step=104000, gradient_step=10400, len=246, n/ep=0, n/st=100, rew=7775.00]                                                                              


Epoch #104: test_reward: 12943.100000 ± 4088.857260, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #105: 1001it [00:01, 607.83it/s, env_step=105000, gradient_step=10500, len=97, n/ep=0, n/st=100, rew=3360.00]                                                                               


Epoch #105: test_reward: 15084.500000 ± 5823.708496, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #106: 1001it [00:02, 438.05it/s, env_step=106000, gradient_step=10600, len=195, n/ep=2, n/st=100, rew=9756.50]                                                                              


Epoch #106: test_reward: 12873.900000 ± 4525.438685, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #107: 1001it [00:01, 614.48it/s, env_step=107000, gradient_step=10700, len=68, n/ep=0, n/st=100, rew=1930.00]                                                                               


Epoch #107: test_reward: 9473.500000 ± 2546.756182, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #108: 1001it [00:02, 429.88it/s, env_step=108000, gradient_step=10800, len=104, n/ep=0, n/st=100, rew=3110.50]                                                                              


Epoch #108: test_reward: 13387.400000 ± 5019.884226, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #109: 1001it [00:01, 565.22it/s, env_step=109000, gradient_step=10900, len=242, n/ep=1, n/st=100, rew=10573.00]                                                                             


Epoch #109: test_reward: 10508.500000 ± 4622.544737, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #110: 1001it [00:01, 520.92it/s, env_step=110000, gradient_step=11000, len=115, n/ep=2, n/st=100, rew=3479.25]                                                                              


Epoch #110: test_reward: 12192.600000 ± 5671.108608, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #111: 1001it [00:01, 575.35it/s, env_step=111000, gradient_step=11100, len=138, n/ep=0, n/st=100, rew=4980.00]                                                                              


Epoch #111: test_reward: 12327.200000 ± 4353.277588, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #112: 1001it [00:01, 525.64it/s, env_step=112000, gradient_step=11200, len=136, n/ep=1, n/st=100, rew=4691.00]                                                                              


Epoch #112: test_reward: 10805.600000 ± 5092.322048, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #113: 1001it [00:01, 513.24it/s, env_step=113000, gradient_step=11300, len=172, n/ep=0, n/st=100, rew=6997.50]                                                                              


Epoch #113: test_reward: 10579.100000 ± 3254.385056, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #114: 1001it [00:01, 605.37it/s, env_step=114000, gradient_step=11400, len=217, n/ep=1, n/st=100, rew=8216.50]                                                                              


Epoch #114: test_reward: 8721.800000 ± 4932.115445, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #115: 1001it [00:01, 525.25it/s, env_step=115000, gradient_step=11500, len=138, n/ep=1, n/st=100, rew=5208.00]                                                                              


Epoch #115: test_reward: 11934.400000 ± 3037.231476, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #116: 1001it [00:01, 576.13it/s, env_step=116000, gradient_step=11600, len=141, n/ep=0, n/st=100, rew=5477.50]                                                                              


Epoch #116: test_reward: 15168.200000 ± 4643.492776, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #117: 1001it [00:02, 437.36it/s, env_step=117000, gradient_step=11700, len=72, n/ep=0, n/st=100, rew=2429.00]                                                                               


Epoch #117: test_reward: 10468.400000 ± 4563.181504, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #118: 1001it [00:01, 526.16it/s, env_step=118000, gradient_step=11800, len=124, n/ep=0, n/st=100, rew=4521.00]                                                                              


Epoch #118: test_reward: 12207.600000 ± 4371.946093, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #119: 1001it [00:01, 581.18it/s, env_step=119000, gradient_step=11900, len=88, n/ep=1, n/st=100, rew=2663.00]                                                                               


Epoch #119: test_reward: 12453.000000 ± 5340.758691, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #120: 1001it [00:02, 430.74it/s, env_step=120000, gradient_step=12000, len=104, n/ep=1, n/st=100, rew=3718.00]                                                                              


Epoch #120: test_reward: 10983.000000 ± 6043.681031, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #121: 1001it [00:01, 606.79it/s, env_step=121000, gradient_step=12100, len=56, n/ep=0, n/st=100, rew=1855.00]                                                                               


Epoch #121: test_reward: 9872.600000 ± 5236.320544, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #122: 1001it [00:01, 570.42it/s, env_step=122000, gradient_step=12200, len=193, n/ep=4, n/st=100, rew=7729.00]                                                                              


Epoch #122: test_reward: 7001.000000 ± 3664.680859, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #123: 1001it [00:01, 586.46it/s, env_step=123000, gradient_step=12300, len=145, n/ep=0, n/st=100, rew=6141.00]                                                                              


Epoch #123: test_reward: 19911.400000 ± 4946.248118, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #124: 1001it [00:01, 526.11it/s, env_step=124000, gradient_step=12400, len=400, n/ep=0, n/st=100, rew=21552.50]                                                                             


Epoch #124: test_reward: 8526.700000 ± 3131.547415, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #125: 1001it [00:01, 517.20it/s, env_step=125000, gradient_step=12500, len=112, n/ep=0, n/st=100, rew=4108.00]                                                                              


Epoch #125: test_reward: 8759.800000 ± 4678.353916, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #126: 1001it [00:01, 544.16it/s, env_step=126000, gradient_step=12600, len=88, n/ep=0, n/st=100, rew=2940.00]                                                                               


Epoch #126: test_reward: 12385.600000 ± 8034.809807, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #127: 1001it [00:02, 457.48it/s, env_step=127000, gradient_step=12700, len=179, n/ep=1, n/st=100, rew=7366.00]                                                                              


Epoch #127: test_reward: 12509.200000 ± 3923.221758, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #128: 1001it [00:02, 462.11it/s, env_step=128000, gradient_step=12800, len=74, n/ep=1, n/st=100, rew=3087.00]                                                                               


Epoch #128: test_reward: 9582.200000 ± 5907.719049, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #129: 1001it [00:02, 426.14it/s, env_step=129000, gradient_step=12900, len=158, n/ep=1, n/st=100, rew=5689.00]                                                                              


Epoch #129: test_reward: 9171.100000 ± 4146.346957, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #130: 1001it [00:02, 438.63it/s, env_step=130000, gradient_step=13000, len=64, n/ep=1, n/st=100, rew=2116.00]                                                                               


Epoch #130: test_reward: 17316.700000 ± 5595.670720, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #131: 1001it [00:01, 574.42it/s, env_step=131000, gradient_step=13100, len=134, n/ep=1, n/st=100, rew=6200.00]                                                                              


Epoch #131: test_reward: 10584.700000 ± 6153.919305, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #132: 1001it [00:01, 611.75it/s, env_step=132000, gradient_step=13200, len=114, n/ep=1, n/st=100, rew=4459.00]                                                                              


Epoch #132: test_reward: 15190.000000 ± 3555.337649, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #133: 1001it [00:01, 547.85it/s, env_step=133000, gradient_step=13300, len=162, n/ep=2, n/st=100, rew=7076.50]                                                                              


Epoch #133: test_reward: 11226.900000 ± 4911.626624, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #134: 1001it [00:01, 618.15it/s, env_step=134000, gradient_step=13400, len=206, n/ep=0, n/st=100, rew=7936.00]                                                                              


Epoch #134: test_reward: 12928.500000 ± 5849.515591, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #135: 1001it [00:02, 495.14it/s, env_step=135000, gradient_step=13500, len=94, n/ep=0, n/st=100, rew=3469.00]                                                                               


Epoch #135: test_reward: 16570.800000 ± 4485.559292, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #136: 1001it [00:01, 559.16it/s, env_step=136000, gradient_step=13600, len=82, n/ep=1, n/st=100, rew=3167.00]                                                                               


Epoch #136: test_reward: 8917.900000 ± 2469.564352, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #137: 1001it [00:01, 589.80it/s, env_step=137000, gradient_step=13700, len=56, n/ep=1, n/st=100, rew=1818.00]                                                                               


Epoch #137: test_reward: 13646.100000 ± 4933.170308, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #138: 1001it [00:01, 520.41it/s, env_step=138000, gradient_step=13800, len=78, n/ep=0, n/st=100, rew=2742.00]                                                                               


Epoch #138: test_reward: 15343.000000 ± 4543.627626, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #139: 1001it [00:02, 481.90it/s, env_step=139000, gradient_step=13900, len=93, n/ep=0, n/st=100, rew=3299.00]                                                                               


Epoch #139: test_reward: 10134.600000 ± 3820.748623, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #140: 1001it [00:01, 506.82it/s, env_step=140000, gradient_step=14000, len=243, n/ep=0, n/st=100, rew=10917.50]                                                                             


Epoch #140: test_reward: 15328.500000 ± 5551.112452, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #141: 1001it [00:02, 472.46it/s, env_step=141000, gradient_step=14100, len=74, n/ep=0, n/st=100, rew=2399.00]                                                                               


Epoch #141: test_reward: 14791.100000 ± 4945.088603, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #142: 1001it [00:02, 488.39it/s, env_step=142000, gradient_step=14200, len=160, n/ep=2, n/st=100, rew=6219.00]                                                                              


Epoch #142: test_reward: 17127.200000 ± 6305.302289, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #143: 1001it [00:01, 583.00it/s, env_step=143000, gradient_step=14300, len=62, n/ep=1, n/st=100, rew=1988.00]                                                                               


Epoch #143: test_reward: 7311.500000 ± 1508.230437, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #144: 1001it [00:01, 615.97it/s, env_step=144000, gradient_step=14400, len=70, n/ep=0, n/st=100, rew=2608.00]                                                                               


Epoch #144: test_reward: 14197.200000 ± 2956.632977, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #145: 1001it [00:02, 466.25it/s, env_step=145000, gradient_step=14500, len=171, n/ep=2, n/st=100, rew=6244.00]                                                                              


Epoch #145: test_reward: 13721.300000 ± 8314.543873, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #146: 1001it [00:01, 510.75it/s, env_step=146000, gradient_step=14600, len=400, n/ep=1, n/st=100, rew=19259.50]                                                                             


Epoch #146: test_reward: 13230.400000 ± 4151.170225, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #147: 1001it [00:02, 454.43it/s, env_step=147000, gradient_step=14700, len=121, n/ep=2, n/st=100, rew=4788.50]                                                                              


Epoch #147: test_reward: 13939.600000 ± 4213.805197, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #148: 1001it [00:01, 557.53it/s, env_step=148000, gradient_step=14800, len=246, n/ep=0, n/st=100, rew=11270.00]                                                                             


Epoch #148: test_reward: 13949.400000 ± 6574.501170, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #149: 1001it [00:01, 578.71it/s, env_step=149000, gradient_step=14900, len=110, n/ep=0, n/st=100, rew=3716.00]                                                                              


Epoch #149: test_reward: 16377.200000 ± 6142.789282, best_reward: 19959.100000 ± 3349.941117 in #27


Epoch #150: 1001it [00:01, 502.69it/s, env_step=150000, gradient_step=15000, len=168, n/ep=0, n/st=100, rew=6636.00]                                                                              


Epoch #150: test_reward: 12072.500000 ± 5890.316787, best_reward: 19959.100000 ± 3349.941117 in #27

InfoStats(gradient_step=15000, best_reward=19959.1, best_reward_std=3349.941117392961, train_step=150000, train_episode=788, test_step=372072, test_episode=1510, timing=TimingStats(total_time=476.80609250068665, train_time=286.3942198753357, train_time_collect=52.30290460586548, train_time_update=228.35369300842285, test_time=190.41187262535095, update_speed=523.7535871544244))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #3


Epoch #1: 1001it [00:01, 554.28it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 10189.200000 ± 5261.116018, best_reward: 12248.000000 ± 3206.338628 in #0


Epoch #2: 1001it [00:02, 485.32it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 8678.900000 ± 3240.095599, best_reward: 12248.000000 ± 3206.338628 in #0


Epoch #3: 1001it [00:02, 397.31it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 7541.400000 ± 3670.881970, best_reward: 12248.000000 ± 3206.338628 in #0


Epoch #4: 1001it [00:02, 424.47it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 9628.100000 ± 3171.846984, best_reward: 12248.000000 ± 3206.338628 in #0


Epoch #5: 1001it [00:02, 468.01it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 9801.800000 ± 5314.469716, best_reward: 12248.000000 ± 3206.338628 in #0


Epoch #6: 1001it [00:02, 410.37it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 12414.200000 ± 3958.440168, best_reward: 12414.200000 ± 3958.440168 in #6


Epoch #7: 1001it [00:01, 552.70it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 10752.100000 ± 5268.090706, best_reward: 12414.200000 ± 3958.440168 in #6


Epoch #8: 1001it [00:02, 440.54it/s, env_step=8000, gradient_step=800, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #8: test_reward: 13081.100000 ± 5616.877842, best_reward: 13081.100000 ± 5616.877842 in #8


Epoch #9: 1001it [00:02, 489.46it/s, env_step=9000, gradient_step=900, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #9: test_reward: 7821.600000 ± 4815.545643, best_reward: 13081.100000 ± 5616.877842 in #8


Epoch #10: 1001it [00:02, 484.38it/s, env_step=10000, gradient_step=1000, len=99, n/ep=0, n/st=100, rew=4161.00]                                                                                  


Epoch #10: test_reward: 10556.600000 ± 3611.990288, best_reward: 13081.100000 ± 5616.877842 in #8


Epoch #11: 1001it [00:02, 431.34it/s, env_step=11000, gradient_step=1100, len=103, n/ep=0, n/st=100, rew=3596.00]                                                                                 


Epoch #11: test_reward: 8141.300000 ± 2934.318560, best_reward: 13081.100000 ± 5616.877842 in #8


Epoch #12: 1001it [00:02, 464.31it/s, env_step=12000, gradient_step=1200, len=120, n/ep=2, n/st=100, rew=5058.50]                                                                                 


Epoch #12: test_reward: 10603.200000 ± 2653.971130, best_reward: 13081.100000 ± 5616.877842 in #8


Epoch #13: 1001it [00:02, 433.63it/s, env_step=13000, gradient_step=1300, len=129, n/ep=0, n/st=100, rew=4253.50]                                                                                 


Epoch #13: test_reward: 10357.600000 ± 2450.728512, best_reward: 13081.100000 ± 5616.877842 in #8


Epoch #14: 1001it [00:02, 386.76it/s, env_step=14000, gradient_step=1400, len=133, n/ep=0, n/st=100, rew=5451.00]                                                                                 


Epoch #14: test_reward: 10933.900000 ± 2284.952273, best_reward: 13081.100000 ± 5616.877842 in #8


Epoch #15: 1001it [00:02, 463.68it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=6116.75]                                                                                 


Epoch #15: test_reward: 13325.200000 ± 4946.846365, best_reward: 13325.200000 ± 4946.846365 in #15


Epoch #16: 1001it [00:02, 386.66it/s, env_step=16000, gradient_step=1600, len=59, n/ep=0, n/st=100, rew=1973.00]                                                                                  


Epoch #16: test_reward: 12991.600000 ± 3806.481609, best_reward: 13325.200000 ± 4946.846365 in #15


Epoch #17: 1001it [00:02, 433.85it/s, env_step=17000, gradient_step=1700, len=170, n/ep=2, n/st=100, rew=7405.50]                                                                                 


Epoch #17: test_reward: 12638.900000 ± 2967.564404, best_reward: 13325.200000 ± 4946.846365 in #15


Epoch #18: 1001it [00:02, 388.90it/s, env_step=18000, gradient_step=1800, len=178, n/ep=0, n/st=100, rew=8456.17]                                                                                 


Epoch #18: test_reward: 13194.700000 ± 4968.472322, best_reward: 13325.200000 ± 4946.846365 in #15


Epoch #19: 1001it [00:02, 345.82it/s, env_step=19000, gradient_step=1900, len=190, n/ep=2, n/st=100, rew=8039.00]                                                                                 


Epoch #19: test_reward: 12985.500000 ± 2919.110866, best_reward: 13325.200000 ± 4946.846365 in #15


Epoch #20: 1001it [00:02, 397.71it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=9431.00]                                                                                 


Epoch #20: test_reward: 13282.500000 ± 6251.285616, best_reward: 13325.200000 ± 4946.846365 in #15


Epoch #21: 1001it [00:02, 418.41it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=10101.50]                                                                                


Epoch #21: test_reward: 15865.700000 ± 5129.303774, best_reward: 15865.700000 ± 5129.303774 in #21


Epoch #22: 1001it [00:02, 393.77it/s, env_step=22000, gradient_step=2200, len=184, n/ep=0, n/st=100, rew=9286.25]                                                                                 


Epoch #22: test_reward: 12345.400000 ± 5127.977402, best_reward: 15865.700000 ± 5129.303774 in #21


Epoch #23: 1001it [00:02, 436.21it/s, env_step=23000, gradient_step=2300, len=229, n/ep=0, n/st=100, rew=12040.50]                                                                                


Epoch #23: test_reward: 11321.500000 ± 4526.422191, best_reward: 15865.700000 ± 5129.303774 in #21


Epoch #24: 1001it [00:02, 410.40it/s, env_step=24000, gradient_step=2400, len=239, n/ep=0, n/st=100, rew=12626.50]                                                                                


Epoch #24: test_reward: 11988.800000 ± 3508.676155, best_reward: 15865.700000 ± 5129.303774 in #21


Epoch #25: 1001it [00:02, 351.87it/s, env_step=25000, gradient_step=2500, len=250, n/ep=1, n/st=100, rew=12492.00]                                                                                


Epoch #25: test_reward: 16780.000000 ± 5620.410003, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #26: 1001it [00:02, 443.22it/s, env_step=26000, gradient_step=2600, len=260, n/ep=2, n/st=100, rew=12583.00]                                                                                


Epoch #26: test_reward: 10824.600000 ± 4925.153159, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #27: 1001it [00:02, 421.26it/s, env_step=27000, gradient_step=2700, len=269, n/ep=0, n/st=100, rew=15167.00]                                                                                


Epoch #27: test_reward: 6599.200000 ± 3453.418851, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #28: 1001it [00:02, 395.12it/s, env_step=28000, gradient_step=2800, len=279, n/ep=0, n/st=100, rew=14940.00]                                                                                


Epoch #28: test_reward: 11664.800000 ± 5317.406789, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #29: 1001it [00:02, 455.13it/s, env_step=29000, gradient_step=2900, len=290, n/ep=1, n/st=100, rew=16736.00]                                                                                


Epoch #29: test_reward: 16565.900000 ± 3815.964346, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #30: 1001it [00:02, 403.43it/s, env_step=30000, gradient_step=3000, len=298, n/ep=0, n/st=100, rew=14485.00]                                                                                


Epoch #30: test_reward: 12729.100000 ± 5893.914055, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #31: 1001it [00:02, 373.04it/s, env_step=31000, gradient_step=3100, len=218, n/ep=1, n/st=100, rew=11827.50]                                                                                


Epoch #31: test_reward: 9024.600000 ± 3890.398365, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #32: 1001it [00:02, 436.92it/s, env_step=32000, gradient_step=3200, len=104, n/ep=0, n/st=100, rew=5079.00]                                                                                 


Epoch #32: test_reward: 9847.700000 ± 3973.281718, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #33: 1001it [00:02, 446.79it/s, env_step=33000, gradient_step=3300, len=122, n/ep=1, n/st=100, rew=6592.00]                                                                                 


Epoch #33: test_reward: 16730.500000 ± 5062.954182, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #34: 1001it [00:02, 412.69it/s, env_step=34000, gradient_step=3400, len=138, n/ep=0, n/st=100, rew=6504.00]                                                                                 


Epoch #34: test_reward: 13300.200000 ± 6292.761537, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #35: 1001it [00:02, 362.68it/s, env_step=35000, gradient_step=3500, len=264, n/ep=2, n/st=100, rew=14474.25]                                                                                


Epoch #35: test_reward: 13744.900000 ± 5199.434920, best_reward: 16780.000000 ± 5620.410003 in #25


Epoch #36: 1001it [00:02, 451.81it/s, env_step=36000, gradient_step=3600, len=108, n/ep=0, n/st=100, rew=5813.00]                                                                                 


Epoch #36: test_reward: 21508.100000 ± 4391.719127, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #37: 1001it [00:03, 319.31it/s, env_step=37000, gradient_step=3700, len=240, n/ep=0, n/st=100, rew=14320.00]                                                                                


Epoch #37: test_reward: 16391.700000 ± 6240.161137, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #38: 1001it [00:02, 363.87it/s, env_step=38000, gradient_step=3800, len=211, n/ep=1, n/st=100, rew=11545.50]                                                                                


Epoch #38: test_reward: 10637.400000 ± 4591.429150, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #39: 1001it [00:02, 373.34it/s, env_step=39000, gradient_step=3900, len=149, n/ep=1, n/st=100, rew=8097.00]                                                                                 


Epoch #39: test_reward: 12587.200000 ± 6230.206045, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #40: 1001it [00:02, 437.53it/s, env_step=40000, gradient_step=4000, len=154, n/ep=1, n/st=100, rew=8437.50]                                                                                 


Epoch #40: test_reward: 14336.400000 ± 3491.867214, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #41: 1001it [00:02, 365.63it/s, env_step=41000, gradient_step=4100, len=220, n/ep=1, n/st=100, rew=14031.00]                                                                                


Epoch #41: test_reward: 16248.300000 ± 2347.336493, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #42: 1001it [00:02, 383.83it/s, env_step=42000, gradient_step=4200, len=56, n/ep=2, n/st=100, rew=2232.50]                                                                                  


Epoch #42: test_reward: 13229.800000 ± 5125.238371, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #43: 1001it [00:02, 428.09it/s, env_step=43000, gradient_step=4300, len=214, n/ep=0, n/st=100, rew=11974.50]                                                                                


Epoch #43: test_reward: 14221.900000 ± 5623.218090, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #44: 1001it [00:02, 388.96it/s, env_step=44000, gradient_step=4400, len=176, n/ep=0, n/st=100, rew=9107.00]                                                                                 


Epoch #44: test_reward: 11948.400000 ± 5779.917425, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #45: 1001it [00:02, 431.25it/s, env_step=45000, gradient_step=4500, len=271, n/ep=0, n/st=100, rew=16855.00]                                                                                


Epoch #45: test_reward: 14671.200000 ± 5811.129043, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #46: 1001it [00:02, 433.44it/s, env_step=46000, gradient_step=4600, len=179, n/ep=0, n/st=100, rew=9180.00]                                                                                 


Epoch #46: test_reward: 12197.500000 ± 3678.634455, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #47: 1001it [00:02, 403.02it/s, env_step=47000, gradient_step=4700, len=226, n/ep=0, n/st=100, rew=12956.17]                                                                                


Epoch #47: test_reward: 16753.100000 ± 5305.690595, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #48: 1001it [00:02, 365.20it/s, env_step=48000, gradient_step=4800, len=275, n/ep=1, n/st=100, rew=16537.00]                                                                                


Epoch #48: test_reward: 16428.200000 ± 5866.489253, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #49: 1001it [00:02, 413.70it/s, env_step=49000, gradient_step=4900, len=199, n/ep=0, n/st=100, rew=11064.00]                                                                                


Epoch #49: test_reward: 13477.800000 ± 5507.976884, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #50: 1001it [00:02, 403.50it/s, env_step=50000, gradient_step=5000, len=229, n/ep=2, n/st=100, rew=13247.75]                                                                                


Epoch #50: test_reward: 9759.800000 ± 2831.243289, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #51: 1001it [00:02, 424.10it/s, env_step=51000, gradient_step=5100, len=140, n/ep=0, n/st=100, rew=5587.00]                                                                                 


Epoch #51: test_reward: 11446.200000 ± 5608.906236, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #52: 1001it [00:02, 369.67it/s, env_step=52000, gradient_step=5200, len=169, n/ep=0, n/st=100, rew=9475.00]                                                                                 


Epoch #52: test_reward: 13133.700000 ± 6109.194383, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #53: 1001it [00:02, 412.84it/s, env_step=53000, gradient_step=5300, len=349, n/ep=1, n/st=100, rew=21523.00]                                                                                


Epoch #53: test_reward: 15430.600000 ± 5733.441134, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #54: 1001it [00:02, 444.40it/s, env_step=54000, gradient_step=5400, len=227, n/ep=0, n/st=100, rew=12648.00]                                                                                


Epoch #54: test_reward: 16754.900000 ± 6241.793996, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #55: 1001it [00:02, 429.82it/s, env_step=55000, gradient_step=5500, len=118, n/ep=0, n/st=100, rew=5153.00]                                                                                 


Epoch #55: test_reward: 11784.400000 ± 7833.138148, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #56: 1001it [00:02, 410.30it/s, env_step=56000, gradient_step=5600, len=227, n/ep=0, n/st=100, rew=12236.50]                                                                                


Epoch #56: test_reward: 12533.800000 ± 3896.968432, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #57: 1001it [00:02, 360.41it/s, env_step=57000, gradient_step=5700, len=117, n/ep=0, n/st=100, rew=6607.00]                                                                                 


Epoch #57: test_reward: 14714.500000 ± 5076.263887, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #58: 1001it [00:02, 429.33it/s, env_step=58000, gradient_step=5800, len=117, n/ep=0, n/st=100, rew=6607.00]                                                                                 


Epoch #58: test_reward: 19091.200000 ± 4969.261370, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #59: 1001it [00:02, 362.35it/s, env_step=59000, gradient_step=5900, len=110, n/ep=1, n/st=100, rew=6075.50]                                                                                 


Epoch #59: test_reward: 10868.800000 ± 3768.240433, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #60: 1001it [00:03, 309.71it/s, env_step=60000, gradient_step=6000, len=150, n/ep=2, n/st=100, rew=8868.25]                                                                                 


Epoch #60: test_reward: 13916.600000 ± 5804.690590, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #61: 1001it [00:02, 350.93it/s, env_step=61000, gradient_step=6100, len=244, n/ep=2, n/st=100, rew=14671.00]                                                                                


Epoch #61: test_reward: 17267.600000 ± 5274.448563, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #62: 1001it [00:02, 427.85it/s, env_step=62000, gradient_step=6200, len=153, n/ep=0, n/st=100, rew=7767.50]                                                                                 


Epoch #62: test_reward: 18795.500000 ± 6702.453450, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #63: 1001it [00:02, 397.84it/s, env_step=63000, gradient_step=6300, len=148, n/ep=0, n/st=100, rew=7997.00]                                                                                 


Epoch #63: test_reward: 12351.200000 ± 5031.346317, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #64: 1001it [00:02, 426.57it/s, env_step=64000, gradient_step=6400, len=172, n/ep=0, n/st=100, rew=10341.00]                                                                                


Epoch #64: test_reward: 15679.300000 ± 7084.169620, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #65: 1001it [00:03, 326.44it/s, env_step=65000, gradient_step=6500, len=206, n/ep=0, n/st=100, rew=12888.00]                                                                                


Epoch #65: test_reward: 12566.800000 ± 3436.099207, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #66: 1001it [00:02, 351.68it/s, env_step=66000, gradient_step=6600, len=152, n/ep=1, n/st=100, rew=9148.00]                                                                                 


Epoch #66: test_reward: 14577.200000 ± 5088.631384, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #67: 1001it [00:02, 392.78it/s, env_step=67000, gradient_step=6700, len=243, n/ep=0, n/st=100, rew=14021.75]                                                                                


Epoch #67: test_reward: 15846.700000 ± 3926.715168, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #68: 1001it [00:02, 377.46it/s, env_step=68000, gradient_step=6800, len=271, n/ep=0, n/st=100, rew=16474.50]                                                                                


Epoch #68: test_reward: 9583.900000 ± 3317.794643, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #69: 1001it [00:02, 370.29it/s, env_step=69000, gradient_step=6900, len=211, n/ep=0, n/st=100, rew=12492.75]                                                                                


Epoch #69: test_reward: 9482.800000 ± 3388.929176, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #70: 1001it [00:02, 370.37it/s, env_step=70000, gradient_step=7000, len=229, n/ep=0, n/st=100, rew=13958.00]                                                                                


Epoch #70: test_reward: 10722.100000 ± 6782.767701, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #71: 1001it [00:02, 375.47it/s, env_step=71000, gradient_step=7100, len=185, n/ep=1, n/st=100, rew=11198.00]                                                                                


Epoch #71: test_reward: 17101.000000 ± 6065.808108, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #72: 1001it [00:02, 422.31it/s, env_step=72000, gradient_step=7200, len=202, n/ep=2, n/st=100, rew=12562.75]                                                                                


Epoch #72: test_reward: 10864.000000 ± 5292.164699, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #73: 1001it [00:02, 443.42it/s, env_step=73000, gradient_step=7300, len=254, n/ep=0, n/st=100, rew=16826.50]                                                                                


Epoch #73: test_reward: 13231.800000 ± 4752.638884, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #74: 1001it [00:03, 329.81it/s, env_step=74000, gradient_step=7400, len=282, n/ep=0, n/st=100, rew=16418.50]                                                                                


Epoch #74: test_reward: 13381.800000 ± 4318.575478, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #75: 1001it [00:02, 352.73it/s, env_step=75000, gradient_step=7500, len=351, n/ep=0, n/st=100, rew=21887.00]                                                                                


Epoch #75: test_reward: 9612.800000 ± 2334.403213, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #76: 1001it [00:02, 432.82it/s, env_step=76000, gradient_step=7600, len=351, n/ep=0, n/st=100, rew=22134.00]                                                                                


Epoch #76: test_reward: 11484.700000 ± 4687.197949, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #77: 1001it [00:02, 412.84it/s, env_step=77000, gradient_step=7700, len=185, n/ep=1, n/st=100, rew=10679.00]                                                                                


Epoch #77: test_reward: 11716.300000 ± 6779.262852, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #78: 1001it [00:03, 333.12it/s, env_step=78000, gradient_step=7800, len=232, n/ep=0, n/st=100, rew=13237.00]                                                                                


Epoch #78: test_reward: 7785.500000 ± 3331.217713, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #79: 1001it [00:02, 450.51it/s, env_step=79000, gradient_step=7900, len=379, n/ep=0, n/st=100, rew=23167.00]                                                                                


Epoch #79: test_reward: 12424.600000 ± 4099.990566, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #80: 1001it [00:02, 372.52it/s, env_step=80000, gradient_step=8000, len=91, n/ep=0, n/st=100, rew=4915.00]                                                                                  


Epoch #80: test_reward: 13609.600000 ± 3197.816011, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #81: 1001it [00:02, 378.53it/s, env_step=81000, gradient_step=8100, len=135, n/ep=0, n/st=100, rew=7283.67]                                                                                 


Epoch #81: test_reward: 9815.200000 ± 4656.508409, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #82: 1001it [00:02, 406.12it/s, env_step=82000, gradient_step=8200, len=180, n/ep=0, n/st=100, rew=10736.00]                                                                                


Epoch #82: test_reward: 14860.800000 ± 5754.640159, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #83: 1001it [00:02, 440.39it/s, env_step=83000, gradient_step=8300, len=192, n/ep=0, n/st=100, rew=11765.00]                                                                                


Epoch #83: test_reward: 10178.100000 ± 3659.046937, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #84: 1001it [00:02, 411.25it/s, env_step=84000, gradient_step=8400, len=86, n/ep=0, n/st=100, rew=4889.00]                                                                                  


Epoch #84: test_reward: 13775.900000 ± 4182.140731, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #85: 1001it [00:02, 427.92it/s, env_step=85000, gradient_step=8500, len=193, n/ep=0, n/st=100, rew=11645.00]                                                                                


Epoch #85: test_reward: 11362.300000 ± 4779.455535, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #86: 1001it [00:02, 375.22it/s, env_step=86000, gradient_step=8600, len=161, n/ep=2, n/st=100, rew=9418.50]                                                                                 


Epoch #86: test_reward: 15380.500000 ± 5260.770043, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #87: 1001it [00:03, 324.35it/s, env_step=87000, gradient_step=8700, len=123, n/ep=0, n/st=100, rew=6682.00]                                                                                 


Epoch #87: test_reward: 10928.400000 ± 6394.003115, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #88: 1001it [00:02, 396.10it/s, env_step=88000, gradient_step=8800, len=208, n/ep=2, n/st=100, rew=11951.75]                                                                                


Epoch #88: test_reward: 15918.600000 ± 5229.110903, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #89: 1001it [00:02, 383.98it/s, env_step=89000, gradient_step=8900, len=147, n/ep=1, n/st=100, rew=8846.50]                                                                                 


Epoch #89: test_reward: 12731.300000 ± 6154.691837, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #90: 1001it [00:02, 365.86it/s, env_step=90000, gradient_step=9000, len=161, n/ep=1, n/st=100, rew=8763.00]                                                                                 


Epoch #90: test_reward: 10583.200000 ± 6419.445284, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #91: 1001it [00:02, 411.33it/s, env_step=91000, gradient_step=9100, len=237, n/ep=0, n/st=100, rew=14666.50]                                                                                


Epoch #91: test_reward: 11635.800000 ± 5069.219522, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #92: 1001it [00:02, 374.86it/s, env_step=92000, gradient_step=9200, len=98, n/ep=1, n/st=100, rew=4857.00]                                                                                  


Epoch #92: test_reward: 9631.600000 ± 1755.406460, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #93: 1001it [00:02, 397.33it/s, env_step=93000, gradient_step=9300, len=101, n/ep=0, n/st=100, rew=5596.00]                                                                                 


Epoch #93: test_reward: 9129.400000 ± 3283.855301, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #94: 1001it [00:02, 416.64it/s, env_step=94000, gradient_step=9400, len=179, n/ep=0, n/st=100, rew=10687.33]                                                                                


Epoch #94: test_reward: 9621.200000 ± 2524.074040, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #95: 1001it [00:02, 435.04it/s, env_step=95000, gradient_step=9500, len=144, n/ep=0, n/st=100, rew=8785.50]                                                                                 


Epoch #95: test_reward: 11748.200000 ± 5308.448112, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #96: 1001it [00:02, 403.05it/s, env_step=96000, gradient_step=9600, len=192, n/ep=1, n/st=100, rew=11077.00]                                                                                


Epoch #96: test_reward: 13695.900000 ± 5034.178452, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #97: 1001it [00:02, 371.97it/s, env_step=97000, gradient_step=9700, len=137, n/ep=0, n/st=100, rew=7048.38]                                                                                 


Epoch #97: test_reward: 12563.000000 ± 4898.672739, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #98: 1001it [00:02, 356.40it/s, env_step=98000, gradient_step=9800, len=187, n/ep=3, n/st=100, rew=11100.33]                                                                                


Epoch #98: test_reward: 7802.000000 ± 3079.845710, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #99: 1001it [00:02, 337.11it/s, env_step=99000, gradient_step=9900, len=131, n/ep=1, n/st=100, rew=7361.00]                                                                                 


Epoch #99: test_reward: 9516.300000 ± 5832.053739, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #100: 1001it [00:02, 337.83it/s, env_step=100000, gradient_step=10000, len=54, n/ep=0, n/st=100, rew=2066.00]                                                                               


Epoch #100: test_reward: 10233.200000 ± 6602.063387, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #101: 1001it [00:02, 424.97it/s, env_step=101000, gradient_step=10100, len=141, n/ep=0, n/st=100, rew=7598.50]                                                                              


Epoch #101: test_reward: 11167.500000 ± 4471.596298, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #102: 1001it [00:02, 396.96it/s, env_step=102000, gradient_step=10200, len=252, n/ep=0, n/st=100, rew=15451.00]                                                                             


Epoch #102: test_reward: 14250.600000 ± 6001.572197, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #103: 1001it [00:02, 392.71it/s, env_step=103000, gradient_step=10300, len=180, n/ep=0, n/st=100, rew=10424.50]                                                                             


Epoch #103: test_reward: 13070.000000 ± 5514.464108, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #104: 1001it [00:02, 407.52it/s, env_step=104000, gradient_step=10400, len=111, n/ep=0, n/st=100, rew=6302.00]                                                                              


Epoch #104: test_reward: 8087.400000 ± 3889.659605, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #105: 1001it [00:02, 387.32it/s, env_step=105000, gradient_step=10500, len=158, n/ep=0, n/st=100, rew=8627.00]                                                                              


Epoch #105: test_reward: 9693.200000 ± 3681.345292, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #106: 1001it [00:02, 404.70it/s, env_step=106000, gradient_step=10600, len=119, n/ep=2, n/st=100, rew=5776.00]                                                                              


Epoch #106: test_reward: 10744.600000 ± 5443.026239, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #107: 1001it [00:02, 393.21it/s, env_step=107000, gradient_step=10700, len=32, n/ep=0, n/st=100, rew=1169.00]                                                                               


Epoch #107: test_reward: 11019.800000 ± 3293.435131, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #108: 1001it [00:02, 394.76it/s, env_step=108000, gradient_step=10800, len=44, n/ep=1, n/st=100, rew=1461.00]                                                                               


Epoch #108: test_reward: 9313.800000 ± 4087.464759, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #109: 1001it [00:03, 317.26it/s, env_step=109000, gradient_step=10900, len=257, n/ep=1, n/st=100, rew=15184.50]                                                                             


Epoch #109: test_reward: 11091.300000 ± 4929.531865, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #110: 1001it [00:03, 328.09it/s, env_step=110000, gradient_step=11000, len=121, n/ep=0, n/st=100, rew=5837.00]                                                                              


Epoch #110: test_reward: 8645.600000 ± 1704.953911, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #111: 1001it [00:02, 355.90it/s, env_step=111000, gradient_step=11100, len=109, n/ep=1, n/st=100, rew=5731.00]                                                                              


Epoch #111: test_reward: 9596.200000 ± 3434.686909, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #112: 1001it [00:02, 399.15it/s, env_step=112000, gradient_step=11200, len=168, n/ep=0, n/st=100, rew=9255.00]                                                                              


Epoch #112: test_reward: 11192.100000 ± 5345.044592, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #113: 1001it [00:02, 339.00it/s, env_step=113000, gradient_step=11300, len=221, n/ep=1, n/st=100, rew=14411.00]                                                                             


Epoch #113: test_reward: 10068.800000 ± 3810.077422, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #114: 1001it [00:02, 378.65it/s, env_step=114000, gradient_step=11400, len=195, n/ep=1, n/st=100, rew=9903.00]                                                                              


Epoch #114: test_reward: 8978.800000 ± 3085.749854, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #115: 1001it [00:02, 393.43it/s, env_step=115000, gradient_step=11500, len=143, n/ep=0, n/st=100, rew=7617.50]                                                                              


Epoch #115: test_reward: 9145.000000 ± 3705.532647, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #116: 1001it [00:02, 433.20it/s, env_step=116000, gradient_step=11600, len=174, n/ep=1, n/st=100, rew=8890.00]                                                                              


Epoch #116: test_reward: 9212.600000 ± 3254.450190, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #117: 1001it [00:02, 354.26it/s, env_step=117000, gradient_step=11700, len=106, n/ep=0, n/st=100, rew=5478.00]                                                                              


Epoch #117: test_reward: 11629.700000 ± 4186.160772, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #118: 1001it [00:02, 372.32it/s, env_step=118000, gradient_step=11800, len=221, n/ep=2, n/st=100, rew=11787.75]                                                                             


Epoch #118: test_reward: 11986.500000 ± 4397.585002, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #119: 1001it [00:02, 416.57it/s, env_step=119000, gradient_step=11900, len=185, n/ep=0, n/st=100, rew=9750.50]                                                                              


Epoch #119: test_reward: 9819.000000 ± 3932.662533, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #120: 1001it [00:02, 402.60it/s, env_step=120000, gradient_step=12000, len=153, n/ep=2, n/st=100, rew=8952.00]                                                                              


Epoch #120: test_reward: 10466.700000 ± 4688.374709, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #121: 1001it [00:02, 413.24it/s, env_step=121000, gradient_step=12100, len=167, n/ep=2, n/st=100, rew=9835.75]                                                                              


Epoch #121: test_reward: 12178.800000 ± 5054.120691, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #122: 1001it [00:02, 380.32it/s, env_step=122000, gradient_step=12200, len=175, n/ep=1, n/st=100, rew=10048.50]                                                                             


Epoch #122: test_reward: 13244.900000 ± 5417.893953, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #123: 1001it [00:02, 391.45it/s, env_step=123000, gradient_step=12300, len=61, n/ep=0, n/st=100, rew=3070.00]                                                                               


Epoch #123: test_reward: 12307.300000 ± 5148.184108, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #124: 1001it [00:02, 401.34it/s, env_step=124000, gradient_step=12400, len=150, n/ep=0, n/st=100, rew=9176.50]                                                                              


Epoch #124: test_reward: 11197.500000 ± 2081.721175, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #125: 1001it [00:02, 417.66it/s, env_step=125000, gradient_step=12500, len=189, n/ep=2, n/st=100, rew=10819.25]                                                                             


Epoch #125: test_reward: 10278.000000 ± 2775.412762, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #126: 1001it [00:02, 379.65it/s, env_step=126000, gradient_step=12600, len=191, n/ep=0, n/st=100, rew=10376.00]                                                                             


Epoch #126: test_reward: 9573.900000 ± 4013.293821, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #127: 1001it [00:02, 403.02it/s, env_step=127000, gradient_step=12700, len=128, n/ep=0, n/st=100, rew=7492.50]                                                                              


Epoch #127: test_reward: 11732.000000 ± 3716.985983, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #128: 1001it [00:02, 348.14it/s, env_step=128000, gradient_step=12800, len=247, n/ep=1, n/st=100, rew=14284.00]                                                                             


Epoch #128: test_reward: 7890.100000 ± 2134.430859, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #129: 1001it [00:02, 390.06it/s, env_step=129000, gradient_step=12900, len=137, n/ep=0, n/st=100, rew=7975.75]                                                                              


Epoch #129: test_reward: 12528.200000 ± 3844.312313, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #130: 1001it [00:02, 378.20it/s, env_step=130000, gradient_step=13000, len=239, n/ep=0, n/st=100, rew=12957.50]                                                                             


Epoch #130: test_reward: 17649.700000 ± 7109.132634, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #131: 1001it [00:02, 423.75it/s, env_step=131000, gradient_step=13100, len=110, n/ep=2, n/st=100, rew=5620.50]                                                                              


Epoch #131: test_reward: 8726.900000 ± 6420.020365, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #132: 1001it [00:02, 365.22it/s, env_step=132000, gradient_step=13200, len=195, n/ep=1, n/st=100, rew=12147.50]                                                                             


Epoch #132: test_reward: 9861.800000 ± 4087.782450, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #133: 1001it [00:02, 417.78it/s, env_step=133000, gradient_step=13300, len=90, n/ep=2, n/st=100, rew=4400.00]                                                                               


Epoch #133: test_reward: 12197.300000 ± 5458.456852, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #134: 1001it [00:02, 417.99it/s, env_step=134000, gradient_step=13400, len=134, n/ep=1, n/st=100, rew=7202.00]                                                                              


Epoch #134: test_reward: 11036.600000 ± 1358.921425, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #135: 1001it [00:02, 335.27it/s, env_step=135000, gradient_step=13500, len=142, n/ep=0, n/st=100, rew=7368.75]                                                                              


Epoch #135: test_reward: 15103.400000 ± 4672.195655, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #136: 1001it [00:02, 408.13it/s, env_step=136000, gradient_step=13600, len=165, n/ep=2, n/st=100, rew=9877.00]                                                                              


Epoch #136: test_reward: 12732.200000 ± 6349.133213, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #137: 1001it [00:02, 357.74it/s, env_step=137000, gradient_step=13700, len=113, n/ep=1, n/st=100, rew=6619.00]                                                                              


Epoch #137: test_reward: 10202.600000 ± 2447.828719, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #138: 1001it [00:02, 379.79it/s, env_step=138000, gradient_step=13800, len=141, n/ep=1, n/st=100, rew=8594.00]                                                                              


Epoch #138: test_reward: 9775.400000 ± 2814.786535, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #139: 1001it [00:03, 324.84it/s, env_step=139000, gradient_step=13900, len=178, n/ep=2, n/st=100, rew=10611.50]                                                                             


Epoch #139: test_reward: 9488.200000 ± 2449.068141, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #140: 1001it [00:02, 361.73it/s, env_step=140000, gradient_step=14000, len=174, n/ep=0, n/st=100, rew=9985.50]                                                                              


Epoch #140: test_reward: 11533.700000 ± 4075.422924, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #141: 1001it [00:02, 392.17it/s, env_step=141000, gradient_step=14100, len=138, n/ep=0, n/st=100, rew=8204.50]                                                                              


Epoch #141: test_reward: 9337.900000 ± 4114.093641, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #142: 1001it [00:02, 405.97it/s, env_step=142000, gradient_step=14200, len=165, n/ep=1, n/st=100, rew=9932.50]                                                                              


Epoch #142: test_reward: 13415.400000 ± 3670.904662, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #143: 1001it [00:02, 375.58it/s, env_step=143000, gradient_step=14300, len=216, n/ep=1, n/st=100, rew=12752.50]                                                                             


Epoch #143: test_reward: 11854.900000 ± 6312.662615, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #144: 1001it [00:02, 374.55it/s, env_step=144000, gradient_step=14400, len=152, n/ep=1, n/st=100, rew=8185.00]                                                                              


Epoch #144: test_reward: 9281.200000 ± 3285.967492, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #145: 1001it [00:02, 416.09it/s, env_step=145000, gradient_step=14500, len=147, n/ep=0, n/st=100, rew=8442.33]                                                                              


Epoch #145: test_reward: 13227.800000 ± 5947.282956, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #146: 1001it [00:02, 393.12it/s, env_step=146000, gradient_step=14600, len=34, n/ep=0, n/st=100, rew=1134.00]                                                                               


Epoch #146: test_reward: 11981.800000 ± 3377.370539, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #147: 1001it [00:02, 377.59it/s, env_step=147000, gradient_step=14700, len=181, n/ep=0, n/st=100, rew=9812.50]                                                                              


Epoch #147: test_reward: 10165.700000 ± 3613.264564, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #148: 1001it [00:02, 366.39it/s, env_step=148000, gradient_step=14800, len=160, n/ep=0, n/st=100, rew=9161.00]                                                                              


Epoch #148: test_reward: 11477.700000 ± 3970.943542, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #149: 1001it [00:02, 352.54it/s, env_step=149000, gradient_step=14900, len=131, n/ep=0, n/st=100, rew=7196.25]                                                                              


Epoch #149: test_reward: 12724.700000 ± 4854.932627, best_reward: 21508.100000 ± 4391.719127 in #36


Epoch #150: 1001it [00:02, 395.81it/s, env_step=150000, gradient_step=15000, len=130, n/ep=0, n/st=100, rew=7090.50]                                                                              


Epoch #150: test_reward: 11796.700000 ± 7144.509753, best_reward: 21508.100000 ± 4391.719127 in #36

InfoStats(gradient_step=15000, best_reward=21508.1, best_reward_std=4391.719126947897, train_step=150000, train_episode=790, test_step=326848, test_episode=1510, timing=TimingStats(total_time=563.2913601398468, train_time=383.026517868042, train_time_collect=50.963677406311035, train_time_update=326.17038130760193, test_time=180.2648422718048, update_speed=391.61779407575403))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #2


Epoch #1: 1001it [00:03, 307.50it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 13801.100000 ± 4361.703026, best_reward: 13801.100000 ± 4361.703026 in #1


Epoch #2: 1001it [00:02, 344.67it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 12846.800000 ± 2956.867051, best_reward: 13801.100000 ± 4361.703026 in #1


Epoch #3: 1001it [00:02, 342.41it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 14120.100000 ± 4001.805479, best_reward: 14120.100000 ± 4001.805479 in #3


Epoch #4: 1001it [00:03, 314.36it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 14155.200000 ± 4153.041965, best_reward: 14155.200000 ± 4153.041965 in #4


Epoch #5: 1001it [00:02, 367.44it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 13104.500000 ± 3399.959713, best_reward: 14155.200000 ± 4153.041965 in #4


Epoch #6: 1001it [00:02, 349.30it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 9998.300000 ± 3227.321894, best_reward: 14155.200000 ± 4153.041965 in #4


Epoch #7: 1001it [00:02, 392.47it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 13234.300000 ± 3258.248427, best_reward: 14155.200000 ± 4153.041965 in #4


Epoch #8: 1001it [00:02, 341.85it/s, env_step=8000, gradient_step=800, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #8: test_reward: 14954.500000 ± 5484.165228, best_reward: 14954.500000 ± 5484.165228 in #8


Epoch #9: 1001it [00:03, 297.55it/s, env_step=9000, gradient_step=900, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #9: test_reward: 7718.600000 ± 2550.185374, best_reward: 14954.500000 ± 5484.165228 in #8


Epoch #10: 1001it [00:03, 313.74it/s, env_step=10000, gradient_step=1000, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                      


Epoch #10: test_reward: 7104.800000 ± 1803.586471, best_reward: 14954.500000 ± 5484.165228 in #8


Epoch #11: 1001it [00:03, 302.20it/s, env_step=11000, gradient_step=1100, len=102, n/ep=0, n/st=100, rew=2624.00]                                                                                 


Epoch #11: test_reward: 14564.200000 ± 3225.743133, best_reward: 14954.500000 ± 5484.165228 in #8


Epoch #12: 1001it [00:03, 283.92it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=3269.00]                                                                                 


Epoch #12: test_reward: 12077.900000 ± 3337.833053, best_reward: 14954.500000 ± 5484.165228 in #8


Epoch #13: 1001it [00:03, 325.10it/s, env_step=13000, gradient_step=1300, len=118, n/ep=0, n/st=100, rew=3269.00]                                                                                 


Epoch #13: test_reward: 13349.900000 ± 2154.560811, best_reward: 14954.500000 ± 5484.165228 in #8


Epoch #14: 1001it [00:02, 335.19it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=4755.00]                                                                                 


Epoch #14: test_reward: 15762.000000 ± 1878.057134, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #15: 1001it [00:03, 299.75it/s, env_step=15000, gradient_step=1500, len=150, n/ep=1, n/st=100, rew=5341.00]                                                                                 


Epoch #15: test_reward: 11805.100000 ± 4139.015304, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #16: 1001it [00:03, 283.09it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=4668.00]                                                                                 


Epoch #16: test_reward: 12988.900000 ± 3079.993131, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #17: 1001it [00:03, 312.40it/s, env_step=17000, gradient_step=1700, len=164, n/ep=0, n/st=100, rew=5681.50]                                                                                 


Epoch #17: test_reward: 10150.000000 ± 6644.383252, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #18: 1001it [00:03, 274.57it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=6764.75]                                                                                 


Epoch #18: test_reward: 9972.400000 ± 1844.651523, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #19: 1001it [00:03, 286.17it/s, env_step=19000, gradient_step=1900, len=190, n/ep=3, n/st=100, rew=7505.33]                                                                                 


Epoch #19: test_reward: 15415.800000 ± 3991.393185, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #20: 1001it [00:03, 290.74it/s, env_step=20000, gradient_step=2000, len=198, n/ep=0, n/st=100, rew=7477.00]                                                                                 


Epoch #20: test_reward: 9406.400000 ± 4993.409320, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #21: 1001it [00:03, 290.41it/s, env_step=21000, gradient_step=2100, len=143, n/ep=2, n/st=100, rew=5149.50]                                                                                 


Epoch #21: test_reward: 13144.000000 ± 3281.811817, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #22: 1001it [00:03, 312.62it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=9710.50]                                                                                 


Epoch #22: test_reward: 13925.500000 ± 6286.175773, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #23: 1001it [00:03, 297.80it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=9583.00]                                                                                 


Epoch #23: test_reward: 10942.600000 ± 3425.637932, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #24: 1001it [00:03, 283.29it/s, env_step=24000, gradient_step=2400, len=240, n/ep=3, n/st=100, rew=9367.50]                                                                                 


Epoch #24: test_reward: 11897.600000 ± 5217.918420, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #25: 1001it [00:03, 302.69it/s, env_step=25000, gradient_step=2500, len=244, n/ep=0, n/st=100, rew=7964.00]                                                                                 


Epoch #25: test_reward: 8843.400000 ± 3341.537137, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #26: 1001it [00:03, 294.49it/s, env_step=26000, gradient_step=2600, len=252, n/ep=0, n/st=100, rew=10707.00]                                                                                


Epoch #26: test_reward: 11623.200000 ± 2643.026667, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #27: 1001it [00:03, 282.80it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=12875.00]                                                                                


Epoch #27: test_reward: 10139.200000 ± 2009.339633, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #28: 1001it [00:03, 301.61it/s, env_step=28000, gradient_step=2800, len=274, n/ep=0, n/st=100, rew=11912.00]                                                                                


Epoch #28: test_reward: 13020.300000 ± 3342.167801, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #29: 1001it [00:03, 305.96it/s, env_step=29000, gradient_step=2900, len=43, n/ep=0, n/st=100, rew=1055.00]                                                                                  


Epoch #29: test_reward: 12056.200000 ± 4214.228038, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #30: 1001it [00:03, 292.71it/s, env_step=30000, gradient_step=3000, len=149, n/ep=0, n/st=100, rew=6867.00]                                                                                 


Epoch #30: test_reward: 8287.000000 ± 2572.318371, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #31: 1001it [00:03, 301.88it/s, env_step=31000, gradient_step=3100, len=185, n/ep=3, n/st=100, rew=8328.00]                                                                                 


Epoch #31: test_reward: 9189.600000 ± 3895.124111, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #32: 1001it [00:03, 291.20it/s, env_step=32000, gradient_step=3200, len=316, n/ep=0, n/st=100, rew=12724.00]                                                                                


Epoch #32: test_reward: 8893.500000 ± 3338.536363, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #33: 1001it [00:03, 312.15it/s, env_step=33000, gradient_step=3300, len=330, n/ep=2, n/st=100, rew=14896.50]                                                                                


Epoch #33: test_reward: 9021.400000 ± 3379.459874, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #34: 1001it [00:03, 302.98it/s, env_step=34000, gradient_step=3400, len=155, n/ep=0, n/st=100, rew=8979.50]                                                                                 


Epoch #34: test_reward: 14171.000000 ± 5204.226667, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #35: 1001it [00:03, 290.26it/s, env_step=35000, gradient_step=3500, len=350, n/ep=1, n/st=100, rew=15832.50]                                                                                


Epoch #35: test_reward: 5918.800000 ± 1583.288780, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #36: 1001it [00:03, 294.26it/s, env_step=36000, gradient_step=3600, len=330, n/ep=4, n/st=100, rew=16272.00]                                                                                


Epoch #36: test_reward: 11243.400000 ± 2635.534413, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #37: 1001it [00:03, 273.43it/s, env_step=37000, gradient_step=3700, len=279, n/ep=2, n/st=100, rew=13048.00]                                                                                


Epoch #37: test_reward: 10673.800000 ± 3781.809033, best_reward: 15762.000000 ± 1878.057134 in #14


Epoch #38: 1001it [00:03, 298.10it/s, env_step=38000, gradient_step=3800, len=378, n/ep=0, n/st=100, rew=19201.00]                                                                                


Epoch #38: test_reward: 17693.600000 ± 7069.475330, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #39: 1001it [00:03, 296.71it/s, env_step=39000, gradient_step=3900, len=210, n/ep=1, n/st=100, rew=10707.50]                                                                                


Epoch #39: test_reward: 8513.500000 ± 5502.308811, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #40: 1001it [00:03, 296.88it/s, env_step=40000, gradient_step=4000, len=371, n/ep=7, n/st=100, rew=16340.29]                                                                                


Epoch #40: test_reward: 12891.000000 ± 4922.870565, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #41: 1001it [00:03, 293.86it/s, env_step=41000, gradient_step=4100, len=228, n/ep=1, n/st=100, rew=12499.00]                                                                                


Epoch #41: test_reward: 14628.100000 ± 5080.887254, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #42: 1001it [00:03, 290.34it/s, env_step=42000, gradient_step=4200, len=89, n/ep=0, n/st=100, rew=2663.00]                                                                                  


Epoch #42: test_reward: 11885.800000 ± 5054.266431, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #43: 1001it [00:03, 264.91it/s, env_step=43000, gradient_step=4300, len=116, n/ep=2, n/st=100, rew=5495.00]                                                                                 


Epoch #43: test_reward: 9206.600000 ± 3253.878922, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #44: 1001it [00:03, 288.64it/s, env_step=44000, gradient_step=4400, len=336, n/ep=0, n/st=100, rew=18131.00]                                                                                


Epoch #44: test_reward: 10060.700000 ± 5194.606050, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #45: 1001it [00:03, 281.54it/s, env_step=45000, gradient_step=4500, len=163, n/ep=0, n/st=100, rew=8079.50]                                                                                 


Epoch #45: test_reward: 10408.000000 ± 3577.613171, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #46: 1001it [00:03, 279.07it/s, env_step=46000, gradient_step=4600, len=163, n/ep=0, n/st=100, rew=8079.50]                                                                                 


Epoch #46: test_reward: 16421.500000 ± 5706.007593, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #47: 1001it [00:03, 278.06it/s, env_step=47000, gradient_step=4700, len=229, n/ep=0, n/st=100, rew=12102.00]                                                                                


Epoch #47: test_reward: 13435.200000 ± 3894.641159, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #48: 1001it [00:04, 235.64it/s, env_step=48000, gradient_step=4800, len=138, n/ep=0, n/st=100, rew=7364.00]                                                                                 


Epoch #48: test_reward: 14080.800000 ± 3777.757557, best_reward: 17693.600000 ± 7069.475330 in #38


Epoch #49: 1001it [00:03, 251.25it/s, env_step=49000, gradient_step=4900, len=169, n/ep=3, n/st=100, rew=8449.17]                                                                                 


Epoch #49: test_reward: 18640.300000 ± 4496.524170, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #50: 1001it [00:04, 243.14it/s, env_step=50000, gradient_step=5000, len=256, n/ep=0, n/st=100, rew=15389.00]                                                                                


Epoch #50: test_reward: 12388.800000 ± 3633.586790, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #51: 1001it [00:03, 266.48it/s, env_step=51000, gradient_step=5100, len=208, n/ep=0, n/st=100, rew=12111.00]                                                                                


Epoch #51: test_reward: 9692.200000 ± 2462.688685, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #52: 1001it [00:03, 292.45it/s, env_step=52000, gradient_step=5200, len=226, n/ep=3, n/st=100, rew=11877.17]                                                                                


Epoch #52: test_reward: 11410.400000 ± 4885.970839, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #53: 1001it [00:03, 262.94it/s, env_step=53000, gradient_step=5300, len=196, n/ep=0, n/st=100, rew=10339.25]                                                                                


Epoch #53: test_reward: 11284.800000 ± 5200.049611, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #54: 1001it [00:03, 276.69it/s, env_step=54000, gradient_step=5400, len=163, n/ep=2, n/st=100, rew=7845.25]                                                                                 


Epoch #54: test_reward: 15301.300000 ± 5993.895345, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #55: 1001it [00:03, 275.72it/s, env_step=55000, gradient_step=5500, len=43, n/ep=1, n/st=100, rew=1527.00]                                                                                  


Epoch #55: test_reward: 15600.900000 ± 3048.782132, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #56: 1001it [00:03, 295.46it/s, env_step=56000, gradient_step=5600, len=207, n/ep=0, n/st=100, rew=11297.00]                                                                                


Epoch #56: test_reward: 11960.600000 ± 5232.414743, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #57: 1001it [00:03, 267.79it/s, env_step=57000, gradient_step=5700, len=179, n/ep=0, n/st=100, rew=9399.50]                                                                                 


Epoch #57: test_reward: 17349.800000 ± 4581.376885, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #58: 1001it [00:03, 280.95it/s, env_step=58000, gradient_step=5800, len=145, n/ep=1, n/st=100, rew=6860.50]                                                                                 


Epoch #58: test_reward: 15751.100000 ± 5393.461717, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #59: 1001it [00:03, 278.07it/s, env_step=59000, gradient_step=5900, len=187, n/ep=0, n/st=100, rew=9834.50]                                                                                 


Epoch #59: test_reward: 10617.500000 ± 3220.362441, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #60: 1001it [00:03, 292.17it/s, env_step=60000, gradient_step=6000, len=200, n/ep=1, n/st=100, rew=10774.00]                                                                                


Epoch #60: test_reward: 11031.500000 ± 5752.657947, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #61: 1001it [00:03, 279.45it/s, env_step=61000, gradient_step=6100, len=400, n/ep=0, n/st=100, rew=20872.50]                                                                                


Epoch #61: test_reward: 12415.400000 ± 3336.227846, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #62: 1001it [00:03, 282.64it/s, env_step=62000, gradient_step=6200, len=118, n/ep=0, n/st=100, rew=6010.00]                                                                                 


Epoch #62: test_reward: 17032.000000 ± 5578.143562, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #63: 1001it [00:03, 264.75it/s, env_step=63000, gradient_step=6300, len=200, n/ep=2, n/st=100, rew=9970.75]                                                                                 


Epoch #63: test_reward: 13863.700000 ± 5448.111490, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #64: 1001it [00:03, 270.73it/s, env_step=64000, gradient_step=6400, len=106, n/ep=0, n/st=100, rew=5532.00]                                                                                 


Epoch #64: test_reward: 14487.200000 ± 5462.475058, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #65: 1001it [00:03, 278.17it/s, env_step=65000, gradient_step=6500, len=226, n/ep=0, n/st=100, rew=11877.00]                                                                                


Epoch #65: test_reward: 14173.400000 ± 6501.825470, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #66: 1001it [00:03, 283.76it/s, env_step=66000, gradient_step=6600, len=260, n/ep=0, n/st=100, rew=14499.25]                                                                                


Epoch #66: test_reward: 13429.700000 ± 5699.116424, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #67: 1001it [00:03, 264.88it/s, env_step=67000, gradient_step=6700, len=146, n/ep=0, n/st=100, rew=7509.00]                                                                                 


Epoch #67: test_reward: 13150.100000 ± 6514.933161, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #68: 1001it [00:03, 257.31it/s, env_step=68000, gradient_step=6800, len=260, n/ep=0, n/st=100, rew=13415.00]                                                                                


Epoch #68: test_reward: 17889.700000 ± 6433.063486, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #69: 1001it [00:03, 265.91it/s, env_step=69000, gradient_step=6900, len=136, n/ep=0, n/st=100, rew=7263.50]                                                                                 


Epoch #69: test_reward: 11442.600000 ± 7534.769966, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #70: 1001it [00:03, 268.29it/s, env_step=70000, gradient_step=7000, len=239, n/ep=3, n/st=100, rew=12632.00]                                                                                


Epoch #70: test_reward: 11020.400000 ± 5783.640483, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #71: 1001it [00:03, 267.24it/s, env_step=71000, gradient_step=7100, len=372, n/ep=0, n/st=100, rew=21939.00]                                                                                


Epoch #71: test_reward: 17094.700000 ± 6823.640935, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #72: 1001it [00:03, 257.83it/s, env_step=72000, gradient_step=7200, len=256, n/ep=0, n/st=100, rew=15581.50]                                                                                


Epoch #72: test_reward: 15970.200000 ± 5130.014947, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #73: 1001it [00:03, 252.68it/s, env_step=73000, gradient_step=7300, len=170, n/ep=0, n/st=100, rew=10216.50]                                                                                


Epoch #73: test_reward: 11134.800000 ± 4618.256940, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #74: 1001it [00:03, 276.35it/s, env_step=74000, gradient_step=7400, len=147, n/ep=0, n/st=100, rew=6746.00]                                                                                 


Epoch #74: test_reward: 11210.900000 ± 6212.638899, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #75: 1001it [00:03, 297.72it/s, env_step=75000, gradient_step=7500, len=171, n/ep=1, n/st=100, rew=9624.00]                                                                                 


Epoch #75: test_reward: 12395.600000 ± 4878.336278, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #76: 1001it [00:04, 241.48it/s, env_step=76000, gradient_step=7600, len=241, n/ep=0, n/st=100, rew=13056.50]                                                                                


Epoch #76: test_reward: 5095.700000 ± 1699.955120, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #77: 1001it [00:03, 272.36it/s, env_step=77000, gradient_step=7700, len=273, n/ep=0, n/st=100, rew=16174.50]                                                                                


Epoch #77: test_reward: 14388.500000 ± 4074.443158, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #78: 1001it [00:03, 260.56it/s, env_step=78000, gradient_step=7800, len=294, n/ep=2, n/st=100, rew=17877.75]                                                                                


Epoch #78: test_reward: 13971.900000 ± 5650.418010, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #79: 1001it [00:03, 261.00it/s, env_step=79000, gradient_step=7900, len=156, n/ep=0, n/st=100, rew=8724.00]                                                                                 


Epoch #79: test_reward: 13045.500000 ± 5146.156551, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #80: 1001it [00:03, 283.60it/s, env_step=80000, gradient_step=8000, len=316, n/ep=0, n/st=100, rew=18695.50]                                                                                


Epoch #80: test_reward: 11933.000000 ± 3546.825764, best_reward: 18640.300000 ± 4496.524170 in #49


Epoch #81: 1001it [00:03, 264.41it/s, env_step=81000, gradient_step=8100, len=78, n/ep=1, n/st=100, rew=3422.00]                                                                                  


Epoch #81: test_reward: 20077.000000 ± 4175.073700, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #82: 1001it [00:03, 285.44it/s, env_step=82000, gradient_step=8200, len=162, n/ep=0, n/st=100, rew=9393.50]                                                                                 


Epoch #82: test_reward: 12349.300000 ± 6310.634834, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #83: 1001it [00:03, 284.94it/s, env_step=83000, gradient_step=8300, len=290, n/ep=0, n/st=100, rew=16416.00]                                                                                


Epoch #83: test_reward: 16667.700000 ± 6741.224014, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #84: 1001it [00:03, 278.01it/s, env_step=84000, gradient_step=8400, len=197, n/ep=2, n/st=100, rew=11319.50]                                                                                


Epoch #84: test_reward: 11859.700000 ± 5778.540232, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #85: 1001it [00:03, 294.75it/s, env_step=85000, gradient_step=8500, len=196, n/ep=0, n/st=100, rew=11977.00]                                                                                


Epoch #85: test_reward: 14389.300000 ± 6295.151405, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #86: 1001it [00:03, 272.87it/s, env_step=86000, gradient_step=8600, len=92, n/ep=1, n/st=100, rew=4643.00]                                                                                  


Epoch #86: test_reward: 17909.700000 ± 7216.497655, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #87: 1001it [00:03, 280.17it/s, env_step=87000, gradient_step=8700, len=112, n/ep=0, n/st=100, rew=3962.00]                                                                                 


Epoch #87: test_reward: 9235.800000 ± 4489.617039, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #88: 1001it [00:03, 291.83it/s, env_step=88000, gradient_step=8800, len=231, n/ep=2, n/st=100, rew=14202.50]                                                                                


Epoch #88: test_reward: 13549.700000 ± 4200.915163, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #89: 1001it [00:03, 286.42it/s, env_step=89000, gradient_step=8900, len=150, n/ep=0, n/st=100, rew=8150.00]                                                                                 


Epoch #89: test_reward: 13884.100000 ± 6267.189122, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #90: 1001it [00:03, 282.95it/s, env_step=90000, gradient_step=9000, len=118, n/ep=1, n/st=100, rew=6192.00]                                                                                 


Epoch #90: test_reward: 14089.500000 ± 5748.577950, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #91: 1001it [00:03, 273.69it/s, env_step=91000, gradient_step=9100, len=246, n/ep=0, n/st=100, rew=14942.00]                                                                                


Epoch #91: test_reward: 13863.500000 ± 7361.861738, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #92: 1001it [00:03, 268.42it/s, env_step=92000, gradient_step=9200, len=162, n/ep=1, n/st=100, rew=9022.00]                                                                                 


Epoch #92: test_reward: 9775.400000 ± 4027.812885, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #93: 1001it [00:03, 283.96it/s, env_step=93000, gradient_step=9300, len=388, n/ep=1, n/st=100, rew=25019.00]                                                                                


Epoch #93: test_reward: 17961.100000 ± 6467.149302, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #94: 1001it [00:03, 263.63it/s, env_step=94000, gradient_step=9400, len=400, n/ep=1, n/st=100, rew=26096.00]                                                                                


Epoch #94: test_reward: 11794.800000 ± 4119.467582, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #95: 1001it [00:03, 263.29it/s, env_step=95000, gradient_step=9500, len=229, n/ep=1, n/st=100, rew=14158.00]                                                                                


Epoch #95: test_reward: 11351.400000 ± 2206.005902, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #96: 1001it [00:03, 279.98it/s, env_step=96000, gradient_step=9600, len=203, n/ep=1, n/st=100, rew=11844.50]                                                                                


Epoch #96: test_reward: 15941.800000 ± 5948.523259, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #97: 1001it [00:03, 272.34it/s, env_step=97000, gradient_step=9700, len=192, n/ep=1, n/st=100, rew=11891.00]                                                                                


Epoch #97: test_reward: 14839.200000 ± 3739.513813, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #98: 1001it [00:04, 241.24it/s, env_step=98000, gradient_step=9800, len=130, n/ep=0, n/st=100, rew=7269.00]                                                                                 


Epoch #98: test_reward: 6109.800000 ± 3496.612469, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #99: 1001it [00:03, 284.03it/s, env_step=99000, gradient_step=9900, len=215, n/ep=0, n/st=100, rew=12316.00]                                                                                


Epoch #99: test_reward: 15296.000000 ± 4013.523938, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #100: 1001it [00:03, 250.80it/s, env_step=100000, gradient_step=10000, len=226, n/ep=0, n/st=100, rew=12983.00]                                                                             


Epoch #100: test_reward: 11503.600000 ± 5460.917655, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #101: 1001it [00:04, 241.17it/s, env_step=101000, gradient_step=10100, len=241, n/ep=1, n/st=100, rew=15671.00]                                                                             


Epoch #101: test_reward: 10781.100000 ± 4313.284397, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #102: 1001it [00:03, 290.68it/s, env_step=102000, gradient_step=10200, len=277, n/ep=1, n/st=100, rew=17203.00]                                                                             


Epoch #102: test_reward: 12191.800000 ± 4967.747796, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #103: 1001it [00:03, 263.31it/s, env_step=103000, gradient_step=10300, len=146, n/ep=0, n/st=100, rew=7886.00]                                                                              


Epoch #103: test_reward: 13000.700000 ± 5167.522386, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #104: 1001it [00:03, 262.52it/s, env_step=104000, gradient_step=10400, len=145, n/ep=0, n/st=100, rew=8364.50]                                                                              


Epoch #104: test_reward: 12861.200000 ± 6852.255115, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #105: 1001it [00:03, 280.63it/s, env_step=105000, gradient_step=10500, len=148, n/ep=0, n/st=100, rew=8033.00]                                                                              


Epoch #105: test_reward: 15355.500000 ± 7445.035087, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #106: 1001it [00:03, 269.48it/s, env_step=106000, gradient_step=10600, len=165, n/ep=2, n/st=100, rew=10229.50]                                                                             


Epoch #106: test_reward: 12131.700000 ± 4908.844896, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #107: 1001it [00:04, 236.59it/s, env_step=107000, gradient_step=10700, len=148, n/ep=0, n/st=100, rew=9151.00]                                                                              


Epoch #107: test_reward: 10473.900000 ± 3128.797611, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #108: 1001it [00:03, 251.44it/s, env_step=108000, gradient_step=10800, len=122, n/ep=0, n/st=100, rew=6994.00]                                                                              


Epoch #108: test_reward: 15555.200000 ± 3257.059404, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #109: 1001it [00:03, 265.38it/s, env_step=109000, gradient_step=10900, len=112, n/ep=1, n/st=100, rew=6095.00]                                                                              


Epoch #109: test_reward: 9374.400000 ± 3770.027140, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #110: 1001it [00:03, 272.76it/s, env_step=110000, gradient_step=11000, len=161, n/ep=0, n/st=100, rew=9791.25]                                                                              


Epoch #110: test_reward: 14676.500000 ± 7283.197969, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #111: 1001it [00:03, 255.20it/s, env_step=111000, gradient_step=11100, len=120, n/ep=0, n/st=100, rew=6970.50]                                                                              


Epoch #111: test_reward: 12583.900000 ± 7179.016429, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #112: 1001it [00:03, 266.46it/s, env_step=112000, gradient_step=11200, len=261, n/ep=1, n/st=100, rew=16750.00]                                                                             


Epoch #112: test_reward: 14964.300000 ± 7561.380377, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #113: 1001it [00:03, 281.16it/s, env_step=113000, gradient_step=11300, len=264, n/ep=0, n/st=100, rew=17484.00]                                                                             


Epoch #113: test_reward: 11413.300000 ± 1842.750230, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #114: 1001it [00:04, 245.85it/s, env_step=114000, gradient_step=11400, len=219, n/ep=1, n/st=100, rew=11354.50]                                                                             


Epoch #114: test_reward: 13091.800000 ± 4225.303345, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #115: 1001it [00:03, 262.19it/s, env_step=115000, gradient_step=11500, len=166, n/ep=0, n/st=100, rew=8899.00]                                                                              


Epoch #115: test_reward: 18138.400000 ± 6645.328091, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #116: 1001it [00:03, 268.99it/s, env_step=116000, gradient_step=11600, len=121, n/ep=0, n/st=100, rew=6101.50]                                                                              


Epoch #116: test_reward: 13041.600000 ± 6798.746284, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #117: 1001it [00:04, 233.84it/s, env_step=117000, gradient_step=11700, len=226, n/ep=0, n/st=100, rew=14595.00]                                                                             


Epoch #117: test_reward: 11990.000000 ± 3939.551345, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #118: 1001it [00:03, 281.21it/s, env_step=118000, gradient_step=11800, len=250, n/ep=0, n/st=100, rew=16141.00]                                                                             


Epoch #118: test_reward: 12534.000000 ± 2626.936581, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #119: 1001it [00:03, 255.01it/s, env_step=119000, gradient_step=11900, len=94, n/ep=0, n/st=100, rew=4563.00]                                                                               


Epoch #119: test_reward: 12766.600000 ± 2870.008474, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #120: 1001it [00:03, 267.54it/s, env_step=120000, gradient_step=12000, len=225, n/ep=0, n/st=100, rew=14133.50]                                                                             


Epoch #120: test_reward: 11773.900000 ± 5920.149820, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #121: 1001it [00:03, 253.33it/s, env_step=121000, gradient_step=12100, len=96, n/ep=1, n/st=100, rew=3968.00]                                                                               


Epoch #121: test_reward: 13838.200000 ± 4540.954301, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #122: 1001it [00:03, 281.98it/s, env_step=122000, gradient_step=12200, len=168, n/ep=2, n/st=100, rew=9262.00]                                                                              


Epoch #122: test_reward: 12060.000000 ± 2751.743593, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #123: 1001it [00:03, 266.12it/s, env_step=123000, gradient_step=12300, len=149, n/ep=0, n/st=100, rew=8071.00]                                                                              


Epoch #123: test_reward: 12240.200000 ± 2822.163489, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #124: 1001it [00:03, 275.73it/s, env_step=124000, gradient_step=12400, len=192, n/ep=0, n/st=100, rew=12193.00]                                                                             


Epoch #124: test_reward: 10698.100000 ± 1310.079192, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #125: 1001it [00:04, 241.49it/s, env_step=125000, gradient_step=12500, len=248, n/ep=0, n/st=100, rew=15555.00]                                                                             


Epoch #125: test_reward: 10704.300000 ± 2654.264570, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #126: 1001it [00:03, 270.05it/s, env_step=126000, gradient_step=12600, len=111, n/ep=2, n/st=100, rew=6174.25]                                                                              


Epoch #126: test_reward: 14130.800000 ± 3619.387235, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #127: 1001it [00:03, 259.89it/s, env_step=127000, gradient_step=12700, len=171, n/ep=1, n/st=100, rew=10570.50]                                                                             


Epoch #127: test_reward: 10279.400000 ± 3780.010905, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #128: 1001it [00:03, 272.30it/s, env_step=128000, gradient_step=12800, len=139, n/ep=2, n/st=100, rew=8120.75]                                                                              


Epoch #128: test_reward: 13462.000000 ± 4791.827397, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #129: 1001it [00:04, 248.30it/s, env_step=129000, gradient_step=12900, len=167, n/ep=0, n/st=100, rew=9514.25]                                                                              


Epoch #129: test_reward: 12781.700000 ± 4046.183907, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #130: 1001it [00:03, 290.83it/s, env_step=130000, gradient_step=13000, len=142, n/ep=1, n/st=100, rew=7987.00]                                                                              


Epoch #130: test_reward: 14575.500000 ± 6950.329751, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #131: 1001it [00:03, 271.99it/s, env_step=131000, gradient_step=13100, len=179, n/ep=0, n/st=100, rew=11022.50]                                                                             


Epoch #131: test_reward: 11595.200000 ± 5384.399034, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #132: 1001it [00:03, 285.75it/s, env_step=132000, gradient_step=13200, len=359, n/ep=1, n/st=100, rew=24095.00]                                                                             


Epoch #132: test_reward: 16924.400000 ± 6368.508384, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #133: 1001it [00:03, 257.33it/s, env_step=133000, gradient_step=13300, len=201, n/ep=0, n/st=100, rew=12123.00]                                                                             


Epoch #133: test_reward: 11289.600000 ± 3351.778966, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #134: 1001it [00:03, 257.90it/s, env_step=134000, gradient_step=13400, len=146, n/ep=0, n/st=100, rew=9014.00]                                                                              


Epoch #134: test_reward: 6415.100000 ± 1286.005245, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #135: 1001it [00:03, 266.85it/s, env_step=135000, gradient_step=13500, len=223, n/ep=0, n/st=100, rew=14621.50]                                                                             


Epoch #135: test_reward: 12841.200000 ± 5385.603268, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #136: 1001it [00:03, 264.91it/s, env_step=136000, gradient_step=13600, len=287, n/ep=2, n/st=100, rew=18367.00]                                                                             


Epoch #136: test_reward: 17368.800000 ± 3138.354626, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #137: 1001it [00:04, 235.27it/s, env_step=137000, gradient_step=13700, len=138, n/ep=2, n/st=100, rew=7752.75]                                                                              


Epoch #137: test_reward: 11112.800000 ± 5348.916989, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #138: 1001it [00:03, 273.17it/s, env_step=138000, gradient_step=13800, len=138, n/ep=1, n/st=100, rew=7141.00]                                                                              


Epoch #138: test_reward: 11860.700000 ± 3851.872221, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #139: 1001it [00:03, 251.15it/s, env_step=139000, gradient_step=13900, len=269, n/ep=0, n/st=100, rew=16890.00]                                                                             


Epoch #139: test_reward: 12640.000000 ± 4324.122107, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #140: 1001it [00:04, 235.20it/s, env_step=140000, gradient_step=14000, len=139, n/ep=0, n/st=100, rew=7740.00]                                                                              


Epoch #140: test_reward: 10303.600000 ± 4136.201330, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #141: 1001it [00:04, 250.20it/s, env_step=141000, gradient_step=14100, len=205, n/ep=1, n/st=100, rew=12799.00]                                                                             


Epoch #141: test_reward: 10611.100000 ± 3365.480365, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #142: 1001it [00:03, 255.57it/s, env_step=142000, gradient_step=14200, len=161, n/ep=0, n/st=100, rew=9796.00]                                                                              


Epoch #142: test_reward: 11296.000000 ± 4800.879919, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #143: 1001it [00:04, 249.35it/s, env_step=143000, gradient_step=14300, len=104, n/ep=0, n/st=100, rew=5283.00]                                                                              


Epoch #143: test_reward: 11017.100000 ± 3398.817602, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #144: 1001it [00:03, 274.91it/s, env_step=144000, gradient_step=14400, len=353, n/ep=0, n/st=100, rew=22122.50]                                                                             


Epoch #144: test_reward: 15339.100000 ± 5531.481112, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #145: 1001it [00:03, 258.42it/s, env_step=145000, gradient_step=14500, len=293, n/ep=0, n/st=100, rew=18984.00]                                                                             


Epoch #145: test_reward: 15210.400000 ± 5093.945646, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #146: 1001it [00:03, 271.42it/s, env_step=146000, gradient_step=14600, len=192, n/ep=1, n/st=100, rew=12068.00]                                                                             


Epoch #146: test_reward: 12981.000000 ± 4918.426781, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #147: 1001it [00:03, 274.78it/s, env_step=147000, gradient_step=14700, len=190, n/ep=0, n/st=100, rew=11421.00]                                                                             


Epoch #147: test_reward: 9128.800000 ± 3058.968610, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #148: 1001it [00:04, 240.06it/s, env_step=148000, gradient_step=14800, len=172, n/ep=1, n/st=100, rew=11211.00]                                                                             


Epoch #148: test_reward: 12163.800000 ± 3408.798111, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #149: 1001it [00:04, 236.92it/s, env_step=149000, gradient_step=14900, len=123, n/ep=0, n/st=100, rew=7104.75]                                                                              


Epoch #149: test_reward: 12901.600000 ± 3483.783782, best_reward: 20077.000000 ± 4175.073700 in #81


Epoch #150: 1001it [00:03, 262.40it/s, env_step=150000, gradient_step=15000, len=177, n/ep=1, n/st=100, rew=11256.50]                                                                             


Epoch #150: test_reward: 13457.600000 ± 3946.978799, best_reward: 20077.000000 ± 4175.073700 in #81

InfoStats(gradient_step=15000, best_reward=20077.0, best_reward_std=4175.0736999483015, train_step=150000, train_episode=700, test_step=355588, test_episode=1510, timing=TimingStats(total_time=732.0679347515106, train_time=545.0654892921448, train_time_collect=52.93186092376709, train_time_update=486.1766080856323, test_time=187.00244545936584, update_speed=275.1962891556373))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #0


Epoch #1: 1001it [00:01, 585.30it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 12595.600000 ± 4785.515462, best_reward: 12595.600000 ± 4785.515462 in #1


Epoch #2: 1001it [00:01, 663.58it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 11961.500000 ± 4791.272759, best_reward: 12595.600000 ± 4785.515462 in #1


Epoch #3: 1001it [00:01, 590.64it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10656.700000 ± 2774.111463, best_reward: 12595.600000 ± 4785.515462 in #1


Epoch #4: 1001it [00:01, 708.53it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 11269.100000 ± 3631.797088, best_reward: 12595.600000 ± 4785.515462 in #1


Epoch #5: 1001it [00:01, 740.37it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 12645.700000 ± 2636.476438, best_reward: 12645.700000 ± 2636.476438 in #5


Epoch #6: 1001it [00:01, 798.16it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 12877.000000 ± 2757.653677, best_reward: 12877.000000 ± 2757.653677 in #6


Epoch #7: 1001it [00:01, 767.51it/s, env_step=7000, gradient_step=700, len=64, n/ep=0, n/st=100, rew=2109.00]                                                                                     


Epoch #7: test_reward: 13534.700000 ± 4113.332008, best_reward: 13534.700000 ± 4113.332008 in #7


Epoch #8: 1001it [00:01, 559.38it/s, env_step=8000, gradient_step=800, len=78, n/ep=0, n/st=100, rew=2533.50]                                                                                     


Epoch #8: test_reward: 11452.200000 ± 3856.071026, best_reward: 13534.700000 ± 4113.332008 in #7


Epoch #9: 1001it [00:01, 673.73it/s, env_step=9000, gradient_step=900, len=90, n/ep=1, n/st=100, rew=3773.00]                                                                                     


Epoch #9: test_reward: 11917.800000 ± 3338.884059, best_reward: 13534.700000 ± 4113.332008 in #7


Epoch #10: 1001it [00:01, 537.27it/s, env_step=10000, gradient_step=1000, len=94, n/ep=0, n/st=100, rew=3448.50]                                                                                  


Epoch #10: test_reward: 13686.400000 ± 2316.136879, best_reward: 13686.400000 ± 2316.136879 in #10


Epoch #11: 1001it [00:01, 666.24it/s, env_step=11000, gradient_step=1100, len=110, n/ep=2, n/st=100, rew=4147.50]                                                                                 


Epoch #11: test_reward: 11484.600000 ± 4193.826348, best_reward: 13686.400000 ± 2316.136879 in #10


Epoch #12: 1001it [00:01, 627.50it/s, env_step=12000, gradient_step=1200, len=120, n/ep=1, n/st=100, rew=4642.00]                                                                                 


Epoch #12: test_reward: 11924.400000 ± 2216.365412, best_reward: 13686.400000 ± 2316.136879 in #10


Epoch #13: 1001it [00:01, 672.48it/s, env_step=13000, gradient_step=1300, len=120, n/ep=0, n/st=100, rew=4642.00]                                                                                 


Epoch #13: test_reward: 8951.200000 ± 4059.043060, best_reward: 13686.400000 ± 2316.136879 in #10


Epoch #14: 1001it [00:02, 490.08it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=5107.00]                                                                                 


Epoch #14: test_reward: 13786.000000 ± 4006.930097, best_reward: 13786.000000 ± 4006.930097 in #14


Epoch #15: 1001it [00:01, 624.39it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=6181.00]                                                                                 


Epoch #15: test_reward: 14962.900000 ± 3626.048826, best_reward: 14962.900000 ± 3626.048826 in #15


Epoch #16: 1001it [00:01, 521.88it/s, env_step=16000, gradient_step=1600, len=160, n/ep=3, n/st=100, rew=6709.67]                                                                                 


Epoch #16: test_reward: 12870.400000 ± 3054.977388, best_reward: 14962.900000 ± 3626.048826 in #15


Epoch #17: 1001it [00:01, 539.40it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=8077.00]                                                                                 


Epoch #17: test_reward: 15325.600000 ± 3781.199894, best_reward: 15325.600000 ± 3781.199894 in #17


Epoch #18: 1001it [00:01, 540.95it/s, env_step=18000, gradient_step=1800, len=176, n/ep=0, n/st=100, rew=7262.00]                                                                                 


Epoch #18: test_reward: 9054.400000 ± 2184.297745, best_reward: 15325.600000 ± 3781.199894 in #17


Epoch #19: 1001it [00:01, 549.41it/s, env_step=19000, gradient_step=1900, len=186, n/ep=0, n/st=100, rew=8060.50]                                                                                 


Epoch #19: test_reward: 15631.200000 ± 3668.136879, best_reward: 15631.200000 ± 3668.136879 in #19


Epoch #20: 1001it [00:01, 575.15it/s, env_step=20000, gradient_step=2000, len=198, n/ep=0, n/st=100, rew=8348.00]                                                                                 


Epoch #20: test_reward: 14333.000000 ± 4826.685592, best_reward: 15631.200000 ± 3668.136879 in #19


Epoch #21: 1001it [00:02, 477.28it/s, env_step=21000, gradient_step=2100, len=208, n/ep=0, n/st=100, rew=10178.75]                                                                                


Epoch #21: test_reward: 11181.000000 ± 3243.934432, best_reward: 15631.200000 ± 3668.136879 in #19


Epoch #22: 1001it [00:01, 544.14it/s, env_step=22000, gradient_step=2200, len=220, n/ep=2, n/st=100, rew=9914.00]                                                                                 


Epoch #22: test_reward: 10814.100000 ± 3532.538930, best_reward: 15631.200000 ± 3668.136879 in #19


Epoch #23: 1001it [00:01, 539.76it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=9705.00]                                                                                 


Epoch #23: test_reward: 15641.300000 ± 3704.193463, best_reward: 15641.300000 ± 3704.193463 in #23


Epoch #24: 1001it [00:01, 622.97it/s, env_step=24000, gradient_step=2400, len=236, n/ep=0, n/st=100, rew=11114.25]                                                                                


Epoch #24: test_reward: 13306.800000 ± 4155.625796, best_reward: 15641.300000 ± 3704.193463 in #23


Epoch #25: 1001it [00:02, 483.20it/s, env_step=25000, gradient_step=2500, len=248, n/ep=0, n/st=100, rew=11963.75]                                                                                


Epoch #25: test_reward: 13557.400000 ± 4590.494901, best_reward: 15641.300000 ± 3704.193463 in #23


Epoch #26: 1001it [00:01, 602.33it/s, env_step=26000, gradient_step=2600, len=256, n/ep=0, n/st=100, rew=13878.00]                                                                                


Epoch #26: test_reward: 15012.500000 ± 4002.032865, best_reward: 15641.300000 ± 3704.193463 in #23


Epoch #27: 1001it [00:01, 651.39it/s, env_step=27000, gradient_step=2700, len=76, n/ep=0, n/st=100, rew=2775.00]                                                                                  


Epoch #27: test_reward: 11372.200000 ± 3963.556655, best_reward: 15641.300000 ± 3704.193463 in #23


Epoch #28: 1001it [00:01, 616.20it/s, env_step=28000, gradient_step=2800, len=86, n/ep=1, n/st=100, rew=3232.00]                                                                                  


Epoch #28: test_reward: 16600.600000 ± 4776.320429, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #29: 1001it [00:01, 586.75it/s, env_step=29000, gradient_step=2900, len=86, n/ep=0, n/st=100, rew=3232.00]                                                                                  


Epoch #29: test_reward: 14147.300000 ± 5076.198854, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #30: 1001it [00:01, 638.02it/s, env_step=30000, gradient_step=3000, len=174, n/ep=0, n/st=100, rew=7759.00]                                                                                 


Epoch #30: test_reward: 8120.000000 ± 2705.195261, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #31: 1001it [00:02, 437.85it/s, env_step=31000, gradient_step=3100, len=174, n/ep=0, n/st=100, rew=7759.00]                                                                                 


Epoch #31: test_reward: 10496.300000 ± 2970.789930, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #32: 1001it [00:01, 633.50it/s, env_step=32000, gradient_step=3200, len=320, n/ep=1, n/st=100, rew=15287.00]                                                                                


Epoch #32: test_reward: 11742.200000 ± 4108.545869, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #33: 1001it [00:01, 592.04it/s, env_step=33000, gradient_step=3300, len=176, n/ep=1, n/st=100, rew=8911.00]                                                                                 


Epoch #33: test_reward: 9657.200000 ± 7198.916152, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #34: 1001it [00:01, 587.79it/s, env_step=34000, gradient_step=3400, len=242, n/ep=0, n/st=100, rew=11138.50]                                                                                


Epoch #34: test_reward: 11921.200000 ± 4733.071514, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #35: 1001it [00:01, 618.26it/s, env_step=35000, gradient_step=3500, len=349, n/ep=0, n/st=100, rew=19004.00]                                                                                


Epoch #35: test_reward: 11892.500000 ± 6061.273979, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #36: 1001it [00:01, 624.40it/s, env_step=36000, gradient_step=3600, len=356, n/ep=0, n/st=100, rew=19672.00]                                                                                


Epoch #36: test_reward: 8759.000000 ± 4394.641669, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #37: 1001it [00:01, 559.22it/s, env_step=37000, gradient_step=3700, len=368, n/ep=0, n/st=100, rew=17192.00]                                                                                


Epoch #37: test_reward: 16372.700000 ± 5819.905979, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #38: 1001it [00:01, 544.92it/s, env_step=38000, gradient_step=3800, len=242, n/ep=1, n/st=100, rew=13493.00]                                                                                


Epoch #38: test_reward: 9596.600000 ± 5073.581067, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #39: 1001it [00:01, 594.68it/s, env_step=39000, gradient_step=3900, len=388, n/ep=0, n/st=100, rew=19837.00]                                                                                


Epoch #39: test_reward: 9233.300000 ± 3122.825837, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #40: 1001it [00:01, 583.32it/s, env_step=40000, gradient_step=4000, len=332, n/ep=0, n/st=100, rew=17017.00]                                                                                


Epoch #40: test_reward: 12967.800000 ± 3970.074201, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #41: 1001it [00:01, 508.14it/s, env_step=41000, gradient_step=4100, len=168, n/ep=1, n/st=100, rew=8714.00]                                                                                 


Epoch #41: test_reward: 12052.800000 ± 2975.072160, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #42: 1001it [00:02, 390.98it/s, env_step=42000, gradient_step=4200, len=203, n/ep=0, n/st=100, rew=10525.00]                                                                                


Epoch #42: test_reward: 15541.200000 ± 3376.224305, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #43: 1001it [00:01, 597.80it/s, env_step=43000, gradient_step=4300, len=210, n/ep=0, n/st=100, rew=10809.00]                                                                                


Epoch #43: test_reward: 14239.200000 ± 6092.408880, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #44: 1001it [00:01, 603.86it/s, env_step=44000, gradient_step=4400, len=230, n/ep=0, n/st=100, rew=11226.50]                                                                                


Epoch #44: test_reward: 13742.400000 ± 2840.908383, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #45: 1001it [00:01, 548.20it/s, env_step=45000, gradient_step=4500, len=248, n/ep=0, n/st=100, rew=14950.50]                                                                                


Epoch #45: test_reward: 15471.600000 ± 3571.608775, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #46: 1001it [00:01, 520.56it/s, env_step=46000, gradient_step=4600, len=350, n/ep=1, n/st=100, rew=18660.50]                                                                                


Epoch #46: test_reward: 12540.500000 ± 3539.493841, best_reward: 16600.600000 ± 4776.320429 in #28


Epoch #47: 1001it [00:02, 411.65it/s, env_step=47000, gradient_step=4700, len=212, n/ep=0, n/st=100, rew=11578.75]                                                                                


Epoch #47: test_reward: 17038.600000 ± 3656.341018, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #48: 1001it [00:01, 531.11it/s, env_step=48000, gradient_step=4800, len=260, n/ep=0, n/st=100, rew=13422.50]                                                                                


Epoch #48: test_reward: 14653.400000 ± 5748.604686, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #49: 1001it [00:01, 552.22it/s, env_step=49000, gradient_step=4900, len=231, n/ep=0, n/st=100, rew=11868.50]                                                                                


Epoch #49: test_reward: 9936.600000 ± 5718.785102, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #50: 1001it [00:02, 416.54it/s, env_step=50000, gradient_step=5000, len=316, n/ep=2, n/st=100, rew=16452.00]                                                                                


Epoch #50: test_reward: 13203.100000 ± 4317.786457, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #51: 1001it [00:01, 563.03it/s, env_step=51000, gradient_step=5100, len=288, n/ep=1, n/st=100, rew=15222.00]                                                                                


Epoch #51: test_reward: 15687.200000 ± 5479.479534, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #52: 1001it [00:02, 401.07it/s, env_step=52000, gradient_step=5200, len=400, n/ep=1, n/st=100, rew=20723.00]                                                                                


Epoch #52: test_reward: 12514.000000 ± 4462.485787, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #53: 1001it [00:02, 483.80it/s, env_step=53000, gradient_step=5300, len=81, n/ep=0, n/st=100, rew=3584.00]                                                                                  


Epoch #53: test_reward: 11842.100000 ± 3124.557775, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #54: 1001it [00:01, 596.50it/s, env_step=54000, gradient_step=5400, len=171, n/ep=0, n/st=100, rew=10088.50]                                                                                


Epoch #54: test_reward: 7740.100000 ± 3733.316554, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #55: 1001it [00:01, 564.13it/s, env_step=55000, gradient_step=5500, len=85, n/ep=0, n/st=100, rew=3376.00]                                                                                  


Epoch #55: test_reward: 13504.000000 ± 5454.943153, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #56: 1001it [00:02, 459.54it/s, env_step=56000, gradient_step=5600, len=314, n/ep=1, n/st=100, rew=16633.00]                                                                                


Epoch #56: test_reward: 11762.500000 ± 6664.458691, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #57: 1001it [00:01, 515.85it/s, env_step=57000, gradient_step=5700, len=82, n/ep=0, n/st=100, rew=2985.00]                                                                                  


Epoch #57: test_reward: 13869.100000 ± 5498.154790, best_reward: 17038.600000 ± 3656.341018 in #47


Epoch #58: 1001it [00:02, 489.47it/s, env_step=58000, gradient_step=5800, len=183, n/ep=0, n/st=100, rew=9836.00]                                                                                 


Epoch #58: test_reward: 20198.400000 ± 5134.436974, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #59: 1001it [00:01, 505.47it/s, env_step=59000, gradient_step=5900, len=91, n/ep=1, n/st=100, rew=3314.00]                                                                                  


Epoch #59: test_reward: 13331.600000 ± 3448.264149, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #60: 1001it [00:01, 591.89it/s, env_step=60000, gradient_step=6000, len=270, n/ep=0, n/st=100, rew=15826.00]                                                                                


Epoch #60: test_reward: 15767.000000 ± 7232.943979, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #61: 1001it [00:02, 461.03it/s, env_step=61000, gradient_step=6100, len=197, n/ep=0, n/st=100, rew=10739.00]                                                                                


Epoch #61: test_reward: 19326.800000 ± 6892.829416, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #62: 1001it [00:01, 592.05it/s, env_step=62000, gradient_step=6200, len=395, n/ep=0, n/st=100, rew=23542.00]                                                                                


Epoch #62: test_reward: 16530.500000 ± 4308.160472, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #63: 1001it [00:01, 622.53it/s, env_step=63000, gradient_step=6300, len=274, n/ep=1, n/st=100, rew=15723.00]                                                                                


Epoch #63: test_reward: 15774.500000 ± 5651.955613, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #64: 1001it [00:01, 548.50it/s, env_step=64000, gradient_step=6400, len=125, n/ep=0, n/st=100, rew=6183.50]                                                                                 


Epoch #64: test_reward: 12932.900000 ± 2735.621189, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #65: 1001it [00:01, 642.51it/s, env_step=65000, gradient_step=6500, len=211, n/ep=0, n/st=100, rew=9618.50]                                                                                 


Epoch #65: test_reward: 15295.800000 ± 5950.625140, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #66: 1001it [00:02, 450.60it/s, env_step=66000, gradient_step=6600, len=47, n/ep=1, n/st=100, rew=1875.00]                                                                                  


Epoch #66: test_reward: 14690.800000 ± 5530.727254, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #67: 1001it [00:01, 591.29it/s, env_step=67000, gradient_step=6700, len=195, n/ep=0, n/st=100, rew=10721.00]                                                                                


Epoch #67: test_reward: 15264.800000 ± 5835.631736, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #68: 1001it [00:01, 626.28it/s, env_step=68000, gradient_step=6800, len=315, n/ep=0, n/st=100, rew=19127.00]                                                                                


Epoch #68: test_reward: 9564.000000 ± 2840.305899, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #69: 1001it [00:01, 594.12it/s, env_step=69000, gradient_step=6900, len=227, n/ep=0, n/st=100, rew=12137.00]                                                                                


Epoch #69: test_reward: 15988.500000 ± 5899.682250, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #70: 1001it [00:02, 429.98it/s, env_step=70000, gradient_step=7000, len=247, n/ep=1, n/st=100, rew=14629.00]                                                                                


Epoch #70: test_reward: 17044.200000 ± 6564.777373, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #71: 1001it [00:02, 450.37it/s, env_step=71000, gradient_step=7100, len=101, n/ep=0, n/st=100, rew=5115.00]                                                                                 


Epoch #71: test_reward: 17415.000000 ± 6183.829218, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #72: 1001it [00:01, 560.80it/s, env_step=72000, gradient_step=7200, len=273, n/ep=0, n/st=100, rew=15464.00]                                                                                


Epoch #72: test_reward: 15911.900000 ± 4168.354410, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #73: 1001it [00:01, 520.25it/s, env_step=73000, gradient_step=7300, len=227, n/ep=0, n/st=100, rew=12800.00]                                                                                


Epoch #73: test_reward: 15278.900000 ± 4699.144208, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #74: 1001it [00:01, 594.98it/s, env_step=74000, gradient_step=7400, len=249, n/ep=1, n/st=100, rew=12135.00]                                                                                


Epoch #74: test_reward: 14582.800000 ± 4834.765637, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #75: 1001it [00:02, 474.16it/s, env_step=75000, gradient_step=7500, len=278, n/ep=0, n/st=100, rew=16024.00]                                                                                


Epoch #75: test_reward: 18895.200000 ± 6553.250259, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #76: 1001it [00:02, 461.53it/s, env_step=76000, gradient_step=7600, len=346, n/ep=0, n/st=100, rew=20574.50]                                                                                


Epoch #76: test_reward: 16265.600000 ± 3936.646446, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #77: 1001it [00:01, 565.39it/s, env_step=77000, gradient_step=7700, len=190, n/ep=0, n/st=100, rew=9495.00]                                                                                 


Epoch #77: test_reward: 15101.700000 ± 6547.710685, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #78: 1001it [00:01, 544.54it/s, env_step=78000, gradient_step=7800, len=168, n/ep=1, n/st=100, rew=9052.00]                                                                                 


Epoch #78: test_reward: 13894.600000 ± 5923.598555, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #79: 1001it [00:01, 565.60it/s, env_step=79000, gradient_step=7900, len=275, n/ep=1, n/st=100, rew=14072.50]                                                                                


Epoch #79: test_reward: 19078.800000 ± 5245.401868, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #80: 1001it [00:01, 624.52it/s, env_step=80000, gradient_step=8000, len=288, n/ep=0, n/st=100, rew=17717.00]                                                                                


Epoch #80: test_reward: 17415.200000 ± 4901.647556, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #81: 1001it [00:01, 578.91it/s, env_step=81000, gradient_step=8100, len=357, n/ep=1, n/st=100, rew=18381.00]                                                                                


Epoch #81: test_reward: 10893.300000 ± 4036.896061, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #82: 1001it [00:02, 396.78it/s, env_step=82000, gradient_step=8200, len=203, n/ep=1, n/st=100, rew=11361.00]                                                                                


Epoch #82: test_reward: 13194.200000 ± 4260.552190, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #83: 1001it [00:02, 435.89it/s, env_step=83000, gradient_step=8300, len=100, n/ep=0, n/st=100, rew=4612.50]                                                                                 


Epoch #83: test_reward: 13511.500000 ± 3893.564363, best_reward: 20198.400000 ± 5134.436974 in #58


Epoch #84: 1001it [00:01, 571.88it/s, env_step=84000, gradient_step=8400, len=338, n/ep=0, n/st=100, rew=19346.00]                                                                                


Epoch #84: test_reward: 21057.900000 ± 4970.393032, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #85: 1001it [00:01, 575.87it/s, env_step=85000, gradient_step=8500, len=153, n/ep=1, n/st=100, rew=7536.50]                                                                                 


Epoch #85: test_reward: 7120.500000 ± 2429.504775, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #86: 1001it [00:01, 609.87it/s, env_step=86000, gradient_step=8600, len=282, n/ep=0, n/st=100, rew=16206.00]                                                                                


Epoch #86: test_reward: 11611.600000 ± 5618.212086, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #87: 1001it [00:02, 397.01it/s, env_step=87000, gradient_step=8700, len=188, n/ep=0, n/st=100, rew=9492.00]                                                                                 


Epoch #87: test_reward: 14399.800000 ± 8488.106618, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #88: 1001it [00:01, 522.40it/s, env_step=88000, gradient_step=8800, len=294, n/ep=0, n/st=100, rew=16496.00]                                                                                


Epoch #88: test_reward: 12014.400000 ± 5259.398429, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #89: 1001it [00:01, 527.38it/s, env_step=89000, gradient_step=8900, len=249, n/ep=3, n/st=100, rew=13182.00]                                                                                


Epoch #89: test_reward: 17309.200000 ± 8693.101044, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #90: 1001it [00:01, 516.85it/s, env_step=90000, gradient_step=9000, len=83, n/ep=2, n/st=100, rew=3676.25]                                                                                  


Epoch #90: test_reward: 16477.300000 ± 7721.054177, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #91: 1001it [00:02, 497.12it/s, env_step=91000, gradient_step=9100, len=84, n/ep=0, n/st=100, rew=3995.50]                                                                                  


Epoch #91: test_reward: 13685.300000 ± 5144.815877, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #92: 1001it [00:02, 484.74it/s, env_step=92000, gradient_step=9200, len=246, n/ep=0, n/st=100, rew=14482.00]                                                                                


Epoch #92: test_reward: 16150.700000 ± 4278.545735, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #93: 1001it [00:01, 643.08it/s, env_step=93000, gradient_step=9300, len=40, n/ep=1, n/st=100, rew=1435.00]                                                                                  


Epoch #93: test_reward: 17094.600000 ± 7169.396909, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #94: 1001it [00:01, 562.61it/s, env_step=94000, gradient_step=9400, len=196, n/ep=1, n/st=100, rew=10004.00]                                                                                


Epoch #94: test_reward: 14936.000000 ± 7050.089361, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #95: 1001it [00:01, 563.43it/s, env_step=95000, gradient_step=9500, len=232, n/ep=0, n/st=100, rew=12491.00]                                                                                


Epoch #95: test_reward: 10545.800000 ± 5620.257802, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #96: 1001it [00:01, 513.08it/s, env_step=96000, gradient_step=9600, len=109, n/ep=0, n/st=100, rew=5586.50]                                                                                 


Epoch #96: test_reward: 8685.400000 ± 6080.781203, best_reward: 21057.900000 ± 4970.393032 in #84


Epoch #97: 1001it [00:01, 575.68it/s, env_step=97000, gradient_step=9700, len=146, n/ep=0, n/st=100, rew=7059.00]                                                                                 


Epoch #97: test_reward: 21968.100000 ± 5232.123899, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #98: 1001it [00:01, 554.22it/s, env_step=98000, gradient_step=9800, len=387, n/ep=0, n/st=100, rew=23539.50]                                                                                


Epoch #98: test_reward: 14490.400000 ± 6068.032617, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #99: 1001it [00:01, 544.31it/s, env_step=99000, gradient_step=9900, len=254, n/ep=0, n/st=100, rew=13108.00]                                                                                


Epoch #99: test_reward: 14719.000000 ± 3316.855891, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #100: 1001it [00:01, 523.28it/s, env_step=100000, gradient_step=10000, len=92, n/ep=0, n/st=100, rew=4072.50]                                                                               


Epoch #100: test_reward: 17011.100000 ± 5641.717034, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #101: 1001it [00:01, 500.85it/s, env_step=101000, gradient_step=10100, len=175, n/ep=0, n/st=100, rew=9158.00]                                                                              


Epoch #101: test_reward: 11120.700000 ± 4258.288906, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #102: 1001it [00:02, 487.74it/s, env_step=102000, gradient_step=10200, len=358, n/ep=0, n/st=100, rew=21034.00]                                                                             


Epoch #102: test_reward: 13427.200000 ± 5649.498399, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #103: 1001it [00:02, 449.88it/s, env_step=103000, gradient_step=10300, len=322, n/ep=0, n/st=100, rew=18667.00]                                                                             


Epoch #103: test_reward: 13903.300000 ± 4316.025047, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #104: 1001it [00:02, 424.94it/s, env_step=104000, gradient_step=10400, len=244, n/ep=0, n/st=100, rew=13189.75]                                                                             


Epoch #104: test_reward: 10731.500000 ± 6244.428897, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #105: 1001it [00:02, 451.11it/s, env_step=105000, gradient_step=10500, len=230, n/ep=0, n/st=100, rew=13938.83]                                                                             


Epoch #105: test_reward: 11576.900000 ± 3894.444850, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #106: 1001it [00:01, 616.76it/s, env_step=106000, gradient_step=10600, len=126, n/ep=0, n/st=100, rew=6261.00]                                                                              


Epoch #106: test_reward: 11422.600000 ± 4826.316571, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #107: 1001it [00:02, 480.24it/s, env_step=107000, gradient_step=10700, len=98, n/ep=1, n/st=100, rew=4665.50]                                                                               


Epoch #107: test_reward: 12690.000000 ± 5267.701966, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #108: 1001it [00:01, 505.75it/s, env_step=108000, gradient_step=10800, len=259, n/ep=0, n/st=100, rew=14832.00]                                                                             


Epoch #108: test_reward: 16145.800000 ± 5145.400739, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #109: 1001it [00:02, 428.43it/s, env_step=109000, gradient_step=10900, len=200, n/ep=1, n/st=100, rew=10564.00]                                                                             


Epoch #109: test_reward: 16987.900000 ± 6301.065282, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #110: 1001it [00:02, 448.55it/s, env_step=110000, gradient_step=11000, len=186, n/ep=1, n/st=100, rew=9663.50]                                                                              


Epoch #110: test_reward: 10984.600000 ± 6246.495229, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #111: 1001it [00:02, 402.84it/s, env_step=111000, gradient_step=11100, len=255, n/ep=0, n/st=100, rew=15577.00]                                                                             


Epoch #111: test_reward: 17935.500000 ± 4406.262163, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #112: 1001it [00:01, 533.08it/s, env_step=112000, gradient_step=11200, len=362, n/ep=1, n/st=100, rew=21155.00]                                                                             


Epoch #112: test_reward: 19009.200000 ± 3361.922123, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #113: 1001it [00:01, 537.00it/s, env_step=113000, gradient_step=11300, len=102, n/ep=1, n/st=100, rew=4993.00]                                                                              


Epoch #113: test_reward: 14567.700000 ± 5012.648802, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #114: 1001it [00:02, 487.46it/s, env_step=114000, gradient_step=11400, len=131, n/ep=1, n/st=100, rew=6688.00]                                                                              


Epoch #114: test_reward: 11280.400000 ± 5375.876974, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #115: 1001it [00:01, 519.91it/s, env_step=115000, gradient_step=11500, len=150, n/ep=0, n/st=100, rew=8053.00]                                                                              


Epoch #115: test_reward: 15038.800000 ± 7464.258608, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #116: 1001it [00:01, 602.32it/s, env_step=116000, gradient_step=11600, len=257, n/ep=0, n/st=100, rew=13641.50]                                                                             


Epoch #116: test_reward: 19066.800000 ± 5220.157273, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #117: 1001it [00:01, 534.45it/s, env_step=117000, gradient_step=11700, len=198, n/ep=0, n/st=100, rew=11347.00]                                                                             


Epoch #117: test_reward: 16611.600000 ± 6468.298141, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #118: 1001it [00:02, 489.78it/s, env_step=118000, gradient_step=11800, len=376, n/ep=0, n/st=100, rew=21655.00]                                                                             


Epoch #118: test_reward: 10294.400000 ± 5057.225963, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #119: 1001it [00:02, 486.68it/s, env_step=119000, gradient_step=11900, len=204, n/ep=0, n/st=100, rew=10246.00]                                                                             


Epoch #119: test_reward: 11481.000000 ± 5516.847687, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #120: 1001it [00:01, 525.34it/s, env_step=120000, gradient_step=12000, len=238, n/ep=0, n/st=100, rew=14620.00]                                                                             


Epoch #120: test_reward: 13903.500000 ± 6882.246352, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #121: 1001it [00:01, 560.66it/s, env_step=121000, gradient_step=12100, len=173, n/ep=1, n/st=100, rew=9654.00]                                                                              


Epoch #121: test_reward: 14548.000000 ± 4454.431412, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #122: 1001it [00:02, 483.42it/s, env_step=122000, gradient_step=12200, len=400, n/ep=1, n/st=100, rew=22977.00]                                                                             


Epoch #122: test_reward: 11406.700000 ± 5434.299073, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #123: 1001it [00:01, 552.50it/s, env_step=123000, gradient_step=12300, len=399, n/ep=0, n/st=100, rew=26562.50]                                                                             


Epoch #123: test_reward: 9621.700000 ± 4151.145795, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #124: 1001it [00:01, 600.72it/s, env_step=124000, gradient_step=12400, len=122, n/ep=0, n/st=100, rew=5806.00]                                                                              


Epoch #124: test_reward: 13523.100000 ± 6607.253581, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #125: 1001it [00:02, 430.79it/s, env_step=125000, gradient_step=12500, len=104, n/ep=1, n/st=100, rew=5376.00]                                                                              


Epoch #125: test_reward: 15786.300000 ± 5582.284999, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #126: 1001it [00:01, 542.78it/s, env_step=126000, gradient_step=12600, len=166, n/ep=0, n/st=100, rew=9238.00]                                                                              


Epoch #126: test_reward: 17501.400000 ± 5472.936912, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #127: 1001it [00:01, 589.15it/s, env_step=127000, gradient_step=12700, len=268, n/ep=0, n/st=100, rew=16988.00]                                                                             


Epoch #127: test_reward: 13286.000000 ± 4800.220328, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #128: 1001it [00:02, 492.90it/s, env_step=128000, gradient_step=12800, len=380, n/ep=0, n/st=100, rew=22807.00]                                                                             


Epoch #128: test_reward: 14053.100000 ± 7514.108815, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #129: 1001it [00:01, 558.70it/s, env_step=129000, gradient_step=12900, len=273, n/ep=0, n/st=100, rew=16599.00]                                                                             


Epoch #129: test_reward: 12700.900000 ± 4463.464449, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #130: 1001it [00:01, 617.47it/s, env_step=130000, gradient_step=13000, len=181, n/ep=0, n/st=100, rew=10641.00]                                                                             


Epoch #130: test_reward: 7910.000000 ± 3121.592542, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #131: 1001it [00:02, 490.62it/s, env_step=131000, gradient_step=13100, len=184, n/ep=0, n/st=100, rew=10328.25]                                                                             


Epoch #131: test_reward: 10600.900000 ± 6273.461317, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #132: 1001it [00:02, 493.53it/s, env_step=132000, gradient_step=13200, len=257, n/ep=0, n/st=100, rew=15686.83]                                                                             


Epoch #132: test_reward: 10176.700000 ± 4641.033076, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #133: 1001it [00:01, 517.64it/s, env_step=133000, gradient_step=13300, len=76, n/ep=1, n/st=100, rew=3369.00]                                                                               


Epoch #133: test_reward: 12842.500000 ± 8042.880631, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #134: 1001it [00:02, 466.48it/s, env_step=134000, gradient_step=13400, len=196, n/ep=0, n/st=100, rew=11037.00]                                                                             


Epoch #134: test_reward: 14759.700000 ± 6503.736927, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #135: 1001it [00:02, 417.95it/s, env_step=135000, gradient_step=13500, len=152, n/ep=2, n/st=100, rew=7754.25]                                                                              


Epoch #135: test_reward: 15652.800000 ± 7597.228453, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #136: 1001it [00:01, 536.41it/s, env_step=136000, gradient_step=13600, len=132, n/ep=1, n/st=100, rew=7910.00]                                                                              


Epoch #136: test_reward: 14594.500000 ± 5700.061987, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #137: 1001it [00:01, 539.76it/s, env_step=137000, gradient_step=13700, len=122, n/ep=0, n/st=100, rew=6272.00]                                                                              


Epoch #137: test_reward: 10933.000000 ± 4668.722630, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #138: 1001it [00:01, 535.94it/s, env_step=138000, gradient_step=13800, len=202, n/ep=0, n/st=100, rew=11323.00]                                                                             


Epoch #138: test_reward: 13261.600000 ± 7949.147617, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #139: 1001it [00:02, 461.76it/s, env_step=139000, gradient_step=13900, len=192, n/ep=2, n/st=100, rew=10816.75]                                                                             


Epoch #139: test_reward: 9038.100000 ± 2767.638287, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #140: 1001it [00:02, 425.98it/s, env_step=140000, gradient_step=14000, len=186, n/ep=1, n/st=100, rew=9859.00]                                                                              


Epoch #140: test_reward: 15794.200000 ± 6648.670676, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #141: 1001it [00:01, 546.66it/s, env_step=141000, gradient_step=14100, len=121, n/ep=0, n/st=100, rew=5303.00]                                                                              


Epoch #141: test_reward: 18526.100000 ± 3115.216668, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #142: 1001it [00:02, 434.96it/s, env_step=142000, gradient_step=14200, len=246, n/ep=0, n/st=100, rew=14896.00]                                                                             


Epoch #142: test_reward: 15089.100000 ± 4663.494815, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #143: 1001it [00:01, 533.19it/s, env_step=143000, gradient_step=14300, len=176, n/ep=0, n/st=100, rew=8943.50]                                                                              


Epoch #143: test_reward: 13708.600000 ± 6725.638114, best_reward: 21968.100000 ± 5232.123899 in #97


Epoch #144: 1001it [00:01, 608.23it/s, env_step=144000, gradient_step=14400, len=146, n/ep=0, n/st=100, rew=9119.00]                                                                              


Epoch #144: test_reward: 21982.000000 ± 5761.715370, best_reward: 21982.000000 ± 5761.715370 in #144


Epoch #145: 1001it [00:02, 441.36it/s, env_step=145000, gradient_step=14500, len=202, n/ep=1, n/st=100, rew=10456.50]                                                                             


Epoch #145: test_reward: 8786.700000 ± 5074.558701, best_reward: 21982.000000 ± 5761.715370 in #144


Epoch #146: 1001it [00:01, 528.11it/s, env_step=146000, gradient_step=14600, len=399, n/ep=0, n/st=100, rew=24677.00]                                                                             


Epoch #146: test_reward: 15627.200000 ± 6562.430842, best_reward: 21982.000000 ± 5761.715370 in #144


Epoch #147: 1001it [00:02, 396.47it/s, env_step=147000, gradient_step=14700, len=253, n/ep=0, n/st=100, rew=13099.00]                                                                             


Epoch #147: test_reward: 15471.200000 ± 3610.241787, best_reward: 21982.000000 ± 5761.715370 in #144


Epoch #148: 1001it [00:01, 585.28it/s, env_step=148000, gradient_step=14800, len=173, n/ep=1, n/st=100, rew=8912.00]                                                                              


Epoch #148: test_reward: 14684.400000 ± 4527.380947, best_reward: 21982.000000 ± 5761.715370 in #144


Epoch #149: 1001it [00:02, 479.40it/s, env_step=149000, gradient_step=14900, len=155, n/ep=0, n/st=100, rew=7380.00]                                                                              


Epoch #149: test_reward: 11847.200000 ± 2924.157547, best_reward: 21982.000000 ± 5761.715370 in #144


Epoch #150: 1001it [00:02, 453.52it/s, env_step=150000, gradient_step=15000, len=302, n/ep=2, n/st=100, rew=19499.25]                                                                             


Epoch #150: test_reward: 14910.400000 ± 5529.532859, best_reward: 21982.000000 ± 5761.715370 in #144

InfoStats(gradient_step=15000, best_reward=21982.0, best_reward_std=5761.7153695752795, train_step=150000, train_episode=640, test_step=393808, test_episode=1510, timing=TimingStats(total_time=501.52312707901, train_time=285.46141266822815, train_time_collect=51.66529393196106, train_time_update=227.91453790664673, test_time=216.06171441078186, update_speed=525.4650658312776))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #4


Epoch #1: 1001it [00:01, 558.84it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 12483.700000 ± 5630.218931, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #2: 1001it [00:01, 527.99it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 6797.400000 ± 4196.709382, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #3: 1001it [00:02, 430.97it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 11288.500000 ± 3966.689281, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #4: 1001it [00:01, 522.59it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 10848.800000 ± 3908.971420, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #5: 1001it [00:01, 543.55it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 10256.800000 ± 4315.944133, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #6: 1001it [00:01, 525.27it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 12269.400000 ± 4796.181444, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #7: 1001it [00:01, 548.99it/s, env_step=7000, gradient_step=700, len=64, n/ep=0, n/st=100, rew=1716.00]                                                                                     


Epoch #7: test_reward: 11846.300000 ± 3901.126044, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #8: 1001it [00:02, 486.60it/s, env_step=8000, gradient_step=800, len=72, n/ep=0, n/st=100, rew=1260.00]                                                                                     


Epoch #8: test_reward: 10970.800000 ± 5553.488234, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #9: 1001it [00:02, 392.52it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=2410.50]                                                                                     


Epoch #9: test_reward: 11893.600000 ± 5643.146750, best_reward: 12483.700000 ± 5630.218931 in #1


Epoch #10: 1001it [00:02, 418.81it/s, env_step=10000, gradient_step=1000, len=100, n/ep=1, n/st=100, rew=2266.00]                                                                                 


Epoch #10: test_reward: 12840.100000 ± 4702.941770, best_reward: 12840.100000 ± 4702.941770 in #10


Epoch #11: 1001it [00:02, 362.46it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=3195.00]                                                                                 


Epoch #11: test_reward: 11882.000000 ± 3781.417962, best_reward: 12840.100000 ± 4702.941770 in #10


Epoch #12: 1001it [00:02, 362.47it/s, env_step=12000, gradient_step=1200, len=116, n/ep=0, n/st=100, rew=1674.00]                                                                                 


Epoch #12: test_reward: 12675.400000 ± 5645.947488, best_reward: 12840.100000 ± 4702.941770 in #10


Epoch #13: 1001it [00:02, 444.20it/s, env_step=13000, gradient_step=1300, len=128, n/ep=0, n/st=100, rew=4392.00]                                                                                 


Epoch #13: test_reward: 16419.400000 ± 5303.989427, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #14: 1001it [00:02, 439.64it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=4343.00]                                                                                 


Epoch #14: test_reward: 12851.900000 ± 3376.882718, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #15: 1001it [00:02, 426.30it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=5543.50]                                                                                 


Epoch #15: test_reward: 14631.900000 ± 5156.263152, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #16: 1001it [00:02, 393.73it/s, env_step=16000, gradient_step=1600, len=158, n/ep=0, n/st=100, rew=6818.00]                                                                                 


Epoch #16: test_reward: 13842.500000 ± 5203.461468, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #17: 1001it [00:02, 341.77it/s, env_step=17000, gradient_step=1700, len=168, n/ep=0, n/st=100, rew=5219.50]                                                                                 


Epoch #17: test_reward: 11841.600000 ± 5660.133836, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #18: 1001it [00:02, 348.62it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=5825.50]                                                                                 


Epoch #18: test_reward: 8795.500000 ± 4958.565584, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #19: 1001it [00:02, 364.49it/s, env_step=19000, gradient_step=1900, len=188, n/ep=0, n/st=100, rew=6980.00]                                                                                 


Epoch #19: test_reward: 11218.300000 ± 5447.700947, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #20: 1001it [00:02, 392.49it/s, env_step=20000, gradient_step=2000, len=200, n/ep=5, n/st=100, rew=8423.80]                                                                                 


Epoch #20: test_reward: 11638.200000 ± 3393.444321, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #21: 1001it [00:02, 347.60it/s, env_step=21000, gradient_step=2100, len=208, n/ep=0, n/st=100, rew=11350.50]                                                                                


Epoch #21: test_reward: 11822.500000 ± 4143.168214, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #22: 1001it [00:02, 356.48it/s, env_step=22000, gradient_step=2200, len=220, n/ep=2, n/st=100, rew=10904.50]                                                                                


Epoch #22: test_reward: 14173.500000 ± 3784.435790, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #23: 1001it [00:02, 447.03it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=12313.00]                                                                                


Epoch #23: test_reward: 12260.000000 ± 3174.127754, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #24: 1001it [00:02, 376.86it/s, env_step=24000, gradient_step=2400, len=240, n/ep=2, n/st=100, rew=11488.00]                                                                                


Epoch #24: test_reward: 11800.600000 ± 5294.948842, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #25: 1001it [00:02, 404.68it/s, env_step=25000, gradient_step=2500, len=250, n/ep=1, n/st=100, rew=11483.00]                                                                                


Epoch #25: test_reward: 10362.500000 ± 4557.522863, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #26: 1001it [00:02, 354.60it/s, env_step=26000, gradient_step=2600, len=166, n/ep=0, n/st=100, rew=6601.67]                                                                                 


Epoch #26: test_reward: 12212.500000 ± 5239.672266, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #27: 1001it [00:02, 406.59it/s, env_step=27000, gradient_step=2700, len=81, n/ep=2, n/st=100, rew=2464.50]                                                                                  


Epoch #27: test_reward: 16256.300000 ± 5841.854895, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #28: 1001it [00:03, 316.87it/s, env_step=28000, gradient_step=2800, len=276, n/ep=0, n/st=100, rew=14297.00]                                                                                


Epoch #28: test_reward: 14463.200000 ± 5698.297304, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #29: 1001it [00:02, 405.56it/s, env_step=29000, gradient_step=2900, len=288, n/ep=0, n/st=100, rew=13247.50]                                                                                


Epoch #29: test_reward: 12347.300000 ± 4132.696167, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #30: 1001it [00:02, 433.22it/s, env_step=30000, gradient_step=3000, len=147, n/ep=0, n/st=100, rew=6604.50]                                                                                 


Epoch #30: test_reward: 13593.200000 ± 4473.061073, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #31: 1001it [00:02, 440.88it/s, env_step=31000, gradient_step=3100, len=304, n/ep=0, n/st=100, rew=16555.00]                                                                                


Epoch #31: test_reward: 12464.300000 ± 2222.198373, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #32: 1001it [00:02, 437.31it/s, env_step=32000, gradient_step=3200, len=170, n/ep=1, n/st=100, rew=8427.00]                                                                                 


Epoch #32: test_reward: 14860.000000 ± 5472.433755, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #33: 1001it [00:02, 432.19it/s, env_step=33000, gradient_step=3300, len=167, n/ep=0, n/st=100, rew=8653.00]                                                                                 


Epoch #33: test_reward: 12513.600000 ± 4328.898848, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #34: 1001it [00:02, 388.83it/s, env_step=34000, gradient_step=3400, len=203, n/ep=0, n/st=100, rew=10799.00]                                                                                


Epoch #34: test_reward: 12331.900000 ± 4612.533002, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #35: 1001it [00:02, 400.31it/s, env_step=35000, gradient_step=3500, len=162, n/ep=0, n/st=100, rew=7247.00]                                                                                 


Epoch #35: test_reward: 13201.000000 ± 3575.760870, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #36: 1001it [00:02, 446.28it/s, env_step=36000, gradient_step=3600, len=156, n/ep=0, n/st=100, rew=7859.00]                                                                                 


Epoch #36: test_reward: 15427.200000 ± 5520.216967, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #37: 1001it [00:02, 409.97it/s, env_step=37000, gradient_step=3700, len=100, n/ep=0, n/st=100, rew=4418.00]                                                                                 


Epoch #37: test_reward: 15692.900000 ± 5144.838121, best_reward: 16419.400000 ± 5303.989427 in #13


Epoch #38: 1001it [00:02, 370.70it/s, env_step=38000, gradient_step=3800, len=178, n/ep=0, n/st=100, rew=9420.00]                                                                                 


Epoch #38: test_reward: 18040.700000 ± 3494.038295, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #39: 1001it [00:02, 355.19it/s, env_step=39000, gradient_step=3900, len=267, n/ep=0, n/st=100, rew=12680.50]                                                                                


Epoch #39: test_reward: 14325.700000 ± 5411.908832, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #40: 1001it [00:02, 425.42it/s, env_step=40000, gradient_step=4000, len=146, n/ep=1, n/st=100, rew=7212.50]                                                                                 


Epoch #40: test_reward: 13749.900000 ± 2541.799931, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #41: 1001it [00:02, 417.27it/s, env_step=41000, gradient_step=4100, len=208, n/ep=0, n/st=100, rew=12181.50]                                                                                


Epoch #41: test_reward: 14918.900000 ± 4614.096758, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #42: 1001it [00:02, 450.02it/s, env_step=42000, gradient_step=4200, len=164, n/ep=0, n/st=100, rew=7557.00]                                                                                 


Epoch #42: test_reward: 15522.100000 ± 5046.952832, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #43: 1001it [00:02, 440.54it/s, env_step=43000, gradient_step=4300, len=207, n/ep=1, n/st=100, rew=10602.00]                                                                                


Epoch #43: test_reward: 11815.700000 ± 6211.437177, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #44: 1001it [00:02, 395.28it/s, env_step=44000, gradient_step=4400, len=200, n/ep=1, n/st=100, rew=9717.00]                                                                                 


Epoch #44: test_reward: 14475.500000 ± 5790.324641, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #45: 1001it [00:02, 372.99it/s, env_step=45000, gradient_step=4500, len=244, n/ep=1, n/st=100, rew=13826.00]                                                                                


Epoch #45: test_reward: 17056.400000 ± 5809.075644, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #46: 1001it [00:02, 354.92it/s, env_step=46000, gradient_step=4600, len=237, n/ep=0, n/st=100, rew=12273.00]                                                                                


Epoch #46: test_reward: 15917.500000 ± 5280.528198, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #47: 1001it [00:02, 435.19it/s, env_step=47000, gradient_step=4700, len=178, n/ep=0, n/st=100, rew=8809.50]                                                                                 


Epoch #47: test_reward: 14628.900000 ± 3449.896475, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #48: 1001it [00:02, 430.62it/s, env_step=48000, gradient_step=4800, len=147, n/ep=0, n/st=100, rew=7385.75]                                                                                 


Epoch #48: test_reward: 9310.500000 ± 4303.347610, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #49: 1001it [00:02, 427.85it/s, env_step=49000, gradient_step=4900, len=62, n/ep=1, n/st=100, rew=2156.00]                                                                                  


Epoch #49: test_reward: 17991.400000 ± 5254.787059, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #50: 1001it [00:02, 431.67it/s, env_step=50000, gradient_step=5000, len=109, n/ep=0, n/st=100, rew=4954.00]                                                                                 


Epoch #50: test_reward: 12666.700000 ± 5683.464648, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #51: 1001it [00:02, 399.01it/s, env_step=51000, gradient_step=5100, len=109, n/ep=0, n/st=100, rew=4954.00]                                                                                 


Epoch #51: test_reward: 12536.400000 ± 6153.509685, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #52: 1001it [00:02, 342.53it/s, env_step=52000, gradient_step=5200, len=266, n/ep=0, n/st=100, rew=15646.00]                                                                                


Epoch #52: test_reward: 15483.100000 ± 6082.383915, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #53: 1001it [00:02, 443.55it/s, env_step=53000, gradient_step=5300, len=127, n/ep=2, n/st=100, rew=6341.00]                                                                                 


Epoch #53: test_reward: 13883.200000 ± 5639.971344, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #54: 1001it [00:02, 368.64it/s, env_step=54000, gradient_step=5400, len=303, n/ep=1, n/st=100, rew=18490.50]                                                                                


Epoch #54: test_reward: 13921.900000 ± 4730.344860, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #55: 1001it [00:02, 399.89it/s, env_step=55000, gradient_step=5500, len=363, n/ep=0, n/st=100, rew=21147.50]                                                                                


Epoch #55: test_reward: 14345.200000 ± 4063.883581, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #56: 1001it [00:02, 361.16it/s, env_step=56000, gradient_step=5600, len=118, n/ep=1, n/st=100, rew=5596.00]                                                                                 


Epoch #56: test_reward: 15679.800000 ± 3753.051180, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #57: 1001it [00:02, 402.70it/s, env_step=57000, gradient_step=5700, len=355, n/ep=0, n/st=100, rew=21811.00]                                                                                


Epoch #57: test_reward: 14303.400000 ± 4833.205669, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #58: 1001it [00:02, 401.59it/s, env_step=58000, gradient_step=5800, len=396, n/ep=0, n/st=100, rew=22531.00]                                                                                


Epoch #58: test_reward: 15077.200000 ± 6249.901756, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #59: 1001it [00:02, 398.18it/s, env_step=59000, gradient_step=5900, len=164, n/ep=0, n/st=100, rew=8685.75]                                                                                 


Epoch #59: test_reward: 12222.600000 ± 6922.515023, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #60: 1001it [00:02, 395.13it/s, env_step=60000, gradient_step=6000, len=280, n/ep=1, n/st=100, rew=16225.50]                                                                                


Epoch #60: test_reward: 13727.600000 ± 5492.809376, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #61: 1001it [00:02, 402.97it/s, env_step=61000, gradient_step=6100, len=152, n/ep=2, n/st=100, rew=7620.50]                                                                                 


Epoch #61: test_reward: 12272.200000 ± 4247.939661, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #62: 1001it [00:02, 426.55it/s, env_step=62000, gradient_step=6200, len=400, n/ep=1, n/st=100, rew=25189.00]                                                                                


Epoch #62: test_reward: 12172.800000 ± 2966.066176, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #63: 1001it [00:03, 329.48it/s, env_step=63000, gradient_step=6300, len=283, n/ep=0, n/st=100, rew=14943.50]                                                                                


Epoch #63: test_reward: 16057.800000 ± 5702.811917, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #64: 1001it [00:02, 410.80it/s, env_step=64000, gradient_step=6400, len=254, n/ep=0, n/st=100, rew=13880.33]                                                                                


Epoch #64: test_reward: 9767.700000 ± 4024.513686, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #65: 1001it [00:02, 439.69it/s, env_step=65000, gradient_step=6500, len=231, n/ep=0, n/st=100, rew=12665.00]                                                                                


Epoch #65: test_reward: 14777.400000 ± 5701.443452, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #66: 1001it [00:02, 443.02it/s, env_step=66000, gradient_step=6600, len=248, n/ep=2, n/st=100, rew=14547.25]                                                                                


Epoch #66: test_reward: 11872.400000 ± 5786.418481, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #67: 1001it [00:02, 374.66it/s, env_step=67000, gradient_step=6700, len=400, n/ep=0, n/st=100, rew=24216.00]                                                                                


Epoch #67: test_reward: 12834.900000 ± 4152.212240, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #68: 1001it [00:02, 411.30it/s, env_step=68000, gradient_step=6800, len=200, n/ep=0, n/st=100, rew=7251.50]                                                                                 


Epoch #68: test_reward: 11771.800000 ± 3523.708098, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #69: 1001it [00:02, 390.45it/s, env_step=69000, gradient_step=6900, len=163, n/ep=1, n/st=100, rew=9140.00]                                                                                 


Epoch #69: test_reward: 12007.000000 ± 2587.448937, best_reward: 18040.700000 ± 3494.038295 in #38


Epoch #70: 1001it [00:02, 443.62it/s, env_step=70000, gradient_step=7000, len=144, n/ep=0, n/st=100, rew=7761.33]                                                                                 


Epoch #70: test_reward: 20673.600000 ± 6989.906540, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #71: 1001it [00:02, 422.36it/s, env_step=71000, gradient_step=7100, len=386, n/ep=1, n/st=100, rew=23505.00]                                                                                


Epoch #71: test_reward: 17051.200000 ± 5989.099679, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #72: 1001it [00:03, 319.52it/s, env_step=72000, gradient_step=7200, len=82, n/ep=0, n/st=100, rew=4065.50]                                                                                  


Epoch #72: test_reward: 12339.800000 ± 4899.531688, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #73: 1001it [00:02, 438.17it/s, env_step=73000, gradient_step=7300, len=305, n/ep=0, n/st=100, rew=17775.50]                                                                                


Epoch #73: test_reward: 11482.900000 ± 4019.560584, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #74: 1001it [00:02, 408.58it/s, env_step=74000, gradient_step=7400, len=184, n/ep=1, n/st=100, rew=11034.50]                                                                                


Epoch #74: test_reward: 13401.700000 ± 6171.796773, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #75: 1001it [00:02, 355.72it/s, env_step=75000, gradient_step=7500, len=333, n/ep=0, n/st=100, rew=19821.00]                                                                                


Epoch #75: test_reward: 10420.700000 ± 3642.930525, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #76: 1001it [00:02, 395.72it/s, env_step=76000, gradient_step=7600, len=198, n/ep=0, n/st=100, rew=11722.00]                                                                                


Epoch #76: test_reward: 10540.900000 ± 3542.518919, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #77: 1001it [00:02, 371.54it/s, env_step=77000, gradient_step=7700, len=161, n/ep=2, n/st=100, rew=9203.75]                                                                                 


Epoch #77: test_reward: 9946.600000 ± 4343.737658, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #78: 1001it [00:03, 312.39it/s, env_step=78000, gradient_step=7800, len=177, n/ep=0, n/st=100, rew=9969.50]                                                                                 


Epoch #78: test_reward: 12164.200000 ± 5509.117512, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #79: 1001it [00:02, 338.12it/s, env_step=79000, gradient_step=7900, len=126, n/ep=0, n/st=100, rew=6200.00]                                                                                 


Epoch #79: test_reward: 8767.300000 ± 2823.704094, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #80: 1001it [00:02, 351.67it/s, env_step=80000, gradient_step=8000, len=251, n/ep=1, n/st=100, rew=15536.00]                                                                                


Epoch #80: test_reward: 10344.700000 ± 3849.657441, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #81: 1001it [00:03, 326.30it/s, env_step=81000, gradient_step=8100, len=369, n/ep=0, n/st=100, rew=21561.00]                                                                                


Epoch #81: test_reward: 10468.600000 ± 5239.186105, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #82: 1001it [00:02, 399.37it/s, env_step=82000, gradient_step=8200, len=259, n/ep=2, n/st=100, rew=14907.50]                                                                                


Epoch #82: test_reward: 12748.800000 ± 4045.827895, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #83: 1001it [00:03, 330.39it/s, env_step=83000, gradient_step=8300, len=349, n/ep=0, n/st=100, rew=22306.00]                                                                                


Epoch #83: test_reward: 12171.100000 ± 4380.385975, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #84: 1001it [00:02, 360.56it/s, env_step=84000, gradient_step=8400, len=130, n/ep=2, n/st=100, rew=6636.75]                                                                                 


Epoch #84: test_reward: 10786.200000 ± 3196.469452, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #85: 1001it [00:02, 388.21it/s, env_step=85000, gradient_step=8500, len=140, n/ep=0, n/st=100, rew=6852.50]                                                                                 


Epoch #85: test_reward: 10493.400000 ± 3150.195778, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #86: 1001it [00:03, 320.43it/s, env_step=86000, gradient_step=8600, len=86, n/ep=1, n/st=100, rew=4209.00]                                                                                  


Epoch #86: test_reward: 10318.600000 ± 3870.345080, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #87: 1001it [00:02, 365.69it/s, env_step=87000, gradient_step=8700, len=171, n/ep=0, n/st=100, rew=10036.50]                                                                                


Epoch #87: test_reward: 12458.000000 ± 4344.110588, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #88: 1001it [00:02, 366.55it/s, env_step=88000, gradient_step=8800, len=224, n/ep=0, n/st=100, rew=11848.75]                                                                                


Epoch #88: test_reward: 8461.200000 ± 4984.248385, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #89: 1001it [00:02, 402.62it/s, env_step=89000, gradient_step=8900, len=58, n/ep=1, n/st=100, rew=2289.00]                                                                                  


Epoch #89: test_reward: 12843.500000 ± 6356.071700, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #90: 1001it [00:02, 370.31it/s, env_step=90000, gradient_step=9000, len=375, n/ep=1, n/st=100, rew=22458.50]                                                                                


Epoch #90: test_reward: 7585.200000 ± 2872.714981, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #91: 1001it [00:03, 312.59it/s, env_step=91000, gradient_step=9100, len=213, n/ep=0, n/st=100, rew=12789.00]                                                                                


Epoch #91: test_reward: 8669.700000 ± 4868.816161, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #92: 1001it [00:02, 375.96it/s, env_step=92000, gradient_step=9200, len=132, n/ep=0, n/st=100, rew=7171.00]                                                                                 


Epoch #92: test_reward: 10521.500000 ± 2778.448749, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #93: 1001it [00:02, 380.69it/s, env_step=93000, gradient_step=9300, len=299, n/ep=1, n/st=100, rew=15985.50]                                                                                


Epoch #93: test_reward: 11700.900000 ± 3699.057568, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #94: 1001it [00:02, 421.53it/s, env_step=94000, gradient_step=9400, len=84, n/ep=1, n/st=100, rew=3625.50]                                                                                  


Epoch #94: test_reward: 6256.800000 ± 2529.670524, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #95: 1001it [00:02, 428.21it/s, env_step=95000, gradient_step=9500, len=292, n/ep=1, n/st=100, rew=18831.00]                                                                                


Epoch #95: test_reward: 12821.600000 ± 5307.433338, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #96: 1001it [00:02, 389.01it/s, env_step=96000, gradient_step=9600, len=177, n/ep=2, n/st=100, rew=10175.00]                                                                                


Epoch #96: test_reward: 14551.400000 ± 4133.658917, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #97: 1001it [00:02, 401.04it/s, env_step=97000, gradient_step=9700, len=248, n/ep=0, n/st=100, rew=14510.50]                                                                                


Epoch #97: test_reward: 13081.800000 ± 6018.687246, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #98: 1001it [00:02, 382.32it/s, env_step=98000, gradient_step=9800, len=157, n/ep=0, n/st=100, rew=9212.33]                                                                                 


Epoch #98: test_reward: 8716.200000 ± 4301.515287, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #99: 1001it [00:02, 387.39it/s, env_step=99000, gradient_step=9900, len=211, n/ep=1, n/st=100, rew=12752.00]                                                                                


Epoch #99: test_reward: 9428.800000 ± 3531.254531, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #100: 1001it [00:02, 373.86it/s, env_step=100000, gradient_step=10000, len=133, n/ep=1, n/st=100, rew=6018.50]                                                                              


Epoch #100: test_reward: 7473.600000 ± 3413.079466, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #101: 1001it [00:02, 410.85it/s, env_step=101000, gradient_step=10100, len=96, n/ep=0, n/st=100, rew=4897.00]                                                                               


Epoch #101: test_reward: 9046.700000 ± 2126.169704, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #102: 1001it [00:02, 407.45it/s, env_step=102000, gradient_step=10200, len=185, n/ep=0, n/st=100, rew=10233.00]                                                                             


Epoch #102: test_reward: 11456.300000 ± 2731.355929, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #103: 1001it [00:02, 427.85it/s, env_step=103000, gradient_step=10300, len=48, n/ep=0, n/st=100, rew=2062.00]                                                                               


Epoch #103: test_reward: 16568.000000 ± 5651.008282, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #104: 1001it [00:02, 413.40it/s, env_step=104000, gradient_step=10400, len=117, n/ep=0, n/st=100, rew=6035.00]                                                                              


Epoch #104: test_reward: 11323.000000 ± 3385.113676, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #105: 1001it [00:02, 355.30it/s, env_step=105000, gradient_step=10500, len=140, n/ep=0, n/st=100, rew=7379.00]                                                                              


Epoch #105: test_reward: 8455.100000 ± 3965.484648, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #106: 1001it [00:02, 409.47it/s, env_step=106000, gradient_step=10600, len=154, n/ep=1, n/st=100, rew=8170.00]                                                                              


Epoch #106: test_reward: 11648.100000 ± 2593.271619, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #107: 1001it [00:02, 400.01it/s, env_step=107000, gradient_step=10700, len=138, n/ep=0, n/st=100, rew=7296.00]                                                                              


Epoch #107: test_reward: 10107.100000 ± 2821.925529, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #108: 1001it [00:02, 370.44it/s, env_step=108000, gradient_step=10800, len=213, n/ep=0, n/st=100, rew=12761.00]                                                                             


Epoch #108: test_reward: 8108.100000 ± 3533.404716, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #109: 1001it [00:03, 319.47it/s, env_step=109000, gradient_step=10900, len=124, n/ep=0, n/st=100, rew=7362.00]                                                                              


Epoch #109: test_reward: 9165.600000 ± 3264.575905, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #110: 1001it [00:02, 388.46it/s, env_step=110000, gradient_step=11000, len=181, n/ep=2, n/st=100, rew=10561.50]                                                                             


Epoch #110: test_reward: 8687.100000 ± 3392.391677, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #111: 1001it [00:03, 307.15it/s, env_step=111000, gradient_step=11100, len=113, n/ep=0, n/st=100, rew=5952.00]                                                                              


Epoch #111: test_reward: 8103.700000 ± 4009.352942, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #112: 1001it [00:03, 316.07it/s, env_step=112000, gradient_step=11200, len=136, n/ep=0, n/st=100, rew=7559.50]                                                                              


Epoch #112: test_reward: 9351.500000 ± 3860.773737, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #113: 1001it [00:03, 333.14it/s, env_step=113000, gradient_step=11300, len=179, n/ep=0, n/st=100, rew=9669.50]                                                                              


Epoch #113: test_reward: 10167.200000 ± 4233.338111, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #114: 1001it [00:02, 382.30it/s, env_step=114000, gradient_step=11400, len=162, n/ep=1, n/st=100, rew=9510.00]                                                                              


Epoch #114: test_reward: 8019.900000 ± 3956.889750, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #115: 1001it [00:02, 369.02it/s, env_step=115000, gradient_step=11500, len=138, n/ep=0, n/st=100, rew=8515.25]                                                                              


Epoch #115: test_reward: 8138.100000 ± 2511.021324, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #116: 1001it [00:02, 376.37it/s, env_step=116000, gradient_step=11600, len=143, n/ep=2, n/st=100, rew=8302.00]                                                                              


Epoch #116: test_reward: 10842.800000 ± 3183.319393, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #117: 1001it [00:02, 351.71it/s, env_step=117000, gradient_step=11700, len=278, n/ep=2, n/st=100, rew=16927.25]                                                                             


Epoch #117: test_reward: 11134.000000 ± 4024.964298, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #118: 1001it [00:02, 411.31it/s, env_step=118000, gradient_step=11800, len=173, n/ep=1, n/st=100, rew=10718.50]                                                                             


Epoch #118: test_reward: 6856.800000 ± 3916.193070, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #119: 1001it [00:02, 371.66it/s, env_step=119000, gradient_step=11900, len=265, n/ep=0, n/st=100, rew=16264.00]                                                                             


Epoch #119: test_reward: 9200.600000 ± 4976.621267, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #120: 1001it [00:02, 387.31it/s, env_step=120000, gradient_step=12000, len=138, n/ep=0, n/st=100, rew=7556.00]                                                                              


Epoch #120: test_reward: 12216.300000 ± 5781.799167, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #121: 1001it [00:02, 411.23it/s, env_step=121000, gradient_step=12100, len=212, n/ep=1, n/st=100, rew=13024.50]                                                                             


Epoch #121: test_reward: 13595.800000 ± 3909.022558, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #122: 1001it [00:02, 422.50it/s, env_step=122000, gradient_step=12200, len=221, n/ep=0, n/st=100, rew=13624.00]                                                                             


Epoch #122: test_reward: 10971.200000 ± 4803.834485, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #123: 1001it [00:02, 402.68it/s, env_step=123000, gradient_step=12300, len=130, n/ep=1, n/st=100, rew=7569.00]                                                                              


Epoch #123: test_reward: 9967.300000 ± 4420.219679, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #124: 1001it [00:02, 369.29it/s, env_step=124000, gradient_step=12400, len=173, n/ep=0, n/st=100, rew=9958.25]                                                                              


Epoch #124: test_reward: 11685.600000 ± 4126.693088, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #125: 1001it [00:02, 417.84it/s, env_step=125000, gradient_step=12500, len=225, n/ep=1, n/st=100, rew=13001.00]                                                                             


Epoch #125: test_reward: 11587.600000 ± 5980.144450, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #126: 1001it [00:02, 341.64it/s, env_step=126000, gradient_step=12600, len=156, n/ep=1, n/st=100, rew=9148.50]                                                                              


Epoch #126: test_reward: 8988.900000 ± 4400.090737, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #127: 1001it [00:02, 385.78it/s, env_step=127000, gradient_step=12700, len=170, n/ep=1, n/st=100, rew=10052.00]                                                                             


Epoch #127: test_reward: 9462.200000 ± 4677.275121, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #128: 1001it [00:03, 317.29it/s, env_step=128000, gradient_step=12800, len=156, n/ep=0, n/st=100, rew=9389.00]                                                                              


Epoch #128: test_reward: 6842.700000 ± 3480.873283, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #129: 1001it [00:02, 345.57it/s, env_step=129000, gradient_step=12900, len=76, n/ep=0, n/st=100, rew=3760.00]                                                                               


Epoch #129: test_reward: 8141.900000 ± 6628.435780, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #130: 1001it [00:02, 393.58it/s, env_step=130000, gradient_step=13000, len=275, n/ep=0, n/st=100, rew=17824.50]                                                                             


Epoch #130: test_reward: 7044.200000 ± 2735.721214, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #131: 1001it [00:02, 374.49it/s, env_step=131000, gradient_step=13100, len=222, n/ep=0, n/st=100, rew=14124.50]                                                                             


Epoch #131: test_reward: 8160.400000 ± 4296.339284, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #132: 1001it [00:02, 412.34it/s, env_step=132000, gradient_step=13200, len=199, n/ep=0, n/st=100, rew=12156.00]                                                                             


Epoch #132: test_reward: 7829.600000 ± 3644.255073, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #133: 1001it [00:02, 399.95it/s, env_step=133000, gradient_step=13300, len=148, n/ep=1, n/st=100, rew=8812.50]                                                                              


Epoch #133: test_reward: 7793.200000 ± 4295.469539, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #134: 1001it [00:02, 366.00it/s, env_step=134000, gradient_step=13400, len=90, n/ep=1, n/st=100, rew=5103.50]                                                                               


Epoch #134: test_reward: 9759.700000 ± 4508.959837, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #135: 1001it [00:02, 403.02it/s, env_step=135000, gradient_step=13500, len=183, n/ep=1, n/st=100, rew=11181.50]                                                                             


Epoch #135: test_reward: 6414.900000 ± 3411.866306, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #136: 1001it [00:02, 413.20it/s, env_step=136000, gradient_step=13600, len=164, n/ep=1, n/st=100, rew=10284.00]                                                                             


Epoch #136: test_reward: 10038.700000 ± 3818.694045, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #137: 1001it [00:02, 406.96it/s, env_step=137000, gradient_step=13700, len=169, n/ep=1, n/st=100, rew=10576.50]                                                                             


Epoch #137: test_reward: 10271.800000 ± 4711.404733, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #138: 1001it [00:02, 384.65it/s, env_step=138000, gradient_step=13800, len=65, n/ep=1, n/st=100, rew=2949.00]                                                                               


Epoch #138: test_reward: 7703.900000 ± 3655.986883, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #139: 1001it [00:02, 398.18it/s, env_step=139000, gradient_step=13900, len=219, n/ep=0, n/st=100, rew=13822.50]                                                                             


Epoch #139: test_reward: 7033.600000 ± 2174.301138, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #140: 1001it [00:02, 401.83it/s, env_step=140000, gradient_step=14000, len=209, n/ep=1, n/st=100, rew=12504.50]                                                                             


Epoch #140: test_reward: 6604.700000 ± 3652.575203, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #141: 1001it [00:02, 371.33it/s, env_step=141000, gradient_step=14100, len=128, n/ep=1, n/st=100, rew=7381.00]                                                                              


Epoch #141: test_reward: 9259.400000 ± 3580.354066, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #142: 1001it [00:02, 400.88it/s, env_step=142000, gradient_step=14200, len=186, n/ep=1, n/st=100, rew=11341.00]                                                                             


Epoch #142: test_reward: 9858.200000 ± 2812.730055, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #143: 1001it [00:02, 370.56it/s, env_step=143000, gradient_step=14300, len=165, n/ep=1, n/st=100, rew=8917.50]                                                                              


Epoch #143: test_reward: 9727.600000 ± 5858.238203, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #144: 1001it [00:02, 348.94it/s, env_step=144000, gradient_step=14400, len=88, n/ep=0, n/st=100, rew=4913.50]                                                                               


Epoch #144: test_reward: 10940.400000 ± 4378.491594, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #145: 1001it [00:02, 412.19it/s, env_step=145000, gradient_step=14500, len=135, n/ep=1, n/st=100, rew=7473.00]                                                                              


Epoch #145: test_reward: 7449.300000 ± 3825.655605, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #146: 1001it [00:02, 396.21it/s, env_step=146000, gradient_step=14600, len=119, n/ep=1, n/st=100, rew=5712.50]                                                                              


Epoch #146: test_reward: 10131.800000 ± 3613.530955, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #147: 1001it [00:02, 342.13it/s, env_step=147000, gradient_step=14700, len=106, n/ep=3, n/st=100, rew=5986.67]                                                                              


Epoch #147: test_reward: 8448.800000 ± 3601.264606, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #148: 1001it [00:02, 433.60it/s, env_step=148000, gradient_step=14800, len=145, n/ep=0, n/st=100, rew=8439.25]                                                                              


Epoch #148: test_reward: 11083.000000 ± 3938.794054, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #149: 1001it [00:03, 333.34it/s, env_step=149000, gradient_step=14900, len=101, n/ep=0, n/st=100, rew=5569.50]                                                                              


Epoch #149: test_reward: 13927.800000 ± 5072.497093, best_reward: 20673.600000 ± 6989.906540 in #70


Epoch #150: 1001it [00:02, 395.73it/s, env_step=150000, gradient_step=15000, len=155, n/ep=1, n/st=100, rew=8532.00]                                                                              


Epoch #150: test_reward: 12750.800000 ± 2904.937342, best_reward: 20673.600000 ± 6989.906540 in #70

InfoStats(gradient_step=15000, best_reward=20673.6, best_reward_std=6989.906540147729, train_step=150000, train_episode=776, test_step=331124, test_episode=1510, timing=TimingStats(total_time=560.9968161582947, train_time=386.6857409477234, train_time_collect=51.813976764678955, train_time_update=329.03489351272583, test_time=174.3110752105713, update_speed=387.9118987743557))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #0


Epoch #1: 1001it [00:01, 729.07it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 7878.600000 ± 3677.200680, best_reward: 8377.900000 ± 3235.516851 in #0


Epoch #2: 1001it [00:02, 485.80it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 10700.200000 ± 2955.814162, best_reward: 10700.200000 ± 2955.814162 in #2


Epoch #3: 1001it [00:01, 623.57it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 2682.000000 ± 1308.027217, best_reward: 10700.200000 ± 2955.814162 in #2


Epoch #4: 1001it [00:01, 735.14it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 6765.700000 ± 2913.009648, best_reward: 10700.200000 ± 2955.814162 in #2


Epoch #5: 1001it [00:01, 679.03it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 9877.200000 ± 5892.263619, best_reward: 10700.200000 ± 2955.814162 in #2


Epoch #6: 1001it [00:01, 748.74it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 11874.300000 ± 6604.649030, best_reward: 11874.300000 ± 6604.649030 in #6


Epoch #7: 1001it [00:01, 602.51it/s, env_step=7000, gradient_step=700, len=68, n/ep=0, n/st=100, rew=2420.00]                                                                                     


Epoch #7: test_reward: 10518.200000 ± 5230.415467, best_reward: 11874.300000 ± 6604.649030 in #6


Epoch #8: 1001it [00:01, 596.21it/s, env_step=8000, gradient_step=800, len=76, n/ep=0, n/st=100, rew=2881.00]                                                                                     


Epoch #8: test_reward: 13188.500000 ± 3291.089432, best_reward: 13188.500000 ± 3291.089432 in #8


Epoch #9: 1001it [00:02, 447.72it/s, env_step=9000, gradient_step=900, len=76, n/ep=0, n/st=100, rew=2881.00]                                                                                     


Epoch #9: test_reward: 11113.000000 ± 5153.474750, best_reward: 13188.500000 ± 3291.089432 in #8


Epoch #10: 1001it [00:01, 565.42it/s, env_step=10000, gradient_step=1000, len=76, n/ep=0, n/st=100, rew=2881.00]                                                                                  


Epoch #10: test_reward: 9363.200000 ± 3203.096777, best_reward: 13188.500000 ± 3291.089432 in #8


Epoch #11: 1001it [00:02, 443.53it/s, env_step=11000, gradient_step=1100, len=101, n/ep=0, n/st=100, rew=4472.25]                                                                                 


Epoch #11: test_reward: 9449.800000 ± 2000.363657, best_reward: 13188.500000 ± 3291.089432 in #8


Epoch #12: 1001it [00:02, 459.01it/s, env_step=12000, gradient_step=1200, len=101, n/ep=0, n/st=100, rew=4472.25]                                                                                 


Epoch #12: test_reward: 8602.200000 ± 3610.275026, best_reward: 13188.500000 ± 3291.089432 in #8


Epoch #13: 1001it [00:02, 484.71it/s, env_step=13000, gradient_step=1300, len=127, n/ep=0, n/st=100, rew=5844.00]                                                                                 


Epoch #13: test_reward: 11539.200000 ± 4509.826799, best_reward: 13188.500000 ± 3291.089432 in #8


Epoch #14: 1001it [00:01, 536.38it/s, env_step=14000, gradient_step=1400, len=139, n/ep=0, n/st=100, rew=6151.00]                                                                                 


Epoch #14: test_reward: 8633.800000 ± 4482.777750, best_reward: 13188.500000 ± 3291.089432 in #8


Epoch #15: 1001it [00:02, 481.85it/s, env_step=15000, gradient_step=1500, len=149, n/ep=0, n/st=100, rew=6353.75]                                                                                 


Epoch #15: test_reward: 13545.400000 ± 4525.740076, best_reward: 13545.400000 ± 4525.740076 in #15


Epoch #16: 1001it [00:01, 590.29it/s, env_step=16000, gradient_step=1600, len=159, n/ep=0, n/st=100, rew=7591.00]                                                                                 


Epoch #16: test_reward: 9911.900000 ± 4987.898965, best_reward: 13545.400000 ± 4525.740076 in #15


Epoch #17: 1001it [00:01, 585.65it/s, env_step=17000, gradient_step=1700, len=166, n/ep=0, n/st=100, rew=6949.00]                                                                                 


Epoch #17: test_reward: 16821.100000 ± 3818.017246, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #18: 1001it [00:01, 546.56it/s, env_step=18000, gradient_step=1800, len=179, n/ep=0, n/st=100, rew=8459.50]                                                                                 


Epoch #18: test_reward: 12583.500000 ± 3417.101264, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #19: 1001it [00:01, 587.51it/s, env_step=19000, gradient_step=1900, len=189, n/ep=0, n/st=100, rew=8750.00]                                                                                 


Epoch #19: test_reward: 12002.400000 ± 2934.936156, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #20: 1001it [00:01, 538.05it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=8406.00]                                                                                 


Epoch #20: test_reward: 8197.400000 ± 3011.321444, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #21: 1001it [00:01, 510.09it/s, env_step=21000, gradient_step=2100, len=208, n/ep=0, n/st=100, rew=10299.00]                                                                                


Epoch #21: test_reward: 10381.900000 ± 6672.810854, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #22: 1001it [00:02, 498.01it/s, env_step=22000, gradient_step=2200, len=213, n/ep=0, n/st=100, rew=11497.00]                                                                                


Epoch #22: test_reward: 6116.500000 ± 5094.490500, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #23: 1001it [00:01, 549.89it/s, env_step=23000, gradient_step=2300, len=152, n/ep=0, n/st=100, rew=6964.00]                                                                                 


Epoch #23: test_reward: 15111.600000 ± 5211.951539, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #24: 1001it [00:01, 576.58it/s, env_step=24000, gradient_step=2400, len=239, n/ep=0, n/st=100, rew=13827.00]                                                                                


Epoch #24: test_reward: 14621.400000 ± 4018.626537, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #25: 1001it [00:01, 582.20it/s, env_step=25000, gradient_step=2500, len=247, n/ep=0, n/st=100, rew=12564.75]                                                                                


Epoch #25: test_reward: 13450.000000 ± 1943.657789, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #26: 1001it [00:02, 461.82it/s, env_step=26000, gradient_step=2600, len=259, n/ep=0, n/st=100, rew=13578.25]                                                                                


Epoch #26: test_reward: 7493.600000 ± 5013.905528, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #27: 1001it [00:01, 507.84it/s, env_step=27000, gradient_step=2700, len=265, n/ep=0, n/st=100, rew=14334.00]                                                                                


Epoch #27: test_reward: 10818.400000 ± 4602.562486, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #28: 1001it [00:01, 546.67it/s, env_step=28000, gradient_step=2800, len=277, n/ep=0, n/st=100, rew=12706.00]                                                                                


Epoch #28: test_reward: 8233.200000 ± 2793.409415, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #29: 1001it [00:01, 603.39it/s, env_step=29000, gradient_step=2900, len=289, n/ep=0, n/st=100, rew=13721.75]                                                                                


Epoch #29: test_reward: 9055.200000 ± 2108.108954, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #30: 1001it [00:02, 487.15it/s, env_step=30000, gradient_step=3000, len=246, n/ep=0, n/st=100, rew=12802.17]                                                                                


Epoch #30: test_reward: 13764.000000 ± 4911.373820, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #31: 1001it [00:01, 604.96it/s, env_step=31000, gradient_step=3100, len=105, n/ep=1, n/st=100, rew=3220.50]                                                                                 


Epoch #31: test_reward: 6417.800000 ± 2142.791021, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #32: 1001it [00:02, 474.98it/s, env_step=32000, gradient_step=3200, len=313, n/ep=0, n/st=100, rew=15092.00]                                                                                


Epoch #32: test_reward: 6550.400000 ± 1708.042341, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #33: 1001it [00:02, 416.53it/s, env_step=33000, gradient_step=3300, len=139, n/ep=1, n/st=100, rew=4451.00]                                                                                 


Epoch #33: test_reward: 7034.200000 ± 2843.191932, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #34: 1001it [00:02, 468.19it/s, env_step=34000, gradient_step=3400, len=199, n/ep=0, n/st=100, rew=9514.00]                                                                                 


Epoch #34: test_reward: 9128.100000 ± 3541.298532, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #35: 1001it [00:01, 501.71it/s, env_step=35000, gradient_step=3500, len=227, n/ep=0, n/st=100, rew=11546.00]                                                                                


Epoch #35: test_reward: 6134.000000 ± 5264.613414, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #36: 1001it [00:02, 462.91it/s, env_step=36000, gradient_step=3600, len=156, n/ep=0, n/st=100, rew=7505.00]                                                                                 


Epoch #36: test_reward: 13905.600000 ± 2253.500885, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #37: 1001it [00:01, 568.89it/s, env_step=37000, gradient_step=3700, len=151, n/ep=0, n/st=100, rew=6181.50]                                                                                 


Epoch #37: test_reward: 9296.800000 ± 2330.358977, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #38: 1001it [00:02, 467.16it/s, env_step=38000, gradient_step=3800, len=179, n/ep=1, n/st=100, rew=7148.50]                                                                                 


Epoch #38: test_reward: 8679.200000 ± 1999.840834, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #39: 1001it [00:01, 599.19it/s, env_step=39000, gradient_step=3900, len=255, n/ep=1, n/st=100, rew=13350.00]                                                                                


Epoch #39: test_reward: 11984.800000 ± 3694.323776, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #40: 1001it [00:01, 528.28it/s, env_step=40000, gradient_step=4000, len=341, n/ep=3, n/st=100, rew=17961.83]                                                                                


Epoch #40: test_reward: 10403.200000 ± 4019.696625, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #41: 1001it [00:01, 513.86it/s, env_step=41000, gradient_step=4100, len=308, n/ep=0, n/st=100, rew=16334.00]                                                                                


Epoch #41: test_reward: 11609.700000 ± 5535.974603, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #42: 1001it [00:01, 633.66it/s, env_step=42000, gradient_step=4200, len=119, n/ep=0, n/st=100, rew=5032.00]                                                                                 


Epoch #42: test_reward: 6634.900000 ± 2983.126362, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #43: 1001it [00:01, 548.01it/s, env_step=43000, gradient_step=4300, len=147, n/ep=1, n/st=100, rew=6359.00]                                                                                 


Epoch #43: test_reward: 14456.000000 ± 4222.501818, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #44: 1001it [00:01, 602.61it/s, env_step=44000, gradient_step=4400, len=122, n/ep=0, n/st=100, rew=5275.25]                                                                                 


Epoch #44: test_reward: 14146.000000 ± 5106.145004, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #45: 1001it [00:02, 466.02it/s, env_step=45000, gradient_step=4500, len=155, n/ep=0, n/st=100, rew=7644.00]                                                                                 


Epoch #45: test_reward: 11148.800000 ± 2686.601452, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #46: 1001it [00:01, 516.61it/s, env_step=46000, gradient_step=4600, len=154, n/ep=0, n/st=100, rew=5927.00]                                                                                 


Epoch #46: test_reward: 10841.000000 ± 4306.272843, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #47: 1001it [00:01, 560.19it/s, env_step=47000, gradient_step=4700, len=139, n/ep=0, n/st=100, rew=6883.00]                                                                                 


Epoch #47: test_reward: 10419.400000 ± 4679.553402, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #48: 1001it [00:01, 601.20it/s, env_step=48000, gradient_step=4800, len=253, n/ep=1, n/st=100, rew=8696.50]                                                                                 


Epoch #48: test_reward: 7374.100000 ± 2539.284878, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #49: 1001it [00:02, 461.34it/s, env_step=49000, gradient_step=4900, len=247, n/ep=0, n/st=100, rew=9885.00]                                                                                 


Epoch #49: test_reward: 14024.900000 ± 5069.650076, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #50: 1001it [00:01, 503.99it/s, env_step=50000, gradient_step=5000, len=236, n/ep=2, n/st=100, rew=10601.75]                                                                                


Epoch #50: test_reward: 12597.900000 ± 5499.640105, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #51: 1001it [00:02, 494.03it/s, env_step=51000, gradient_step=5100, len=141, n/ep=1, n/st=100, rew=6377.50]                                                                                 


Epoch #51: test_reward: 8880.300000 ± 3764.818563, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #52: 1001it [00:01, 523.12it/s, env_step=52000, gradient_step=5200, len=351, n/ep=0, n/st=100, rew=18451.00]                                                                                


Epoch #52: test_reward: 8405.400000 ± 1967.221553, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #53: 1001it [00:02, 451.98it/s, env_step=53000, gradient_step=5300, len=204, n/ep=3, n/st=100, rew=9657.50]                                                                                 


Epoch #53: test_reward: 14533.000000 ± 7416.192001, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #54: 1001it [00:01, 535.24it/s, env_step=54000, gradient_step=5400, len=140, n/ep=0, n/st=100, rew=5674.75]                                                                                 


Epoch #54: test_reward: 6350.200000 ± 4005.396005, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #55: 1001it [00:02, 423.54it/s, env_step=55000, gradient_step=5500, len=376, n/ep=0, n/st=100, rew=20599.50]                                                                                


Epoch #55: test_reward: 12898.400000 ± 5313.716010, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #56: 1001it [00:01, 560.71it/s, env_step=56000, gradient_step=5600, len=206, n/ep=0, n/st=100, rew=9329.00]                                                                                 


Epoch #56: test_reward: 11413.600000 ± 5433.538832, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #57: 1001it [00:02, 470.46it/s, env_step=57000, gradient_step=5700, len=307, n/ep=1, n/st=100, rew=16959.00]                                                                                


Epoch #57: test_reward: 13309.800000 ± 4399.076921, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #58: 1001it [00:01, 562.85it/s, env_step=58000, gradient_step=5800, len=365, n/ep=0, n/st=100, rew=19400.75]                                                                                


Epoch #58: test_reward: 7781.500000 ± 3007.070377, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #59: 1001it [00:01, 536.21it/s, env_step=59000, gradient_step=5900, len=168, n/ep=1, n/st=100, rew=7886.00]                                                                                 


Epoch #59: test_reward: 14569.800000 ± 5679.932919, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #60: 1001it [00:01, 541.28it/s, env_step=60000, gradient_step=6000, len=159, n/ep=0, n/st=100, rew=6417.50]                                                                                 


Epoch #60: test_reward: 11412.400000 ± 2122.682416, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #61: 1001it [00:02, 494.66it/s, env_step=61000, gradient_step=6100, len=88, n/ep=0, n/st=100, rew=3806.00]                                                                                  


Epoch #61: test_reward: 13608.300000 ± 7058.799630, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #62: 1001it [00:01, 589.87it/s, env_step=62000, gradient_step=6200, len=180, n/ep=0, n/st=100, rew=7495.50]                                                                                 


Epoch #62: test_reward: 12008.700000 ± 4327.287211, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #63: 1001it [00:02, 492.20it/s, env_step=63000, gradient_step=6300, len=257, n/ep=0, n/st=100, rew=11910.00]                                                                                


Epoch #63: test_reward: 15480.200000 ± 6327.775451, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #64: 1001it [00:01, 529.56it/s, env_step=64000, gradient_step=6400, len=261, n/ep=0, n/st=100, rew=12084.75]                                                                                


Epoch #64: test_reward: 12920.200000 ± 6297.656736, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #65: 1001it [00:02, 492.82it/s, env_step=65000, gradient_step=6500, len=145, n/ep=0, n/st=100, rew=5747.00]                                                                                 


Epoch #65: test_reward: 10470.600000 ± 3368.273510, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #66: 1001it [00:02, 460.32it/s, env_step=66000, gradient_step=6600, len=177, n/ep=0, n/st=100, rew=8827.75]                                                                                 


Epoch #66: test_reward: 12298.700000 ± 4986.795024, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #67: 1001it [00:01, 545.15it/s, env_step=67000, gradient_step=6700, len=203, n/ep=0, n/st=100, rew=10404.00]                                                                                


Epoch #67: test_reward: 11172.100000 ± 5281.103908, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #68: 1001it [00:02, 479.34it/s, env_step=68000, gradient_step=6800, len=141, n/ep=0, n/st=100, rew=5852.00]                                                                                 


Epoch #68: test_reward: 11449.600000 ± 3409.263152, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #69: 1001it [00:01, 545.71it/s, env_step=69000, gradient_step=6900, len=400, n/ep=0, n/st=100, rew=22877.00]                                                                                


Epoch #69: test_reward: 13817.400000 ± 5134.809465, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #70: 1001it [00:01, 564.14it/s, env_step=70000, gradient_step=7000, len=257, n/ep=1, n/st=100, rew=14231.00]                                                                                


Epoch #70: test_reward: 13335.300000 ± 5309.946291, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #71: 1001it [00:02, 494.27it/s, env_step=71000, gradient_step=7100, len=90, n/ep=0, n/st=100, rew=3659.00]                                                                                  


Epoch #71: test_reward: 11573.500000 ± 4877.796577, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #72: 1001it [00:02, 471.56it/s, env_step=72000, gradient_step=7200, len=79, n/ep=0, n/st=100, rew=3072.75]                                                                                  


Epoch #72: test_reward: 16054.800000 ± 3900.469223, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #73: 1001it [00:01, 543.62it/s, env_step=73000, gradient_step=7300, len=219, n/ep=1, n/st=100, rew=10529.00]                                                                                


Epoch #73: test_reward: 9228.500000 ± 3490.068459, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #74: 1001it [00:01, 523.48it/s, env_step=74000, gradient_step=7400, len=38, n/ep=1, n/st=100, rew=1253.00]                                                                                  


Epoch #74: test_reward: 10405.400000 ± 4797.916615, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #75: 1001it [00:02, 495.37it/s, env_step=75000, gradient_step=7500, len=322, n/ep=1, n/st=100, rew=15549.00]                                                                                


Epoch #75: test_reward: 15090.000000 ± 4511.506334, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #76: 1001it [00:02, 453.99it/s, env_step=76000, gradient_step=7600, len=278, n/ep=0, n/st=100, rew=13125.00]                                                                                


Epoch #76: test_reward: 14796.600000 ± 5268.235401, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #77: 1001it [00:01, 555.00it/s, env_step=77000, gradient_step=7700, len=244, n/ep=0, n/st=100, rew=13749.00]                                                                                


Epoch #77: test_reward: 14377.600000 ± 4025.053918, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #78: 1001it [00:02, 490.54it/s, env_step=78000, gradient_step=7800, len=307, n/ep=2, n/st=100, rew=14239.00]                                                                                


Epoch #78: test_reward: 11030.800000 ± 6140.829745, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #79: 1001it [00:02, 468.41it/s, env_step=79000, gradient_step=7900, len=173, n/ep=0, n/st=100, rew=7585.00]                                                                                 


Epoch #79: test_reward: 11486.400000 ± 4322.159881, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #80: 1001it [00:01, 552.67it/s, env_step=80000, gradient_step=8000, len=400, n/ep=1, n/st=100, rew=20897.50]                                                                                


Epoch #80: test_reward: 6450.900000 ± 1372.585476, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #81: 1001it [00:01, 568.05it/s, env_step=81000, gradient_step=8100, len=186, n/ep=2, n/st=100, rew=10713.50]                                                                                


Epoch #81: test_reward: 12834.600000 ± 5719.860596, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #82: 1001it [00:01, 557.44it/s, env_step=82000, gradient_step=8200, len=271, n/ep=0, n/st=100, rew=12690.00]                                                                                


Epoch #82: test_reward: 7705.900000 ± 2179.839097, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #83: 1001it [00:01, 597.72it/s, env_step=83000, gradient_step=8300, len=157, n/ep=1, n/st=100, rew=6905.00]                                                                                 


Epoch #83: test_reward: 12869.800000 ± 4159.327489, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #84: 1001it [00:02, 459.41it/s, env_step=84000, gradient_step=8400, len=328, n/ep=0, n/st=100, rew=19628.00]                                                                                


Epoch #84: test_reward: 9395.000000 ± 3833.943975, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #85: 1001it [00:01, 583.03it/s, env_step=85000, gradient_step=8500, len=187, n/ep=1, n/st=100, rew=9124.50]                                                                                 


Epoch #85: test_reward: 9694.000000 ± 4510.002461, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #86: 1001it [00:01, 561.69it/s, env_step=86000, gradient_step=8600, len=125, n/ep=2, n/st=100, rew=6050.50]                                                                                 


Epoch #86: test_reward: 14206.700000 ± 5415.980023, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #87: 1001it [00:01, 514.73it/s, env_step=87000, gradient_step=8700, len=167, n/ep=1, n/st=100, rew=7019.50]                                                                                 


Epoch #87: test_reward: 10671.400000 ± 4028.796649, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #88: 1001it [00:01, 595.33it/s, env_step=88000, gradient_step=8800, len=216, n/ep=0, n/st=100, rew=8725.50]                                                                                 


Epoch #88: test_reward: 6408.400000 ± 2360.695965, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #89: 1001it [00:01, 538.69it/s, env_step=89000, gradient_step=8900, len=148, n/ep=0, n/st=100, rew=6708.00]                                                                                 


Epoch #89: test_reward: 9389.000000 ± 5475.232287, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #90: 1001it [00:01, 542.36it/s, env_step=90000, gradient_step=9000, len=400, n/ep=1, n/st=100, rew=22065.00]                                                                                


Epoch #90: test_reward: 8815.200000 ± 2634.143648, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #91: 1001it [00:02, 415.36it/s, env_step=91000, gradient_step=9100, len=198, n/ep=0, n/st=100, rew=9614.00]                                                                                 


Epoch #91: test_reward: 8289.800000 ± 4051.783553, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #92: 1001it [00:02, 494.51it/s, env_step=92000, gradient_step=9200, len=246, n/ep=0, n/st=100, rew=13295.00]                                                                                


Epoch #92: test_reward: 10022.500000 ± 4520.036399, best_reward: 16821.100000 ± 3818.017246 in #17


Epoch #93: 1001it [00:01, 517.52it/s, env_step=93000, gradient_step=9300, len=152, n/ep=0, n/st=100, rew=6561.00]                                                                                 


Epoch #93: test_reward: 17986.300000 ± 4691.576964, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #94: 1001it [00:01, 553.51it/s, env_step=94000, gradient_step=9400, len=190, n/ep=1, n/st=100, rew=9634.00]                                                                                 


Epoch #94: test_reward: 9734.000000 ± 6028.567060, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #95: 1001it [00:01, 502.59it/s, env_step=95000, gradient_step=9500, len=167, n/ep=0, n/st=100, rew=8191.50]                                                                                 


Epoch #95: test_reward: 9664.600000 ± 3768.896183, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #96: 1001it [00:01, 563.95it/s, env_step=96000, gradient_step=9600, len=272, n/ep=0, n/st=100, rew=15617.00]                                                                                


Epoch #96: test_reward: 11027.000000 ± 4503.595031, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #97: 1001it [00:02, 476.19it/s, env_step=97000, gradient_step=9700, len=400, n/ep=0, n/st=100, rew=19973.50]                                                                                


Epoch #97: test_reward: 11891.300000 ± 3957.919177, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #98: 1001it [00:01, 562.25it/s, env_step=98000, gradient_step=9800, len=234, n/ep=0, n/st=100, rew=11398.50]                                                                                


Epoch #98: test_reward: 13793.100000 ± 4263.089313, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #99: 1001it [00:02, 456.27it/s, env_step=99000, gradient_step=9900, len=179, n/ep=0, n/st=100, rew=7967.00]                                                                                 


Epoch #99: test_reward: 6532.900000 ± 2610.344362, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #100: 1001it [00:02, 420.54it/s, env_step=100000, gradient_step=10000, len=198, n/ep=0, n/st=100, rew=9579.00]                                                                              


Epoch #100: test_reward: 11662.300000 ± 4733.313048, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #101: 1001it [00:01, 538.90it/s, env_step=101000, gradient_step=10100, len=115, n/ep=0, n/st=100, rew=4619.50]                                                                              


Epoch #101: test_reward: 14211.300000 ± 5225.690826, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #102: 1001it [00:02, 462.89it/s, env_step=102000, gradient_step=10200, len=237, n/ep=0, n/st=100, rew=11585.00]                                                                             


Epoch #102: test_reward: 13454.300000 ± 4546.724185, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #103: 1001it [00:01, 541.64it/s, env_step=103000, gradient_step=10300, len=228, n/ep=0, n/st=100, rew=11956.00]                                                                             


Epoch #103: test_reward: 12314.700000 ± 6005.555679, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #104: 1001it [00:02, 469.21it/s, env_step=104000, gradient_step=10400, len=155, n/ep=2, n/st=100, rew=7076.25]                                                                              


Epoch #104: test_reward: 10575.100000 ± 3504.912052, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #105: 1001it [00:01, 517.71it/s, env_step=105000, gradient_step=10500, len=196, n/ep=0, n/st=100, rew=10020.67]                                                                             


Epoch #105: test_reward: 14545.900000 ± 5314.079101, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #106: 1001it [00:01, 558.69it/s, env_step=106000, gradient_step=10600, len=127, n/ep=0, n/st=100, rew=5392.00]                                                                              


Epoch #106: test_reward: 11241.300000 ± 7313.359694, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #107: 1001it [00:01, 570.21it/s, env_step=107000, gradient_step=10700, len=302, n/ep=1, n/st=100, rew=16643.00]                                                                             


Epoch #107: test_reward: 8984.000000 ± 3810.271066, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #108: 1001it [00:01, 508.20it/s, env_step=108000, gradient_step=10800, len=134, n/ep=0, n/st=100, rew=6471.50]                                                                              


Epoch #108: test_reward: 13688.400000 ± 4659.651751, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #109: 1001it [00:01, 525.82it/s, env_step=109000, gradient_step=10900, len=204, n/ep=0, n/st=100, rew=11288.50]                                                                             


Epoch #109: test_reward: 8653.600000 ± 3929.618892, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #110: 1001it [00:02, 441.63it/s, env_step=110000, gradient_step=11000, len=302, n/ep=1, n/st=100, rew=17529.50]                                                                             


Epoch #110: test_reward: 8951.600000 ± 4698.472330, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #111: 1001it [00:01, 539.52it/s, env_step=111000, gradient_step=11100, len=250, n/ep=1, n/st=100, rew=12214.00]                                                                             


Epoch #111: test_reward: 15733.300000 ± 3679.891603, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #112: 1001it [00:02, 422.86it/s, env_step=112000, gradient_step=11200, len=248, n/ep=0, n/st=100, rew=13863.00]                                                                             


Epoch #112: test_reward: 13728.700000 ± 4226.060933, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #113: 1001it [00:01, 530.59it/s, env_step=113000, gradient_step=11300, len=145, n/ep=0, n/st=100, rew=6747.00]                                                                              


Epoch #113: test_reward: 9273.800000 ± 7339.061572, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #114: 1001it [00:01, 532.22it/s, env_step=114000, gradient_step=11400, len=275, n/ep=0, n/st=100, rew=14081.75]                                                                             


Epoch #114: test_reward: 11896.000000 ± 5093.470428, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #115: 1001it [00:01, 534.81it/s, env_step=115000, gradient_step=11500, len=400, n/ep=0, n/st=100, rew=21944.50]                                                                             


Epoch #115: test_reward: 12449.200000 ± 5279.864881, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #116: 1001it [00:01, 533.53it/s, env_step=116000, gradient_step=11600, len=338, n/ep=3, n/st=100, rew=17857.67]                                                                             


Epoch #116: test_reward: 14718.600000 ± 5414.287658, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #117: 1001it [00:01, 534.10it/s, env_step=117000, gradient_step=11700, len=177, n/ep=2, n/st=100, rew=7835.00]                                                                              


Epoch #117: test_reward: 10418.100000 ± 4516.136943, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #118: 1001it [00:02, 461.85it/s, env_step=118000, gradient_step=11800, len=234, n/ep=1, n/st=100, rew=12212.50]                                                                             


Epoch #118: test_reward: 9293.000000 ± 3246.528762, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #119: 1001it [00:01, 582.82it/s, env_step=119000, gradient_step=11900, len=205, n/ep=2, n/st=100, rew=9334.25]                                                                              


Epoch #119: test_reward: 7511.200000 ± 2791.857905, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #120: 1001it [00:01, 557.34it/s, env_step=120000, gradient_step=12000, len=226, n/ep=0, n/st=100, rew=11218.00]                                                                             


Epoch #120: test_reward: 5947.600000 ± 5458.595336, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #121: 1001it [00:01, 560.44it/s, env_step=121000, gradient_step=12100, len=333, n/ep=0, n/st=100, rew=17528.00]                                                                             


Epoch #121: test_reward: 6257.200000 ± 1929.358069, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #122: 1001it [00:01, 555.85it/s, env_step=122000, gradient_step=12200, len=172, n/ep=1, n/st=100, rew=7694.50]                                                                              


Epoch #122: test_reward: 8659.900000 ± 3862.707534, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #123: 1001it [00:02, 473.06it/s, env_step=123000, gradient_step=12300, len=177, n/ep=2, n/st=100, rew=7516.25]                                                                              


Epoch #123: test_reward: 11943.100000 ± 5397.840854, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #124: 1001it [00:02, 393.56it/s, env_step=124000, gradient_step=12400, len=199, n/ep=0, n/st=100, rew=10098.00]                                                                             


Epoch #124: test_reward: 12066.100000 ± 5425.659194, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #125: 1001it [00:01, 578.27it/s, env_step=125000, gradient_step=12500, len=131, n/ep=1, n/st=100, rew=5384.00]                                                                              


Epoch #125: test_reward: 15701.500000 ± 3871.360542, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #126: 1001it [00:02, 462.43it/s, env_step=126000, gradient_step=12600, len=108, n/ep=1, n/st=100, rew=5445.00]                                                                              


Epoch #126: test_reward: 11122.100000 ± 3994.194849, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #127: 1001it [00:02, 430.19it/s, env_step=127000, gradient_step=12700, len=206, n/ep=0, n/st=100, rew=10343.25]                                                                             


Epoch #127: test_reward: 10255.700000 ± 3217.786446, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #128: 1001it [00:01, 585.50it/s, env_step=128000, gradient_step=12800, len=288, n/ep=1, n/st=100, rew=16532.50]                                                                             


Epoch #128: test_reward: 13516.000000 ± 6718.910269, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #129: 1001it [00:01, 557.65it/s, env_step=129000, gradient_step=12900, len=289, n/ep=0, n/st=100, rew=14386.50]                                                                             


Epoch #129: test_reward: 8790.200000 ± 6311.827086, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #130: 1001it [00:01, 548.44it/s, env_step=130000, gradient_step=13000, len=125, n/ep=0, n/st=100, rew=6380.50]                                                                              


Epoch #130: test_reward: 13530.800000 ± 4945.492186, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #131: 1001it [00:02, 475.07it/s, env_step=131000, gradient_step=13100, len=198, n/ep=1, n/st=100, rew=10372.50]                                                                             


Epoch #131: test_reward: 7215.300000 ± 1753.085397, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #132: 1001it [00:02, 458.24it/s, env_step=132000, gradient_step=13200, len=221, n/ep=0, n/st=100, rew=10542.00]                                                                             


Epoch #132: test_reward: 10583.600000 ± 2790.002946, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #133: 1001it [00:02, 430.94it/s, env_step=133000, gradient_step=13300, len=126, n/ep=1, n/st=100, rew=5428.50]                                                                              


Epoch #133: test_reward: 8594.000000 ± 3505.030442, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #134: 1001it [00:01, 571.84it/s, env_step=134000, gradient_step=13400, len=193, n/ep=0, n/st=100, rew=9420.00]                                                                              


Epoch #134: test_reward: 12585.600000 ± 3740.655509, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #135: 1001it [00:02, 438.69it/s, env_step=135000, gradient_step=13500, len=114, n/ep=0, n/st=100, rew=5425.00]                                                                              


Epoch #135: test_reward: 12399.500000 ± 4337.297759, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #136: 1001it [00:01, 572.14it/s, env_step=136000, gradient_step=13600, len=265, n/ep=1, n/st=100, rew=12761.50]                                                                             


Epoch #136: test_reward: 11409.400000 ± 5706.758803, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #137: 1001it [00:01, 543.23it/s, env_step=137000, gradient_step=13700, len=288, n/ep=1, n/st=100, rew=12708.00]                                                                             


Epoch #137: test_reward: 7262.200000 ± 2788.955855, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #138: 1001it [00:01, 557.62it/s, env_step=138000, gradient_step=13800, len=89, n/ep=1, n/st=100, rew=3522.00]                                                                               


Epoch #138: test_reward: 13491.000000 ± 3581.231660, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #139: 1001it [00:02, 441.18it/s, env_step=139000, gradient_step=13900, len=216, n/ep=0, n/st=100, rew=11230.50]                                                                             


Epoch #139: test_reward: 9466.800000 ± 3040.765325, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #140: 1001it [00:01, 531.57it/s, env_step=140000, gradient_step=14000, len=244, n/ep=0, n/st=100, rew=13273.67]                                                                             


Epoch #140: test_reward: 10768.500000 ± 2432.088742, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #141: 1001it [00:01, 552.05it/s, env_step=141000, gradient_step=14100, len=248, n/ep=2, n/st=100, rew=13724.25]                                                                             


Epoch #141: test_reward: 15842.600000 ± 5019.277382, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #142: 1001it [00:02, 442.24it/s, env_step=142000, gradient_step=14200, len=222, n/ep=1, n/st=100, rew=11890.00]                                                                             


Epoch #142: test_reward: 11936.600000 ± 4424.004344, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #143: 1001it [00:02, 457.04it/s, env_step=143000, gradient_step=14300, len=194, n/ep=0, n/st=100, rew=10315.50]                                                                             


Epoch #143: test_reward: 9949.300000 ± 3418.500667, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #144: 1001it [00:01, 552.51it/s, env_step=144000, gradient_step=14400, len=242, n/ep=0, n/st=100, rew=13557.00]                                                                             


Epoch #144: test_reward: 16316.300000 ± 5703.814006, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #145: 1001it [00:01, 505.78it/s, env_step=145000, gradient_step=14500, len=190, n/ep=1, n/st=100, rew=9633.50]                                                                              


Epoch #145: test_reward: 14224.800000 ± 5472.971858, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #146: 1001it [00:01, 520.50it/s, env_step=146000, gradient_step=14600, len=141, n/ep=1, n/st=100, rew=6480.00]                                                                              


Epoch #146: test_reward: 10331.100000 ± 2663.977795, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #147: 1001it [00:01, 550.57it/s, env_step=147000, gradient_step=14700, len=242, n/ep=0, n/st=100, rew=13926.50]                                                                             


Epoch #147: test_reward: 11104.700000 ± 3139.873725, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #148: 1001it [00:02, 458.36it/s, env_step=148000, gradient_step=14800, len=165, n/ep=0, n/st=100, rew=7242.00]                                                                              


Epoch #148: test_reward: 11190.700000 ± 2270.851737, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #149: 1001it [00:01, 581.02it/s, env_step=149000, gradient_step=14900, len=284, n/ep=0, n/st=100, rew=16521.00]                                                                             


Epoch #149: test_reward: 9950.000000 ± 4288.284785, best_reward: 17986.300000 ± 4691.576964 in #93


Epoch #150: 1001it [00:02, 486.61it/s, env_step=150000, gradient_step=15000, len=224, n/ep=0, n/st=100, rew=12409.00]                                                                             


Epoch #150: test_reward: 10729.000000 ± 2739.146436, best_reward: 17986.300000 ± 4691.576964 in #93

InfoStats(gradient_step=15000, best_reward=17986.3, best_reward_std=4691.576964092138, train_step=150000, train_episode=653, test_step=346392, test_episode=1510, timing=TimingStats(total_time=465.6895022392273, train_time=291.1466679573059, train_time_collect=51.20975351333618, train_time_update=234.32233953475952, test_time=174.5428342819214, update_speed=515.2042475787364))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #3


Epoch #1: 1001it [00:02, 429.86it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 3104.600000 ± 1896.705786, best_reward: 5510.500000 ± 3895.991819 in #0


Epoch #2: 1001it [00:02, 496.50it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 4616.000000 ± 4528.955994, best_reward: 5510.500000 ± 3895.991819 in #0


Epoch #3: 1001it [00:02, 421.81it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 7557.000000 ± 3063.627817, best_reward: 7557.000000 ± 3063.627817 in #3


Epoch #4: 1001it [00:02, 414.40it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 3896.800000 ± 3672.035643, best_reward: 7557.000000 ± 3063.627817 in #3


Epoch #5: 1001it [00:02, 500.16it/s, env_step=5000, gradient_step=500, len=49, n/ep=0, n/st=100, rew=1490.00]                                                                                     


Epoch #5: test_reward: 6361.200000 ± 4529.377635, best_reward: 7557.000000 ± 3063.627817 in #3


Epoch #6: 1001it [00:01, 502.55it/s, env_step=6000, gradient_step=600, len=59, n/ep=0, n/st=100, rew=1940.00]                                                                                     


Epoch #6: test_reward: 9168.400000 ± 4407.693075, best_reward: 9168.400000 ± 4407.693075 in #6


Epoch #7: 1001it [00:02, 365.16it/s, env_step=7000, gradient_step=700, len=63, n/ep=0, n/st=100, rew=2367.00]                                                                                     


Epoch #7: test_reward: 10051.600000 ± 6267.478013, best_reward: 10051.600000 ± 6267.478013 in #7


Epoch #8: 1001it [00:02, 355.04it/s, env_step=8000, gradient_step=800, len=79, n/ep=0, n/st=100, rew=2812.00]                                                                                     


Epoch #8: test_reward: 14744.800000 ± 2847.349357, best_reward: 14744.800000 ± 2847.349357 in #8


Epoch #9: 1001it [00:02, 337.31it/s, env_step=9000, gradient_step=900, len=87, n/ep=0, n/st=100, rew=3352.00]                                                                                     


Epoch #9: test_reward: 10320.600000 ± 3534.955281, best_reward: 14744.800000 ± 2847.349357 in #8


Epoch #10: 1001it [00:02, 382.85it/s, env_step=10000, gradient_step=1000, len=99, n/ep=0, n/st=100, rew=4496.00]                                                                                  


Epoch #10: test_reward: 11127.600000 ± 4869.527086, best_reward: 14744.800000 ± 2847.349357 in #8


Epoch #11: 1001it [00:02, 351.58it/s, env_step=11000, gradient_step=1100, len=109, n/ep=0, n/st=100, rew=4609.00]                                                                                 


Epoch #11: test_reward: 17283.800000 ± 4101.555651, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #12: 1001it [00:02, 389.14it/s, env_step=12000, gradient_step=1200, len=117, n/ep=0, n/st=100, rew=5222.00]                                                                                 


Epoch #12: test_reward: 5963.200000 ± 3474.755036, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #13: 1001it [00:02, 438.14it/s, env_step=13000, gradient_step=1300, len=129, n/ep=0, n/st=100, rew=6134.00]                                                                                 


Epoch #13: test_reward: 6177.200000 ± 4605.403452, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #14: 1001it [00:02, 389.41it/s, env_step=14000, gradient_step=1400, len=139, n/ep=0, n/st=100, rew=6556.00]                                                                                 


Epoch #14: test_reward: 10605.200000 ± 5377.076990, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #15: 1001it [00:02, 343.19it/s, env_step=15000, gradient_step=1500, len=143, n/ep=0, n/st=100, rew=7026.17]                                                                                 


Epoch #15: test_reward: 9866.900000 ± 5105.106354, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #16: 1001it [00:02, 410.10it/s, env_step=16000, gradient_step=1600, len=159, n/ep=0, n/st=100, rew=7094.00]                                                                                 


Epoch #16: test_reward: 5816.600000 ± 3472.477623, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #17: 1001it [00:02, 419.55it/s, env_step=17000, gradient_step=1700, len=159, n/ep=0, n/st=100, rew=7094.00]                                                                                 


Epoch #17: test_reward: 10393.000000 ± 4077.281864, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #18: 1001it [00:02, 358.70it/s, env_step=18000, gradient_step=1800, len=180, n/ep=3, n/st=100, rew=9049.33]                                                                                 


Epoch #18: test_reward: 10310.300000 ± 4239.454400, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #19: 1001it [00:02, 351.85it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=9969.00]                                                                                 


Epoch #19: test_reward: 8103.200000 ± 5496.078544, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #20: 1001it [00:03, 321.34it/s, env_step=20000, gradient_step=2000, len=91, n/ep=1, n/st=100, rew=4003.00]                                                                                  


Epoch #20: test_reward: 7509.800000 ± 2913.272723, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #21: 1001it [00:02, 400.52it/s, env_step=21000, gradient_step=2100, len=207, n/ep=0, n/st=100, rew=10168.25]                                                                                


Epoch #21: test_reward: 8303.000000 ± 3670.315000, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #22: 1001it [00:02, 380.48it/s, env_step=22000, gradient_step=2200, len=219, n/ep=0, n/st=100, rew=10368.00]                                                                                


Epoch #22: test_reward: 5973.600000 ± 2944.053913, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #23: 1001it [00:02, 338.91it/s, env_step=23000, gradient_step=2300, len=223, n/ep=0, n/st=100, rew=11707.50]                                                                                


Epoch #23: test_reward: 7696.000000 ± 4685.062049, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #24: 1001it [00:02, 357.96it/s, env_step=24000, gradient_step=2400, len=175, n/ep=0, n/st=100, rew=9111.75]                                                                                 


Epoch #24: test_reward: 9887.600000 ± 3077.090938, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #25: 1001it [00:02, 418.65it/s, env_step=25000, gradient_step=2500, len=169, n/ep=1, n/st=100, rew=9257.50]                                                                                 


Epoch #25: test_reward: 11798.400000 ± 3228.681935, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #26: 1001it [00:02, 400.81it/s, env_step=26000, gradient_step=2600, len=153, n/ep=0, n/st=100, rew=7777.00]                                                                                 


Epoch #26: test_reward: 8186.600000 ± 3167.505271, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #27: 1001it [00:03, 324.99it/s, env_step=27000, gradient_step=2700, len=269, n/ep=0, n/st=100, rew=16903.00]                                                                                


Epoch #27: test_reward: 9393.400000 ± 3369.712397, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #28: 1001it [00:02, 335.17it/s, env_step=28000, gradient_step=2800, len=232, n/ep=0, n/st=100, rew=12869.20]                                                                                


Epoch #28: test_reward: 7669.300000 ± 4202.846656, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #29: 1001it [00:03, 312.11it/s, env_step=29000, gradient_step=2900, len=93, n/ep=1, n/st=100, rew=4178.00]                                                                                  


Epoch #29: test_reward: 11978.300000 ± 5883.414316, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #30: 1001it [00:02, 404.13it/s, env_step=30000, gradient_step=3000, len=227, n/ep=0, n/st=100, rew=12551.00]                                                                                


Epoch #30: test_reward: 3021.700000 ± 1356.639234, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #31: 1001it [00:03, 330.38it/s, env_step=31000, gradient_step=3100, len=189, n/ep=1, n/st=100, rew=10384.00]                                                                                


Epoch #31: test_reward: 7123.400000 ± 3044.837736, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #32: 1001it [00:03, 324.12it/s, env_step=32000, gradient_step=3200, len=230, n/ep=0, n/st=100, rew=13801.00]                                                                                


Epoch #32: test_reward: 7549.000000 ± 3501.628792, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #33: 1001it [00:02, 388.58it/s, env_step=33000, gradient_step=3300, len=209, n/ep=1, n/st=100, rew=11884.00]                                                                                


Epoch #33: test_reward: 8849.500000 ± 3835.729377, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #34: 1001it [00:02, 351.10it/s, env_step=34000, gradient_step=3400, len=155, n/ep=0, n/st=100, rew=9266.50]                                                                                 


Epoch #34: test_reward: 6809.000000 ± 2784.742466, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #35: 1001it [00:02, 424.13it/s, env_step=35000, gradient_step=3500, len=98, n/ep=0, n/st=100, rew=4644.75]                                                                                  


Epoch #35: test_reward: 7182.600000 ± 2406.560915, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #36: 1001it [00:03, 332.30it/s, env_step=36000, gradient_step=3600, len=208, n/ep=0, n/st=100, rew=12002.00]                                                                                


Epoch #36: test_reward: 12397.400000 ± 3948.622626, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #37: 1001it [00:02, 419.96it/s, env_step=37000, gradient_step=3700, len=192, n/ep=2, n/st=100, rew=10849.00]                                                                                


Epoch #37: test_reward: 11815.200000 ± 3565.544357, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #38: 1001it [00:02, 402.83it/s, env_step=38000, gradient_step=3800, len=189, n/ep=1, n/st=100, rew=11362.50]                                                                                


Epoch #38: test_reward: 12488.200000 ± 5997.367636, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #39: 1001it [00:02, 401.09it/s, env_step=39000, gradient_step=3900, len=218, n/ep=1, n/st=100, rew=13631.00]                                                                                


Epoch #39: test_reward: 9058.600000 ± 3971.849146, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #40: 1001it [00:02, 346.62it/s, env_step=40000, gradient_step=4000, len=274, n/ep=2, n/st=100, rew=15438.50]                                                                                


Epoch #40: test_reward: 16243.600000 ± 4941.969389, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #41: 1001it [00:02, 428.49it/s, env_step=41000, gradient_step=4100, len=189, n/ep=1, n/st=100, rew=9308.00]                                                                                 


Epoch #41: test_reward: 5864.400000 ± 2552.452593, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #42: 1001it [00:02, 354.41it/s, env_step=42000, gradient_step=4200, len=135, n/ep=0, n/st=100, rew=6585.00]                                                                                 


Epoch #42: test_reward: 7337.600000 ± 2977.391583, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #43: 1001it [00:02, 366.92it/s, env_step=43000, gradient_step=4300, len=181, n/ep=0, n/st=100, rew=11237.00]                                                                                


Epoch #43: test_reward: 11733.200000 ± 3705.777457, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #44: 1001it [00:02, 361.16it/s, env_step=44000, gradient_step=4400, len=186, n/ep=1, n/st=100, rew=10489.50]                                                                                


Epoch #44: test_reward: 8682.200000 ± 2702.341570, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #45: 1001it [00:03, 329.84it/s, env_step=45000, gradient_step=4500, len=211, n/ep=2, n/st=100, rew=10811.50]                                                                                


Epoch #45: test_reward: 6132.400000 ± 3085.905611, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #46: 1001it [00:02, 409.13it/s, env_step=46000, gradient_step=4600, len=181, n/ep=0, n/st=100, rew=9817.00]                                                                                 


Epoch #46: test_reward: 8149.100000 ± 4486.622080, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #47: 1001it [00:02, 401.73it/s, env_step=47000, gradient_step=4700, len=158, n/ep=1, n/st=100, rew=8025.00]                                                                                 


Epoch #47: test_reward: 6409.000000 ± 5007.251242, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #48: 1001it [00:02, 386.05it/s, env_step=48000, gradient_step=4800, len=271, n/ep=0, n/st=100, rew=14825.00]                                                                                


Epoch #48: test_reward: 7593.100000 ± 4148.114956, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #49: 1001it [00:02, 353.83it/s, env_step=49000, gradient_step=4900, len=95, n/ep=0, n/st=100, rew=4571.00]                                                                                  


Epoch #49: test_reward: 11717.400000 ± 6071.639914, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #50: 1001it [00:02, 389.50it/s, env_step=50000, gradient_step=5000, len=195, n/ep=2, n/st=100, rew=9917.75]                                                                                 


Epoch #50: test_reward: 12345.600000 ± 4535.810560, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #51: 1001it [00:02, 404.18it/s, env_step=51000, gradient_step=5100, len=287, n/ep=1, n/st=100, rew=18107.50]                                                                                


Epoch #51: test_reward: 11356.800000 ± 7301.949668, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #52: 1001it [00:02, 366.12it/s, env_step=52000, gradient_step=5200, len=247, n/ep=0, n/st=100, rew=14880.00]                                                                                


Epoch #52: test_reward: 14156.400000 ± 4467.705702, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #53: 1001it [00:02, 395.53it/s, env_step=53000, gradient_step=5300, len=134, n/ep=0, n/st=100, rew=6750.00]                                                                                 


Epoch #53: test_reward: 13400.100000 ± 4942.224751, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #54: 1001it [00:02, 360.55it/s, env_step=54000, gradient_step=5400, len=231, n/ep=0, n/st=100, rew=12187.00]                                                                                


Epoch #54: test_reward: 10186.800000 ± 4641.887737, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #55: 1001it [00:02, 381.59it/s, env_step=55000, gradient_step=5500, len=175, n/ep=0, n/st=100, rew=9473.00]                                                                                 


Epoch #55: test_reward: 15165.700000 ± 4597.551197, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #56: 1001it [00:02, 409.79it/s, env_step=56000, gradient_step=5600, len=248, n/ep=1, n/st=100, rew=13761.00]                                                                                


Epoch #56: test_reward: 12189.000000 ± 4152.719639, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #57: 1001it [00:02, 379.47it/s, env_step=57000, gradient_step=5700, len=208, n/ep=2, n/st=100, rew=11370.50]                                                                                


Epoch #57: test_reward: 13732.500000 ± 4310.368923, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #58: 1001it [00:02, 369.82it/s, env_step=58000, gradient_step=5800, len=191, n/ep=0, n/st=100, rew=10271.50]                                                                                


Epoch #58: test_reward: 15000.800000 ± 2618.961924, best_reward: 17283.800000 ± 4101.555651 in #11


Epoch #59: 1001it [00:02, 349.53it/s, env_step=59000, gradient_step=5900, len=117, n/ep=1, n/st=100, rew=6332.00]                                                                                 


Epoch #59: test_reward: 17900.200000 ± 3801.816561, best_reward: 17900.200000 ± 3801.816561 in #59


Epoch #60: 1001it [00:02, 372.28it/s, env_step=60000, gradient_step=6000, len=241, n/ep=0, n/st=100, rew=14241.00]                                                                                


Epoch #60: test_reward: 12619.900000 ± 6065.288459, best_reward: 17900.200000 ± 3801.816561 in #59


Epoch #61: 1001it [00:02, 436.95it/s, env_step=61000, gradient_step=6100, len=263, n/ep=1, n/st=100, rew=15409.50]                                                                                


Epoch #61: test_reward: 12331.700000 ± 6193.780882, best_reward: 17900.200000 ± 3801.816561 in #59


Epoch #62: 1001it [00:02, 375.73it/s, env_step=62000, gradient_step=6200, len=186, n/ep=0, n/st=100, rew=11008.50]                                                                                


Epoch #62: test_reward: 5312.900000 ± 2385.113266, best_reward: 17900.200000 ± 3801.816561 in #59


Epoch #63: 1001it [00:02, 423.75it/s, env_step=63000, gradient_step=6300, len=349, n/ep=0, n/st=100, rew=21125.00]                                                                                


Epoch #63: test_reward: 15390.400000 ± 7349.341837, best_reward: 17900.200000 ± 3801.816561 in #59


Epoch #64: 1001it [00:02, 394.64it/s, env_step=64000, gradient_step=6400, len=261, n/ep=1, n/st=100, rew=16278.00]                                                                                


Epoch #64: test_reward: 15467.000000 ± 5535.325483, best_reward: 17900.200000 ± 3801.816561 in #59


Epoch #65: 1001it [00:02, 405.44it/s, env_step=65000, gradient_step=6500, len=149, n/ep=0, n/st=100, rew=6965.50]                                                                                 


Epoch #65: test_reward: 10870.300000 ± 6413.480210, best_reward: 17900.200000 ± 3801.816561 in #59


Epoch #66: 1001it [00:02, 384.56it/s, env_step=66000, gradient_step=6600, len=103, n/ep=1, n/st=100, rew=5121.00]                                                                                 


Epoch #66: test_reward: 15346.700000 ± 4386.106885, best_reward: 17900.200000 ± 3801.816561 in #59


Epoch #67: 1001it [00:02, 371.82it/s, env_step=67000, gradient_step=6700, len=261, n/ep=0, n/st=100, rew=14168.00]                                                                                


Epoch #67: test_reward: 18333.700000 ± 5595.801230, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #68: 1001it [00:02, 415.47it/s, env_step=68000, gradient_step=6800, len=313, n/ep=0, n/st=100, rew=18042.00]                                                                                


Epoch #68: test_reward: 16036.900000 ± 4694.853298, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #69: 1001it [00:02, 396.40it/s, env_step=69000, gradient_step=6900, len=365, n/ep=1, n/st=100, rew=22331.50]                                                                                


Epoch #69: test_reward: 17173.500000 ± 5065.592152, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #70: 1001it [00:02, 397.69it/s, env_step=70000, gradient_step=7000, len=338, n/ep=0, n/st=100, rew=20413.00]                                                                                


Epoch #70: test_reward: 11672.800000 ± 6675.830148, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #71: 1001it [00:02, 440.59it/s, env_step=71000, gradient_step=7100, len=90, n/ep=0, n/st=100, rew=3488.50]                                                                                  


Epoch #71: test_reward: 15355.600000 ± 5451.320046, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #72: 1001it [00:02, 358.59it/s, env_step=72000, gradient_step=7200, len=178, n/ep=0, n/st=100, rew=10924.00]                                                                                


Epoch #72: test_reward: 11054.600000 ± 3746.134360, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #73: 1001it [00:02, 391.85it/s, env_step=73000, gradient_step=7300, len=159, n/ep=0, n/st=100, rew=9650.00]                                                                                 


Epoch #73: test_reward: 10897.700000 ± 5005.800397, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #74: 1001it [00:02, 412.36it/s, env_step=74000, gradient_step=7400, len=175, n/ep=0, n/st=100, rew=10076.50]                                                                                


Epoch #74: test_reward: 16863.900000 ± 6641.002281, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #75: 1001it [00:02, 429.16it/s, env_step=75000, gradient_step=7500, len=190, n/ep=0, n/st=100, rew=11393.00]                                                                                


Epoch #75: test_reward: 13084.400000 ± 3049.165597, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #76: 1001it [00:02, 403.76it/s, env_step=76000, gradient_step=7600, len=87, n/ep=0, n/st=100, rew=4487.50]                                                                                  


Epoch #76: test_reward: 11890.400000 ± 3597.586057, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #77: 1001it [00:02, 428.24it/s, env_step=77000, gradient_step=7700, len=207, n/ep=0, n/st=100, rew=13586.50]                                                                                


Epoch #77: test_reward: 17201.200000 ± 8917.725100, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #78: 1001it [00:02, 358.99it/s, env_step=78000, gradient_step=7800, len=79, n/ep=1, n/st=100, rew=4077.00]                                                                                  


Epoch #78: test_reward: 14778.000000 ± 6424.430247, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #79: 1001it [00:02, 360.81it/s, env_step=79000, gradient_step=7900, len=169, n/ep=0, n/st=100, rew=9440.00]                                                                                 


Epoch #79: test_reward: 11529.700000 ± 5204.834177, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #80: 1001it [00:03, 317.91it/s, env_step=80000, gradient_step=8000, len=349, n/ep=0, n/st=100, rew=21909.00]                                                                                


Epoch #80: test_reward: 16138.800000 ± 4464.156915, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #81: 1001it [00:02, 409.37it/s, env_step=81000, gradient_step=8100, len=267, n/ep=0, n/st=100, rew=16681.75]                                                                                


Epoch #81: test_reward: 16073.800000 ± 5238.730014, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #82: 1001it [00:02, 402.28it/s, env_step=82000, gradient_step=8200, len=113, n/ep=0, n/st=100, rew=6373.00]                                                                                 


Epoch #82: test_reward: 11629.600000 ± 4370.360333, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #83: 1001it [00:02, 365.07it/s, env_step=83000, gradient_step=8300, len=189, n/ep=0, n/st=100, rew=11115.00]                                                                                


Epoch #83: test_reward: 12445.300000 ± 5119.078668, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #84: 1001it [00:02, 386.39it/s, env_step=84000, gradient_step=8400, len=147, n/ep=2, n/st=100, rew=8597.00]                                                                                 


Epoch #84: test_reward: 14304.200000 ± 5883.296861, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #85: 1001it [00:02, 406.73it/s, env_step=85000, gradient_step=8500, len=181, n/ep=1, n/st=100, rew=9849.00]                                                                                 


Epoch #85: test_reward: 11262.400000 ± 3625.043095, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #86: 1001it [00:02, 347.35it/s, env_step=86000, gradient_step=8600, len=215, n/ep=1, n/st=100, rew=13823.00]                                                                                


Epoch #86: test_reward: 14338.000000 ± 3615.556610, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #87: 1001it [00:02, 365.90it/s, env_step=87000, gradient_step=8700, len=125, n/ep=0, n/st=100, rew=6990.50]                                                                                 


Epoch #87: test_reward: 13527.900000 ± 4978.115636, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #88: 1001it [00:02, 427.04it/s, env_step=88000, gradient_step=8800, len=269, n/ep=1, n/st=100, rew=16331.50]                                                                                


Epoch #88: test_reward: 12342.800000 ± 4285.659921, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #89: 1001it [00:02, 367.31it/s, env_step=89000, gradient_step=8900, len=275, n/ep=1, n/st=100, rew=17661.00]                                                                                


Epoch #89: test_reward: 11057.800000 ± 5348.696398, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #90: 1001it [00:02, 374.32it/s, env_step=90000, gradient_step=9000, len=215, n/ep=0, n/st=100, rew=13588.00]                                                                                


Epoch #90: test_reward: 13717.600000 ± 3073.217409, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #91: 1001it [00:02, 343.62it/s, env_step=91000, gradient_step=9100, len=173, n/ep=1, n/st=100, rew=11140.00]                                                                                


Epoch #91: test_reward: 6581.000000 ± 4651.029241, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #92: 1001it [00:02, 368.79it/s, env_step=92000, gradient_step=9200, len=378, n/ep=1, n/st=100, rew=25329.00]                                                                                


Epoch #92: test_reward: 10105.800000 ± 7852.365768, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #93: 1001it [00:02, 369.05it/s, env_step=93000, gradient_step=9300, len=228, n/ep=1, n/st=100, rew=14459.50]                                                                                


Epoch #93: test_reward: 11918.200000 ± 7249.481606, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #94: 1001it [00:02, 389.66it/s, env_step=94000, gradient_step=9400, len=164, n/ep=3, n/st=100, rew=9580.50]                                                                                 


Epoch #94: test_reward: 13917.300000 ± 5056.523668, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #95: 1001it [00:02, 410.77it/s, env_step=95000, gradient_step=9500, len=114, n/ep=0, n/st=100, rew=6634.75]                                                                                 


Epoch #95: test_reward: 11334.600000 ± 4058.262293, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #96: 1001it [00:02, 396.10it/s, env_step=96000, gradient_step=9600, len=191, n/ep=0, n/st=100, rew=10101.00]                                                                                


Epoch #96: test_reward: 14384.800000 ± 7659.388916, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #97: 1001it [00:03, 314.82it/s, env_step=97000, gradient_step=9700, len=171, n/ep=0, n/st=100, rew=9343.00]                                                                                 


Epoch #97: test_reward: 11929.000000 ± 5949.428006, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #98: 1001it [00:02, 407.26it/s, env_step=98000, gradient_step=9800, len=140, n/ep=1, n/st=100, rew=8263.00]                                                                                 


Epoch #98: test_reward: 7568.200000 ± 5846.858743, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #99: 1001it [00:02, 367.37it/s, env_step=99000, gradient_step=9900, len=96, n/ep=0, n/st=100, rew=5142.00]                                                                                  


Epoch #99: test_reward: 9234.700000 ± 3909.337874, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #100: 1001it [00:02, 406.38it/s, env_step=100000, gradient_step=10000, len=82, n/ep=0, n/st=100, rew=3739.00]                                                                               


Epoch #100: test_reward: 17749.700000 ± 8624.767429, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #101: 1001it [00:02, 400.25it/s, env_step=101000, gradient_step=10100, len=57, n/ep=0, n/st=100, rew=2528.00]                                                                               


Epoch #101: test_reward: 13329.600000 ± 3539.455755, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #102: 1001it [00:02, 424.11it/s, env_step=102000, gradient_step=10200, len=179, n/ep=1, n/st=100, rew=10074.00]                                                                             


Epoch #102: test_reward: 11216.800000 ± 6291.687179, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #103: 1001it [00:02, 385.45it/s, env_step=103000, gradient_step=10300, len=134, n/ep=1, n/st=100, rew=8133.00]                                                                              


Epoch #103: test_reward: 15121.000000 ± 4833.666186, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #104: 1001it [00:02, 392.63it/s, env_step=104000, gradient_step=10400, len=120, n/ep=2, n/st=100, rew=7048.50]                                                                              


Epoch #104: test_reward: 11151.200000 ± 5011.756993, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #105: 1001it [00:02, 382.82it/s, env_step=105000, gradient_step=10500, len=125, n/ep=1, n/st=100, rew=5953.00]                                                                              


Epoch #105: test_reward: 10837.600000 ± 2522.134937, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #106: 1001it [00:02, 362.05it/s, env_step=106000, gradient_step=10600, len=166, n/ep=1, n/st=100, rew=10345.00]                                                                             


Epoch #106: test_reward: 13882.400000 ± 4703.041084, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #107: 1001it [00:02, 345.46it/s, env_step=107000, gradient_step=10700, len=155, n/ep=2, n/st=100, rew=8143.50]                                                                              


Epoch #107: test_reward: 12803.100000 ± 4326.131631, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #108: 1001it [00:02, 385.45it/s, env_step=108000, gradient_step=10800, len=126, n/ep=1, n/st=100, rew=7754.50]                                                                              


Epoch #108: test_reward: 13180.500000 ± 3767.607338, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #109: 1001it [00:03, 332.45it/s, env_step=109000, gradient_step=10900, len=57, n/ep=1, n/st=100, rew=2841.00]                                                                               


Epoch #109: test_reward: 14539.600000 ± 4523.107653, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #110: 1001it [00:02, 359.66it/s, env_step=110000, gradient_step=11000, len=112, n/ep=1, n/st=100, rew=6428.00]                                                                              


Epoch #110: test_reward: 12702.500000 ± 5262.230501, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #111: 1001it [00:02, 388.54it/s, env_step=111000, gradient_step=11100, len=107, n/ep=1, n/st=100, rew=5888.00]                                                                              


Epoch #111: test_reward: 14182.900000 ± 5309.301262, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #112: 1001it [00:02, 413.99it/s, env_step=112000, gradient_step=11200, len=212, n/ep=0, n/st=100, rew=13439.00]                                                                             


Epoch #112: test_reward: 15980.600000 ± 4985.167042, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #113: 1001it [00:02, 395.44it/s, env_step=113000, gradient_step=11300, len=152, n/ep=0, n/st=100, rew=8550.50]                                                                              


Epoch #113: test_reward: 10282.400000 ± 4758.505946, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #114: 1001it [00:02, 374.40it/s, env_step=114000, gradient_step=11400, len=110, n/ep=1, n/st=100, rew=6223.50]                                                                              


Epoch #114: test_reward: 10426.400000 ± 4769.943547, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #115: 1001it [00:02, 358.73it/s, env_step=115000, gradient_step=11500, len=110, n/ep=0, n/st=100, rew=6223.50]                                                                              


Epoch #115: test_reward: 10706.800000 ± 4088.859934, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #116: 1001it [00:02, 370.58it/s, env_step=116000, gradient_step=11600, len=213, n/ep=0, n/st=100, rew=12645.50]                                                                             


Epoch #116: test_reward: 8558.700000 ± 5419.682261, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #117: 1001it [00:03, 333.53it/s, env_step=117000, gradient_step=11700, len=250, n/ep=0, n/st=100, rew=16387.50]                                                                             


Epoch #117: test_reward: 10494.000000 ± 3100.136997, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #118: 1001it [00:02, 399.11it/s, env_step=118000, gradient_step=11800, len=135, n/ep=1, n/st=100, rew=7752.00]                                                                              


Epoch #118: test_reward: 13441.000000 ± 3226.312477, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #119: 1001it [00:02, 425.54it/s, env_step=119000, gradient_step=11900, len=133, n/ep=1, n/st=100, rew=8109.00]                                                                              


Epoch #119: test_reward: 11767.800000 ± 4641.211088, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #120: 1001it [00:02, 365.40it/s, env_step=120000, gradient_step=12000, len=201, n/ep=0, n/st=100, rew=12138.00]                                                                             


Epoch #120: test_reward: 8764.800000 ± 3355.907263, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #121: 1001it [00:03, 330.06it/s, env_step=121000, gradient_step=12100, len=270, n/ep=1, n/st=100, rew=18094.00]                                                                             


Epoch #121: test_reward: 15623.500000 ± 6751.455625, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #122: 1001it [00:02, 437.99it/s, env_step=122000, gradient_step=12200, len=265, n/ep=0, n/st=100, rew=16767.50]                                                                             


Epoch #122: test_reward: 17552.800000 ± 4409.141681, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #123: 1001it [00:02, 386.06it/s, env_step=123000, gradient_step=12300, len=67, n/ep=0, n/st=100, rew=2482.00]                                                                               


Epoch #123: test_reward: 11071.200000 ± 4463.814015, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #124: 1001it [00:02, 391.50it/s, env_step=124000, gradient_step=12400, len=137, n/ep=1, n/st=100, rew=7705.00]                                                                              


Epoch #124: test_reward: 11766.400000 ± 3998.423144, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #125: 1001it [00:02, 358.97it/s, env_step=125000, gradient_step=12500, len=171, n/ep=0, n/st=100, rew=10644.25]                                                                             


Epoch #125: test_reward: 12254.000000 ± 5817.934840, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #126: 1001it [00:02, 349.01it/s, env_step=126000, gradient_step=12600, len=20, n/ep=0, n/st=100, rew=778.00]                                                                                


Epoch #126: test_reward: 12883.000000 ± 3774.922330, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #127: 1001it [00:02, 382.39it/s, env_step=127000, gradient_step=12700, len=169, n/ep=2, n/st=100, rew=9650.00]                                                                              


Epoch #127: test_reward: 9869.900000 ± 3239.909642, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #128: 1001it [00:02, 365.43it/s, env_step=128000, gradient_step=12800, len=204, n/ep=1, n/st=100, rew=13518.50]                                                                             


Epoch #128: test_reward: 13096.400000 ± 6724.104315, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #129: 1001it [00:02, 404.62it/s, env_step=129000, gradient_step=12900, len=400, n/ep=0, n/st=100, rew=26771.00]                                                                             


Epoch #129: test_reward: 8361.200000 ± 1889.092893, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #130: 1001it [00:02, 404.12it/s, env_step=130000, gradient_step=13000, len=215, n/ep=0, n/st=100, rew=12010.00]                                                                             


Epoch #130: test_reward: 16219.800000 ± 5596.582132, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #131: 1001it [00:02, 412.65it/s, env_step=131000, gradient_step=13100, len=122, n/ep=0, n/st=100, rew=6289.00]                                                                              


Epoch #131: test_reward: 12223.700000 ± 5394.994755, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #132: 1001it [00:02, 338.86it/s, env_step=132000, gradient_step=13200, len=158, n/ep=1, n/st=100, rew=9184.00]                                                                              


Epoch #132: test_reward: 13362.700000 ± 4341.446074, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #133: 1001it [00:02, 335.10it/s, env_step=133000, gradient_step=13300, len=169, n/ep=0, n/st=100, rew=11276.50]                                                                             


Epoch #133: test_reward: 13879.100000 ± 5829.361036, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #134: 1001it [00:02, 344.45it/s, env_step=134000, gradient_step=13400, len=205, n/ep=0, n/st=100, rew=12698.50]                                                                             


Epoch #134: test_reward: 10530.800000 ± 3181.268074, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #135: 1001it [00:02, 410.34it/s, env_step=135000, gradient_step=13500, len=116, n/ep=1, n/st=100, rew=7002.00]                                                                              


Epoch #135: test_reward: 12387.100000 ± 5326.680025, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #136: 1001it [00:02, 401.35it/s, env_step=136000, gradient_step=13600, len=113, n/ep=0, n/st=100, rew=6731.00]                                                                              


Epoch #136: test_reward: 16527.300000 ± 7073.600074, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #137: 1001it [00:02, 377.55it/s, env_step=137000, gradient_step=13700, len=152, n/ep=1, n/st=100, rew=8497.00]                                                                              


Epoch #137: test_reward: 14545.500000 ± 3231.953442, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #138: 1001it [00:02, 389.60it/s, env_step=138000, gradient_step=13800, len=153, n/ep=0, n/st=100, rew=9589.00]                                                                              


Epoch #138: test_reward: 13372.900000 ± 4756.268946, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #139: 1001it [00:02, 361.74it/s, env_step=139000, gradient_step=13900, len=209, n/ep=1, n/st=100, rew=13439.00]                                                                             


Epoch #139: test_reward: 10808.800000 ± 2122.837714, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #140: 1001it [00:03, 305.52it/s, env_step=140000, gradient_step=14000, len=264, n/ep=0, n/st=100, rew=17055.00]                                                                             


Epoch #140: test_reward: 10312.900000 ± 4123.794865, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #141: 1001it [00:02, 346.40it/s, env_step=141000, gradient_step=14100, len=124, n/ep=0, n/st=100, rew=7442.50]                                                                              


Epoch #141: test_reward: 11556.000000 ± 4325.371360, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #142: 1001it [00:02, 396.09it/s, env_step=142000, gradient_step=14200, len=167, n/ep=1, n/st=100, rew=10890.50]                                                                             


Epoch #142: test_reward: 8147.400000 ± 4481.921735, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #143: 1001it [00:02, 375.15it/s, env_step=143000, gradient_step=14300, len=166, n/ep=1, n/st=100, rew=9899.00]                                                                              


Epoch #143: test_reward: 7012.700000 ± 2377.567204, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #144: 1001it [00:02, 401.41it/s, env_step=144000, gradient_step=14400, len=149, n/ep=1, n/st=100, rew=9292.50]                                                                              


Epoch #144: test_reward: 13042.100000 ± 5735.966674, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #145: 1001it [00:02, 419.27it/s, env_step=145000, gradient_step=14500, len=228, n/ep=0, n/st=100, rew=14064.00]                                                                             


Epoch #145: test_reward: 10973.200000 ± 5188.828631, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #146: 1001it [00:02, 378.83it/s, env_step=146000, gradient_step=14600, len=145, n/ep=1, n/st=100, rew=8639.00]                                                                              


Epoch #146: test_reward: 14739.700000 ± 3940.987289, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #147: 1001it [00:02, 376.94it/s, env_step=147000, gradient_step=14700, len=172, n/ep=0, n/st=100, rew=9568.50]                                                                              


Epoch #147: test_reward: 12498.500000 ± 4704.161588, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #148: 1001it [00:02, 392.09it/s, env_step=148000, gradient_step=14800, len=103, n/ep=0, n/st=100, rew=5760.00]                                                                              


Epoch #148: test_reward: 8709.900000 ± 4467.002674, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #149: 1001it [00:03, 318.27it/s, env_step=149000, gradient_step=14900, len=133, n/ep=0, n/st=100, rew=7994.50]                                                                              


Epoch #149: test_reward: 9207.400000 ± 4827.364751, best_reward: 18333.700000 ± 5595.801230 in #67


Epoch #150: 1001it [00:02, 344.04it/s, env_step=150000, gradient_step=15000, len=84, n/ep=1, n/st=100, rew=4612.00]                                                                               


Epoch #150: test_reward: 10862.600000 ± 5258.773777, best_reward: 18333.700000 ± 5595.801230 in #67

InfoStats(gradient_step=15000, best_reward=18333.7, best_reward_std=5595.801230386941, train_step=150000, train_episode=767, test_step=326620, test_episode=1510, timing=TimingStats(total_time=572.9335129261017, train_time=397.81662702560425, train_time_collect=52.594566345214844, train_time_update=339.2297031879425, test_time=175.11688590049744, update_speed=377.0581464166547))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #1


Epoch #1: 1001it [00:02, 443.86it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 11652.700000 ± 4714.504959, best_reward: 11652.700000 ± 4714.504959 in #1


Epoch #2: 1001it [00:02, 499.26it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 8919.000000 ± 3815.733245, best_reward: 11652.700000 ± 4714.504959 in #1


Epoch #3: 1001it [00:02, 444.49it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10345.800000 ± 4932.199303, best_reward: 11652.700000 ± 4714.504959 in #1


Epoch #4: 1001it [00:01, 519.42it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 12115.700000 ± 5412.915666, best_reward: 12115.700000 ± 5412.915666 in #4


Epoch #5: 1001it [00:02, 453.49it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 12111.400000 ± 3400.301904, best_reward: 12115.700000 ± 5412.915666 in #4


Epoch #6: 1001it [00:01, 540.72it/s, env_step=6000, gradient_step=600, len=59, n/ep=0, n/st=100, rew=2400.50]                                                                                     


Epoch #6: test_reward: 13845.500000 ± 3684.962937, best_reward: 13845.500000 ± 3684.962937 in #6


Epoch #7: 1001it [00:02, 462.53it/s, env_step=7000, gradient_step=700, len=61, n/ep=0, n/st=100, rew=2358.00]                                                                                     


Epoch #7: test_reward: 8891.400000 ± 4522.996246, best_reward: 13845.500000 ± 3684.962937 in #6


Epoch #8: 1001it [00:02, 495.71it/s, env_step=8000, gradient_step=800, len=75, n/ep=0, n/st=100, rew=3409.00]                                                                                     


Epoch #8: test_reward: 14366.500000 ± 4087.502862, best_reward: 14366.500000 ± 4087.502862 in #8


Epoch #9: 1001it [00:02, 387.56it/s, env_step=9000, gradient_step=900, len=75, n/ep=0, n/st=100, rew=3409.00]                                                                                     


Epoch #9: test_reward: 12607.000000 ± 6265.212877, best_reward: 14366.500000 ± 4087.502862 in #8


Epoch #10: 1001it [00:02, 380.43it/s, env_step=10000, gradient_step=1000, len=99, n/ep=0, n/st=100, rew=4835.00]                                                                                  


Epoch #10: test_reward: 8603.900000 ± 4764.601903, best_reward: 14366.500000 ± 4087.502862 in #8


Epoch #11: 1001it [00:02, 403.53it/s, env_step=11000, gradient_step=1100, len=106, n/ep=0, n/st=100, rew=4739.00]                                                                                 


Epoch #11: test_reward: 18048.100000 ± 6842.222394, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #12: 1001it [00:02, 397.42it/s, env_step=12000, gradient_step=1200, len=106, n/ep=0, n/st=100, rew=4739.00]                                                                                 


Epoch #12: test_reward: 9560.200000 ± 2016.194574, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #13: 1001it [00:02, 380.96it/s, env_step=13000, gradient_step=1300, len=130, n/ep=2, n/st=100, rew=6458.00]                                                                                 


Epoch #13: test_reward: 11950.600000 ± 3488.215251, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #14: 1001it [00:02, 417.11it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=7454.00]                                                                                 


Epoch #14: test_reward: 13884.700000 ± 4565.424625, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #15: 1001it [00:02, 422.10it/s, env_step=15000, gradient_step=1500, len=86, n/ep=0, n/st=100, rew=3490.00]                                                                                  


Epoch #15: test_reward: 8868.600000 ± 3723.875836, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #16: 1001it [00:03, 328.07it/s, env_step=16000, gradient_step=1600, len=158, n/ep=0, n/st=100, rew=7628.00]                                                                                 


Epoch #16: test_reward: 13848.000000 ± 5323.454724, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #17: 1001it [00:02, 357.26it/s, env_step=17000, gradient_step=1700, len=168, n/ep=0, n/st=100, rew=8442.00]                                                                                 


Epoch #17: test_reward: 10053.300000 ± 3281.274754, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #18: 1001it [00:02, 441.94it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=10271.25]                                                                                


Epoch #18: test_reward: 9750.500000 ± 2346.152265, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #19: 1001it [00:02, 455.64it/s, env_step=19000, gradient_step=1900, len=185, n/ep=0, n/st=100, rew=10714.50]                                                                                


Epoch #19: test_reward: 12730.700000 ± 4875.961342, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #20: 1001it [00:02, 447.94it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=11294.50]                                                                                


Epoch #20: test_reward: 12252.000000 ± 4039.105124, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #21: 1001it [00:02, 447.56it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=11715.50]                                                                                


Epoch #21: test_reward: 13235.000000 ± 5817.712162, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #22: 1001it [00:02, 392.36it/s, env_step=22000, gradient_step=2200, len=218, n/ep=0, n/st=100, rew=11817.50]                                                                                


Epoch #22: test_reward: 11512.600000 ± 4562.064625, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #23: 1001it [00:02, 441.67it/s, env_step=23000, gradient_step=2300, len=36, n/ep=1, n/st=100, rew=822.00]                                                                                   


Epoch #23: test_reward: 15330.400000 ± 5184.106002, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #24: 1001it [00:02, 457.60it/s, env_step=24000, gradient_step=2400, len=233, n/ep=0, n/st=100, rew=12566.00]                                                                                


Epoch #24: test_reward: 6107.600000 ± 3583.494027, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #25: 1001it [00:03, 323.07it/s, env_step=25000, gradient_step=2500, len=161, n/ep=2, n/st=100, rew=8850.00]                                                                                 


Epoch #25: test_reward: 10397.500000 ± 4595.573201, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #26: 1001it [00:02, 385.52it/s, env_step=26000, gradient_step=2600, len=259, n/ep=0, n/st=100, rew=15196.50]                                                                                


Epoch #26: test_reward: 11866.900000 ± 3956.842288, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #27: 1001it [00:02, 388.42it/s, env_step=27000, gradient_step=2700, len=268, n/ep=0, n/st=100, rew=15058.50]                                                                                


Epoch #27: test_reward: 13971.400000 ± 3705.095551, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #28: 1001it [00:02, 369.68it/s, env_step=28000, gradient_step=2800, len=126, n/ep=1, n/st=100, rew=4841.00]                                                                                 


Epoch #28: test_reward: 8794.800000 ± 5440.984631, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #29: 1001it [00:02, 437.92it/s, env_step=29000, gradient_step=2900, len=286, n/ep=0, n/st=100, rew=16780.50]                                                                                


Epoch #29: test_reward: 18038.900000 ± 2047.451413, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #30: 1001it [00:02, 397.17it/s, env_step=30000, gradient_step=3000, len=299, n/ep=0, n/st=100, rew=18028.50]                                                                                


Epoch #30: test_reward: 11520.400000 ± 3127.491845, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #31: 1001it [00:02, 388.62it/s, env_step=31000, gradient_step=3100, len=82, n/ep=2, n/st=100, rew=4138.00]                                                                                  


Epoch #31: test_reward: 15118.100000 ± 3484.698278, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #32: 1001it [00:02, 427.64it/s, env_step=32000, gradient_step=3200, len=318, n/ep=0, n/st=100, rew=18800.00]                                                                                


Epoch #32: test_reward: 11645.300000 ± 4243.607052, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #33: 1001it [00:02, 450.73it/s, env_step=33000, gradient_step=3300, len=184, n/ep=1, n/st=100, rew=9253.00]                                                                                 


Epoch #33: test_reward: 11222.300000 ± 4531.009072, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #34: 1001it [00:02, 391.19it/s, env_step=34000, gradient_step=3400, len=164, n/ep=0, n/st=100, rew=7525.75]                                                                                 


Epoch #34: test_reward: 11060.600000 ± 4920.357796, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #35: 1001it [00:02, 427.98it/s, env_step=35000, gradient_step=3500, len=243, n/ep=2, n/st=100, rew=14183.00]                                                                                


Epoch #35: test_reward: 10571.900000 ± 3245.470057, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #36: 1001it [00:02, 415.14it/s, env_step=36000, gradient_step=3600, len=160, n/ep=1, n/st=100, rew=8263.50]                                                                                 


Epoch #36: test_reward: 11561.100000 ± 5199.716770, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #37: 1001it [00:02, 344.14it/s, env_step=37000, gradient_step=3700, len=246, n/ep=0, n/st=100, rew=12227.00]                                                                                


Epoch #37: test_reward: 7194.000000 ± 3388.485798, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #38: 1001it [00:02, 419.47it/s, env_step=38000, gradient_step=3800, len=229, n/ep=0, n/st=100, rew=12541.00]                                                                                


Epoch #38: test_reward: 13250.800000 ± 5866.484481, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #39: 1001it [00:02, 359.86it/s, env_step=39000, gradient_step=3900, len=162, n/ep=0, n/st=100, rew=8995.00]                                                                                 


Epoch #39: test_reward: 13787.900000 ± 6245.122392, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #40: 1001it [00:02, 368.00it/s, env_step=40000, gradient_step=4000, len=289, n/ep=4, n/st=100, rew=17373.38]                                                                                


Epoch #40: test_reward: 12216.900000 ± 3442.108204, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #41: 1001it [00:02, 416.54it/s, env_step=41000, gradient_step=4100, len=199, n/ep=0, n/st=100, rew=11810.00]                                                                                


Epoch #41: test_reward: 7379.200000 ± 3505.067155, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #42: 1001it [00:03, 327.74it/s, env_step=42000, gradient_step=4200, len=156, n/ep=0, n/st=100, rew=7543.00]                                                                                 


Epoch #42: test_reward: 10757.300000 ± 6803.758139, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #43: 1001it [00:02, 335.65it/s, env_step=43000, gradient_step=4300, len=261, n/ep=2, n/st=100, rew=14796.25]                                                                                


Epoch #43: test_reward: 7262.900000 ± 2332.648943, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #44: 1001it [00:02, 404.44it/s, env_step=44000, gradient_step=4400, len=223, n/ep=1, n/st=100, rew=10299.50]                                                                                


Epoch #44: test_reward: 13006.600000 ± 6018.334208, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #45: 1001it [00:02, 350.07it/s, env_step=45000, gradient_step=4500, len=277, n/ep=1, n/st=100, rew=15236.00]                                                                                


Epoch #45: test_reward: 8771.900000 ± 3390.850113, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #46: 1001it [00:02, 363.25it/s, env_step=46000, gradient_step=4600, len=207, n/ep=0, n/st=100, rew=9821.00]                                                                                 


Epoch #46: test_reward: 12453.400000 ± 4599.772542, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #47: 1001it [00:02, 450.34it/s, env_step=47000, gradient_step=4700, len=239, n/ep=1, n/st=100, rew=13519.50]                                                                                


Epoch #47: test_reward: 9847.200000 ± 4473.913584, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #48: 1001it [00:02, 424.65it/s, env_step=48000, gradient_step=4800, len=168, n/ep=0, n/st=100, rew=8251.00]                                                                                 


Epoch #48: test_reward: 9394.500000 ± 2106.923788, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #49: 1001it [00:02, 430.48it/s, env_step=49000, gradient_step=4900, len=273, n/ep=0, n/st=100, rew=15344.17]                                                                                


Epoch #49: test_reward: 13848.100000 ± 2956.732739, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #50: 1001it [00:02, 430.92it/s, env_step=50000, gradient_step=5000, len=115, n/ep=1, n/st=100, rew=5274.00]                                                                                 


Epoch #50: test_reward: 9101.600000 ± 4039.082079, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #51: 1001it [00:02, 434.58it/s, env_step=51000, gradient_step=5100, len=273, n/ep=0, n/st=100, rew=15703.00]                                                                                


Epoch #51: test_reward: 11515.300000 ± 3451.817899, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #52: 1001it [00:02, 386.15it/s, env_step=52000, gradient_step=5200, len=247, n/ep=0, n/st=100, rew=14018.00]                                                                                


Epoch #52: test_reward: 11066.900000 ± 4719.447647, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #53: 1001it [00:03, 315.98it/s, env_step=53000, gradient_step=5300, len=249, n/ep=0, n/st=100, rew=13983.50]                                                                                


Epoch #53: test_reward: 12937.000000 ± 3404.730503, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #54: 1001it [00:02, 350.13it/s, env_step=54000, gradient_step=5400, len=164, n/ep=0, n/st=100, rew=8893.25]                                                                                 


Epoch #54: test_reward: 11284.800000 ± 4470.352756, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #55: 1001it [00:02, 377.18it/s, env_step=55000, gradient_step=5500, len=225, n/ep=0, n/st=100, rew=12172.50]                                                                                


Epoch #55: test_reward: 11322.200000 ± 4456.591114, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #56: 1001it [00:02, 413.99it/s, env_step=56000, gradient_step=5600, len=160, n/ep=1, n/st=100, rew=8301.50]                                                                                 


Epoch #56: test_reward: 11945.600000 ± 2209.346564, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #57: 1001it [00:02, 399.45it/s, env_step=57000, gradient_step=5700, len=245, n/ep=0, n/st=100, rew=13523.00]                                                                                


Epoch #57: test_reward: 7241.600000 ± 3296.235162, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #58: 1001it [00:02, 360.57it/s, env_step=58000, gradient_step=5800, len=250, n/ep=1, n/st=100, rew=14936.00]                                                                                


Epoch #58: test_reward: 12575.400000 ± 5258.394569, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #59: 1001it [00:02, 421.66it/s, env_step=59000, gradient_step=5900, len=224, n/ep=0, n/st=100, rew=11248.50]                                                                                


Epoch #59: test_reward: 13886.200000 ± 6221.960749, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #60: 1001it [00:02, 340.46it/s, env_step=60000, gradient_step=6000, len=333, n/ep=1, n/st=100, rew=19076.00]                                                                                


Epoch #60: test_reward: 8921.300000 ± 5147.978711, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #61: 1001it [00:02, 425.48it/s, env_step=61000, gradient_step=6100, len=238, n/ep=1, n/st=100, rew=14833.00]                                                                                


Epoch #61: test_reward: 9350.900000 ± 5370.683875, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #62: 1001it [00:02, 440.67it/s, env_step=62000, gradient_step=6200, len=169, n/ep=0, n/st=100, rew=9758.00]                                                                                 


Epoch #62: test_reward: 9700.800000 ± 4676.514937, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #63: 1001it [00:02, 371.55it/s, env_step=63000, gradient_step=6300, len=230, n/ep=1, n/st=100, rew=14069.00]                                                                                


Epoch #63: test_reward: 15926.400000 ± 4352.379446, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #64: 1001it [00:02, 419.97it/s, env_step=64000, gradient_step=6400, len=325, n/ep=0, n/st=100, rew=18257.50]                                                                                


Epoch #64: test_reward: 11817.600000 ± 4354.115185, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #65: 1001it [00:02, 428.71it/s, env_step=65000, gradient_step=6500, len=309, n/ep=1, n/st=100, rew=16856.00]                                                                                


Epoch #65: test_reward: 10918.200000 ± 6478.093204, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #66: 1001it [00:02, 374.72it/s, env_step=66000, gradient_step=6600, len=226, n/ep=0, n/st=100, rew=12561.00]                                                                                


Epoch #66: test_reward: 9170.300000 ± 4181.615455, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #67: 1001it [00:02, 357.32it/s, env_step=67000, gradient_step=6700, len=254, n/ep=0, n/st=100, rew=14541.50]                                                                                


Epoch #67: test_reward: 11867.400000 ± 6933.101358, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #68: 1001it [00:02, 411.86it/s, env_step=68000, gradient_step=6800, len=211, n/ep=1, n/st=100, rew=12047.50]                                                                                


Epoch #68: test_reward: 9359.300000 ± 5242.907801, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #69: 1001it [00:02, 421.56it/s, env_step=69000, gradient_step=6900, len=283, n/ep=0, n/st=100, rew=16454.50]                                                                                


Epoch #69: test_reward: 5708.400000 ± 5521.106777, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #70: 1001it [00:02, 375.76it/s, env_step=70000, gradient_step=7000, len=273, n/ep=2, n/st=100, rew=16166.00]                                                                                


Epoch #70: test_reward: 8870.700000 ± 5243.677146, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #71: 1001it [00:02, 408.71it/s, env_step=71000, gradient_step=7100, len=157, n/ep=0, n/st=100, rew=8992.00]                                                                                 


Epoch #71: test_reward: 8188.800000 ± 5575.300204, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #72: 1001it [00:03, 322.89it/s, env_step=72000, gradient_step=7200, len=176, n/ep=0, n/st=100, rew=10419.00]                                                                                


Epoch #72: test_reward: 10631.400000 ± 4272.016976, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #73: 1001it [00:02, 379.13it/s, env_step=73000, gradient_step=7300, len=183, n/ep=1, n/st=100, rew=10469.00]                                                                                


Epoch #73: test_reward: 4938.700000 ± 1990.517423, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #74: 1001it [00:02, 381.72it/s, env_step=74000, gradient_step=7400, len=121, n/ep=0, n/st=100, rew=5345.00]                                                                                 


Epoch #74: test_reward: 7836.900000 ± 4376.311379, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #75: 1001it [00:02, 420.58it/s, env_step=75000, gradient_step=7500, len=165, n/ep=0, n/st=100, rew=10148.00]                                                                                


Epoch #75: test_reward: 13532.000000 ± 4889.862145, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #76: 1001it [00:02, 333.92it/s, env_step=76000, gradient_step=7600, len=55, n/ep=1, n/st=100, rew=2460.00]                                                                                  


Epoch #76: test_reward: 13085.500000 ± 5929.670939, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #77: 1001it [00:03, 321.57it/s, env_step=77000, gradient_step=7700, len=55, n/ep=0, n/st=100, rew=2460.00]                                                                                  


Epoch #77: test_reward: 9690.400000 ± 5577.371768, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #78: 1001it [00:03, 311.38it/s, env_step=78000, gradient_step=7800, len=208, n/ep=1, n/st=100, rew=12168.00]                                                                                


Epoch #78: test_reward: 9364.400000 ± 5255.896083, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #79: 1001it [00:02, 354.97it/s, env_step=79000, gradient_step=7900, len=135, n/ep=1, n/st=100, rew=7567.50]                                                                                 


Epoch #79: test_reward: 9130.000000 ± 4785.932386, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #80: 1001it [00:02, 383.83it/s, env_step=80000, gradient_step=8000, len=253, n/ep=0, n/st=100, rew=14833.00]                                                                                


Epoch #80: test_reward: 10125.200000 ± 3801.562331, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #81: 1001it [00:02, 417.17it/s, env_step=81000, gradient_step=8100, len=114, n/ep=0, n/st=100, rew=5844.50]                                                                                 


Epoch #81: test_reward: 6620.200000 ± 3331.932376, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #82: 1001it [00:02, 400.32it/s, env_step=82000, gradient_step=8200, len=240, n/ep=1, n/st=100, rew=14172.00]                                                                                


Epoch #82: test_reward: 11543.400000 ± 6133.940075, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #83: 1001it [00:02, 348.77it/s, env_step=83000, gradient_step=8300, len=335, n/ep=3, n/st=100, rew=20157.00]                                                                                


Epoch #83: test_reward: 10620.700000 ± 4804.834566, best_reward: 18048.100000 ± 6842.222394 in #11


Epoch #84: 1001it [00:03, 319.96it/s, env_step=84000, gradient_step=8400, len=19, n/ep=1, n/st=100, rew=613.00]                                                                                   


Epoch #84: test_reward: 19250.900000 ± 3867.904871, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #85: 1001it [00:02, 391.61it/s, env_step=85000, gradient_step=8500, len=159, n/ep=0, n/st=100, rew=8107.00]                                                                                 


Epoch #85: test_reward: 8311.400000 ± 5982.313402, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #86: 1001it [00:02, 385.57it/s, env_step=86000, gradient_step=8600, len=297, n/ep=1, n/st=100, rew=19459.50]                                                                                


Epoch #86: test_reward: 10015.400000 ± 3738.499169, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #87: 1001it [00:02, 421.37it/s, env_step=87000, gradient_step=8700, len=306, n/ep=2, n/st=100, rew=17797.50]                                                                                


Epoch #87: test_reward: 17148.800000 ± 6292.877558, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #88: 1001it [00:02, 421.46it/s, env_step=88000, gradient_step=8800, len=202, n/ep=1, n/st=100, rew=12569.50]                                                                                


Epoch #88: test_reward: 9757.700000 ± 6234.682030, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #89: 1001it [00:02, 346.88it/s, env_step=89000, gradient_step=8900, len=361, n/ep=0, n/st=100, rew=21455.00]                                                                                


Epoch #89: test_reward: 11093.800000 ± 5175.020730, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #90: 1001it [00:02, 433.89it/s, env_step=90000, gradient_step=9000, len=224, n/ep=1, n/st=100, rew=13744.50]                                                                                


Epoch #90: test_reward: 9940.100000 ± 5405.807848, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #91: 1001it [00:02, 389.08it/s, env_step=91000, gradient_step=9100, len=214, n/ep=1, n/st=100, rew=12070.00]                                                                                


Epoch #91: test_reward: 10955.900000 ± 7721.965118, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #92: 1001it [00:02, 375.26it/s, env_step=92000, gradient_step=9200, len=361, n/ep=1, n/st=100, rew=20595.00]                                                                                


Epoch #92: test_reward: 12489.700000 ± 4266.314664, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #93: 1001it [00:02, 425.48it/s, env_step=93000, gradient_step=9300, len=147, n/ep=0, n/st=100, rew=8607.00]                                                                                 


Epoch #93: test_reward: 10775.300000 ± 5165.395455, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #94: 1001it [00:02, 405.30it/s, env_step=94000, gradient_step=9400, len=97, n/ep=0, n/st=100, rew=5285.25]                                                                                  


Epoch #94: test_reward: 11815.900000 ± 4856.764611, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #95: 1001it [00:02, 397.51it/s, env_step=95000, gradient_step=9500, len=107, n/ep=0, n/st=100, rew=5080.00]                                                                                 


Epoch #95: test_reward: 9683.000000 ± 3494.663274, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #96: 1001it [00:02, 394.50it/s, env_step=96000, gradient_step=9600, len=101, n/ep=0, n/st=100, rew=5188.00]                                                                                 


Epoch #96: test_reward: 11357.900000 ± 6317.570505, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #97: 1001it [00:02, 374.89it/s, env_step=97000, gradient_step=9700, len=217, n/ep=0, n/st=100, rew=12978.50]                                                                                


Epoch #97: test_reward: 8577.200000 ± 2359.937194, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #98: 1001it [00:02, 365.31it/s, env_step=98000, gradient_step=9800, len=154, n/ep=1, n/st=100, rew=8839.00]                                                                                 


Epoch #98: test_reward: 5503.200000 ± 3618.344201, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #99: 1001it [00:02, 394.81it/s, env_step=99000, gradient_step=9900, len=39, n/ep=0, n/st=100, rew=1397.50]                                                                                  


Epoch #99: test_reward: 9171.400000 ± 4285.518316, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #100: 1001it [00:03, 324.94it/s, env_step=100000, gradient_step=10000, len=251, n/ep=2, n/st=100, rew=16093.00]                                                                             


Epoch #100: test_reward: 10555.400000 ± 3616.001665, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #101: 1001it [00:02, 369.90it/s, env_step=101000, gradient_step=10100, len=268, n/ep=0, n/st=100, rew=16471.00]                                                                             


Epoch #101: test_reward: 10870.300000 ± 6465.082444, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #102: 1001it [00:02, 361.87it/s, env_step=102000, gradient_step=10200, len=173, n/ep=0, n/st=100, rew=9573.50]                                                                              


Epoch #102: test_reward: 10669.200000 ± 4978.021792, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #103: 1001it [00:02, 390.93it/s, env_step=103000, gradient_step=10300, len=313, n/ep=1, n/st=100, rew=17418.50]                                                                             


Epoch #103: test_reward: 7699.700000 ± 3920.560063, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #104: 1001it [00:02, 412.04it/s, env_step=104000, gradient_step=10400, len=114, n/ep=1, n/st=100, rew=5373.00]                                                                              


Epoch #104: test_reward: 9014.600000 ± 2860.697964, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #105: 1001it [00:02, 387.35it/s, env_step=105000, gradient_step=10500, len=250, n/ep=0, n/st=100, rew=14557.50]                                                                             


Epoch #105: test_reward: 8297.200000 ± 3410.455858, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #106: 1001it [00:02, 363.23it/s, env_step=106000, gradient_step=10600, len=281, n/ep=0, n/st=100, rew=16468.50]                                                                             


Epoch #106: test_reward: 9563.100000 ± 5003.142242, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #107: 1001it [00:02, 372.99it/s, env_step=107000, gradient_step=10700, len=139, n/ep=0, n/st=100, rew=7839.00]                                                                              


Epoch #107: test_reward: 9651.700000 ± 3199.768181, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #108: 1001it [00:02, 361.60it/s, env_step=108000, gradient_step=10800, len=179, n/ep=0, n/st=100, rew=10580.00]                                                                             


Epoch #108: test_reward: 10917.800000 ± 5078.430876, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #109: 1001it [00:02, 359.58it/s, env_step=109000, gradient_step=10900, len=121, n/ep=1, n/st=100, rew=6744.50]                                                                              


Epoch #109: test_reward: 9852.600000 ± 2789.572017, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #110: 1001it [00:02, 356.59it/s, env_step=110000, gradient_step=11000, len=121, n/ep=1, n/st=100, rew=6746.50]                                                                              


Epoch #110: test_reward: 10195.200000 ± 5220.328664, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #111: 1001it [00:02, 381.38it/s, env_step=111000, gradient_step=11100, len=216, n/ep=0, n/st=100, rew=12819.00]                                                                             


Epoch #111: test_reward: 7293.900000 ± 3891.924420, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #112: 1001it [00:02, 375.32it/s, env_step=112000, gradient_step=11200, len=249, n/ep=0, n/st=100, rew=15436.00]                                                                             


Epoch #112: test_reward: 12412.900000 ± 6028.616034, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #113: 1001it [00:02, 441.36it/s, env_step=113000, gradient_step=11300, len=268, n/ep=2, n/st=100, rew=15477.00]                                                                             


Epoch #113: test_reward: 9985.700000 ± 4450.911548, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #114: 1001it [00:02, 390.17it/s, env_step=114000, gradient_step=11400, len=108, n/ep=0, n/st=100, rew=5815.00]                                                                              


Epoch #114: test_reward: 10547.600000 ± 3189.428889, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #115: 1001it [00:02, 417.74it/s, env_step=115000, gradient_step=11500, len=164, n/ep=0, n/st=100, rew=8964.50]                                                                              


Epoch #115: test_reward: 13174.500000 ± 3313.544665, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #116: 1001it [00:02, 367.79it/s, env_step=116000, gradient_step=11600, len=319, n/ep=1, n/st=100, rew=17633.00]                                                                             


Epoch #116: test_reward: 12304.000000 ± 5263.836928, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #117: 1001it [00:02, 358.45it/s, env_step=117000, gradient_step=11700, len=195, n/ep=0, n/st=100, rew=11030.00]                                                                             


Epoch #117: test_reward: 10755.600000 ± 4687.861478, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #118: 1001it [00:02, 393.51it/s, env_step=118000, gradient_step=11800, len=141, n/ep=1, n/st=100, rew=8371.00]                                                                              


Epoch #118: test_reward: 8163.100000 ± 4126.963737, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #119: 1001it [00:02, 436.76it/s, env_step=119000, gradient_step=11900, len=142, n/ep=1, n/st=100, rew=6279.00]                                                                              


Epoch #119: test_reward: 9700.600000 ± 2443.624693, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #120: 1001it [00:02, 396.79it/s, env_step=120000, gradient_step=12000, len=141, n/ep=0, n/st=100, rew=7657.50]                                                                              


Epoch #120: test_reward: 12315.000000 ± 4422.012777, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #121: 1001it [00:02, 375.33it/s, env_step=121000, gradient_step=12100, len=104, n/ep=0, n/st=100, rew=5267.50]                                                                              


Epoch #121: test_reward: 7998.200000 ± 3577.595528, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #122: 1001it [00:02, 433.85it/s, env_step=122000, gradient_step=12200, len=125, n/ep=1, n/st=100, rew=6851.50]                                                                              


Epoch #122: test_reward: 11697.000000 ± 2746.144534, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #123: 1001it [00:02, 420.88it/s, env_step=123000, gradient_step=12300, len=109, n/ep=0, n/st=100, rew=5257.50]                                                                              


Epoch #123: test_reward: 9393.200000 ± 4816.637267, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #124: 1001it [00:02, 366.44it/s, env_step=124000, gradient_step=12400, len=114, n/ep=0, n/st=100, rew=5258.25]                                                                              


Epoch #124: test_reward: 9312.800000 ± 3790.867178, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #125: 1001it [00:02, 406.21it/s, env_step=125000, gradient_step=12500, len=105, n/ep=0, n/st=100, rew=4207.00]                                                                              


Epoch #125: test_reward: 10265.400000 ± 4528.685818, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #126: 1001it [00:02, 405.46it/s, env_step=126000, gradient_step=12600, len=245, n/ep=0, n/st=100, rew=14245.50]                                                                             


Epoch #126: test_reward: 9396.200000 ± 3914.402376, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #127: 1001it [00:02, 371.02it/s, env_step=127000, gradient_step=12700, len=158, n/ep=4, n/st=100, rew=9124.25]                                                                              


Epoch #127: test_reward: 11530.500000 ± 4111.425087, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #128: 1001it [00:02, 416.82it/s, env_step=128000, gradient_step=12800, len=152, n/ep=0, n/st=100, rew=7391.25]                                                                              


Epoch #128: test_reward: 12131.400000 ± 7102.303671, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #129: 1001it [00:02, 386.08it/s, env_step=129000, gradient_step=12900, len=135, n/ep=1, n/st=100, rew=7157.00]                                                                              


Epoch #129: test_reward: 12347.300000 ± 2972.134151, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #130: 1001it [00:02, 415.17it/s, env_step=130000, gradient_step=13000, len=187, n/ep=2, n/st=100, rew=10946.50]                                                                             


Epoch #130: test_reward: 11528.400000 ± 3188.088901, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #131: 1001it [00:03, 316.85it/s, env_step=131000, gradient_step=13100, len=109, n/ep=2, n/st=100, rew=5306.00]                                                                              


Epoch #131: test_reward: 13049.600000 ± 4051.062977, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #132: 1001it [00:02, 421.77it/s, env_step=132000, gradient_step=13200, len=267, n/ep=0, n/st=100, rew=16666.00]                                                                             


Epoch #132: test_reward: 11084.800000 ± 4277.637147, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #133: 1001it [00:02, 402.69it/s, env_step=133000, gradient_step=13300, len=139, n/ep=1, n/st=100, rew=8069.00]                                                                              


Epoch #133: test_reward: 12509.200000 ± 4593.420246, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #134: 1001it [00:02, 341.09it/s, env_step=134000, gradient_step=13400, len=210, n/ep=1, n/st=100, rew=12473.00]                                                                             


Epoch #134: test_reward: 7957.800000 ± 4794.258792, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #135: 1001it [00:02, 354.50it/s, env_step=135000, gradient_step=13500, len=86, n/ep=0, n/st=100, rew=3957.50]                                                                               


Epoch #135: test_reward: 13541.100000 ± 3154.487358, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #136: 1001it [00:02, 353.64it/s, env_step=136000, gradient_step=13600, len=212, n/ep=0, n/st=100, rew=13242.00]                                                                             


Epoch #136: test_reward: 13849.000000 ± 6578.361392, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #137: 1001it [00:02, 430.56it/s, env_step=137000, gradient_step=13700, len=164, n/ep=0, n/st=100, rew=10321.50]                                                                             


Epoch #137: test_reward: 10510.200000 ± 3778.551913, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #138: 1001it [00:02, 390.82it/s, env_step=138000, gradient_step=13800, len=152, n/ep=1, n/st=100, rew=8748.00]                                                                              


Epoch #138: test_reward: 12736.500000 ± 6727.224469, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #139: 1001it [00:02, 383.53it/s, env_step=139000, gradient_step=13900, len=106, n/ep=0, n/st=100, rew=5197.00]                                                                              


Epoch #139: test_reward: 14767.200000 ± 4194.151471, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #140: 1001it [00:02, 385.10it/s, env_step=140000, gradient_step=14000, len=215, n/ep=0, n/st=100, rew=13312.50]                                                                             


Epoch #140: test_reward: 9611.000000 ± 4492.909614, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #141: 1001it [00:02, 338.91it/s, env_step=141000, gradient_step=14100, len=128, n/ep=2, n/st=100, rew=7189.00]                                                                              


Epoch #141: test_reward: 11017.500000 ± 3929.531403, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #142: 1001it [00:02, 393.34it/s, env_step=142000, gradient_step=14200, len=291, n/ep=1, n/st=100, rew=16346.50]                                                                             


Epoch #142: test_reward: 11964.500000 ± 4317.805722, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #143: 1001it [00:02, 394.96it/s, env_step=143000, gradient_step=14300, len=183, n/ep=1, n/st=100, rew=10574.50]                                                                             


Epoch #143: test_reward: 10729.800000 ± 2928.087628, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #144: 1001it [00:02, 432.34it/s, env_step=144000, gradient_step=14400, len=211, n/ep=2, n/st=100, rew=12542.25]                                                                             


Epoch #144: test_reward: 11293.400000 ± 2751.714818, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #145: 1001it [00:02, 427.53it/s, env_step=145000, gradient_step=14500, len=170, n/ep=0, n/st=100, rew=9696.50]                                                                              


Epoch #145: test_reward: 11589.700000 ± 3822.836749, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #146: 1001it [00:02, 396.43it/s, env_step=146000, gradient_step=14600, len=190, n/ep=1, n/st=100, rew=11193.50]                                                                             


Epoch #146: test_reward: 12522.800000 ± 4464.554016, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #147: 1001it [00:02, 414.86it/s, env_step=147000, gradient_step=14700, len=147, n/ep=0, n/st=100, rew=8985.00]                                                                              


Epoch #147: test_reward: 9755.300000 ± 3455.000522, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #148: 1001it [00:02, 346.27it/s, env_step=148000, gradient_step=14800, len=161, n/ep=2, n/st=100, rew=9080.75]                                                                              


Epoch #148: test_reward: 9064.100000 ± 6392.350216, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #149: 1001it [00:02, 369.86it/s, env_step=149000, gradient_step=14900, len=269, n/ep=0, n/st=100, rew=17281.00]                                                                             


Epoch #149: test_reward: 11198.400000 ± 6196.814572, best_reward: 19250.900000 ± 3867.904871 in #84


Epoch #150: 1001it [00:02, 411.85it/s, env_step=150000, gradient_step=15000, len=141, n/ep=0, n/st=100, rew=7680.50]                                                                              


Epoch #150: test_reward: 11507.900000 ± 3203.248615, best_reward: 19250.900000 ± 3867.904871 in #84

InfoStats(gradient_step=15000, best_reward=19250.9, best_reward_std=3867.90487085709, train_step=150000, train_episode=719, test_step=338943, test_episode=1510, timing=TimingStats(total_time=569.8102686405182, train_time=385.6999578475952, train_time_collect=52.36756229400635, train_time_update=327.4336929321289, test_time=184.11031079292297, update_speed=388.90333521703604))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #10


Epoch #1: 1001it [00:01, 519.20it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 14033.200000 ± 3951.446768, best_reward: 14033.200000 ± 3951.446768 in #1


Epoch #2: 1001it [00:02, 488.70it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 12209.300000 ± 4590.227446, best_reward: 14033.200000 ± 3951.446768 in #1


Epoch #3: 1001it [00:02, 485.29it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 14176.100000 ± 3688.201769, best_reward: 14176.100000 ± 3688.201769 in #3


Epoch #4: 1001it [00:02, 364.37it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 13231.200000 ± 5163.855474, best_reward: 14176.100000 ± 3688.201769 in #3


Epoch #5: 1001it [00:02, 472.22it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 12611.600000 ± 3547.655344, best_reward: 14176.100000 ± 3688.201769 in #3


Epoch #6: 1001it [00:02, 469.76it/s, env_step=6000, gradient_step=600, len=58, n/ep=0, n/st=100, rew=1954.00]                                                                                     


Epoch #6: test_reward: 13887.200000 ± 4270.740165, best_reward: 14176.100000 ± 3688.201769 in #3


Epoch #7: 1001it [00:02, 381.72it/s, env_step=7000, gradient_step=700, len=66, n/ep=0, n/st=100, rew=2171.00]                                                                                     


Epoch #7: test_reward: 11116.200000 ± 4475.703382, best_reward: 14176.100000 ± 3688.201769 in #3


Epoch #8: 1001it [00:02, 422.39it/s, env_step=8000, gradient_step=800, len=80, n/ep=2, n/st=100, rew=2783.50]                                                                                     


Epoch #8: test_reward: 11965.800000 ± 3944.408113, best_reward: 14176.100000 ± 3688.201769 in #3


Epoch #9: 1001it [00:02, 425.73it/s, env_step=9000, gradient_step=900, len=90, n/ep=1, n/st=100, rew=3603.00]                                                                                     


Epoch #9: test_reward: 18520.700000 ± 5770.846663, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #10: 1001it [00:02, 424.06it/s, env_step=10000, gradient_step=1000, len=100, n/ep=2, n/st=100, rew=3943.50]                                                                                 


Epoch #10: test_reward: 9892.100000 ± 5748.457575, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #11: 1001it [00:02, 444.67it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=4822.00]                                                                                 


Epoch #11: test_reward: 13803.300000 ± 5688.039593, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #12: 1001it [00:02, 402.49it/s, env_step=12000, gradient_step=1200, len=114, n/ep=0, n/st=100, rew=4343.50]                                                                                 


Epoch #12: test_reward: 15391.700000 ± 3606.624407, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #13: 1001it [00:02, 408.80it/s, env_step=13000, gradient_step=1300, len=130, n/ep=2, n/st=100, rew=6043.75]                                                                                 


Epoch #13: test_reward: 13374.000000 ± 4556.956616, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #14: 1001it [00:02, 433.20it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=5871.00]                                                                                 


Epoch #14: test_reward: 11572.600000 ± 5249.029990, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #15: 1001it [00:02, 444.38it/s, env_step=15000, gradient_step=1500, len=150, n/ep=1, n/st=100, rew=6075.00]                                                                                 


Epoch #15: test_reward: 14341.400000 ± 6635.368071, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #16: 1001it [00:02, 368.84it/s, env_step=16000, gradient_step=1600, len=158, n/ep=0, n/st=100, rew=6975.40]                                                                                 


Epoch #16: test_reward: 14435.200000 ± 4125.370815, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #17: 1001it [00:02, 401.69it/s, env_step=17000, gradient_step=1700, len=137, n/ep=0, n/st=100, rew=5497.25]                                                                                 


Epoch #17: test_reward: 15054.600000 ± 3808.716850, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #18: 1001it [00:02, 354.53it/s, env_step=18000, gradient_step=1800, len=176, n/ep=0, n/st=100, rew=8350.25]                                                                                 


Epoch #18: test_reward: 10584.600000 ± 3837.669923, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #19: 1001it [00:02, 427.80it/s, env_step=19000, gradient_step=1900, len=185, n/ep=0, n/st=100, rew=7756.00]                                                                                 


Epoch #19: test_reward: 11730.400000 ± 4181.493494, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #20: 1001it [00:02, 413.09it/s, env_step=20000, gradient_step=2000, len=180, n/ep=4, n/st=100, rew=8362.88]                                                                                 


Epoch #20: test_reward: 11076.200000 ± 4616.867808, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #21: 1001it [00:02, 413.79it/s, env_step=21000, gradient_step=2100, len=208, n/ep=0, n/st=100, rew=11306.00]                                                                                


Epoch #21: test_reward: 14028.200000 ± 5940.299551, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #22: 1001it [00:03, 328.76it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=10083.00]                                                                                


Epoch #22: test_reward: 11154.000000 ± 2325.639869, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #23: 1001it [00:02, 412.82it/s, env_step=23000, gradient_step=2300, len=220, n/ep=0, n/st=100, rew=10083.00]                                                                                


Epoch #23: test_reward: 11907.600000 ± 5922.099479, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #24: 1001it [00:02, 412.68it/s, env_step=24000, gradient_step=2400, len=240, n/ep=1, n/st=100, rew=11537.00]                                                                                


Epoch #24: test_reward: 8883.700000 ± 4113.461585, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #25: 1001it [00:02, 431.27it/s, env_step=25000, gradient_step=2500, len=86, n/ep=0, n/st=100, rew=3003.00]                                                                                  


Epoch #25: test_reward: 17300.000000 ± 7036.484136, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #26: 1001it [00:02, 437.55it/s, env_step=26000, gradient_step=2600, len=156, n/ep=1, n/st=100, rew=6278.50]                                                                                 


Epoch #26: test_reward: 12371.200000 ± 4384.403284, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #27: 1001it [00:02, 384.09it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=13080.00]                                                                                


Epoch #27: test_reward: 14502.400000 ± 5261.644652, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #28: 1001it [00:02, 361.73it/s, env_step=28000, gradient_step=2800, len=274, n/ep=0, n/st=100, rew=13597.00]                                                                                


Epoch #28: test_reward: 15347.700000 ± 6401.934896, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #29: 1001it [00:02, 424.65it/s, env_step=29000, gradient_step=2900, len=95, n/ep=0, n/st=100, rew=5083.50]                                                                                  


Epoch #29: test_reward: 14063.900000 ± 5529.517293, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #30: 1001it [00:02, 445.11it/s, env_step=30000, gradient_step=3000, len=174, n/ep=0, n/st=100, rew=9474.50]                                                                                 


Epoch #30: test_reward: 13184.900000 ± 5741.696343, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #31: 1001it [00:02, 339.09it/s, env_step=31000, gradient_step=3100, len=64, n/ep=0, n/st=100, rew=2022.00]                                                                                  


Epoch #31: test_reward: 12651.500000 ± 4221.188298, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #32: 1001it [00:02, 439.51it/s, env_step=32000, gradient_step=3200, len=318, n/ep=0, n/st=100, rew=16469.50]                                                                                


Epoch #32: test_reward: 15010.200000 ± 3144.925589, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #33: 1001it [00:02, 415.06it/s, env_step=33000, gradient_step=3300, len=330, n/ep=2, n/st=100, rew=18581.00]                                                                                


Epoch #33: test_reward: 14131.600000 ± 7204.694792, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #34: 1001it [00:02, 347.02it/s, env_step=34000, gradient_step=3400, len=201, n/ep=0, n/st=100, rew=10689.00]                                                                                


Epoch #34: test_reward: 15967.400000 ± 6143.587473, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #35: 1001it [00:02, 364.54it/s, env_step=35000, gradient_step=3500, len=342, n/ep=0, n/st=100, rew=19743.00]                                                                                


Epoch #35: test_reward: 11662.500000 ± 5133.350821, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #36: 1001it [00:02, 423.82it/s, env_step=36000, gradient_step=3600, len=181, n/ep=0, n/st=100, rew=8726.00]                                                                                 


Epoch #36: test_reward: 11218.400000 ± 5035.512272, best_reward: 18520.700000 ± 5770.846663 in #9


Epoch #37: 1001it [00:02, 369.55it/s, env_step=37000, gradient_step=3700, len=216, n/ep=0, n/st=100, rew=12864.00]                                                                                


Epoch #37: test_reward: 19090.800000 ± 7882.953987, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #38: 1001it [00:02, 370.14it/s, env_step=38000, gradient_step=3800, len=242, n/ep=1, n/st=100, rew=14946.00]                                                                                


Epoch #38: test_reward: 13578.300000 ± 4838.576693, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #39: 1001it [00:02, 383.32it/s, env_step=39000, gradient_step=3900, len=225, n/ep=2, n/st=100, rew=12378.50]                                                                                


Epoch #39: test_reward: 12181.000000 ± 3749.101226, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #40: 1001it [00:02, 363.46it/s, env_step=40000, gradient_step=4000, len=400, n/ep=4, n/st=100, rew=22336.00]                                                                                


Epoch #40: test_reward: 9307.800000 ± 3353.943047, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #41: 1001it [00:02, 368.37it/s, env_step=41000, gradient_step=4100, len=326, n/ep=1, n/st=100, rew=18824.50]                                                                                


Epoch #41: test_reward: 13642.000000 ± 5575.241053, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #42: 1001it [00:02, 380.74it/s, env_step=42000, gradient_step=4200, len=89, n/ep=0, n/st=100, rew=3006.50]                                                                                  


Epoch #42: test_reward: 17765.800000 ± 6756.459514, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #43: 1001it [00:03, 330.19it/s, env_step=43000, gradient_step=4300, len=142, n/ep=0, n/st=100, rew=7733.50]                                                                                 


Epoch #43: test_reward: 13323.100000 ± 5927.461724, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #44: 1001it [00:02, 364.33it/s, env_step=44000, gradient_step=4400, len=268, n/ep=2, n/st=100, rew=14934.75]                                                                                


Epoch #44: test_reward: 13035.300000 ± 4981.522700, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #45: 1001it [00:02, 407.37it/s, env_step=45000, gradient_step=4500, len=93, n/ep=1, n/st=100, rew=4370.00]                                                                                  


Epoch #45: test_reward: 9548.400000 ± 4712.563022, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #46: 1001it [00:02, 365.14it/s, env_step=46000, gradient_step=4600, len=342, n/ep=0, n/st=100, rew=21210.50]                                                                                


Epoch #46: test_reward: 15097.300000 ± 4957.860427, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #47: 1001it [00:02, 437.56it/s, env_step=47000, gradient_step=4700, len=221, n/ep=0, n/st=100, rew=12301.00]                                                                                


Epoch #47: test_reward: 11355.400000 ± 4918.597446, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #48: 1001it [00:02, 439.84it/s, env_step=48000, gradient_step=4800, len=264, n/ep=0, n/st=100, rew=15101.00]                                                                                


Epoch #48: test_reward: 13254.000000 ± 4071.733292, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #49: 1001it [00:02, 384.58it/s, env_step=49000, gradient_step=4900, len=324, n/ep=0, n/st=100, rew=19613.50]                                                                                


Epoch #49: test_reward: 10429.300000 ± 4790.648538, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #50: 1001it [00:02, 425.95it/s, env_step=50000, gradient_step=5000, len=200, n/ep=0, n/st=100, rew=10568.50]                                                                                


Epoch #50: test_reward: 12563.700000 ± 3783.961312, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #51: 1001it [00:03, 330.72it/s, env_step=51000, gradient_step=5100, len=215, n/ep=1, n/st=100, rew=12570.00]                                                                                


Epoch #51: test_reward: 13579.400000 ± 4424.421142, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #52: 1001it [00:02, 366.56it/s, env_step=52000, gradient_step=5200, len=212, n/ep=1, n/st=100, rew=12693.00]                                                                                


Epoch #52: test_reward: 9404.900000 ± 3691.783809, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #53: 1001it [00:03, 322.43it/s, env_step=53000, gradient_step=5300, len=147, n/ep=0, n/st=100, rew=7814.00]                                                                                 


Epoch #53: test_reward: 10710.000000 ± 6360.938484, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #54: 1001it [00:02, 358.18it/s, env_step=54000, gradient_step=5400, len=130, n/ep=1, n/st=100, rew=6887.50]                                                                                 


Epoch #54: test_reward: 9753.000000 ± 3414.109928, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #55: 1001it [00:02, 378.43it/s, env_step=55000, gradient_step=5500, len=245, n/ep=0, n/st=100, rew=13339.50]                                                                                


Epoch #55: test_reward: 9793.200000 ± 6449.470936, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #56: 1001it [00:02, 417.11it/s, env_step=56000, gradient_step=5600, len=188, n/ep=0, n/st=100, rew=11065.00]                                                                                


Epoch #56: test_reward: 10045.000000 ± 4071.653841, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #57: 1001it [00:02, 353.37it/s, env_step=57000, gradient_step=5700, len=174, n/ep=0, n/st=100, rew=9665.00]                                                                                 


Epoch #57: test_reward: 12965.800000 ± 5223.650406, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #58: 1001it [00:02, 384.77it/s, env_step=58000, gradient_step=5800, len=161, n/ep=3, n/st=100, rew=9094.17]                                                                                 


Epoch #58: test_reward: 12588.700000 ± 3192.217413, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #59: 1001it [00:02, 402.15it/s, env_step=59000, gradient_step=5900, len=149, n/ep=0, n/st=100, rew=7943.75]                                                                                 


Epoch #59: test_reward: 11053.600000 ± 3678.702222, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #60: 1001it [00:02, 340.06it/s, env_step=60000, gradient_step=6000, len=224, n/ep=1, n/st=100, rew=13337.00]                                                                                


Epoch #60: test_reward: 8396.600000 ± 2007.561267, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #61: 1001it [00:03, 330.53it/s, env_step=61000, gradient_step=6100, len=191, n/ep=0, n/st=100, rew=9842.00]                                                                                 


Epoch #61: test_reward: 11923.200000 ± 5566.065231, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #62: 1001it [00:02, 361.90it/s, env_step=62000, gradient_step=6200, len=283, n/ep=1, n/st=100, rew=16568.50]                                                                                


Epoch #62: test_reward: 8234.000000 ± 3424.523091, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #63: 1001it [00:02, 405.35it/s, env_step=63000, gradient_step=6300, len=157, n/ep=0, n/st=100, rew=8646.00]                                                                                 


Epoch #63: test_reward: 8116.100000 ± 3195.011313, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #64: 1001it [00:02, 409.03it/s, env_step=64000, gradient_step=6400, len=207, n/ep=0, n/st=100, rew=11536.00]                                                                                


Epoch #64: test_reward: 10785.600000 ± 3623.177340, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #65: 1001it [00:02, 357.38it/s, env_step=65000, gradient_step=6500, len=87, n/ep=0, n/st=100, rew=2834.50]                                                                                  


Epoch #65: test_reward: 9935.100000 ± 4657.684306, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #66: 1001it [00:02, 370.87it/s, env_step=66000, gradient_step=6600, len=233, n/ep=1, n/st=100, rew=12851.00]                                                                                


Epoch #66: test_reward: 13223.000000 ± 5470.957521, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #67: 1001it [00:02, 434.37it/s, env_step=67000, gradient_step=6700, len=70, n/ep=1, n/st=100, rew=3227.00]                                                                                  


Epoch #67: test_reward: 7500.600000 ± 3837.135187, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #68: 1001it [00:02, 353.90it/s, env_step=68000, gradient_step=6800, len=169, n/ep=1, n/st=100, rew=9026.50]                                                                                 


Epoch #68: test_reward: 8279.100000 ± 2557.309776, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #69: 1001it [00:03, 321.56it/s, env_step=69000, gradient_step=6900, len=223, n/ep=0, n/st=100, rew=12399.00]                                                                                


Epoch #69: test_reward: 8798.700000 ± 3817.190434, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #70: 1001it [00:02, 410.52it/s, env_step=70000, gradient_step=7000, len=78, n/ep=1, n/st=100, rew=3118.00]                                                                                  


Epoch #70: test_reward: 13516.100000 ± 5756.604111, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #71: 1001it [00:02, 375.16it/s, env_step=71000, gradient_step=7100, len=159, n/ep=0, n/st=100, rew=8264.50]                                                                                 


Epoch #71: test_reward: 13114.100000 ± 6237.476532, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #72: 1001it [00:02, 404.00it/s, env_step=72000, gradient_step=7200, len=205, n/ep=0, n/st=100, rew=12592.50]                                                                                


Epoch #72: test_reward: 15865.300000 ± 5193.893146, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #73: 1001it [00:02, 416.11it/s, env_step=73000, gradient_step=7300, len=193, n/ep=0, n/st=100, rew=10933.00]                                                                                


Epoch #73: test_reward: 13666.000000 ± 5662.207926, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #74: 1001it [00:02, 436.50it/s, env_step=74000, gradient_step=7400, len=209, n/ep=1, n/st=100, rew=11191.00]                                                                                


Epoch #74: test_reward: 8027.200000 ± 5422.558009, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #75: 1001it [00:02, 369.26it/s, env_step=75000, gradient_step=7500, len=233, n/ep=1, n/st=100, rew=13913.00]                                                                                


Epoch #75: test_reward: 10204.800000 ± 5192.931269, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #76: 1001it [00:02, 379.43it/s, env_step=76000, gradient_step=7600, len=172, n/ep=0, n/st=100, rew=9318.00]                                                                                 


Epoch #76: test_reward: 10310.600000 ± 3727.487202, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #77: 1001it [00:02, 388.86it/s, env_step=77000, gradient_step=7700, len=191, n/ep=1, n/st=100, rew=9172.50]                                                                                 


Epoch #77: test_reward: 12458.200000 ± 5164.768742, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #78: 1001it [00:02, 335.61it/s, env_step=78000, gradient_step=7800, len=183, n/ep=2, n/st=100, rew=9572.50]                                                                                 


Epoch #78: test_reward: 3703.600000 ± 2224.028381, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #79: 1001it [00:02, 407.05it/s, env_step=79000, gradient_step=7900, len=202, n/ep=1, n/st=100, rew=11308.00]                                                                                


Epoch #79: test_reward: 9964.400000 ± 2702.367673, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #80: 1001it [00:02, 387.64it/s, env_step=80000, gradient_step=8000, len=355, n/ep=0, n/st=100, rew=20502.00]                                                                                


Epoch #80: test_reward: 11506.200000 ± 4137.133810, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #81: 1001it [00:02, 345.19it/s, env_step=81000, gradient_step=8100, len=215, n/ep=0, n/st=100, rew=12033.00]                                                                                


Epoch #81: test_reward: 8717.900000 ± 5981.239277, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #82: 1001it [00:02, 366.65it/s, env_step=82000, gradient_step=8200, len=289, n/ep=1, n/st=100, rew=17153.00]                                                                                


Epoch #82: test_reward: 12344.200000 ± 6319.287203, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #83: 1001it [00:02, 407.40it/s, env_step=83000, gradient_step=8300, len=285, n/ep=0, n/st=100, rew=16260.00]                                                                                


Epoch #83: test_reward: 6997.100000 ± 3157.006033, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #84: 1001it [00:02, 334.16it/s, env_step=84000, gradient_step=8400, len=282, n/ep=1, n/st=100, rew=15492.00]                                                                                


Epoch #84: test_reward: 10193.600000 ± 5119.288216, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #85: 1001it [00:02, 428.29it/s, env_step=85000, gradient_step=8500, len=275, n/ep=0, n/st=100, rew=16708.50]                                                                                


Epoch #85: test_reward: 9365.000000 ± 3032.767911, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #86: 1001it [00:02, 350.91it/s, env_step=86000, gradient_step=8600, len=244, n/ep=1, n/st=100, rew=14695.00]                                                                                


Epoch #86: test_reward: 15623.300000 ± 7504.676623, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #87: 1001it [00:02, 399.69it/s, env_step=87000, gradient_step=8700, len=279, n/ep=0, n/st=100, rew=15671.00]                                                                                


Epoch #87: test_reward: 11423.000000 ± 2088.618970, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #88: 1001it [00:03, 313.37it/s, env_step=88000, gradient_step=8800, len=175, n/ep=2, n/st=100, rew=9486.50]                                                                                 


Epoch #88: test_reward: 11709.900000 ± 4145.211803, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #89: 1001it [00:02, 396.43it/s, env_step=89000, gradient_step=8900, len=212, n/ep=0, n/st=100, rew=12187.50]                                                                                


Epoch #89: test_reward: 8380.300000 ± 4652.891532, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #90: 1001it [00:03, 323.56it/s, env_step=90000, gradient_step=9000, len=124, n/ep=0, n/st=100, rew=6109.00]                                                                                 


Epoch #90: test_reward: 13690.800000 ± 4535.980441, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #91: 1001it [00:02, 383.47it/s, env_step=91000, gradient_step=9100, len=177, n/ep=0, n/st=100, rew=10326.50]                                                                                


Epoch #91: test_reward: 12247.800000 ± 3886.640009, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #92: 1001it [00:02, 416.78it/s, env_step=92000, gradient_step=9200, len=229, n/ep=3, n/st=100, rew=12755.17]                                                                                


Epoch #92: test_reward: 14945.900000 ± 4861.062445, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #93: 1001it [00:02, 401.42it/s, env_step=93000, gradient_step=9300, len=80, n/ep=0, n/st=100, rew=4183.50]                                                                                  


Epoch #93: test_reward: 5513.600000 ± 3071.600729, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #94: 1001it [00:02, 398.44it/s, env_step=94000, gradient_step=9400, len=168, n/ep=1, n/st=100, rew=10331.00]                                                                                


Epoch #94: test_reward: 6749.000000 ± 4079.281456, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #95: 1001it [00:02, 409.10it/s, env_step=95000, gradient_step=9500, len=160, n/ep=2, n/st=100, rew=8483.75]                                                                                 


Epoch #95: test_reward: 8016.900000 ± 5642.030830, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #96: 1001it [00:02, 424.35it/s, env_step=96000, gradient_step=9600, len=187, n/ep=1, n/st=100, rew=9928.50]                                                                                 


Epoch #96: test_reward: 8367.500000 ± 4969.060178, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #97: 1001it [00:02, 351.16it/s, env_step=97000, gradient_step=9700, len=278, n/ep=0, n/st=100, rew=17465.00]                                                                                


Epoch #97: test_reward: 11724.400000 ± 4018.398268, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #98: 1001it [00:03, 319.88it/s, env_step=98000, gradient_step=9800, len=194, n/ep=1, n/st=100, rew=11448.50]                                                                                


Epoch #98: test_reward: 9671.200000 ± 4286.119172, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #99: 1001it [00:02, 388.08it/s, env_step=99000, gradient_step=9900, len=176, n/ep=0, n/st=100, rew=10193.25]                                                                                


Epoch #99: test_reward: 9044.000000 ± 5640.154537, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #100: 1001it [00:02, 346.65it/s, env_step=100000, gradient_step=10000, len=86, n/ep=0, n/st=100, rew=4439.00]                                                                               


Epoch #100: test_reward: 9067.300000 ± 4978.620011, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #101: 1001it [00:02, 412.06it/s, env_step=101000, gradient_step=10100, len=110, n/ep=0, n/st=100, rew=5574.50]                                                                              


Epoch #101: test_reward: 10915.500000 ± 4693.478609, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #102: 1001it [00:02, 403.20it/s, env_step=102000, gradient_step=10200, len=233, n/ep=1, n/st=100, rew=13073.50]                                                                             


Epoch #102: test_reward: 14518.200000 ± 3844.861735, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #103: 1001it [00:02, 406.95it/s, env_step=103000, gradient_step=10300, len=150, n/ep=1, n/st=100, rew=8272.00]                                                                              


Epoch #103: test_reward: 7073.200000 ± 4661.769038, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #104: 1001it [00:02, 339.54it/s, env_step=104000, gradient_step=10400, len=118, n/ep=0, n/st=100, rew=6241.00]                                                                              


Epoch #104: test_reward: 6595.600000 ± 3819.074841, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #105: 1001it [00:02, 385.37it/s, env_step=105000, gradient_step=10500, len=189, n/ep=0, n/st=100, rew=9568.00]                                                                              


Epoch #105: test_reward: 9728.200000 ± 4136.692055, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #106: 1001it [00:02, 352.10it/s, env_step=106000, gradient_step=10600, len=253, n/ep=0, n/st=100, rew=15720.50]                                                                             


Epoch #106: test_reward: 8316.400000 ± 4411.949483, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #107: 1001it [00:02, 371.68it/s, env_step=107000, gradient_step=10700, len=131, n/ep=1, n/st=100, rew=7495.00]                                                                              


Epoch #107: test_reward: 11043.900000 ± 3935.225647, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #108: 1001it [00:02, 351.54it/s, env_step=108000, gradient_step=10800, len=359, n/ep=0, n/st=100, rew=21731.00]                                                                             


Epoch #108: test_reward: 10422.500000 ± 5511.994725, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #109: 1001it [00:03, 330.72it/s, env_step=109000, gradient_step=10900, len=36, n/ep=1, n/st=100, rew=1219.00]                                                                               


Epoch #109: test_reward: 6647.900000 ± 5497.058276, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #110: 1001it [00:02, 390.41it/s, env_step=110000, gradient_step=11000, len=105, n/ep=0, n/st=100, rew=5312.00]                                                                              


Epoch #110: test_reward: 7431.600000 ± 3397.104037, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #111: 1001it [00:03, 321.93it/s, env_step=111000, gradient_step=11100, len=214, n/ep=0, n/st=100, rew=11983.88]                                                                             


Epoch #111: test_reward: 4465.600000 ± 2657.427297, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #112: 1001it [00:02, 369.34it/s, env_step=112000, gradient_step=11200, len=149, n/ep=2, n/st=100, rew=8560.75]                                                                              


Epoch #112: test_reward: 7614.400000 ± 4866.531068, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #113: 1001it [00:02, 361.04it/s, env_step=113000, gradient_step=11300, len=215, n/ep=1, n/st=100, rew=11754.00]                                                                             


Epoch #113: test_reward: 10398.200000 ± 5266.155748, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #114: 1001it [00:03, 298.15it/s, env_step=114000, gradient_step=11400, len=133, n/ep=1, n/st=100, rew=7618.00]                                                                              


Epoch #114: test_reward: 8890.400000 ± 4210.761765, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #115: 1001it [00:03, 319.78it/s, env_step=115000, gradient_step=11500, len=253, n/ep=1, n/st=100, rew=14955.00]                                                                             


Epoch #115: test_reward: 9436.500000 ± 2274.060388, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #116: 1001it [00:02, 337.17it/s, env_step=116000, gradient_step=11600, len=186, n/ep=1, n/st=100, rew=11516.00]                                                                             


Epoch #116: test_reward: 13975.400000 ± 6782.818075, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #117: 1001it [00:02, 403.13it/s, env_step=117000, gradient_step=11700, len=82, n/ep=0, n/st=100, rew=4269.00]                                                                               


Epoch #117: test_reward: 10350.900000 ± 5041.827019, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #118: 1001it [00:03, 332.69it/s, env_step=118000, gradient_step=11800, len=301, n/ep=2, n/st=100, rew=18742.50]                                                                             


Epoch #118: test_reward: 7633.100000 ± 2043.349723, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #119: 1001it [00:02, 377.94it/s, env_step=119000, gradient_step=11900, len=114, n/ep=2, n/st=100, rew=5855.25]                                                                              


Epoch #119: test_reward: 9719.300000 ± 7523.598687, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #120: 1001it [00:03, 323.91it/s, env_step=120000, gradient_step=12000, len=155, n/ep=1, n/st=100, rew=8191.00]                                                                              


Epoch #120: test_reward: 8697.200000 ± 4143.230570, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #121: 1001it [00:02, 360.56it/s, env_step=121000, gradient_step=12100, len=195, n/ep=0, n/st=100, rew=9589.00]                                                                              


Epoch #121: test_reward: 9071.400000 ± 4885.599415, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #122: 1001it [00:03, 319.15it/s, env_step=122000, gradient_step=12200, len=115, n/ep=0, n/st=100, rew=5491.25]                                                                              


Epoch #122: test_reward: 8752.200000 ± 3313.169141, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #123: 1001it [00:02, 339.99it/s, env_step=123000, gradient_step=12300, len=110, n/ep=1, n/st=100, rew=5321.00]                                                                              


Epoch #123: test_reward: 11763.100000 ± 4669.424749, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #124: 1001it [00:03, 325.54it/s, env_step=124000, gradient_step=12400, len=131, n/ep=1, n/st=100, rew=7139.00]                                                                              


Epoch #124: test_reward: 10000.400000 ± 4851.847632, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #125: 1001it [00:03, 321.69it/s, env_step=125000, gradient_step=12500, len=185, n/ep=1, n/st=100, rew=10251.00]                                                                             


Epoch #125: test_reward: 10566.200000 ± 3657.615037, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #126: 1001it [00:02, 394.86it/s, env_step=126000, gradient_step=12600, len=160, n/ep=0, n/st=100, rew=9076.75]                                                                              


Epoch #126: test_reward: 11917.800000 ± 5356.890531, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #127: 1001it [00:03, 326.14it/s, env_step=127000, gradient_step=12700, len=286, n/ep=0, n/st=100, rew=18765.50]                                                                             


Epoch #127: test_reward: 10280.700000 ± 2445.985243, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #128: 1001it [00:03, 306.60it/s, env_step=128000, gradient_step=12800, len=169, n/ep=0, n/st=100, rew=9932.50]                                                                              


Epoch #128: test_reward: 6527.300000 ± 3522.147755, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #129: 1001it [00:03, 308.42it/s, env_step=129000, gradient_step=12900, len=118, n/ep=0, n/st=100, rew=5843.00]                                                                              


Epoch #129: test_reward: 14374.200000 ± 5535.517063, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #130: 1001it [00:03, 328.39it/s, env_step=130000, gradient_step=13000, len=205, n/ep=0, n/st=100, rew=11897.00]                                                                             


Epoch #130: test_reward: 13772.700000 ± 3169.999812, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #131: 1001it [00:02, 334.01it/s, env_step=131000, gradient_step=13100, len=151, n/ep=0, n/st=100, rew=8564.50]                                                                              


Epoch #131: test_reward: 8577.000000 ± 5418.623903, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #132: 1001it [00:03, 302.33it/s, env_step=132000, gradient_step=13200, len=219, n/ep=1, n/st=100, rew=13574.00]                                                                             


Epoch #132: test_reward: 10495.000000 ± 4050.705420, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #133: 1001it [00:02, 365.57it/s, env_step=133000, gradient_step=13300, len=79, n/ep=0, n/st=100, rew=3745.00]                                                                               


Epoch #133: test_reward: 12096.700000 ± 4811.608963, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #134: 1001it [00:03, 331.33it/s, env_step=134000, gradient_step=13400, len=217, n/ep=3, n/st=100, rew=11853.00]                                                                             


Epoch #134: test_reward: 10089.900000 ± 3631.751876, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #135: 1001it [00:02, 373.51it/s, env_step=135000, gradient_step=13500, len=276, n/ep=2, n/st=100, rew=17329.75]                                                                             


Epoch #135: test_reward: 13796.600000 ± 5104.888485, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #136: 1001it [00:02, 339.07it/s, env_step=136000, gradient_step=13600, len=167, n/ep=0, n/st=100, rew=9547.00]                                                                              


Epoch #136: test_reward: 10949.700000 ± 4416.611825, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #137: 1001it [00:02, 354.32it/s, env_step=137000, gradient_step=13700, len=260, n/ep=0, n/st=100, rew=15292.00]                                                                             


Epoch #137: test_reward: 9513.000000 ± 5391.716536, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #138: 1001it [00:02, 344.41it/s, env_step=138000, gradient_step=13800, len=335, n/ep=0, n/st=100, rew=20961.50]                                                                             


Epoch #138: test_reward: 13566.000000 ± 6678.627614, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #139: 1001it [00:02, 404.26it/s, env_step=139000, gradient_step=13900, len=192, n/ep=0, n/st=100, rew=11219.00]                                                                             


Epoch #139: test_reward: 9527.800000 ± 2971.343460, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #140: 1001it [00:02, 346.06it/s, env_step=140000, gradient_step=14000, len=180, n/ep=0, n/st=100, rew=10404.00]                                                                             


Epoch #140: test_reward: 16401.300000 ± 4503.731565, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #141: 1001it [00:02, 349.33it/s, env_step=141000, gradient_step=14100, len=139, n/ep=1, n/st=100, rew=7329.50]                                                                              


Epoch #141: test_reward: 10549.300000 ± 5590.850312, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #142: 1001it [00:02, 381.29it/s, env_step=142000, gradient_step=14200, len=120, n/ep=2, n/st=100, rew=6455.00]                                                                              


Epoch #142: test_reward: 10160.700000 ± 5948.730269, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #143: 1001it [00:02, 341.85it/s, env_step=143000, gradient_step=14300, len=120, n/ep=0, n/st=100, rew=6066.00]                                                                              


Epoch #143: test_reward: 6768.200000 ± 3171.425982, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #144: 1001it [00:02, 356.81it/s, env_step=144000, gradient_step=14400, len=96, n/ep=0, n/st=100, rew=5001.00]                                                                               


Epoch #144: test_reward: 11904.000000 ± 2074.774590, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #145: 1001it [00:03, 306.03it/s, env_step=145000, gradient_step=14500, len=116, n/ep=0, n/st=100, rew=6582.00]                                                                              


Epoch #145: test_reward: 9853.600000 ± 8185.806817, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #146: 1001it [00:02, 333.91it/s, env_step=146000, gradient_step=14600, len=206, n/ep=1, n/st=100, rew=12044.00]                                                                             


Epoch #146: test_reward: 10463.200000 ± 3793.084571, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #147: 1001it [00:02, 362.93it/s, env_step=147000, gradient_step=14700, len=48, n/ep=0, n/st=100, rew=1863.00]                                                                               


Epoch #147: test_reward: 16333.400000 ± 7734.970630, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #148: 1001it [00:02, 358.37it/s, env_step=148000, gradient_step=14800, len=206, n/ep=2, n/st=100, rew=12016.50]                                                                             


Epoch #148: test_reward: 8820.000000 ± 4840.906795, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #149: 1001it [00:02, 340.97it/s, env_step=149000, gradient_step=14900, len=188, n/ep=1, n/st=100, rew=11305.00]                                                                             


Epoch #149: test_reward: 9096.000000 ± 3695.104329, best_reward: 19090.800000 ± 7882.953987 in #37


Epoch #150: 1001it [00:03, 319.12it/s, env_step=150000, gradient_step=15000, len=154, n/ep=0, n/st=100, rew=7118.00]                                                                              


Epoch #150: test_reward: 12783.000000 ± 5127.839350, best_reward: 19090.800000 ± 7882.953987 in #37

InfoStats(gradient_step=15000, best_reward=19090.8, best_reward_std=7882.953986926474, train_step=150000, train_episode=755, test_step=346365, test_episode=1510, timing=TimingStats(total_time=594.7798187732697, train_time=404.3152554035187, train_time_collect=52.662532567977905, train_time_update=345.5365779399872, test_time=190.46456336975098, update_speed=370.99762622188354))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #2


Epoch #1: 1001it [00:02, 478.69it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 11266.200000 ± 7100.526090, best_reward: 11266.200000 ± 7100.526090 in #1


Epoch #2: 1001it [00:02, 444.83it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 5886.600000 ± 7145.061178, best_reward: 11266.200000 ± 7100.526090 in #1


Epoch #3: 1001it [00:02, 462.79it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 12034.800000 ± 6166.685930, best_reward: 12034.800000 ± 6166.685930 in #3


Epoch #4: 1001it [00:02, 469.23it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 11244.500000 ± 2987.423346, best_reward: 12034.800000 ± 6166.685930 in #3


Epoch #5: 1001it [00:02, 456.82it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 9976.500000 ± 2041.795056, best_reward: 12034.800000 ± 6166.685930 in #3


Epoch #6: 1001it [00:02, 429.17it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 11992.800000 ± 3674.729427, best_reward: 12034.800000 ± 6166.685930 in #3


Epoch #7: 1001it [00:02, 434.42it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 13656.700000 ± 4189.807204, best_reward: 13656.700000 ± 4189.807204 in #7


Epoch #8: 1001it [00:02, 435.80it/s, env_step=8000, gradient_step=800, len=73, n/ep=0, n/st=100, rew=2523.00]                                                                                     


Epoch #8: test_reward: 10438.600000 ± 2572.516130, best_reward: 13656.700000 ± 4189.807204 in #7


Epoch #9: 1001it [00:02, 374.18it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=3393.00]                                                                                     


Epoch #9: test_reward: 15681.300000 ± 5207.936003, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #10: 1001it [00:02, 436.74it/s, env_step=10000, gradient_step=1000, len=100, n/ep=2, n/st=100, rew=3309.25]                                                                                 


Epoch #10: test_reward: 9854.000000 ± 4068.366748, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #11: 1001it [00:02, 413.21it/s, env_step=11000, gradient_step=1100, len=104, n/ep=0, n/st=100, rew=3088.00]                                                                                 


Epoch #11: test_reward: 11030.500000 ± 4261.590624, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #12: 1001it [00:02, 379.16it/s, env_step=12000, gradient_step=1200, len=114, n/ep=0, n/st=100, rew=3953.00]                                                                                 


Epoch #12: test_reward: 12071.500000 ± 6380.157259, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #13: 1001it [00:02, 375.91it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=5396.00]                                                                                 


Epoch #13: test_reward: 11114.900000 ± 4112.011344, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #14: 1001it [00:02, 394.93it/s, env_step=14000, gradient_step=1400, len=137, n/ep=0, n/st=100, rew=6143.00]                                                                                 


Epoch #14: test_reward: 11951.200000 ± 4290.487916, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #15: 1001it [00:03, 332.67it/s, env_step=15000, gradient_step=1500, len=146, n/ep=0, n/st=100, rew=4874.00]                                                                                 


Epoch #15: test_reward: 11043.400000 ± 3795.122138, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #16: 1001it [00:02, 344.68it/s, env_step=16000, gradient_step=1600, len=160, n/ep=4, n/st=100, rew=6815.38]                                                                                 


Epoch #16: test_reward: 9384.000000 ± 5367.388341, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #17: 1001it [00:02, 370.15it/s, env_step=17000, gradient_step=1700, len=170, n/ep=1, n/st=100, rew=8044.00]                                                                                 


Epoch #17: test_reward: 12634.200000 ± 3352.137491, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #18: 1001it [00:03, 321.87it/s, env_step=18000, gradient_step=1800, len=65, n/ep=2, n/st=100, rew=2224.00]                                                                                  


Epoch #18: test_reward: 9212.400000 ± 3207.488837, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #19: 1001it [00:02, 357.77it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=9784.00]                                                                                 


Epoch #19: test_reward: 12928.600000 ± 3201.043086, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #20: 1001it [00:02, 424.64it/s, env_step=20000, gradient_step=2000, len=142, n/ep=3, n/st=100, rew=6000.50]                                                                                 


Epoch #20: test_reward: 13317.200000 ± 4668.687884, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #21: 1001it [00:02, 373.00it/s, env_step=21000, gradient_step=2100, len=210, n/ep=1, n/st=100, rew=10172.00]                                                                                


Epoch #21: test_reward: 8350.700000 ± 5216.467465, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #22: 1001it [00:02, 340.83it/s, env_step=22000, gradient_step=2200, len=216, n/ep=0, n/st=100, rew=10919.00]                                                                                


Epoch #22: test_reward: 15002.900000 ± 5326.082509, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #23: 1001it [00:02, 427.13it/s, env_step=23000, gradient_step=2300, len=166, n/ep=2, n/st=100, rew=6381.25]                                                                                 


Epoch #23: test_reward: 10957.300000 ± 5279.213181, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #24: 1001it [00:02, 362.86it/s, env_step=24000, gradient_step=2400, len=235, n/ep=0, n/st=100, rew=13156.00]                                                                                


Epoch #24: test_reward: 12357.900000 ± 5302.320331, best_reward: 15681.300000 ± 5207.936003 in #9


Epoch #25: 1001it [00:02, 355.71it/s, env_step=25000, gradient_step=2500, len=248, n/ep=0, n/st=100, rew=15698.00]                                                                                


Epoch #25: test_reward: 16324.500000 ± 4500.335615, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #26: 1001it [00:02, 409.34it/s, env_step=26000, gradient_step=2600, len=258, n/ep=0, n/st=100, rew=14829.00]                                                                                


Epoch #26: test_reward: 14150.600000 ± 3783.838374, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #27: 1001it [00:02, 375.75it/s, env_step=27000, gradient_step=2700, len=267, n/ep=0, n/st=100, rew=14776.00]                                                                                


Epoch #27: test_reward: 15961.000000 ± 6199.405988, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #28: 1001it [00:02, 416.83it/s, env_step=28000, gradient_step=2800, len=181, n/ep=0, n/st=100, rew=8653.67]                                                                                 


Epoch #28: test_reward: 16098.800000 ± 5393.245253, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #29: 1001it [00:02, 354.12it/s, env_step=29000, gradient_step=2900, len=289, n/ep=0, n/st=100, rew=16895.50]                                                                                


Epoch #29: test_reward: 13611.900000 ± 3612.941639, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #30: 1001it [00:02, 389.07it/s, env_step=30000, gradient_step=3000, len=214, n/ep=1, n/st=100, rew=10919.00]                                                                                


Epoch #30: test_reward: 11798.400000 ± 5035.653566, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #31: 1001it [00:03, 324.77it/s, env_step=31000, gradient_step=3100, len=150, n/ep=1, n/st=100, rew=7351.00]                                                                                 


Epoch #31: test_reward: 15374.400000 ± 4039.129094, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #32: 1001it [00:02, 377.96it/s, env_step=32000, gradient_step=3200, len=319, n/ep=0, n/st=100, rew=17827.25]                                                                                


Epoch #32: test_reward: 13068.300000 ± 5168.897543, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #33: 1001it [00:02, 361.32it/s, env_step=33000, gradient_step=3300, len=96, n/ep=0, n/st=100, rew=4771.00]                                                                                  


Epoch #33: test_reward: 14589.000000 ± 6124.857435, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #34: 1001it [00:02, 381.09it/s, env_step=34000, gradient_step=3400, len=173, n/ep=0, n/st=100, rew=8112.50]                                                                                 


Epoch #34: test_reward: 16203.000000 ± 5428.070025, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #35: 1001it [00:02, 385.36it/s, env_step=35000, gradient_step=3500, len=180, n/ep=0, n/st=100, rew=9568.00]                                                                                 


Epoch #35: test_reward: 14153.300000 ± 5233.317209, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #36: 1001it [00:02, 410.73it/s, env_step=36000, gradient_step=3600, len=128, n/ep=0, n/st=100, rew=6152.00]                                                                                 


Epoch #36: test_reward: 13863.800000 ± 4328.409264, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #37: 1001it [00:02, 367.13it/s, env_step=37000, gradient_step=3700, len=256, n/ep=1, n/st=100, rew=10807.00]                                                                                


Epoch #37: test_reward: 13822.600000 ± 3437.738827, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #38: 1001it [00:02, 398.58it/s, env_step=38000, gradient_step=3800, len=186, n/ep=0, n/st=100, rew=10155.50]                                                                                


Epoch #38: test_reward: 9336.600000 ± 3064.791647, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #39: 1001it [00:02, 344.22it/s, env_step=39000, gradient_step=3900, len=230, n/ep=1, n/st=100, rew=12382.50]                                                                                


Epoch #39: test_reward: 13862.700000 ± 5681.109065, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #40: 1001it [00:02, 361.30it/s, env_step=40000, gradient_step=4000, len=299, n/ep=0, n/st=100, rew=17000.00]                                                                                


Epoch #40: test_reward: 9017.100000 ± 4238.461618, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #41: 1001it [00:02, 366.39it/s, env_step=41000, gradient_step=4100, len=100, n/ep=1, n/st=100, rew=4282.00]                                                                                 


Epoch #41: test_reward: 11602.700000 ± 4225.060190, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #42: 1001it [00:02, 385.46it/s, env_step=42000, gradient_step=4200, len=160, n/ep=0, n/st=100, rew=7764.00]                                                                                 


Epoch #42: test_reward: 10736.500000 ± 2894.973135, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #43: 1001it [00:02, 358.91it/s, env_step=43000, gradient_step=4300, len=252, n/ep=1, n/st=100, rew=12111.00]                                                                                


Epoch #43: test_reward: 9624.800000 ± 4364.805993, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #44: 1001it [00:02, 373.89it/s, env_step=44000, gradient_step=4400, len=264, n/ep=1, n/st=100, rew=15642.00]                                                                                


Epoch #44: test_reward: 13233.300000 ± 4506.524959, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #45: 1001it [00:02, 405.33it/s, env_step=45000, gradient_step=4500, len=189, n/ep=0, n/st=100, rew=10468.00]                                                                                


Epoch #45: test_reward: 11008.600000 ± 4879.830186, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #46: 1001it [00:02, 413.29it/s, env_step=46000, gradient_step=4600, len=228, n/ep=1, n/st=100, rew=12544.00]                                                                                


Epoch #46: test_reward: 10054.100000 ± 4125.195255, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #47: 1001it [00:03, 319.62it/s, env_step=47000, gradient_step=4700, len=276, n/ep=1, n/st=100, rew=14348.00]                                                                                


Epoch #47: test_reward: 11808.200000 ± 4906.903133, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #48: 1001it [00:03, 330.90it/s, env_step=48000, gradient_step=4800, len=197, n/ep=0, n/st=100, rew=10047.00]                                                                                


Epoch #48: test_reward: 15543.800000 ± 5174.809848, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #49: 1001it [00:03, 306.69it/s, env_step=49000, gradient_step=4900, len=275, n/ep=0, n/st=100, rew=15190.00]                                                                                


Epoch #49: test_reward: 10807.500000 ± 3760.985223, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #50: 1001it [00:02, 411.82it/s, env_step=50000, gradient_step=5000, len=181, n/ep=1, n/st=100, rew=9278.00]                                                                                 


Epoch #50: test_reward: 12970.800000 ± 4930.116830, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #51: 1001it [00:02, 381.54it/s, env_step=51000, gradient_step=5100, len=284, n/ep=1, n/st=100, rew=15917.00]                                                                                


Epoch #51: test_reward: 11381.800000 ± 3465.475055, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #52: 1001it [00:02, 457.30it/s, env_step=52000, gradient_step=5200, len=211, n/ep=1, n/st=100, rew=11887.50]                                                                                


Epoch #52: test_reward: 13608.400000 ± 4622.849926, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #53: 1001it [00:03, 316.19it/s, env_step=53000, gradient_step=5300, len=229, n/ep=1, n/st=100, rew=13489.50]                                                                                


Epoch #53: test_reward: 12164.100000 ± 3927.330886, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #54: 1001it [00:02, 359.93it/s, env_step=54000, gradient_step=5400, len=225, n/ep=0, n/st=100, rew=12968.50]                                                                                


Epoch #54: test_reward: 13107.900000 ± 5096.398541, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #55: 1001it [00:02, 414.16it/s, env_step=55000, gradient_step=5500, len=211, n/ep=0, n/st=100, rew=11225.00]                                                                                


Epoch #55: test_reward: 7652.200000 ± 2074.380187, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #56: 1001it [00:02, 415.56it/s, env_step=56000, gradient_step=5600, len=181, n/ep=0, n/st=100, rew=10455.50]                                                                                


Epoch #56: test_reward: 13090.800000 ± 4246.151547, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #57: 1001it [00:02, 413.34it/s, env_step=57000, gradient_step=5700, len=133, n/ep=0, n/st=100, rew=6940.00]                                                                                 


Epoch #57: test_reward: 9105.600000 ± 3065.659316, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #58: 1001it [00:02, 431.78it/s, env_step=58000, gradient_step=5800, len=253, n/ep=0, n/st=100, rew=14189.00]                                                                                


Epoch #58: test_reward: 9833.500000 ± 3110.404387, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #59: 1001it [00:02, 368.29it/s, env_step=59000, gradient_step=5900, len=347, n/ep=0, n/st=100, rew=20568.00]                                                                                


Epoch #59: test_reward: 8243.900000 ± 5398.306873, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #60: 1001it [00:02, 409.66it/s, env_step=60000, gradient_step=6000, len=164, n/ep=1, n/st=100, rew=9405.00]                                                                                 


Epoch #60: test_reward: 11060.600000 ± 4330.379110, best_reward: 16324.500000 ± 4500.335615 in #25


Epoch #61: 1001it [00:02, 431.68it/s, env_step=61000, gradient_step=6100, len=291, n/ep=1, n/st=100, rew=15643.00]                                                                                


Epoch #61: test_reward: 18076.300000 ± 4565.760047, best_reward: 18076.300000 ± 4565.760047 in #61


Epoch #62: 1001it [00:03, 319.75it/s, env_step=62000, gradient_step=6200, len=283, n/ep=0, n/st=100, rew=15818.00]                                                                                


Epoch #62: test_reward: 15271.300000 ± 3128.557784, best_reward: 18076.300000 ± 4565.760047 in #61


Epoch #63: 1001it [00:02, 389.45it/s, env_step=63000, gradient_step=6300, len=277, n/ep=1, n/st=100, rew=15541.50]                                                                                


Epoch #63: test_reward: 14247.900000 ± 3769.676072, best_reward: 18076.300000 ± 4565.760047 in #61


Epoch #64: 1001it [00:02, 416.15it/s, env_step=64000, gradient_step=6400, len=132, n/ep=0, n/st=100, rew=6519.00]                                                                                 


Epoch #64: test_reward: 18523.000000 ± 5368.375434, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #65: 1001it [00:02, 412.88it/s, env_step=65000, gradient_step=6500, len=150, n/ep=1, n/st=100, rew=9084.00]                                                                                 


Epoch #65: test_reward: 12121.800000 ± 5191.553251, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #66: 1001it [00:02, 376.48it/s, env_step=66000, gradient_step=6600, len=135, n/ep=1, n/st=100, rew=6911.00]                                                                                 


Epoch #66: test_reward: 12428.300000 ± 4067.891544, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #67: 1001it [00:02, 371.03it/s, env_step=67000, gradient_step=6700, len=171, n/ep=0, n/st=100, rew=10297.00]                                                                                


Epoch #67: test_reward: 12323.800000 ± 4217.570101, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #68: 1001it [00:02, 346.52it/s, env_step=68000, gradient_step=6800, len=209, n/ep=2, n/st=100, rew=11650.75]                                                                                


Epoch #68: test_reward: 11910.300000 ± 6999.990086, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #69: 1001it [00:02, 370.48it/s, env_step=69000, gradient_step=6900, len=127, n/ep=0, n/st=100, rew=6934.00]                                                                                 


Epoch #69: test_reward: 17276.700000 ± 4822.820752, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #70: 1001it [00:02, 381.50it/s, env_step=70000, gradient_step=7000, len=249, n/ep=2, n/st=100, rew=13652.50]                                                                                


Epoch #70: test_reward: 12399.300000 ± 7013.435193, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #71: 1001it [00:02, 414.38it/s, env_step=71000, gradient_step=7100, len=170, n/ep=2, n/st=100, rew=9736.50]                                                                                 


Epoch #71: test_reward: 12682.600000 ± 5018.140556, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #72: 1001it [00:02, 389.27it/s, env_step=72000, gradient_step=7200, len=304, n/ep=1, n/st=100, rew=17512.00]                                                                                


Epoch #72: test_reward: 13273.200000 ± 4910.200562, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #73: 1001it [00:02, 366.62it/s, env_step=73000, gradient_step=7300, len=103, n/ep=0, n/st=100, rew=4935.50]                                                                                 


Epoch #73: test_reward: 10844.900000 ± 4937.375830, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #74: 1001it [00:02, 408.34it/s, env_step=74000, gradient_step=7400, len=270, n/ep=2, n/st=100, rew=15089.25]                                                                                


Epoch #74: test_reward: 13055.800000 ± 5452.691570, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #75: 1001it [00:02, 379.26it/s, env_step=75000, gradient_step=7500, len=198, n/ep=1, n/st=100, rew=11973.00]                                                                                


Epoch #75: test_reward: 12801.700000 ± 6485.469020, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #76: 1001it [00:02, 387.92it/s, env_step=76000, gradient_step=7600, len=135, n/ep=0, n/st=100, rew=7619.00]                                                                                 


Epoch #76: test_reward: 16187.600000 ± 7236.205293, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #77: 1001it [00:02, 346.46it/s, env_step=77000, gradient_step=7700, len=263, n/ep=1, n/st=100, rew=15935.50]                                                                                


Epoch #77: test_reward: 11660.800000 ± 4208.604800, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #78: 1001it [00:02, 337.03it/s, env_step=78000, gradient_step=7800, len=162, n/ep=0, n/st=100, rew=8674.00]                                                                                 


Epoch #78: test_reward: 11113.600000 ± 4605.253635, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #79: 1001it [00:03, 330.37it/s, env_step=79000, gradient_step=7900, len=189, n/ep=0, n/st=100, rew=10448.00]                                                                                


Epoch #79: test_reward: 13872.400000 ± 5233.195968, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #80: 1001it [00:02, 384.11it/s, env_step=80000, gradient_step=8000, len=323, n/ep=1, n/st=100, rew=19783.50]                                                                                


Epoch #80: test_reward: 13232.500000 ± 3313.203201, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #81: 1001it [00:03, 330.32it/s, env_step=81000, gradient_step=8100, len=182, n/ep=0, n/st=100, rew=11005.00]                                                                                


Epoch #81: test_reward: 13246.000000 ± 5638.439660, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #82: 1001it [00:02, 409.71it/s, env_step=82000, gradient_step=8200, len=166, n/ep=0, n/st=100, rew=8912.00]                                                                                 


Epoch #82: test_reward: 8765.300000 ± 4362.441221, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #83: 1001it [00:02, 385.85it/s, env_step=83000, gradient_step=8300, len=110, n/ep=1, n/st=100, rew=5145.00]                                                                                 


Epoch #83: test_reward: 13756.700000 ± 3801.865359, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #84: 1001it [00:02, 360.32it/s, env_step=84000, gradient_step=8400, len=206, n/ep=0, n/st=100, rew=12907.00]                                                                                


Epoch #84: test_reward: 10887.500000 ± 4452.566771, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #85: 1001it [00:03, 318.83it/s, env_step=85000, gradient_step=8500, len=166, n/ep=0, n/st=100, rew=8447.00]                                                                                 


Epoch #85: test_reward: 12957.300000 ± 5171.482400, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #86: 1001it [00:02, 413.98it/s, env_step=86000, gradient_step=8600, len=136, n/ep=1, n/st=100, rew=5574.00]                                                                                 


Epoch #86: test_reward: 9663.800000 ± 2568.828597, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #87: 1001it [00:02, 392.05it/s, env_step=87000, gradient_step=8700, len=126, n/ep=0, n/st=100, rew=7102.50]                                                                                 


Epoch #87: test_reward: 14973.600000 ± 6577.170489, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #88: 1001it [00:02, 347.81it/s, env_step=88000, gradient_step=8800, len=201, n/ep=0, n/st=100, rew=11913.00]                                                                                


Epoch #88: test_reward: 18165.000000 ± 7458.512573, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #89: 1001it [00:02, 387.89it/s, env_step=89000, gradient_step=8900, len=189, n/ep=1, n/st=100, rew=10881.50]                                                                                


Epoch #89: test_reward: 10183.600000 ± 3036.266365, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #90: 1001it [00:02, 432.28it/s, env_step=90000, gradient_step=9000, len=298, n/ep=1, n/st=100, rew=16658.00]                                                                                


Epoch #90: test_reward: 14965.700000 ± 5853.783256, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #91: 1001it [00:02, 394.14it/s, env_step=91000, gradient_step=9100, len=138, n/ep=0, n/st=100, rew=7819.00]                                                                                 


Epoch #91: test_reward: 11956.700000 ± 4024.629028, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #92: 1001it [00:03, 322.94it/s, env_step=92000, gradient_step=9200, len=233, n/ep=0, n/st=100, rew=12796.00]                                                                                


Epoch #92: test_reward: 13931.600000 ± 5125.988982, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #93: 1001it [00:02, 350.52it/s, env_step=93000, gradient_step=9300, len=123, n/ep=0, n/st=100, rew=6670.00]                                                                                 


Epoch #93: test_reward: 16338.200000 ± 5876.756551, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #94: 1001it [00:02, 354.68it/s, env_step=94000, gradient_step=9400, len=204, n/ep=0, n/st=100, rew=11317.00]                                                                                


Epoch #94: test_reward: 11772.000000 ± 5412.545963, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #95: 1001it [00:02, 362.13it/s, env_step=95000, gradient_step=9500, len=235, n/ep=0, n/st=100, rew=14712.00]                                                                                


Epoch #95: test_reward: 16244.100000 ± 4728.726773, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #96: 1001it [00:02, 431.39it/s, env_step=96000, gradient_step=9600, len=260, n/ep=0, n/st=100, rew=12349.00]                                                                                


Epoch #96: test_reward: 7676.300000 ± 2993.548465, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #97: 1001it [00:02, 350.26it/s, env_step=97000, gradient_step=9700, len=215, n/ep=1, n/st=100, rew=12584.00]                                                                                


Epoch #97: test_reward: 9275.200000 ± 4340.068843, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #98: 1001it [00:02, 340.78it/s, env_step=98000, gradient_step=9800, len=234, n/ep=1, n/st=100, rew=11593.00]                                                                                


Epoch #98: test_reward: 12750.800000 ± 6201.115364, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #99: 1001it [00:02, 359.92it/s, env_step=99000, gradient_step=9900, len=154, n/ep=1, n/st=100, rew=8686.50]                                                                                 


Epoch #99: test_reward: 7714.800000 ± 2611.225145, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #100: 1001it [00:02, 381.91it/s, env_step=100000, gradient_step=10000, len=160, n/ep=2, n/st=100, rew=8422.50]                                                                              


Epoch #100: test_reward: 10018.000000 ± 6894.521419, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #101: 1001it [00:02, 392.75it/s, env_step=101000, gradient_step=10100, len=194, n/ep=1, n/st=100, rew=10883.50]                                                                             


Epoch #101: test_reward: 10038.400000 ± 2558.070804, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #102: 1001it [00:02, 362.14it/s, env_step=102000, gradient_step=10200, len=194, n/ep=1, n/st=100, rew=11011.50]                                                                             


Epoch #102: test_reward: 12357.400000 ± 3833.268376, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #103: 1001it [00:02, 405.27it/s, env_step=103000, gradient_step=10300, len=103, n/ep=0, n/st=100, rew=5409.00]                                                                              


Epoch #103: test_reward: 9959.500000 ± 2591.656623, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #104: 1001it [00:02, 344.30it/s, env_step=104000, gradient_step=10400, len=255, n/ep=2, n/st=100, rew=15564.50]                                                                             


Epoch #104: test_reward: 11096.100000 ± 3475.935110, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #105: 1001it [00:02, 392.34it/s, env_step=105000, gradient_step=10500, len=187, n/ep=0, n/st=100, rew=10683.75]                                                                             


Epoch #105: test_reward: 10927.000000 ± 3189.387966, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #106: 1001it [00:02, 372.70it/s, env_step=106000, gradient_step=10600, len=183, n/ep=0, n/st=100, rew=9654.00]                                                                              


Epoch #106: test_reward: 12186.200000 ± 4878.443067, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #107: 1001it [00:02, 378.84it/s, env_step=107000, gradient_step=10700, len=95, n/ep=0, n/st=100, rew=4806.00]                                                                               


Epoch #107: test_reward: 10536.500000 ± 2034.136635, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #108: 1001it [00:03, 312.37it/s, env_step=108000, gradient_step=10800, len=145, n/ep=0, n/st=100, rew=8112.50]                                                                              


Epoch #108: test_reward: 11334.500000 ± 2700.024528, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #109: 1001it [00:02, 382.50it/s, env_step=109000, gradient_step=10900, len=124, n/ep=0, n/st=100, rew=6600.00]                                                                              


Epoch #109: test_reward: 11738.600000 ± 3708.841361, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #110: 1001it [00:02, 333.80it/s, env_step=110000, gradient_step=11000, len=220, n/ep=0, n/st=100, rew=12781.50]                                                                             


Epoch #110: test_reward: 12523.600000 ± 5417.878463, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #111: 1001it [00:02, 389.58it/s, env_step=111000, gradient_step=11100, len=167, n/ep=2, n/st=100, rew=9445.00]                                                                              


Epoch #111: test_reward: 9692.800000 ± 2163.516064, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #112: 1001it [00:02, 395.79it/s, env_step=112000, gradient_step=11200, len=192, n/ep=0, n/st=100, rew=11910.50]                                                                             


Epoch #112: test_reward: 8670.000000 ± 2716.366728, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #113: 1001it [00:02, 394.26it/s, env_step=113000, gradient_step=11300, len=245, n/ep=0, n/st=100, rew=15575.00]                                                                             


Epoch #113: test_reward: 9728.700000 ± 3386.327156, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #114: 1001it [00:02, 398.03it/s, env_step=114000, gradient_step=11400, len=203, n/ep=0, n/st=100, rew=12212.50]                                                                             


Epoch #114: test_reward: 10390.600000 ± 3111.693404, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #115: 1001it [00:02, 353.47it/s, env_step=115000, gradient_step=11500, len=93, n/ep=0, n/st=100, rew=4505.00]                                                                               


Epoch #115: test_reward: 11180.800000 ± 3864.290589, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #116: 1001it [00:02, 375.18it/s, env_step=116000, gradient_step=11600, len=81, n/ep=0, n/st=100, rew=3957.50]                                                                               


Epoch #116: test_reward: 11126.900000 ± 5424.280827, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #117: 1001it [00:02, 335.96it/s, env_step=117000, gradient_step=11700, len=127, n/ep=1, n/st=100, rew=6694.00]                                                                              


Epoch #117: test_reward: 11437.900000 ± 4505.018123, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #118: 1001it [00:02, 380.12it/s, env_step=118000, gradient_step=11800, len=113, n/ep=0, n/st=100, rew=6839.00]                                                                              


Epoch #118: test_reward: 7803.600000 ± 3021.599682, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #119: 1001it [00:03, 331.87it/s, env_step=119000, gradient_step=11900, len=43, n/ep=0, n/st=100, rew=1412.00]                                                                               


Epoch #119: test_reward: 6756.800000 ± 3072.038014, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #120: 1001it [00:03, 330.41it/s, env_step=120000, gradient_step=12000, len=400, n/ep=0, n/st=100, rew=26802.00]                                                                             


Epoch #120: test_reward: 8590.200000 ± 5510.520952, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #121: 1001it [00:02, 375.32it/s, env_step=121000, gradient_step=12100, len=157, n/ep=0, n/st=100, rew=8272.00]                                                                              


Epoch #121: test_reward: 13714.200000 ± 3073.173825, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #122: 1001it [00:03, 316.47it/s, env_step=122000, gradient_step=12200, len=189, n/ep=1, n/st=100, rew=11465.50]                                                                             


Epoch #122: test_reward: 12598.500000 ± 2577.697548, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #123: 1001it [00:02, 337.92it/s, env_step=123000, gradient_step=12300, len=171, n/ep=0, n/st=100, rew=10784.00]                                                                             


Epoch #123: test_reward: 12188.700000 ± 3835.814439, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #124: 1001it [00:02, 389.66it/s, env_step=124000, gradient_step=12400, len=193, n/ep=1, n/st=100, rew=11619.00]                                                                             


Epoch #124: test_reward: 10588.300000 ± 5890.556002, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #125: 1001it [00:02, 354.74it/s, env_step=125000, gradient_step=12500, len=215, n/ep=1, n/st=100, rew=12651.00]                                                                             


Epoch #125: test_reward: 14885.200000 ± 4205.860858, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #126: 1001it [00:03, 312.79it/s, env_step=126000, gradient_step=12600, len=266, n/ep=2, n/st=100, rew=17364.75]                                                                             


Epoch #126: test_reward: 10686.200000 ± 4665.687041, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #127: 1001it [00:03, 329.10it/s, env_step=127000, gradient_step=12700, len=117, n/ep=1, n/st=100, rew=6104.00]                                                                              


Epoch #127: test_reward: 7441.100000 ± 3517.453125, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #128: 1001it [00:02, 347.82it/s, env_step=128000, gradient_step=12800, len=194, n/ep=0, n/st=100, rew=11488.75]                                                                             


Epoch #128: test_reward: 10230.300000 ± 4261.011618, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #129: 1001it [00:02, 367.60it/s, env_step=129000, gradient_step=12900, len=86, n/ep=0, n/st=100, rew=3893.00]                                                                               


Epoch #129: test_reward: 8299.100000 ± 2020.810850, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #130: 1001it [00:02, 368.64it/s, env_step=130000, gradient_step=13000, len=186, n/ep=1, n/st=100, rew=9935.00]                                                                              


Epoch #130: test_reward: 9512.400000 ± 3793.930474, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #131: 1001it [00:02, 361.21it/s, env_step=131000, gradient_step=13100, len=133, n/ep=0, n/st=100, rew=8224.50]                                                                              


Epoch #131: test_reward: 8502.600000 ± 3039.505887, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #132: 1001it [00:02, 359.22it/s, env_step=132000, gradient_step=13200, len=305, n/ep=0, n/st=100, rew=18453.00]                                                                             


Epoch #132: test_reward: 11340.900000 ± 3920.359077, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #133: 1001it [00:03, 321.82it/s, env_step=133000, gradient_step=13300, len=339, n/ep=0, n/st=100, rew=21576.00]                                                                             


Epoch #133: test_reward: 11735.500000 ± 5988.778728, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #134: 1001it [00:02, 336.59it/s, env_step=134000, gradient_step=13400, len=146, n/ep=1, n/st=100, rew=8631.00]                                                                              


Epoch #134: test_reward: 9909.800000 ± 2871.303181, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #135: 1001it [00:02, 397.48it/s, env_step=135000, gradient_step=13500, len=165, n/ep=0, n/st=100, rew=8820.00]                                                                              


Epoch #135: test_reward: 10659.000000 ± 5630.019858, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #136: 1001it [00:02, 349.50it/s, env_step=136000, gradient_step=13600, len=211, n/ep=0, n/st=100, rew=12790.00]                                                                             


Epoch #136: test_reward: 15352.400000 ± 7843.748188, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #137: 1001it [00:02, 404.77it/s, env_step=137000, gradient_step=13700, len=155, n/ep=1, n/st=100, rew=8551.50]                                                                              


Epoch #137: test_reward: 12471.800000 ± 2875.972802, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #138: 1001it [00:02, 374.13it/s, env_step=138000, gradient_step=13800, len=182, n/ep=0, n/st=100, rew=9275.00]                                                                              


Epoch #138: test_reward: 14611.700000 ± 3888.619345, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #139: 1001it [00:02, 349.48it/s, env_step=139000, gradient_step=13900, len=152, n/ep=0, n/st=100, rew=8616.00]                                                                              


Epoch #139: test_reward: 11361.800000 ± 3501.072773, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #140: 1001it [00:02, 341.38it/s, env_step=140000, gradient_step=14000, len=151, n/ep=1, n/st=100, rew=8130.00]                                                                              


Epoch #140: test_reward: 10753.700000 ± 5864.842795, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #141: 1001it [00:02, 399.03it/s, env_step=141000, gradient_step=14100, len=145, n/ep=1, n/st=100, rew=8448.00]                                                                              


Epoch #141: test_reward: 13605.000000 ± 8801.963804, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #142: 1001it [00:02, 403.47it/s, env_step=142000, gradient_step=14200, len=140, n/ep=0, n/st=100, rew=7962.00]                                                                              


Epoch #142: test_reward: 10183.200000 ± 3420.255219, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #143: 1001it [00:02, 340.15it/s, env_step=143000, gradient_step=14300, len=253, n/ep=0, n/st=100, rew=13737.00]                                                                             


Epoch #143: test_reward: 9120.600000 ± 2713.884161, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #144: 1001it [00:02, 393.27it/s, env_step=144000, gradient_step=14400, len=147, n/ep=0, n/st=100, rew=8187.00]                                                                              


Epoch #144: test_reward: 11002.200000 ± 5281.085188, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #145: 1001it [00:03, 316.53it/s, env_step=145000, gradient_step=14500, len=239, n/ep=0, n/st=100, rew=15624.50]                                                                             


Epoch #145: test_reward: 9784.500000 ± 3339.814913, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #146: 1001it [00:03, 331.29it/s, env_step=146000, gradient_step=14600, len=119, n/ep=1, n/st=100, rew=7034.00]                                                                              


Epoch #146: test_reward: 13396.900000 ± 4909.551272, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #147: 1001it [00:02, 337.18it/s, env_step=147000, gradient_step=14700, len=103, n/ep=0, n/st=100, rew=5436.50]                                                                              


Epoch #147: test_reward: 11429.200000 ± 4798.911039, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #148: 1001it [00:02, 346.95it/s, env_step=148000, gradient_step=14800, len=226, n/ep=0, n/st=100, rew=12705.50]                                                                             


Epoch #148: test_reward: 11278.100000 ± 3380.172908, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #149: 1001it [00:02, 344.53it/s, env_step=149000, gradient_step=14900, len=203, n/ep=0, n/st=100, rew=12512.00]                                                                             


Epoch #149: test_reward: 11655.700000 ± 3775.284017, best_reward: 18523.000000 ± 5368.375434 in #64


Epoch #150: 1001it [00:02, 342.72it/s, env_step=150000, gradient_step=15000, len=161, n/ep=0, n/st=100, rew=9416.00]                                                                              


Epoch #150: test_reward: 10081.700000 ± 4755.726255, best_reward: 18523.000000 ± 5368.375434 in #64

InfoStats(gradient_step=15000, best_reward=18523.0, best_reward_std=5368.375433965102, train_step=150000, train_episode=778, test_step=341075, test_episode=1510, timing=TimingStats(total_time=597.7426607608795, train_time=405.3665051460266, train_time_collect=53.21074962615967, train_time_update=345.92297077178955, test_time=192.3761556148529, update_speed=370.0355063770377))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #7


Epoch #1: 1001it [00:02, 474.54it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 13600.000000 ± 7106.723858, best_reward: 13600.000000 ± 7106.723858 in #1


Epoch #2: 1001it [00:02, 420.92it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 11184.600000 ± 5776.043355, best_reward: 13600.000000 ± 7106.723858 in #1


Epoch #3: 1001it [00:02, 425.60it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10302.000000 ± 5355.205468, best_reward: 13600.000000 ± 7106.723858 in #1


Epoch #4: 1001it [00:02, 442.35it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 6871.600000 ± 3896.745699, best_reward: 13600.000000 ± 7106.723858 in #1


Epoch #5: 1001it [00:01, 541.01it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 6422.800000 ± 6394.417077, best_reward: 13600.000000 ± 7106.723858 in #1


Epoch #6: 1001it [00:02, 429.82it/s, env_step=6000, gradient_step=600, len=59, n/ep=0, n/st=100, rew=2044.00]                                                                                     


Epoch #6: test_reward: 3450.600000 ± 4250.426054, best_reward: 13600.000000 ± 7106.723858 in #1


Epoch #7: 1001it [00:02, 470.06it/s, env_step=7000, gradient_step=700, len=67, n/ep=0, n/st=100, rew=2470.00]                                                                                     


Epoch #7: test_reward: 16299.200000 ± 4022.009269, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #8: 1001it [00:02, 410.13it/s, env_step=8000, gradient_step=800, len=67, n/ep=0, n/st=100, rew=2470.00]                                                                                     


Epoch #8: test_reward: 13400.600000 ± 4332.081652, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #9: 1001it [00:02, 372.19it/s, env_step=9000, gradient_step=900, len=85, n/ep=0, n/st=100, rew=2979.50]                                                                                     


Epoch #9: test_reward: 13471.400000 ± 3463.944434, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #10: 1001it [00:02, 406.61it/s, env_step=10000, gradient_step=1000, len=95, n/ep=0, n/st=100, rew=3240.00]                                                                                  


Epoch #10: test_reward: 11150.500000 ± 5871.247572, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #11: 1001it [00:02, 373.43it/s, env_step=11000, gradient_step=1100, len=95, n/ep=0, n/st=100, rew=3240.00]                                                                                  


Epoch #11: test_reward: 10222.200000 ± 4860.207173, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #12: 1001it [00:02, 411.66it/s, env_step=12000, gradient_step=1200, len=95, n/ep=0, n/st=100, rew=3240.00]                                                                                  


Epoch #12: test_reward: 13193.500000 ± 4361.177737, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #13: 1001it [00:02, 364.35it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=6181.00]                                                                                 


Epoch #13: test_reward: 12056.100000 ± 3261.584935, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #14: 1001it [00:02, 426.31it/s, env_step=14000, gradient_step=1400, len=139, n/ep=0, n/st=100, rew=7010.00]                                                                                 


Epoch #14: test_reward: 11128.900000 ± 4217.780470, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #15: 1001it [00:02, 404.18it/s, env_step=15000, gradient_step=1500, len=149, n/ep=0, n/st=100, rew=6453.00]                                                                                 


Epoch #15: test_reward: 12622.000000 ± 4868.176989, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #16: 1001it [00:02, 355.56it/s, env_step=16000, gradient_step=1600, len=64, n/ep=0, n/st=100, rew=1903.50]                                                                                  


Epoch #16: test_reward: 12337.400000 ± 4129.676651, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #17: 1001it [00:02, 389.52it/s, env_step=17000, gradient_step=1700, len=64, n/ep=0, n/st=100, rew=1903.50]                                                                                  


Epoch #17: test_reward: 12501.600000 ± 4652.943051, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #18: 1001it [00:02, 392.95it/s, env_step=18000, gradient_step=1800, len=180, n/ep=3, n/st=100, rew=7576.33]                                                                                 


Epoch #18: test_reward: 10140.300000 ± 3259.563285, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #19: 1001it [00:02, 357.99it/s, env_step=19000, gradient_step=1900, len=100, n/ep=0, n/st=100, rew=3560.50]                                                                                 


Epoch #19: test_reward: 9052.800000 ± 4621.848825, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #20: 1001it [00:03, 332.34it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=8383.00]                                                                                 


Epoch #20: test_reward: 9743.300000 ± 3905.854888, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #21: 1001it [00:02, 345.19it/s, env_step=21000, gradient_step=2100, len=207, n/ep=0, n/st=100, rew=10579.00]                                                                                


Epoch #21: test_reward: 10400.400000 ± 3588.683636, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #22: 1001it [00:02, 397.87it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=12672.50]                                                                                


Epoch #22: test_reward: 9913.400000 ± 3392.708216, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #23: 1001it [00:02, 365.00it/s, env_step=23000, gradient_step=2300, len=138, n/ep=0, n/st=100, rew=7063.00]                                                                                 


Epoch #23: test_reward: 8422.500000 ± 3758.162802, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #24: 1001it [00:03, 331.75it/s, env_step=24000, gradient_step=2400, len=238, n/ep=0, n/st=100, rew=13023.00]                                                                                


Epoch #24: test_reward: 12686.100000 ± 6367.223452, best_reward: 16299.200000 ± 4022.009269 in #7


Epoch #25: 1001it [00:02, 413.23it/s, env_step=25000, gradient_step=2500, len=249, n/ep=0, n/st=100, rew=13559.75]                                                                                


Epoch #25: test_reward: 20102.100000 ± 4945.992609, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #26: 1001it [00:02, 437.42it/s, env_step=26000, gradient_step=2600, len=259, n/ep=0, n/st=100, rew=14361.50]                                                                                


Epoch #26: test_reward: 10735.400000 ± 3832.971907, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #27: 1001it [00:02, 359.96it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=11750.50]                                                                                


Epoch #27: test_reward: 11232.600000 ± 4692.033103, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #28: 1001it [00:02, 420.03it/s, env_step=28000, gradient_step=2800, len=100, n/ep=0, n/st=100, rew=4192.00]                                                                                 


Epoch #28: test_reward: 11728.600000 ± 3761.210874, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #29: 1001it [00:02, 349.36it/s, env_step=29000, gradient_step=2900, len=128, n/ep=0, n/st=100, rew=5600.00]                                                                                 


Epoch #29: test_reward: 10555.400000 ± 3547.393528, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #30: 1001it [00:02, 354.34it/s, env_step=30000, gradient_step=3000, len=202, n/ep=0, n/st=100, rew=8336.00]                                                                                 


Epoch #30: test_reward: 12413.900000 ± 6285.560221, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #31: 1001it [00:02, 386.39it/s, env_step=31000, gradient_step=3100, len=310, n/ep=2, n/st=100, rew=16582.00]                                                                                


Epoch #31: test_reward: 14214.700000 ± 6046.066689, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #32: 1001it [00:02, 357.15it/s, env_step=32000, gradient_step=3200, len=318, n/ep=0, n/st=100, rew=17615.00]                                                                                


Epoch #32: test_reward: 10901.800000 ± 5725.161409, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #33: 1001it [00:02, 445.78it/s, env_step=33000, gradient_step=3300, len=263, n/ep=1, n/st=100, rew=13487.50]                                                                                


Epoch #33: test_reward: 9910.600000 ± 3945.146187, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #34: 1001it [00:02, 426.75it/s, env_step=34000, gradient_step=3400, len=243, n/ep=2, n/st=100, rew=13277.25]                                                                                


Epoch #34: test_reward: 13116.900000 ± 3202.591152, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #35: 1001it [00:02, 394.16it/s, env_step=35000, gradient_step=3500, len=200, n/ep=0, n/st=100, rew=11528.00]                                                                                


Epoch #35: test_reward: 10601.200000 ± 4398.243577, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #36: 1001it [00:02, 433.68it/s, env_step=36000, gradient_step=3600, len=200, n/ep=0, n/st=100, rew=11528.00]                                                                                


Epoch #36: test_reward: 14565.700000 ± 5331.905176, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #37: 1001it [00:02, 430.94it/s, env_step=37000, gradient_step=3700, len=198, n/ep=1, n/st=100, rew=11631.00]                                                                                


Epoch #37: test_reward: 10691.800000 ± 4498.663664, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #38: 1001it [00:02, 406.48it/s, env_step=38000, gradient_step=3800, len=126, n/ep=0, n/st=100, rew=6035.00]                                                                                 


Epoch #38: test_reward: 11843.100000 ± 6190.765776, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #39: 1001it [00:02, 338.18it/s, env_step=39000, gradient_step=3900, len=165, n/ep=1, n/st=100, rew=8656.00]                                                                                 


Epoch #39: test_reward: 9976.800000 ± 5297.469675, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #40: 1001it [00:02, 374.44it/s, env_step=40000, gradient_step=4000, len=400, n/ep=5, n/st=100, rew=23624.20]                                                                                


Epoch #40: test_reward: 10668.000000 ± 4482.313778, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #41: 1001it [00:02, 352.47it/s, env_step=41000, gradient_step=4100, len=276, n/ep=1, n/st=100, rew=16393.00]                                                                                


Epoch #41: test_reward: 8540.800000 ± 4213.697208, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #42: 1001it [00:02, 383.84it/s, env_step=42000, gradient_step=4200, len=233, n/ep=0, n/st=100, rew=12252.00]                                                                                


Epoch #42: test_reward: 12408.100000 ± 4575.297181, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #43: 1001it [00:02, 422.11it/s, env_step=43000, gradient_step=4300, len=161, n/ep=3, n/st=100, rew=7829.33]                                                                                 


Epoch #43: test_reward: 11501.300000 ± 4009.290337, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #44: 1001it [00:03, 315.47it/s, env_step=44000, gradient_step=4400, len=178, n/ep=0, n/st=100, rew=10355.00]                                                                                


Epoch #44: test_reward: 13421.500000 ± 4984.253891, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #45: 1001it [00:02, 401.47it/s, env_step=45000, gradient_step=4500, len=137, n/ep=0, n/st=100, rew=6730.00]                                                                                 


Epoch #45: test_reward: 12822.000000 ± 6129.552871, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #46: 1001it [00:02, 385.88it/s, env_step=46000, gradient_step=4600, len=157, n/ep=0, n/st=100, rew=8335.00]                                                                                 


Epoch #46: test_reward: 11927.700000 ± 3148.939918, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #47: 1001it [00:02, 400.31it/s, env_step=47000, gradient_step=4700, len=209, n/ep=0, n/st=100, rew=12315.00]                                                                                


Epoch #47: test_reward: 11332.600000 ± 4349.963223, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #48: 1001it [00:02, 355.26it/s, env_step=48000, gradient_step=4800, len=169, n/ep=0, n/st=100, rew=8112.50]                                                                                 


Epoch #48: test_reward: 10016.500000 ± 3587.988106, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #49: 1001it [00:02, 400.53it/s, env_step=49000, gradient_step=4900, len=238, n/ep=0, n/st=100, rew=14417.50]                                                                                


Epoch #49: test_reward: 11943.400000 ± 5596.552299, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #50: 1001it [00:02, 353.96it/s, env_step=50000, gradient_step=5000, len=100, n/ep=1, n/st=100, rew=4680.00]                                                                                 


Epoch #50: test_reward: 11629.800000 ± 3990.385891, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #51: 1001it [00:02, 401.03it/s, env_step=51000, gradient_step=5100, len=257, n/ep=0, n/st=100, rew=14107.50]                                                                                


Epoch #51: test_reward: 17808.500000 ± 4161.972399, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #52: 1001it [00:03, 316.67it/s, env_step=52000, gradient_step=5200, len=187, n/ep=2, n/st=100, rew=9686.50]                                                                                 


Epoch #52: test_reward: 9039.000000 ± 4790.637056, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #53: 1001it [00:02, 414.24it/s, env_step=53000, gradient_step=5300, len=199, n/ep=1, n/st=100, rew=10631.00]                                                                                


Epoch #53: test_reward: 12490.800000 ± 5670.087227, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #54: 1001it [00:02, 418.34it/s, env_step=54000, gradient_step=5400, len=111, n/ep=0, n/st=100, rew=5594.50]                                                                                 


Epoch #54: test_reward: 11702.600000 ± 5843.824179, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #55: 1001it [00:02, 340.49it/s, env_step=55000, gradient_step=5500, len=149, n/ep=0, n/st=100, rew=8936.50]                                                                                 


Epoch #55: test_reward: 12928.700000 ± 4476.581509, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #56: 1001it [00:02, 376.14it/s, env_step=56000, gradient_step=5600, len=181, n/ep=0, n/st=100, rew=10097.00]                                                                                


Epoch #56: test_reward: 13287.400000 ± 6969.276809, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #57: 1001it [00:02, 342.84it/s, env_step=57000, gradient_step=5700, len=151, n/ep=0, n/st=100, rew=7499.00]                                                                                 


Epoch #57: test_reward: 14331.700000 ± 5571.658838, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #58: 1001it [00:02, 412.58it/s, env_step=58000, gradient_step=5800, len=97, n/ep=0, n/st=100, rew=4800.00]                                                                                  


Epoch #58: test_reward: 13430.600000 ± 6096.031319, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #59: 1001it [00:02, 414.94it/s, env_step=59000, gradient_step=5900, len=243, n/ep=0, n/st=100, rew=13519.50]                                                                                


Epoch #59: test_reward: 15710.000000 ± 6089.414093, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #60: 1001it [00:02, 384.72it/s, env_step=60000, gradient_step=6000, len=311, n/ep=0, n/st=100, rew=19473.50]                                                                                


Epoch #60: test_reward: 13389.700000 ± 8358.931846, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #61: 1001it [00:02, 413.54it/s, env_step=61000, gradient_step=6100, len=182, n/ep=0, n/st=100, rew=9987.00]                                                                                 


Epoch #61: test_reward: 8373.400000 ± 5104.815358, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #62: 1001it [00:02, 427.78it/s, env_step=62000, gradient_step=6200, len=117, n/ep=1, n/st=100, rew=5561.50]                                                                                 


Epoch #62: test_reward: 11511.000000 ± 3916.864690, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #63: 1001it [00:02, 398.26it/s, env_step=63000, gradient_step=6300, len=204, n/ep=0, n/st=100, rew=11524.00]                                                                                


Epoch #63: test_reward: 16443.300000 ± 6074.173574, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #64: 1001it [00:02, 372.89it/s, env_step=64000, gradient_step=6400, len=257, n/ep=5, n/st=100, rew=14565.70]                                                                                


Epoch #64: test_reward: 14420.600000 ± 4314.378639, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #65: 1001it [00:02, 416.70it/s, env_step=65000, gradient_step=6500, len=201, n/ep=2, n/st=100, rew=11452.00]                                                                                


Epoch #65: test_reward: 10662.600000 ± 3979.825227, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #66: 1001it [00:02, 384.56it/s, env_step=66000, gradient_step=6600, len=339, n/ep=1, n/st=100, rew=19600.00]                                                                                


Epoch #66: test_reward: 9242.000000 ± 4755.542682, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #67: 1001it [00:02, 401.30it/s, env_step=67000, gradient_step=6700, len=162, n/ep=1, n/st=100, rew=8741.00]                                                                                 


Epoch #67: test_reward: 10983.100000 ± 6012.132907, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #68: 1001it [00:02, 337.89it/s, env_step=68000, gradient_step=6800, len=279, n/ep=0, n/st=100, rew=16958.00]                                                                                


Epoch #68: test_reward: 12742.400000 ± 4563.304772, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #69: 1001it [00:02, 386.63it/s, env_step=69000, gradient_step=6900, len=204, n/ep=0, n/st=100, rew=12073.50]                                                                                


Epoch #69: test_reward: 13611.300000 ± 2980.046578, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #70: 1001it [00:02, 402.96it/s, env_step=70000, gradient_step=7000, len=178, n/ep=0, n/st=100, rew=9835.00]                                                                                 


Epoch #70: test_reward: 10615.400000 ± 4832.095512, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #71: 1001it [00:02, 402.17it/s, env_step=71000, gradient_step=7100, len=259, n/ep=3, n/st=100, rew=14422.67]                                                                                


Epoch #71: test_reward: 12291.800000 ± 7420.610309, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #72: 1001it [00:02, 349.25it/s, env_step=72000, gradient_step=7200, len=116, n/ep=0, n/st=100, rew=5518.00]                                                                                 


Epoch #72: test_reward: 14457.800000 ± 5404.513628, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #73: 1001it [00:02, 363.16it/s, env_step=73000, gradient_step=7300, len=314, n/ep=0, n/st=100, rew=17163.00]                                                                                


Epoch #73: test_reward: 12656.800000 ± 4373.149433, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #74: 1001it [00:02, 363.99it/s, env_step=74000, gradient_step=7400, len=100, n/ep=0, n/st=100, rew=5666.50]                                                                                 


Epoch #74: test_reward: 12861.700000 ± 6901.566852, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #75: 1001it [00:02, 396.60it/s, env_step=75000, gradient_step=7500, len=106, n/ep=0, n/st=100, rew=5036.00]                                                                                 


Epoch #75: test_reward: 12746.000000 ± 7125.140448, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #76: 1001it [00:02, 394.35it/s, env_step=76000, gradient_step=7600, len=101, n/ep=1, n/st=100, rew=3988.00]                                                                                 


Epoch #76: test_reward: 13499.400000 ± 4419.637456, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #77: 1001it [00:02, 356.41it/s, env_step=77000, gradient_step=7700, len=157, n/ep=2, n/st=100, rew=8849.00]                                                                                 


Epoch #77: test_reward: 10872.500000 ± 5700.633531, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #78: 1001it [00:02, 351.23it/s, env_step=78000, gradient_step=7800, len=138, n/ep=0, n/st=100, rew=8193.50]                                                                                 


Epoch #78: test_reward: 8233.100000 ± 2592.366427, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #79: 1001it [00:02, 385.21it/s, env_step=79000, gradient_step=7900, len=100, n/ep=0, n/st=100, rew=3730.50]                                                                                 


Epoch #79: test_reward: 13306.100000 ± 5802.910312, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #80: 1001it [00:03, 328.21it/s, env_step=80000, gradient_step=8000, len=205, n/ep=0, n/st=100, rew=11962.50]                                                                                


Epoch #80: test_reward: 15261.800000 ± 5703.996648, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #81: 1001it [00:02, 406.52it/s, env_step=81000, gradient_step=8100, len=400, n/ep=0, n/st=100, rew=24056.00]                                                                                


Epoch #81: test_reward: 12638.200000 ± 3278.747865, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #82: 1001it [00:02, 414.56it/s, env_step=82000, gradient_step=8200, len=197, n/ep=1, n/st=100, rew=10654.00]                                                                                


Epoch #82: test_reward: 12897.500000 ± 5976.263051, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #83: 1001it [00:02, 439.78it/s, env_step=83000, gradient_step=8300, len=120, n/ep=1, n/st=100, rew=6069.00]                                                                                 


Epoch #83: test_reward: 13253.600000 ± 5232.861152, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #84: 1001it [00:02, 397.99it/s, env_step=84000, gradient_step=8400, len=122, n/ep=0, n/st=100, rew=4580.50]                                                                                 


Epoch #84: test_reward: 10852.900000 ± 3378.280700, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #85: 1001it [00:02, 352.07it/s, env_step=85000, gradient_step=8500, len=378, n/ep=1, n/st=100, rew=23919.50]                                                                                


Epoch #85: test_reward: 10634.200000 ± 3996.082151, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #86: 1001it [00:02, 386.09it/s, env_step=86000, gradient_step=8600, len=153, n/ep=0, n/st=100, rew=9290.00]                                                                                 


Epoch #86: test_reward: 17476.400000 ± 7006.910363, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #87: 1001it [00:02, 438.09it/s, env_step=87000, gradient_step=8700, len=234, n/ep=0, n/st=100, rew=13681.50]                                                                                


Epoch #87: test_reward: 14629.100000 ± 5980.932176, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #88: 1001it [00:02, 343.36it/s, env_step=88000, gradient_step=8800, len=153, n/ep=1, n/st=100, rew=8875.50]                                                                                 


Epoch #88: test_reward: 10362.600000 ± 3460.438244, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #89: 1001it [00:02, 358.98it/s, env_step=89000, gradient_step=8900, len=116, n/ep=0, n/st=100, rew=4163.00]                                                                                 


Epoch #89: test_reward: 9825.300000 ± 3370.038132, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #90: 1001it [00:02, 418.11it/s, env_step=90000, gradient_step=9000, len=196, n/ep=2, n/st=100, rew=10240.00]                                                                                


Epoch #90: test_reward: 13613.000000 ± 4100.812066, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #91: 1001it [00:02, 387.05it/s, env_step=91000, gradient_step=9100, len=96, n/ep=0, n/st=100, rew=3943.50]                                                                                  


Epoch #91: test_reward: 14771.000000 ± 5953.415104, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #92: 1001it [00:02, 403.56it/s, env_step=92000, gradient_step=9200, len=211, n/ep=0, n/st=100, rew=10689.00]                                                                                


Epoch #92: test_reward: 14135.900000 ± 5539.548762, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #93: 1001it [00:02, 430.16it/s, env_step=93000, gradient_step=9300, len=217, n/ep=0, n/st=100, rew=12184.00]                                                                                


Epoch #93: test_reward: 13118.100000 ± 7657.613459, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #94: 1001it [00:02, 403.33it/s, env_step=94000, gradient_step=9400, len=178, n/ep=2, n/st=100, rew=9589.25]                                                                                 


Epoch #94: test_reward: 13280.600000 ± 4645.237996, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #95: 1001it [00:02, 427.24it/s, env_step=95000, gradient_step=9500, len=78, n/ep=1, n/st=100, rew=4227.00]                                                                                  


Epoch #95: test_reward: 13028.700000 ± 3446.503389, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #96: 1001it [00:02, 357.72it/s, env_step=96000, gradient_step=9600, len=246, n/ep=1, n/st=100, rew=15143.00]                                                                                


Epoch #96: test_reward: 16256.000000 ± 8456.040728, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #97: 1001it [00:02, 405.82it/s, env_step=97000, gradient_step=9700, len=160, n/ep=0, n/st=100, rew=8436.75]                                                                                 


Epoch #97: test_reward: 14402.500000 ± 5053.053537, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #98: 1001it [00:02, 420.81it/s, env_step=98000, gradient_step=9800, len=300, n/ep=0, n/st=100, rew=17634.50]                                                                                


Epoch #98: test_reward: 11528.500000 ± 4645.822107, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #99: 1001it [00:02, 339.90it/s, env_step=99000, gradient_step=9900, len=92, n/ep=2, n/st=100, rew=4743.25]                                                                                  


Epoch #99: test_reward: 14363.900000 ± 5623.350059, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #100: 1001it [00:02, 393.50it/s, env_step=100000, gradient_step=10000, len=136, n/ep=1, n/st=100, rew=5905.00]                                                                              


Epoch #100: test_reward: 14375.100000 ± 5385.086971, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #101: 1001it [00:02, 358.61it/s, env_step=101000, gradient_step=10100, len=307, n/ep=0, n/st=100, rew=20563.00]                                                                             


Epoch #101: test_reward: 9579.200000 ± 4298.956078, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #102: 1001it [00:02, 406.46it/s, env_step=102000, gradient_step=10200, len=164, n/ep=0, n/st=100, rew=9437.00]                                                                              


Epoch #102: test_reward: 10134.700000 ± 5343.890438, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #103: 1001it [00:02, 360.43it/s, env_step=103000, gradient_step=10300, len=187, n/ep=0, n/st=100, rew=10085.00]                                                                             


Epoch #103: test_reward: 11632.400000 ± 3487.451195, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #104: 1001it [00:02, 409.31it/s, env_step=104000, gradient_step=10400, len=140, n/ep=1, n/st=100, rew=6446.00]                                                                              


Epoch #104: test_reward: 12528.800000 ± 4674.944209, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #105: 1001it [00:02, 347.49it/s, env_step=105000, gradient_step=10500, len=212, n/ep=1, n/st=100, rew=12765.00]                                                                             


Epoch #105: test_reward: 13107.100000 ± 7846.758572, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #106: 1001it [00:02, 365.73it/s, env_step=106000, gradient_step=10600, len=84, n/ep=0, n/st=100, rew=4521.00]                                                                               


Epoch #106: test_reward: 11294.100000 ± 4203.458444, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #107: 1001it [00:02, 408.69it/s, env_step=107000, gradient_step=10700, len=205, n/ep=0, n/st=100, rew=11207.00]                                                                             


Epoch #107: test_reward: 12253.400000 ± 5772.250414, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #108: 1001it [00:02, 396.74it/s, env_step=108000, gradient_step=10800, len=110, n/ep=0, n/st=100, rew=5458.00]                                                                              


Epoch #108: test_reward: 15791.500000 ± 5057.989981, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #109: 1001it [00:02, 441.23it/s, env_step=109000, gradient_step=10900, len=214, n/ep=1, n/st=100, rew=12142.00]                                                                             


Epoch #109: test_reward: 9481.000000 ± 6514.012496, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #110: 1001it [00:03, 323.11it/s, env_step=110000, gradient_step=11000, len=134, n/ep=1, n/st=100, rew=7070.00]                                                                              


Epoch #110: test_reward: 9266.200000 ± 3818.045725, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #111: 1001it [00:03, 313.33it/s, env_step=111000, gradient_step=11100, len=38, n/ep=1, n/st=100, rew=1248.00]                                                                               


Epoch #111: test_reward: 14302.000000 ± 4210.724759, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #112: 1001it [00:02, 338.52it/s, env_step=112000, gradient_step=11200, len=169, n/ep=2, n/st=100, rew=8720.00]                                                                              


Epoch #112: test_reward: 11166.100000 ± 3621.986594, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #113: 1001it [00:02, 366.98it/s, env_step=113000, gradient_step=11300, len=161, n/ep=1, n/st=100, rew=9032.00]                                                                              


Epoch #113: test_reward: 11249.200000 ± 4089.242076, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #114: 1001it [00:02, 382.25it/s, env_step=114000, gradient_step=11400, len=273, n/ep=0, n/st=100, rew=16260.50]                                                                             


Epoch #114: test_reward: 10208.400000 ± 2925.512372, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #115: 1001it [00:02, 370.86it/s, env_step=115000, gradient_step=11500, len=320, n/ep=0, n/st=100, rew=19341.00]                                                                             


Epoch #115: test_reward: 14995.900000 ± 8455.166249, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #116: 1001it [00:02, 406.98it/s, env_step=116000, gradient_step=11600, len=209, n/ep=1, n/st=100, rew=11487.00]                                                                             


Epoch #116: test_reward: 11813.800000 ± 3980.480896, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #117: 1001it [00:02, 428.68it/s, env_step=117000, gradient_step=11700, len=71, n/ep=0, n/st=100, rew=3510.00]                                                                               


Epoch #117: test_reward: 11944.700000 ± 9111.697384, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #118: 1001it [00:03, 321.21it/s, env_step=118000, gradient_step=11800, len=137, n/ep=1, n/st=100, rew=8131.50]                                                                              


Epoch #118: test_reward: 12347.800000 ± 5403.199308, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #119: 1001it [00:02, 373.18it/s, env_step=119000, gradient_step=11900, len=158, n/ep=1, n/st=100, rew=8293.00]                                                                              


Epoch #119: test_reward: 11550.400000 ± 5224.085206, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #120: 1001it [00:03, 321.10it/s, env_step=120000, gradient_step=12000, len=129, n/ep=0, n/st=100, rew=7062.00]                                                                              


Epoch #120: test_reward: 14116.400000 ± 6775.360185, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #121: 1001it [00:02, 396.16it/s, env_step=121000, gradient_step=12100, len=89, n/ep=0, n/st=100, rew=3340.00]                                                                               


Epoch #121: test_reward: 10259.600000 ± 3875.723628, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #122: 1001it [00:02, 397.77it/s, env_step=122000, gradient_step=12200, len=244, n/ep=2, n/st=100, rew=14233.50]                                                                             


Epoch #122: test_reward: 14379.700000 ± 5427.564427, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #123: 1001it [00:02, 352.25it/s, env_step=123000, gradient_step=12300, len=85, n/ep=1, n/st=100, rew=3812.50]                                                                               


Epoch #123: test_reward: 15423.500000 ± 5520.138155, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #124: 1001it [00:02, 407.34it/s, env_step=124000, gradient_step=12400, len=113, n/ep=0, n/st=100, rew=5562.50]                                                                              


Epoch #124: test_reward: 12718.300000 ± 6045.829059, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #125: 1001it [00:02, 367.13it/s, env_step=125000, gradient_step=12500, len=154, n/ep=1, n/st=100, rew=8677.50]                                                                              


Epoch #125: test_reward: 11087.100000 ± 3203.770293, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #126: 1001it [00:02, 365.78it/s, env_step=126000, gradient_step=12600, len=177, n/ep=2, n/st=100, rew=9645.50]                                                                              


Epoch #126: test_reward: 17136.300000 ± 6048.129348, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #127: 1001it [00:02, 345.68it/s, env_step=127000, gradient_step=12700, len=205, n/ep=1, n/st=100, rew=12462.50]                                                                             


Epoch #127: test_reward: 12888.100000 ± 6407.210492, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #128: 1001it [00:03, 315.15it/s, env_step=128000, gradient_step=12800, len=217, n/ep=1, n/st=100, rew=12309.00]                                                                             


Epoch #128: test_reward: 13734.700000 ± 4656.099313, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #129: 1001it [00:02, 335.06it/s, env_step=129000, gradient_step=12900, len=54, n/ep=1, n/st=100, rew=2267.00]                                                                               


Epoch #129: test_reward: 10531.500000 ± 4200.099624, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #130: 1001it [00:02, 340.66it/s, env_step=130000, gradient_step=13000, len=400, n/ep=0, n/st=100, rew=23076.00]                                                                             


Epoch #130: test_reward: 14912.000000 ± 4829.889523, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #131: 1001it [00:02, 337.07it/s, env_step=131000, gradient_step=13100, len=169, n/ep=0, n/st=100, rew=10471.50]                                                                             


Epoch #131: test_reward: 10827.100000 ± 3754.391334, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #132: 1001it [00:02, 391.33it/s, env_step=132000, gradient_step=13200, len=178, n/ep=1, n/st=100, rew=9460.00]                                                                              


Epoch #132: test_reward: 14658.400000 ± 7050.395212, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #133: 1001it [00:02, 345.68it/s, env_step=133000, gradient_step=13300, len=224, n/ep=0, n/st=100, rew=14279.00]                                                                             


Epoch #133: test_reward: 10146.800000 ± 2993.966092, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #134: 1001it [00:02, 344.64it/s, env_step=134000, gradient_step=13400, len=213, n/ep=1, n/st=100, rew=13104.50]                                                                             


Epoch #134: test_reward: 9101.100000 ± 3817.460319, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #135: 1001it [00:02, 381.35it/s, env_step=135000, gradient_step=13500, len=106, n/ep=0, n/st=100, rew=4869.00]                                                                              


Epoch #135: test_reward: 9851.300000 ± 7375.362744, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #136: 1001it [00:02, 390.46it/s, env_step=136000, gradient_step=13600, len=230, n/ep=0, n/st=100, rew=14370.00]                                                                             


Epoch #136: test_reward: 13711.200000 ± 4664.013396, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #137: 1001it [00:02, 351.11it/s, env_step=137000, gradient_step=13700, len=116, n/ep=0, n/st=100, rew=6018.00]                                                                              


Epoch #137: test_reward: 10411.800000 ± 2731.484644, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #138: 1001it [00:02, 372.29it/s, env_step=138000, gradient_step=13800, len=181, n/ep=1, n/st=100, rew=11333.50]                                                                             


Epoch #138: test_reward: 6840.600000 ± 4157.631229, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #139: 1001it [00:02, 355.67it/s, env_step=139000, gradient_step=13900, len=173, n/ep=2, n/st=100, rew=9146.00]                                                                              


Epoch #139: test_reward: 11702.300000 ± 3533.898529, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #140: 1001it [00:02, 357.82it/s, env_step=140000, gradient_step=14000, len=226, n/ep=0, n/st=100, rew=13095.00]                                                                             


Epoch #140: test_reward: 10745.500000 ± 4083.452883, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #141: 1001it [00:02, 379.71it/s, env_step=141000, gradient_step=14100, len=183, n/ep=4, n/st=100, rew=10858.75]                                                                             


Epoch #141: test_reward: 6409.500000 ± 2280.072729, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #142: 1001it [00:02, 350.56it/s, env_step=142000, gradient_step=14200, len=183, n/ep=2, n/st=100, rew=10919.75]                                                                             


Epoch #142: test_reward: 14576.800000 ± 5664.218513, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #143: 1001it [00:02, 362.02it/s, env_step=143000, gradient_step=14300, len=199, n/ep=0, n/st=100, rew=11818.00]                                                                             


Epoch #143: test_reward: 12375.400000 ± 3803.654248, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #144: 1001it [00:02, 394.12it/s, env_step=144000, gradient_step=14400, len=122, n/ep=0, n/st=100, rew=5286.00]                                                                              


Epoch #144: test_reward: 8722.500000 ± 2404.595735, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #145: 1001it [00:02, 351.12it/s, env_step=145000, gradient_step=14500, len=166, n/ep=0, n/st=100, rew=9621.50]                                                                              


Epoch #145: test_reward: 14670.500000 ± 6228.531308, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #146: 1001it [00:02, 348.01it/s, env_step=146000, gradient_step=14600, len=162, n/ep=0, n/st=100, rew=9237.67]                                                                              


Epoch #146: test_reward: 13701.200000 ± 5919.706408, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #147: 1001it [00:03, 316.45it/s, env_step=147000, gradient_step=14700, len=208, n/ep=0, n/st=100, rew=12968.00]                                                                             


Epoch #147: test_reward: 10604.000000 ± 2910.850872, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #148: 1001it [00:02, 399.70it/s, env_step=148000, gradient_step=14800, len=108, n/ep=1, n/st=100, rew=5990.00]                                                                              


Epoch #148: test_reward: 12575.800000 ± 6084.131899, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #149: 1001it [00:02, 394.57it/s, env_step=149000, gradient_step=14900, len=164, n/ep=1, n/st=100, rew=9135.00]                                                                              


Epoch #149: test_reward: 8721.800000 ± 3143.723964, best_reward: 20102.100000 ± 4945.992609 in #25


Epoch #150: 1001it [00:02, 339.61it/s, env_step=150000, gradient_step=15000, len=111, n/ep=1, n/st=100, rew=5431.00]                                                                              


Epoch #150: test_reward: 13934.700000 ± 6714.987089, best_reward: 20102.100000 ± 4945.992609 in #25

InfoStats(gradient_step=15000, best_reward=20102.1, best_reward_std=4945.992609173613, train_step=150000, train_episode=748, test_step=345884, test_episode=1510, timing=TimingStats(total_time=592.3921966552734, train_time=397.06606912612915, train_time_collect=51.218117237091064, train_time_update=339.8107590675354, test_time=195.3261275291443, update_speed=377.7708841506477))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #11


Epoch #1: 1001it [00:02, 450.87it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 4871.400000 ± 2338.522619, best_reward: 13251.400000 ± 4491.584914 in #0


Epoch #2: 1001it [00:02, 399.63it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 13378.700000 ± 3639.753647, best_reward: 13378.700000 ± 3639.753647 in #2


Epoch #3: 1001it [00:02, 419.35it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 13521.200000 ± 4586.609004, best_reward: 13521.200000 ± 4586.609004 in #3


Epoch #4: 1001it [00:02, 432.54it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 12973.700000 ± 5732.776030, best_reward: 13521.200000 ± 4586.609004 in #3


Epoch #5: 1001it [00:02, 428.25it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 15593.700000 ± 4602.340167, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #6: 1001it [00:02, 442.24it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 13406.100000 ± 3652.907129, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #7: 1001it [00:02, 448.44it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 9524.000000 ± 4007.640129, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #8: 1001it [00:02, 415.95it/s, env_step=8000, gradient_step=800, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #8: test_reward: 12800.000000 ± 5178.110312, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #9: 1001it [00:02, 423.96it/s, env_step=9000, gradient_step=900, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #9: test_reward: 11832.400000 ± 4687.030855, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #10: 1001it [00:02, 374.60it/s, env_step=10000, gradient_step=1000, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                      


Epoch #10: test_reward: 12472.600000 ± 3089.628172, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #11: 1001it [00:02, 428.29it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=4337.00]                                                                                 


Epoch #11: test_reward: 10380.000000 ± 2783.523091, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #12: 1001it [00:02, 382.43it/s, env_step=12000, gradient_step=1200, len=110, n/ep=0, n/st=100, rew=4337.00]                                                                                 


Epoch #12: test_reward: 12916.000000 ± 3201.121304, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #13: 1001it [00:02, 422.62it/s, env_step=13000, gradient_step=1300, len=128, n/ep=0, n/st=100, rew=5046.50]                                                                                 


Epoch #13: test_reward: 10401.000000 ± 2571.759009, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #14: 1001it [00:02, 366.88it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=6444.00]                                                                                 


Epoch #14: test_reward: 11891.200000 ± 5415.413739, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #15: 1001it [00:02, 344.51it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=7266.50]                                                                                 


Epoch #15: test_reward: 12797.100000 ± 3935.099756, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #16: 1001it [00:02, 334.07it/s, env_step=16000, gradient_step=1600, len=152, n/ep=0, n/st=100, rew=6925.00]                                                                                 


Epoch #16: test_reward: 12499.900000 ± 4095.971544, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #17: 1001it [00:02, 395.94it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=7433.00]                                                                                 


Epoch #17: test_reward: 12696.200000 ± 4722.917526, best_reward: 15593.700000 ± 4602.340167 in #5


Epoch #18: 1001it [00:02, 378.21it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=8924.50]                                                                                 


Epoch #18: test_reward: 15766.400000 ± 6516.688226, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #19: 1001it [00:02, 365.81it/s, env_step=19000, gradient_step=1900, len=189, n/ep=0, n/st=100, rew=10457.00]                                                                                


Epoch #19: test_reward: 13210.100000 ± 6282.097268, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #20: 1001it [00:02, 356.96it/s, env_step=20000, gradient_step=2000, len=198, n/ep=0, n/st=100, rew=9860.50]                                                                                 


Epoch #20: test_reward: 11410.700000 ± 5100.769668, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #21: 1001it [00:02, 409.96it/s, env_step=21000, gradient_step=2100, len=208, n/ep=0, n/st=100, rew=10788.75]                                                                                


Epoch #21: test_reward: 11841.800000 ± 3698.877365, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #22: 1001it [00:02, 368.40it/s, env_step=22000, gradient_step=2200, len=82, n/ep=0, n/st=100, rew=3067.00]                                                                                  


Epoch #22: test_reward: 10484.700000 ± 2714.854105, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #23: 1001it [00:02, 385.66it/s, env_step=23000, gradient_step=2300, len=228, n/ep=0, n/st=100, rew=11733.00]                                                                                


Epoch #23: test_reward: 10212.400000 ± 4842.576385, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #24: 1001it [00:02, 400.28it/s, env_step=24000, gradient_step=2400, len=228, n/ep=0, n/st=100, rew=11733.00]                                                                                


Epoch #24: test_reward: 10565.000000 ± 3895.973640, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #25: 1001it [00:02, 386.37it/s, env_step=25000, gradient_step=2500, len=249, n/ep=0, n/st=100, rew=14482.00]                                                                                


Epoch #25: test_reward: 9608.300000 ± 3012.944442, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #26: 1001it [00:02, 363.28it/s, env_step=26000, gradient_step=2600, len=257, n/ep=0, n/st=100, rew=14268.50]                                                                                


Epoch #26: test_reward: 9531.600000 ± 4006.685793, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #27: 1001it [00:02, 411.97it/s, env_step=27000, gradient_step=2700, len=269, n/ep=0, n/st=100, rew=15118.00]                                                                                


Epoch #27: test_reward: 11249.100000 ± 3248.829157, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #28: 1001it [00:03, 327.90it/s, env_step=28000, gradient_step=2800, len=175, n/ep=0, n/st=100, rew=10156.50]                                                                                


Epoch #28: test_reward: 9915.000000 ± 1790.025363, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #29: 1001it [00:03, 324.18it/s, env_step=29000, gradient_step=2900, len=116, n/ep=0, n/st=100, rew=6376.50]                                                                                 


Epoch #29: test_reward: 9236.300000 ± 2429.145778, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #30: 1001it [00:03, 273.79it/s, env_step=30000, gradient_step=3000, len=232, n/ep=2, n/st=100, rew=12787.00]                                                                                


Epoch #30: test_reward: 12380.000000 ± 5142.869938, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #31: 1001it [00:03, 272.32it/s, env_step=31000, gradient_step=3100, len=259, n/ep=2, n/st=100, rew=13922.75]                                                                                


Epoch #31: test_reward: 10361.400000 ± 3879.047491, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #32: 1001it [00:03, 306.32it/s, env_step=32000, gradient_step=3200, len=52, n/ep=1, n/st=100, rew=1838.00]                                                                                  


Epoch #32: test_reward: 12313.400000 ± 4408.247071, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #33: 1001it [00:02, 339.14it/s, env_step=33000, gradient_step=3300, len=221, n/ep=0, n/st=100, rew=13163.00]                                                                                


Epoch #33: test_reward: 11365.500000 ± 5602.353296, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #34: 1001it [00:02, 351.50it/s, env_step=34000, gradient_step=3400, len=164, n/ep=0, n/st=100, rew=9329.00]                                                                                 


Epoch #34: test_reward: 8587.700000 ± 3167.161380, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #35: 1001it [00:02, 400.13it/s, env_step=35000, gradient_step=3500, len=153, n/ep=1, n/st=100, rew=7778.00]                                                                                 


Epoch #35: test_reward: 11340.000000 ± 3992.048797, best_reward: 15766.400000 ± 6516.688226 in #18


Epoch #36: 1001it [00:02, 344.01it/s, env_step=36000, gradient_step=3600, len=234, n/ep=1, n/st=100, rew=14574.00]                                                                                


Epoch #36: test_reward: 15806.000000 ± 3121.569990, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #37: 1001it [00:03, 321.05it/s, env_step=37000, gradient_step=3700, len=185, n/ep=0, n/st=100, rew=11422.50]                                                                                


Epoch #37: test_reward: 12366.000000 ± 4838.796131, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #38: 1001it [00:02, 373.62it/s, env_step=38000, gradient_step=3800, len=248, n/ep=1, n/st=100, rew=13566.00]                                                                                


Epoch #38: test_reward: 12654.200000 ± 2251.108562, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #39: 1001it [00:02, 407.78it/s, env_step=39000, gradient_step=3900, len=205, n/ep=0, n/st=100, rew=12227.00]                                                                                


Epoch #39: test_reward: 9889.500000 ± 3164.260838, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #40: 1001it [00:02, 351.76it/s, env_step=40000, gradient_step=4000, len=400, n/ep=1, n/st=100, rew=24329.00]                                                                                


Epoch #40: test_reward: 12188.500000 ± 3322.691417, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #41: 1001it [00:02, 411.98it/s, env_step=41000, gradient_step=4100, len=165, n/ep=0, n/st=100, rew=8588.00]                                                                                 


Epoch #41: test_reward: 13233.700000 ± 5734.338184, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #42: 1001it [00:02, 407.74it/s, env_step=42000, gradient_step=4200, len=140, n/ep=0, n/st=100, rew=8175.00]                                                                                 


Epoch #42: test_reward: 10254.000000 ± 5078.787887, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #43: 1001it [00:02, 383.70it/s, env_step=43000, gradient_step=4300, len=213, n/ep=0, n/st=100, rew=12032.50]                                                                                


Epoch #43: test_reward: 12232.300000 ± 5200.275263, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #44: 1001it [00:02, 361.71it/s, env_step=44000, gradient_step=4400, len=96, n/ep=0, n/st=100, rew=4742.00]                                                                                  


Epoch #44: test_reward: 13254.700000 ± 5024.265878, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #45: 1001it [00:02, 415.16it/s, env_step=45000, gradient_step=4500, len=188, n/ep=2, n/st=100, rew=11226.75]                                                                                


Epoch #45: test_reward: 14776.400000 ± 5034.407437, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #46: 1001it [00:02, 409.02it/s, env_step=46000, gradient_step=4600, len=155, n/ep=0, n/st=100, rew=9273.50]                                                                                 


Epoch #46: test_reward: 8898.400000 ± 4955.024182, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #47: 1001it [00:02, 351.74it/s, env_step=47000, gradient_step=4700, len=119, n/ep=0, n/st=100, rew=5616.00]                                                                                 


Epoch #47: test_reward: 12189.300000 ± 5681.166782, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #48: 1001it [00:02, 377.72it/s, env_step=48000, gradient_step=4800, len=81, n/ep=1, n/st=100, rew=3539.50]                                                                                  


Epoch #48: test_reward: 13927.200000 ± 4365.718837, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #49: 1001it [00:03, 319.04it/s, env_step=49000, gradient_step=4900, len=202, n/ep=0, n/st=100, rew=11708.50]                                                                                


Epoch #49: test_reward: 13592.300000 ± 5762.855647, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #50: 1001it [00:02, 337.14it/s, env_step=50000, gradient_step=5000, len=231, n/ep=0, n/st=100, rew=14002.00]                                                                                


Epoch #50: test_reward: 12527.000000 ± 4512.624890, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #51: 1001it [00:02, 399.07it/s, env_step=51000, gradient_step=5100, len=206, n/ep=0, n/st=100, rew=11426.00]                                                                                


Epoch #51: test_reward: 10785.800000 ± 4092.442420, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #52: 1001it [00:02, 368.47it/s, env_step=52000, gradient_step=5200, len=144, n/ep=1, n/st=100, rew=7564.00]                                                                                 


Epoch #52: test_reward: 11678.700000 ± 4234.372233, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #53: 1001it [00:02, 370.33it/s, env_step=53000, gradient_step=5300, len=240, n/ep=0, n/st=100, rew=13338.50]                                                                                


Epoch #53: test_reward: 10329.700000 ± 4747.450980, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #54: 1001it [00:02, 382.04it/s, env_step=54000, gradient_step=5400, len=230, n/ep=3, n/st=100, rew=12391.33]                                                                                


Epoch #54: test_reward: 9583.600000 ± 4055.753918, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #55: 1001it [00:02, 395.60it/s, env_step=55000, gradient_step=5500, len=193, n/ep=1, n/st=100, rew=11542.00]                                                                                


Epoch #55: test_reward: 11326.600000 ± 4646.237880, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #56: 1001it [00:02, 411.91it/s, env_step=56000, gradient_step=5600, len=142, n/ep=0, n/st=100, rew=7636.00]                                                                                 


Epoch #56: test_reward: 11242.000000 ± 3066.276700, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #57: 1001it [00:02, 335.71it/s, env_step=57000, gradient_step=5700, len=169, n/ep=1, n/st=100, rew=9871.50]                                                                                 


Epoch #57: test_reward: 9630.800000 ± 5086.173666, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #58: 1001it [00:02, 377.07it/s, env_step=58000, gradient_step=5800, len=296, n/ep=0, n/st=100, rew=18345.50]                                                                                


Epoch #58: test_reward: 10750.900000 ± 3933.238474, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #59: 1001it [00:02, 413.96it/s, env_step=59000, gradient_step=5900, len=166, n/ep=0, n/st=100, rew=8038.50]                                                                                 


Epoch #59: test_reward: 11893.500000 ± 3781.745953, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #60: 1001it [00:02, 376.18it/s, env_step=60000, gradient_step=6000, len=383, n/ep=0, n/st=100, rew=22706.00]                                                                                


Epoch #60: test_reward: 11600.700000 ± 3728.059658, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #61: 1001it [00:02, 411.40it/s, env_step=61000, gradient_step=6100, len=160, n/ep=1, n/st=100, rew=9697.00]                                                                                 


Epoch #61: test_reward: 8387.000000 ± 4105.802187, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #62: 1001it [00:02, 398.25it/s, env_step=62000, gradient_step=6200, len=138, n/ep=0, n/st=100, rew=6976.00]                                                                                 


Epoch #62: test_reward: 11687.000000 ± 6054.267123, best_reward: 15806.000000 ± 3121.569990 in #36


Epoch #63: 1001it [00:02, 368.53it/s, env_step=63000, gradient_step=6300, len=149, n/ep=4, n/st=100, rew=8077.50]                                                                                 


Epoch #63: test_reward: 18043.000000 ± 5201.862589, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #64: 1001it [00:02, 368.48it/s, env_step=64000, gradient_step=6400, len=253, n/ep=0, n/st=100, rew=15326.00]                                                                                


Epoch #64: test_reward: 10465.700000 ± 5250.606537, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #65: 1001it [00:02, 393.00it/s, env_step=65000, gradient_step=6500, len=231, n/ep=0, n/st=100, rew=11498.00]                                                                                


Epoch #65: test_reward: 14109.100000 ± 3743.148526, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #66: 1001it [00:02, 382.87it/s, env_step=66000, gradient_step=6600, len=183, n/ep=0, n/st=100, rew=9972.00]                                                                                 


Epoch #66: test_reward: 9501.700000 ± 4265.980873, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #67: 1001it [00:02, 409.45it/s, env_step=67000, gradient_step=6700, len=273, n/ep=0, n/st=100, rew=15775.00]                                                                                


Epoch #67: test_reward: 8293.600000 ± 2417.058179, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #68: 1001it [00:02, 342.05it/s, env_step=68000, gradient_step=6800, len=183, n/ep=2, n/st=100, rew=9509.50]                                                                                 


Epoch #68: test_reward: 14801.700000 ± 4177.711552, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #69: 1001it [00:02, 365.85it/s, env_step=69000, gradient_step=6900, len=175, n/ep=0, n/st=100, rew=8790.50]                                                                                 


Epoch #69: test_reward: 8255.000000 ± 4586.073833, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #70: 1001it [00:02, 361.71it/s, env_step=70000, gradient_step=7000, len=192, n/ep=1, n/st=100, rew=10024.50]                                                                                


Epoch #70: test_reward: 9028.600000 ± 2151.803300, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #71: 1001it [00:02, 383.27it/s, env_step=71000, gradient_step=7100, len=199, n/ep=1, n/st=100, rew=11743.50]                                                                                


Epoch #71: test_reward: 12194.600000 ± 4650.302038, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #72: 1001it [00:02, 404.70it/s, env_step=72000, gradient_step=7200, len=154, n/ep=0, n/st=100, rew=7831.00]                                                                                 


Epoch #72: test_reward: 8890.100000 ± 2720.397267, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #73: 1001it [00:03, 330.35it/s, env_step=73000, gradient_step=7300, len=253, n/ep=1, n/st=100, rew=15112.00]                                                                                


Epoch #73: test_reward: 12105.800000 ± 2940.158968, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #74: 1001it [00:02, 337.39it/s, env_step=74000, gradient_step=7400, len=94, n/ep=0, n/st=100, rew=4638.00]                                                                                  


Epoch #74: test_reward: 10957.300000 ± 4588.799888, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #75: 1001it [00:02, 410.43it/s, env_step=75000, gradient_step=7500, len=120, n/ep=1, n/st=100, rew=5875.00]                                                                                 


Epoch #75: test_reward: 13034.200000 ± 7995.584166, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #76: 1001it [00:02, 361.19it/s, env_step=76000, gradient_step=7600, len=187, n/ep=2, n/st=100, rew=11095.75]                                                                                


Epoch #76: test_reward: 7569.400000 ± 6797.022586, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #77: 1001it [00:02, 383.35it/s, env_step=77000, gradient_step=7700, len=149, n/ep=2, n/st=100, rew=7617.75]                                                                                 


Epoch #77: test_reward: 14105.600000 ± 6617.314972, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #78: 1001it [00:02, 347.53it/s, env_step=78000, gradient_step=7800, len=166, n/ep=1, n/st=100, rew=9941.50]                                                                                 


Epoch #78: test_reward: 12558.200000 ± 3927.175290, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #79: 1001it [00:02, 371.19it/s, env_step=79000, gradient_step=7900, len=146, n/ep=1, n/st=100, rew=7943.00]                                                                                 


Epoch #79: test_reward: 11236.600000 ± 6524.025539, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #80: 1001it [00:02, 406.67it/s, env_step=80000, gradient_step=8000, len=400, n/ep=0, n/st=100, rew=25343.00]                                                                                


Epoch #80: test_reward: 10248.200000 ± 5436.261543, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #81: 1001it [00:02, 387.74it/s, env_step=81000, gradient_step=8100, len=34, n/ep=1, n/st=100, rew=754.00]                                                                                   


Epoch #81: test_reward: 13026.200000 ± 6365.225633, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #82: 1001it [00:03, 329.54it/s, env_step=82000, gradient_step=8200, len=98, n/ep=0, n/st=100, rew=4546.50]                                                                                  


Epoch #82: test_reward: 10330.800000 ± 3605.947665, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #83: 1001it [00:02, 346.68it/s, env_step=83000, gradient_step=8300, len=173, n/ep=0, n/st=100, rew=9967.50]                                                                                 


Epoch #83: test_reward: 10436.100000 ± 3532.940148, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #84: 1001it [00:02, 366.82it/s, env_step=84000, gradient_step=8400, len=145, n/ep=0, n/st=100, rew=7810.50]                                                                                 


Epoch #84: test_reward: 11017.300000 ± 3932.231607, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #85: 1001it [00:02, 346.80it/s, env_step=85000, gradient_step=8500, len=207, n/ep=2, n/st=100, rew=11540.50]                                                                                


Epoch #85: test_reward: 10582.700000 ± 3506.572145, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #86: 1001it [00:02, 339.92it/s, env_step=86000, gradient_step=8600, len=121, n/ep=0, n/st=100, rew=6952.50]                                                                                 


Epoch #86: test_reward: 14596.900000 ± 2950.441948, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #87: 1001it [00:02, 365.01it/s, env_step=87000, gradient_step=8700, len=58, n/ep=1, n/st=100, rew=2747.00]                                                                                  


Epoch #87: test_reward: 9511.300000 ± 3416.078191, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #88: 1001it [00:02, 364.62it/s, env_step=88000, gradient_step=8800, len=144, n/ep=2, n/st=100, rew=7561.75]                                                                                 


Epoch #88: test_reward: 10165.800000 ± 5654.043664, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #89: 1001it [00:02, 349.10it/s, env_step=89000, gradient_step=8900, len=147, n/ep=0, n/st=100, rew=8236.00]                                                                                 


Epoch #89: test_reward: 8594.800000 ± 2951.222689, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #90: 1001it [00:02, 354.93it/s, env_step=90000, gradient_step=9000, len=198, n/ep=0, n/st=100, rew=11355.00]                                                                                


Epoch #90: test_reward: 13790.200000 ± 4514.720408, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #91: 1001it [00:02, 368.46it/s, env_step=91000, gradient_step=9100, len=157, n/ep=0, n/st=100, rew=8530.75]                                                                                 


Epoch #91: test_reward: 15594.300000 ± 4244.124080, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #92: 1001it [00:02, 373.37it/s, env_step=92000, gradient_step=9200, len=80, n/ep=0, n/st=100, rew=3990.00]                                                                                  


Epoch #92: test_reward: 10580.400000 ± 4155.688780, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #93: 1001it [00:02, 362.01it/s, env_step=93000, gradient_step=9300, len=184, n/ep=1, n/st=100, rew=9607.00]                                                                                 


Epoch #93: test_reward: 10894.400000 ± 7299.986236, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #94: 1001it [00:02, 426.20it/s, env_step=94000, gradient_step=9400, len=152, n/ep=2, n/st=100, rew=8342.75]                                                                                 


Epoch #94: test_reward: 10644.000000 ± 3985.680770, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #95: 1001it [00:03, 332.86it/s, env_step=95000, gradient_step=9500, len=256, n/ep=0, n/st=100, rew=14636.00]                                                                                


Epoch #95: test_reward: 10033.200000 ± 2612.343500, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #96: 1001it [00:02, 384.97it/s, env_step=96000, gradient_step=9600, len=80, n/ep=1, n/st=100, rew=3947.00]                                                                                  


Epoch #96: test_reward: 9866.100000 ± 4882.057178, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #97: 1001it [00:02, 380.59it/s, env_step=97000, gradient_step=9700, len=105, n/ep=0, n/st=100, rew=6093.00]                                                                                 


Epoch #97: test_reward: 6756.700000 ± 3032.916881, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #98: 1001it [00:02, 380.26it/s, env_step=98000, gradient_step=9800, len=162, n/ep=0, n/st=100, rew=9035.25]                                                                                 


Epoch #98: test_reward: 10282.000000 ± 5623.785398, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #99: 1001it [00:02, 363.79it/s, env_step=99000, gradient_step=9900, len=196, n/ep=1, n/st=100, rew=10820.00]                                                                                


Epoch #99: test_reward: 13688.700000 ± 6171.188833, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #100: 1001it [00:03, 330.46it/s, env_step=100000, gradient_step=10000, len=141, n/ep=1, n/st=100, rew=8106.00]                                                                              


Epoch #100: test_reward: 13736.300000 ± 4608.872575, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #101: 1001it [00:03, 323.44it/s, env_step=101000, gradient_step=10100, len=183, n/ep=1, n/st=100, rew=10180.00]                                                                             


Epoch #101: test_reward: 14277.800000 ± 5873.349739, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #102: 1001it [00:02, 356.75it/s, env_step=102000, gradient_step=10200, len=57, n/ep=0, n/st=100, rew=2056.00]                                                                               


Epoch #102: test_reward: 12519.700000 ± 5945.482992, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #103: 1001it [00:02, 372.37it/s, env_step=103000, gradient_step=10300, len=106, n/ep=1, n/st=100, rew=5676.00]                                                                              


Epoch #103: test_reward: 14227.200000 ± 4584.553780, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #104: 1001it [00:02, 340.63it/s, env_step=104000, gradient_step=10400, len=139, n/ep=0, n/st=100, rew=8070.00]                                                                              


Epoch #104: test_reward: 11731.000000 ± 5244.674785, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #105: 1001it [00:02, 334.75it/s, env_step=105000, gradient_step=10500, len=185, n/ep=1, n/st=100, rew=10514.50]                                                                             


Epoch #105: test_reward: 13508.400000 ± 6684.073940, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #106: 1001it [00:02, 393.25it/s, env_step=106000, gradient_step=10600, len=71, n/ep=0, n/st=100, rew=3109.00]                                                                               


Epoch #106: test_reward: 10105.100000 ± 3197.121594, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #107: 1001it [00:02, 397.07it/s, env_step=107000, gradient_step=10700, len=163, n/ep=0, n/st=100, rew=9666.50]                                                                              


Epoch #107: test_reward: 14405.800000 ± 7510.976618, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #108: 1001it [00:03, 331.41it/s, env_step=108000, gradient_step=10800, len=181, n/ep=1, n/st=100, rew=10838.00]                                                                             


Epoch #108: test_reward: 12369.100000 ± 4805.795261, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #109: 1001it [00:02, 379.05it/s, env_step=109000, gradient_step=10900, len=158, n/ep=0, n/st=100, rew=9643.00]                                                                              


Epoch #109: test_reward: 10001.700000 ± 2734.835573, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #110: 1001it [00:02, 397.32it/s, env_step=110000, gradient_step=11000, len=191, n/ep=0, n/st=100, rew=11175.50]                                                                             


Epoch #110: test_reward: 10398.600000 ± 3372.616438, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #111: 1001it [00:02, 399.49it/s, env_step=111000, gradient_step=11100, len=204, n/ep=2, n/st=100, rew=12368.75]                                                                             


Epoch #111: test_reward: 10096.800000 ± 1969.689356, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #112: 1001it [00:02, 368.00it/s, env_step=112000, gradient_step=11200, len=218, n/ep=1, n/st=100, rew=12329.00]                                                                             


Epoch #112: test_reward: 9568.800000 ± 5734.932411, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #113: 1001it [00:02, 349.91it/s, env_step=113000, gradient_step=11300, len=54, n/ep=0, n/st=100, rew=2455.00]                                                                               


Epoch #113: test_reward: 10698.400000 ± 4769.386023, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #114: 1001it [00:02, 373.60it/s, env_step=114000, gradient_step=11400, len=273, n/ep=0, n/st=100, rew=16901.50]                                                                             


Epoch #114: test_reward: 13357.200000 ± 3524.932675, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #115: 1001it [00:03, 332.15it/s, env_step=115000, gradient_step=11500, len=169, n/ep=2, n/st=100, rew=9319.25]                                                                              


Epoch #115: test_reward: 10597.000000 ± 4657.315922, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #116: 1001it [00:02, 418.97it/s, env_step=116000, gradient_step=11600, len=276, n/ep=0, n/st=100, rew=17312.00]                                                                             


Epoch #116: test_reward: 12061.200000 ± 6042.296448, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #117: 1001it [00:02, 366.35it/s, env_step=117000, gradient_step=11700, len=156, n/ep=1, n/st=100, rew=8900.00]                                                                              


Epoch #117: test_reward: 10320.000000 ± 4490.137860, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #118: 1001it [00:02, 355.16it/s, env_step=118000, gradient_step=11800, len=166, n/ep=0, n/st=100, rew=9022.50]                                                                              


Epoch #118: test_reward: 12023.300000 ± 4319.241462, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #119: 1001it [00:02, 383.61it/s, env_step=119000, gradient_step=11900, len=225, n/ep=0, n/st=100, rew=13430.00]                                                                             


Epoch #119: test_reward: 11820.400000 ± 4358.946368, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #120: 1001it [00:02, 391.23it/s, env_step=120000, gradient_step=12000, len=78, n/ep=0, n/st=100, rew=3386.00]                                                                               


Epoch #120: test_reward: 12583.400000 ± 5035.347837, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #121: 1001it [00:02, 383.68it/s, env_step=121000, gradient_step=12100, len=211, n/ep=0, n/st=100, rew=12378.00]                                                                             


Epoch #121: test_reward: 14297.800000 ± 7662.131267, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #122: 1001it [00:02, 400.49it/s, env_step=122000, gradient_step=12200, len=104, n/ep=0, n/st=100, rew=5620.00]                                                                              


Epoch #122: test_reward: 13380.500000 ± 5205.498617, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #123: 1001it [00:02, 339.08it/s, env_step=123000, gradient_step=12300, len=129, n/ep=1, n/st=100, rew=7671.50]                                                                              


Epoch #123: test_reward: 11741.500000 ± 4436.105888, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #124: 1001it [00:02, 376.22it/s, env_step=124000, gradient_step=12400, len=194, n/ep=0, n/st=100, rew=10829.00]                                                                             


Epoch #124: test_reward: 11615.100000 ± 4916.985752, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #125: 1001it [00:02, 349.76it/s, env_step=125000, gradient_step=12500, len=400, n/ep=0, n/st=100, rew=26405.00]                                                                             


Epoch #125: test_reward: 12793.700000 ± 5376.570172, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #126: 1001it [00:02, 359.50it/s, env_step=126000, gradient_step=12600, len=190, n/ep=0, n/st=100, rew=10679.00]                                                                             


Epoch #126: test_reward: 13257.500000 ± 3552.712745, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #127: 1001it [00:02, 363.75it/s, env_step=127000, gradient_step=12700, len=85, n/ep=0, n/st=100, rew=4594.75]                                                                               


Epoch #127: test_reward: 13289.800000 ± 5408.930611, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #128: 1001it [00:03, 315.89it/s, env_step=128000, gradient_step=12800, len=138, n/ep=0, n/st=100, rew=7344.00]                                                                              


Epoch #128: test_reward: 14578.200000 ± 3335.390766, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #129: 1001it [00:02, 383.55it/s, env_step=129000, gradient_step=12900, len=198, n/ep=2, n/st=100, rew=12036.75]                                                                             


Epoch #129: test_reward: 13989.300000 ± 4127.859204, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #130: 1001it [00:02, 400.91it/s, env_step=130000, gradient_step=13000, len=119, n/ep=1, n/st=100, rew=6818.50]                                                                              


Epoch #130: test_reward: 14032.400000 ± 4794.085819, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #131: 1001it [00:02, 352.71it/s, env_step=131000, gradient_step=13100, len=216, n/ep=1, n/st=100, rew=12746.00]                                                                             


Epoch #131: test_reward: 12229.100000 ± 6630.697587, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #132: 1001it [00:02, 337.79it/s, env_step=132000, gradient_step=13200, len=193, n/ep=0, n/st=100, rew=10716.50]                                                                             


Epoch #132: test_reward: 10151.200000 ± 3982.157827, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #133: 1001it [00:02, 355.06it/s, env_step=133000, gradient_step=13300, len=157, n/ep=1, n/st=100, rew=9834.00]                                                                              


Epoch #133: test_reward: 15076.600000 ± 4546.489815, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #134: 1001it [00:02, 370.40it/s, env_step=134000, gradient_step=13400, len=112, n/ep=0, n/st=100, rew=6323.00]                                                                              


Epoch #134: test_reward: 17739.000000 ± 4871.697076, best_reward: 18043.000000 ± 5201.862589 in #63


Epoch #135: 1001it [00:03, 327.73it/s, env_step=135000, gradient_step=13500, len=64, n/ep=0, n/st=100, rew=2487.00]                                                                               


Epoch #135: test_reward: 19488.700000 ± 5727.377481, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #136: 1001it [00:02, 362.17it/s, env_step=136000, gradient_step=13600, len=263, n/ep=2, n/st=100, rew=15833.50]                                                                             


Epoch #136: test_reward: 12413.800000 ± 3524.048745, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #137: 1001it [00:02, 349.83it/s, env_step=137000, gradient_step=13700, len=226, n/ep=0, n/st=100, rew=13575.00]                                                                             


Epoch #137: test_reward: 17107.200000 ± 4985.402166, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #138: 1001it [00:02, 338.61it/s, env_step=138000, gradient_step=13800, len=152, n/ep=0, n/st=100, rew=8834.00]                                                                              


Epoch #138: test_reward: 13092.400000 ± 7740.945759, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #139: 1001it [00:05, 184.30it/s, env_step=139000, gradient_step=13900, len=239, n/ep=0, n/st=100, rew=14716.00]                                                                             


Epoch #139: test_reward: 10159.000000 ± 2736.713832, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #140: 1001it [00:03, 257.67it/s, env_step=140000, gradient_step=14000, len=189, n/ep=2, n/st=100, rew=10821.50]                                                                             


Epoch #140: test_reward: 15461.600000 ± 5406.602023, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #141: 1001it [00:03, 274.44it/s, env_step=141000, gradient_step=14100, len=260, n/ep=0, n/st=100, rew=15852.50]                                                                             


Epoch #141: test_reward: 13683.500000 ± 6401.531430, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #142: 1001it [00:03, 257.08it/s, env_step=142000, gradient_step=14200, len=199, n/ep=0, n/st=100, rew=12064.50]                                                                             


Epoch #142: test_reward: 10967.600000 ± 3288.350778, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #143: 1001it [00:04, 238.81it/s, env_step=143000, gradient_step=14300, len=262, n/ep=0, n/st=100, rew=15639.00]                                                                             


Epoch #143: test_reward: 10745.300000 ± 5518.534517, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #144: 1001it [00:04, 250.24it/s, env_step=144000, gradient_step=14400, len=184, n/ep=0, n/st=100, rew=10430.50]                                                                             


Epoch #144: test_reward: 14714.600000 ± 5967.562387, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #145: 1001it [00:03, 254.86it/s, env_step=145000, gradient_step=14500, len=400, n/ep=1, n/st=100, rew=28044.00]                                                                             


Epoch #145: test_reward: 11795.900000 ± 4358.998886, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #146: 1001it [00:04, 245.86it/s, env_step=146000, gradient_step=14600, len=127, n/ep=0, n/st=100, rew=6648.50]                                                                              


Epoch #146: test_reward: 12970.000000 ± 6503.912484, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #147: 1001it [00:03, 268.87it/s, env_step=147000, gradient_step=14700, len=78, n/ep=1, n/st=100, rew=3728.00]                                                                               


Epoch #147: test_reward: 18471.800000 ± 6308.259630, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #148: 1001it [00:03, 292.69it/s, env_step=148000, gradient_step=14800, len=110, n/ep=0, n/st=100, rew=5284.00]                                                                              


Epoch #148: test_reward: 12601.100000 ± 6843.058066, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #149: 1001it [00:04, 233.31it/s, env_step=149000, gradient_step=14900, len=188, n/ep=1, n/st=100, rew=10261.00]                                                                             


Epoch #149: test_reward: 13515.600000 ± 6881.769412, best_reward: 19488.700000 ± 5727.377481 in #135


Epoch #150: 1001it [00:03, 309.84it/s, env_step=150000, gradient_step=15000, len=142, n/ep=1, n/st=100, rew=8334.00]                                                                              


Epoch #150: test_reward: 12359.000000 ± 2153.874509, best_reward: 19488.700000 ± 5727.377481 in #135

InfoStats(gradient_step=15000, best_reward=19488.7, best_reward_std=5727.377481011707, train_step=150000, train_episode=808, test_step=327322, test_episode=1510, timing=TimingStats(total_time=617.1621088981628, train_time=423.4008746147156, train_time_collect=54.34202313423157, train_time_update=362.53940415382385, test_time=193.76123428344727, update_speed=354.274185513897))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #14


Epoch #1: 1001it [00:03, 276.63it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 7655.700000 ± 5672.196031, best_reward: 9153.200000 ± 4831.844633 in #0


Epoch #2: 1001it [00:03, 315.49it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 11843.200000 ± 7242.464233, best_reward: 11843.200000 ± 7242.464233 in #2


Epoch #3: 1001it [00:02, 349.11it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 8054.600000 ± 5785.167780, best_reward: 11843.200000 ± 7242.464233 in #2


Epoch #4: 1001it [00:03, 294.48it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 5353.600000 ± 5302.435803, best_reward: 11843.200000 ± 7242.464233 in #2


Epoch #5: 1001it [00:03, 332.55it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 8509.500000 ± 6356.122690, best_reward: 11843.200000 ± 7242.464233 in #2


Epoch #6: 1001it [00:03, 259.83it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 12294.500000 ± 6429.186749, best_reward: 12294.500000 ± 6429.186749 in #6


Epoch #7: 1001it [00:02, 348.20it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 13293.200000 ± 4398.915953, best_reward: 13293.200000 ± 4398.915953 in #7


Epoch #8: 1001it [00:02, 349.36it/s, env_step=8000, gradient_step=800, len=77, n/ep=0, n/st=100, rew=2480.00]                                                                                     


Epoch #8: test_reward: 13162.200000 ± 5231.686264, best_reward: 13293.200000 ± 4398.915953 in #7


Epoch #9: 1001it [00:03, 267.07it/s, env_step=9000, gradient_step=900, len=84, n/ep=0, n/st=100, rew=2955.00]                                                                                     


Epoch #9: test_reward: 16428.100000 ± 7876.365487, best_reward: 16428.100000 ± 7876.365487 in #9


Epoch #10: 1001it [00:03, 301.76it/s, env_step=10000, gradient_step=1000, len=84, n/ep=0, n/st=100, rew=2955.00]                                                                                  


Epoch #10: test_reward: 15514.500000 ± 5567.716574, best_reward: 16428.100000 ± 7876.365487 in #9


Epoch #11: 1001it [00:04, 248.08it/s, env_step=11000, gradient_step=1100, len=109, n/ep=0, n/st=100, rew=4351.00]                                                                                 


Epoch #11: test_reward: 19277.000000 ± 6500.800212, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #12: 1001it [00:03, 263.79it/s, env_step=12000, gradient_step=1200, len=109, n/ep=0, n/st=100, rew=4351.00]                                                                                 


Epoch #12: test_reward: 14061.000000 ± 5570.685595, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #13: 1001it [00:03, 253.30it/s, env_step=13000, gradient_step=1300, len=122, n/ep=0, n/st=100, rew=4534.50]                                                                                 


Epoch #13: test_reward: 13080.600000 ± 4540.661454, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #14: 1001it [00:04, 244.71it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=5571.00]                                                                                 


Epoch #14: test_reward: 11189.300000 ± 3349.792114, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #15: 1001it [00:03, 296.26it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=6390.75]                                                                                 


Epoch #15: test_reward: 11798.200000 ± 4203.162471, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #16: 1001it [00:04, 238.72it/s, env_step=16000, gradient_step=1600, len=157, n/ep=0, n/st=100, rew=7636.00]                                                                                 


Epoch #16: test_reward: 8206.700000 ± 2957.463307, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #17: 1001it [00:04, 240.83it/s, env_step=17000, gradient_step=1700, len=170, n/ep=1, n/st=100, rew=7430.00]                                                                                 


Epoch #17: test_reward: 7395.600000 ± 2491.356225, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #18: 1001it [00:03, 258.87it/s, env_step=18000, gradient_step=1800, len=176, n/ep=0, n/st=100, rew=7719.00]                                                                                 


Epoch #18: test_reward: 11459.200000 ± 2265.165636, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #19: 1001it [00:03, 278.02it/s, env_step=19000, gradient_step=1900, len=188, n/ep=0, n/st=100, rew=8795.33]                                                                                 


Epoch #19: test_reward: 11060.600000 ± 2903.096285, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #20: 1001it [00:03, 288.76it/s, env_step=20000, gradient_step=2000, len=199, n/ep=0, n/st=100, rew=10278.50]                                                                                


Epoch #20: test_reward: 8305.100000 ± 3444.604925, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #21: 1001it [00:03, 282.12it/s, env_step=21000, gradient_step=2100, len=171, n/ep=2, n/st=100, rew=7872.50]                                                                                 


Epoch #21: test_reward: 7661.000000 ± 5099.744719, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #22: 1001it [00:03, 278.10it/s, env_step=22000, gradient_step=2200, len=176, n/ep=0, n/st=100, rew=8481.50]                                                                                 


Epoch #22: test_reward: 10652.700000 ± 8835.662036, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #23: 1001it [00:03, 276.32it/s, env_step=23000, gradient_step=2300, len=230, n/ep=2, n/st=100, rew=12266.50]                                                                                


Epoch #23: test_reward: 11543.700000 ± 3626.795612, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #24: 1001it [00:04, 220.16it/s, env_step=24000, gradient_step=2400, len=239, n/ep=0, n/st=100, rew=12889.75]                                                                                


Epoch #24: test_reward: 14490.400000 ± 6409.674753, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #25: 1001it [00:03, 251.68it/s, env_step=25000, gradient_step=2500, len=249, n/ep=0, n/st=100, rew=13588.50]                                                                                


Epoch #25: test_reward: 11971.500000 ± 6514.213325, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #26: 1001it [00:03, 257.31it/s, env_step=26000, gradient_step=2600, len=254, n/ep=0, n/st=100, rew=12352.00]                                                                                


Epoch #26: test_reward: 11764.600000 ± 5993.231652, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #27: 1001it [00:03, 289.32it/s, env_step=27000, gradient_step=2700, len=148, n/ep=1, n/st=100, rew=8858.50]                                                                                 


Epoch #27: test_reward: 11385.600000 ± 6177.400913, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #28: 1001it [00:03, 259.31it/s, env_step=28000, gradient_step=2800, len=279, n/ep=0, n/st=100, rew=15258.50]                                                                                


Epoch #28: test_reward: 10253.300000 ± 4015.901494, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #29: 1001it [00:03, 286.18it/s, env_step=29000, gradient_step=2900, len=133, n/ep=0, n/st=100, rew=7011.00]                                                                                 


Epoch #29: test_reward: 11311.400000 ± 5216.964179, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #30: 1001it [00:04, 240.16it/s, env_step=30000, gradient_step=3000, len=38, n/ep=1, n/st=100, rew=1513.50]                                                                                  


Epoch #30: test_reward: 9920.500000 ± 5139.313558, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #31: 1001it [00:03, 279.58it/s, env_step=31000, gradient_step=3100, len=164, n/ep=0, n/st=100, rew=8280.00]                                                                                 


Epoch #31: test_reward: 11920.400000 ± 5029.324472, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #32: 1001it [00:02, 342.23it/s, env_step=32000, gradient_step=3200, len=213, n/ep=0, n/st=100, rew=10237.25]                                                                                


Epoch #32: test_reward: 14112.800000 ± 5007.805923, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #33: 1001it [00:03, 322.63it/s, env_step=33000, gradient_step=3300, len=98, n/ep=0, n/st=100, rew=4641.00]                                                                                  


Epoch #33: test_reward: 13059.800000 ± 5846.774475, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #34: 1001it [00:03, 278.52it/s, env_step=34000, gradient_step=3400, len=137, n/ep=1, n/st=100, rew=8021.50]                                                                                 


Epoch #34: test_reward: 10970.800000 ± 3685.975930, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #35: 1001it [00:03, 258.37it/s, env_step=35000, gradient_step=3500, len=350, n/ep=1, n/st=100, rew=20018.00]                                                                                


Epoch #35: test_reward: 9901.000000 ± 3734.943882, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #36: 1001it [00:03, 310.94it/s, env_step=36000, gradient_step=3600, len=216, n/ep=0, n/st=100, rew=11110.00]                                                                                


Epoch #36: test_reward: 11638.600000 ± 2667.585695, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #37: 1001it [00:03, 291.87it/s, env_step=37000, gradient_step=3700, len=256, n/ep=0, n/st=100, rew=15399.25]                                                                                


Epoch #37: test_reward: 10451.900000 ± 4696.096218, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #38: 1001it [00:03, 276.09it/s, env_step=38000, gradient_step=3800, len=218, n/ep=1, n/st=100, rew=13248.00]                                                                                


Epoch #38: test_reward: 9481.900000 ± 4192.274191, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #39: 1001it [00:03, 267.84it/s, env_step=39000, gradient_step=3900, len=70, n/ep=0, n/st=100, rew=3099.00]                                                                                  


Epoch #39: test_reward: 10544.000000 ± 5276.241314, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #40: 1001it [00:03, 291.28it/s, env_step=40000, gradient_step=4000, len=400, n/ep=5, n/st=100, rew=22878.60]                                                                                


Epoch #40: test_reward: 10685.200000 ± 4112.289917, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #41: 1001it [00:03, 266.23it/s, env_step=41000, gradient_step=4100, len=172, n/ep=0, n/st=100, rew=9769.25]                                                                                 


Epoch #41: test_reward: 9888.300000 ± 4654.374503, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #42: 1001it [00:03, 255.24it/s, env_step=42000, gradient_step=4200, len=50, n/ep=0, n/st=100, rew=1755.00]                                                                                  


Epoch #42: test_reward: 5210.600000 ± 4364.507560, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #43: 1001it [00:03, 328.34it/s, env_step=43000, gradient_step=4300, len=228, n/ep=0, n/st=100, rew=13103.50]                                                                                


Epoch #43: test_reward: 11289.400000 ± 4920.493943, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #44: 1001it [00:03, 256.51it/s, env_step=44000, gradient_step=4400, len=202, n/ep=0, n/st=100, rew=12197.00]                                                                                


Epoch #44: test_reward: 8237.800000 ± 4077.321616, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #45: 1001it [00:03, 268.39it/s, env_step=45000, gradient_step=4500, len=224, n/ep=0, n/st=100, rew=11282.33]                                                                                


Epoch #45: test_reward: 11764.700000 ± 5292.182197, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #46: 1001it [00:03, 296.84it/s, env_step=46000, gradient_step=4600, len=177, n/ep=0, n/st=100, rew=9858.00]                                                                                 


Epoch #46: test_reward: 13892.500000 ± 3744.720691, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #47: 1001it [00:03, 288.88it/s, env_step=47000, gradient_step=4700, len=229, n/ep=0, n/st=100, rew=12604.00]                                                                                


Epoch #47: test_reward: 8980.800000 ± 6633.157691, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #48: 1001it [00:03, 307.14it/s, env_step=48000, gradient_step=4800, len=264, n/ep=1, n/st=100, rew=14773.50]                                                                                


Epoch #48: test_reward: 11864.200000 ± 3794.600606, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #49: 1001it [00:03, 300.16it/s, env_step=49000, gradient_step=4900, len=131, n/ep=0, n/st=100, rew=6942.00]                                                                                 


Epoch #49: test_reward: 11156.600000 ± 4356.378386, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #50: 1001it [00:03, 299.23it/s, env_step=50000, gradient_step=5000, len=200, n/ep=0, n/st=100, rew=10841.50]                                                                                


Epoch #50: test_reward: 14835.700000 ± 5857.476983, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #51: 1001it [00:03, 320.25it/s, env_step=51000, gradient_step=5100, len=245, n/ep=1, n/st=100, rew=14487.00]                                                                                


Epoch #51: test_reward: 11115.300000 ± 3322.937347, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #52: 1001it [00:03, 306.66it/s, env_step=52000, gradient_step=5200, len=230, n/ep=0, n/st=100, rew=12170.50]                                                                                


Epoch #52: test_reward: 15147.300000 ± 5423.859365, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #53: 1001it [00:03, 277.65it/s, env_step=53000, gradient_step=5300, len=137, n/ep=0, n/st=100, rew=7105.00]                                                                                 


Epoch #53: test_reward: 12404.000000 ± 4727.931641, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #54: 1001it [00:03, 308.62it/s, env_step=54000, gradient_step=5400, len=309, n/ep=0, n/st=100, rew=16501.00]                                                                                


Epoch #54: test_reward: 8825.400000 ± 3637.485841, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #55: 1001it [00:03, 290.58it/s, env_step=55000, gradient_step=5500, len=126, n/ep=0, n/st=100, rew=5612.00]                                                                                 


Epoch #55: test_reward: 14931.300000 ± 6187.930770, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #56: 1001it [00:03, 280.40it/s, env_step=56000, gradient_step=5600, len=202, n/ep=1, n/st=100, rew=9807.50]                                                                                 


Epoch #56: test_reward: 10282.300000 ± 7165.494094, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #57: 1001it [00:03, 269.96it/s, env_step=57000, gradient_step=5700, len=239, n/ep=0, n/st=100, rew=13253.50]                                                                                


Epoch #57: test_reward: 12605.900000 ± 3585.452062, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #58: 1001it [00:03, 308.54it/s, env_step=58000, gradient_step=5800, len=150, n/ep=0, n/st=100, rew=7372.00]                                                                                 


Epoch #58: test_reward: 9102.200000 ± 4246.576098, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #59: 1001it [00:03, 265.44it/s, env_step=59000, gradient_step=5900, len=267, n/ep=0, n/st=100, rew=15719.00]                                                                                


Epoch #59: test_reward: 11610.800000 ± 4947.860988, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #60: 1001it [00:03, 292.30it/s, env_step=60000, gradient_step=6000, len=194, n/ep=0, n/st=100, rew=10681.75]                                                                                


Epoch #60: test_reward: 11320.600000 ± 6046.999011, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #61: 1001it [00:03, 267.32it/s, env_step=61000, gradient_step=6100, len=359, n/ep=1, n/st=100, rew=21317.00]                                                                                


Epoch #61: test_reward: 12760.800000 ± 5849.447594, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #62: 1001it [00:03, 323.96it/s, env_step=62000, gradient_step=6200, len=186, n/ep=2, n/st=100, rew=10627.50]                                                                                


Epoch #62: test_reward: 12295.600000 ± 4406.194462, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #63: 1001it [00:03, 300.61it/s, env_step=63000, gradient_step=6300, len=83, n/ep=0, n/st=100, rew=3316.00]                                                                                  


Epoch #63: test_reward: 8430.200000 ± 6420.552154, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #64: 1001it [00:03, 287.66it/s, env_step=64000, gradient_step=6400, len=307, n/ep=1, n/st=100, rew=17678.00]                                                                                


Epoch #64: test_reward: 9991.600000 ± 5417.226139, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #65: 1001it [00:03, 297.61it/s, env_step=65000, gradient_step=6500, len=139, n/ep=0, n/st=100, rew=7513.00]                                                                                 


Epoch #65: test_reward: 12433.400000 ± 3945.586704, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #66: 1001it [00:03, 277.36it/s, env_step=66000, gradient_step=6600, len=169, n/ep=0, n/st=100, rew=9375.00]                                                                                 


Epoch #66: test_reward: 11763.100000 ± 4184.400613, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #67: 1001it [00:03, 269.56it/s, env_step=67000, gradient_step=6700, len=271, n/ep=1, n/st=100, rew=16265.00]                                                                                


Epoch #67: test_reward: 10096.200000 ± 4715.129559, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #68: 1001it [00:03, 295.46it/s, env_step=68000, gradient_step=6800, len=279, n/ep=0, n/st=100, rew=16003.00]                                                                                


Epoch #68: test_reward: 8402.100000 ± 2904.101253, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #69: 1001it [00:03, 271.72it/s, env_step=69000, gradient_step=6900, len=400, n/ep=0, n/st=100, rew=22302.00]                                                                                


Epoch #69: test_reward: 9397.200000 ± 4429.301453, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #70: 1001it [00:03, 282.33it/s, env_step=70000, gradient_step=7000, len=131, n/ep=0, n/st=100, rew=6384.50]                                                                                 


Epoch #70: test_reward: 12106.500000 ± 3994.566140, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #71: 1001it [00:03, 262.03it/s, env_step=71000, gradient_step=7100, len=257, n/ep=0, n/st=100, rew=12437.00]                                                                                


Epoch #71: test_reward: 13113.300000 ± 6523.580889, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #72: 1001it [00:04, 238.60it/s, env_step=72000, gradient_step=7200, len=181, n/ep=1, n/st=100, rew=9381.00]                                                                                 


Epoch #72: test_reward: 9292.000000 ± 3389.751643, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #73: 1001it [00:04, 210.16it/s, env_step=73000, gradient_step=7300, len=148, n/ep=2, n/st=100, rew=7507.75]                                                                                 


Epoch #73: test_reward: 11934.300000 ± 2940.360728, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #74: 1001it [00:04, 243.65it/s, env_step=74000, gradient_step=7400, len=285, n/ep=1, n/st=100, rew=16776.00]                                                                                


Epoch #74: test_reward: 12252.400000 ± 6069.878849, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #75: 1001it [00:03, 267.20it/s, env_step=75000, gradient_step=7500, len=133, n/ep=1, n/st=100, rew=6586.50]                                                                                 


Epoch #75: test_reward: 13879.900000 ± 6488.622480, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #76: 1001it [00:04, 233.96it/s, env_step=76000, gradient_step=7600, len=286, n/ep=2, n/st=100, rew=16503.25]                                                                                


Epoch #76: test_reward: 11892.300000 ± 6397.279954, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #77: 1001it [00:04, 250.11it/s, env_step=77000, gradient_step=7700, len=149, n/ep=0, n/st=100, rew=8674.50]                                                                                 


Epoch #77: test_reward: 13210.400000 ± 7425.197185, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #78: 1001it [00:03, 272.41it/s, env_step=78000, gradient_step=7800, len=203, n/ep=0, n/st=100, rew=10063.00]                                                                                


Epoch #78: test_reward: 14085.800000 ± 4850.871114, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #79: 1001it [00:03, 279.69it/s, env_step=79000, gradient_step=7900, len=85, n/ep=1, n/st=100, rew=3268.00]                                                                                  


Epoch #79: test_reward: 16157.400000 ± 4841.720442, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #80: 1001it [00:04, 215.81it/s, env_step=80000, gradient_step=8000, len=162, n/ep=2, n/st=100, rew=8730.75]                                                                                 


Epoch #80: test_reward: 14239.800000 ± 7091.622773, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #81: 1001it [00:03, 255.66it/s, env_step=81000, gradient_step=8100, len=169, n/ep=0, n/st=100, rew=8944.00]                                                                                 


Epoch #81: test_reward: 12506.500000 ± 5260.700091, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #82: 1001it [00:03, 263.91it/s, env_step=82000, gradient_step=8200, len=251, n/ep=0, n/st=100, rew=14169.00]                                                                                


Epoch #82: test_reward: 8860.400000 ± 4314.761064, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #83: 1001it [00:04, 250.25it/s, env_step=83000, gradient_step=8300, len=232, n/ep=0, n/st=100, rew=12716.00]                                                                                


Epoch #83: test_reward: 10477.200000 ± 5442.967073, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #84: 1001it [00:03, 263.97it/s, env_step=84000, gradient_step=8400, len=114, n/ep=2, n/st=100, rew=5893.00]                                                                                 


Epoch #84: test_reward: 11929.500000 ± 4167.825842, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #85: 1001it [00:03, 301.23it/s, env_step=85000, gradient_step=8500, len=224, n/ep=0, n/st=100, rew=12874.00]                                                                                


Epoch #85: test_reward: 12034.700000 ± 5582.941627, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #86: 1001it [00:03, 267.60it/s, env_step=86000, gradient_step=8600, len=259, n/ep=1, n/st=100, rew=15500.00]                                                                                


Epoch #86: test_reward: 11291.300000 ± 6691.149857, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #87: 1001it [00:03, 281.93it/s, env_step=87000, gradient_step=8700, len=271, n/ep=0, n/st=100, rew=16058.00]                                                                                


Epoch #87: test_reward: 14075.300000 ± 5013.198421, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #88: 1001it [00:03, 293.20it/s, env_step=88000, gradient_step=8800, len=400, n/ep=1, n/st=100, rew=23727.00]                                                                                


Epoch #88: test_reward: 9237.800000 ± 3954.900398, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #89: 1001it [00:03, 292.68it/s, env_step=89000, gradient_step=8900, len=146, n/ep=0, n/st=100, rew=7474.00]                                                                                 


Epoch #89: test_reward: 16188.600000 ± 6367.240096, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #90: 1001it [00:03, 251.02it/s, env_step=90000, gradient_step=9000, len=255, n/ep=0, n/st=100, rew=15177.50]                                                                                


Epoch #90: test_reward: 12484.400000 ± 5415.111434, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #91: 1001it [00:03, 298.27it/s, env_step=91000, gradient_step=9100, len=92, n/ep=0, n/st=100, rew=4782.50]                                                                                  


Epoch #91: test_reward: 14027.700000 ± 5702.349692, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #92: 1001it [00:03, 299.70it/s, env_step=92000, gradient_step=9200, len=244, n/ep=0, n/st=100, rew=13767.67]                                                                                


Epoch #92: test_reward: 10101.000000 ± 3409.779524, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #93: 1001it [00:03, 309.37it/s, env_step=93000, gradient_step=9300, len=119, n/ep=1, n/st=100, rew=6725.00]                                                                                 


Epoch #93: test_reward: 18289.600000 ± 5813.296314, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #94: 1001it [00:03, 290.55it/s, env_step=94000, gradient_step=9400, len=279, n/ep=1, n/st=100, rew=16942.00]                                                                                


Epoch #94: test_reward: 15533.700000 ± 3951.072843, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #95: 1001it [00:03, 268.15it/s, env_step=95000, gradient_step=9500, len=231, n/ep=0, n/st=100, rew=13108.00]                                                                                


Epoch #95: test_reward: 14450.400000 ± 4108.095500, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #96: 1001it [00:03, 319.02it/s, env_step=96000, gradient_step=9600, len=161, n/ep=0, n/st=100, rew=8503.75]                                                                                 


Epoch #96: test_reward: 11301.800000 ± 4582.967637, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #97: 1001it [00:03, 307.99it/s, env_step=97000, gradient_step=9700, len=240, n/ep=1, n/st=100, rew=15060.50]                                                                                


Epoch #97: test_reward: 9709.500000 ± 3036.360165, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #98: 1001it [00:03, 316.76it/s, env_step=98000, gradient_step=9800, len=96, n/ep=0, n/st=100, rew=4163.50]                                                                                  


Epoch #98: test_reward: 14748.800000 ± 7087.181736, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #99: 1001it [00:03, 270.22it/s, env_step=99000, gradient_step=9900, len=285, n/ep=1, n/st=100, rew=18423.00]                                                                                


Epoch #99: test_reward: 10313.100000 ± 2960.073731, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #100: 1001it [00:03, 262.17it/s, env_step=100000, gradient_step=10000, len=171, n/ep=1, n/st=100, rew=8686.00]                                                                              


Epoch #100: test_reward: 11453.300000 ± 4689.960044, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #101: 1001it [00:03, 268.43it/s, env_step=101000, gradient_step=10100, len=400, n/ep=0, n/st=100, rew=24546.50]                                                                             


Epoch #101: test_reward: 8865.700000 ± 8365.135074, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #102: 1001it [00:03, 272.61it/s, env_step=102000, gradient_step=10200, len=250, n/ep=0, n/st=100, rew=15203.50]                                                                             


Epoch #102: test_reward: 18024.800000 ± 3224.131009, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #103: 1001it [00:03, 252.19it/s, env_step=103000, gradient_step=10300, len=214, n/ep=1, n/st=100, rew=13289.00]                                                                             


Epoch #103: test_reward: 11113.300000 ± 4561.172920, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #104: 1001it [00:03, 264.78it/s, env_step=104000, gradient_step=10400, len=152, n/ep=1, n/st=100, rew=7621.00]                                                                              


Epoch #104: test_reward: 10377.800000 ± 6476.200689, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #105: 1001it [00:03, 301.69it/s, env_step=105000, gradient_step=10500, len=261, n/ep=1, n/st=100, rew=15325.00]                                                                             


Epoch #105: test_reward: 13279.400000 ± 4929.384225, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #106: 1001it [00:03, 279.73it/s, env_step=106000, gradient_step=10600, len=114, n/ep=1, n/st=100, rew=4763.00]                                                                              


Epoch #106: test_reward: 9327.000000 ± 3969.254565, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #107: 1001it [00:03, 311.24it/s, env_step=107000, gradient_step=10700, len=253, n/ep=0, n/st=100, rew=16237.50]                                                                             


Epoch #107: test_reward: 9861.600000 ± 5255.293069, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #108: 1001it [00:03, 268.91it/s, env_step=108000, gradient_step=10800, len=163, n/ep=1, n/st=100, rew=8216.00]                                                                              


Epoch #108: test_reward: 7684.900000 ± 5709.685849, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #109: 1001it [00:03, 332.38it/s, env_step=109000, gradient_step=10900, len=134, n/ep=0, n/st=100, rew=6088.00]                                                                              


Epoch #109: test_reward: 14320.800000 ± 5593.879455, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #110: 1001it [00:03, 277.62it/s, env_step=110000, gradient_step=11000, len=106, n/ep=1, n/st=100, rew=5398.00]                                                                              


Epoch #110: test_reward: 8620.500000 ± 3222.425771, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #111: 1001it [00:03, 312.12it/s, env_step=111000, gradient_step=11100, len=110, n/ep=1, n/st=100, rew=5651.00]                                                                              


Epoch #111: test_reward: 16161.100000 ± 4605.675617, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #112: 1001it [00:03, 267.30it/s, env_step=112000, gradient_step=11200, len=201, n/ep=2, n/st=100, rew=11347.75]                                                                             


Epoch #112: test_reward: 9952.400000 ± 5759.997625, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #113: 1001it [00:03, 332.81it/s, env_step=113000, gradient_step=11300, len=153, n/ep=1, n/st=100, rew=8155.00]                                                                              


Epoch #113: test_reward: 10129.600000 ± 3978.818096, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #114: 1001it [00:03, 302.89it/s, env_step=114000, gradient_step=11400, len=345, n/ep=0, n/st=100, rew=21731.00]                                                                             


Epoch #114: test_reward: 12433.300000 ± 4572.063583, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #115: 1001it [00:03, 273.41it/s, env_step=115000, gradient_step=11500, len=163, n/ep=0, n/st=100, rew=9087.50]                                                                              


Epoch #115: test_reward: 8490.600000 ± 4665.583355, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #116: 1001it [00:03, 255.68it/s, env_step=116000, gradient_step=11600, len=245, n/ep=0, n/st=100, rew=14461.00]                                                                             


Epoch #116: test_reward: 13603.500000 ± 5443.638512, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #117: 1001it [00:03, 275.54it/s, env_step=117000, gradient_step=11700, len=235, n/ep=1, n/st=100, rew=14528.00]                                                                             


Epoch #117: test_reward: 12861.600000 ± 6799.173129, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #118: 1001it [00:03, 285.07it/s, env_step=118000, gradient_step=11800, len=115, n/ep=1, n/st=100, rew=5228.00]                                                                              


Epoch #118: test_reward: 9685.800000 ± 5198.351235, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #119: 1001it [00:04, 247.94it/s, env_step=119000, gradient_step=11900, len=70, n/ep=1, n/st=100, rew=3030.50]                                                                               


Epoch #119: test_reward: 6864.200000 ± 3132.641786, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #120: 1001it [00:04, 222.43it/s, env_step=120000, gradient_step=12000, len=188, n/ep=1, n/st=100, rew=10487.00]                                                                             


Epoch #120: test_reward: 8595.200000 ± 4575.661543, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #121: 1001it [00:03, 274.68it/s, env_step=121000, gradient_step=12100, len=247, n/ep=1, n/st=100, rew=12386.00]                                                                             


Epoch #121: test_reward: 3540.200000 ± 4156.982988, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #122: 1001it [00:03, 284.07it/s, env_step=122000, gradient_step=12200, len=157, n/ep=0, n/st=100, rew=8742.00]                                                                              


Epoch #122: test_reward: 10435.900000 ± 6076.328553, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #123: 1001it [00:04, 231.46it/s, env_step=123000, gradient_step=12300, len=208, n/ep=1, n/st=100, rew=10860.50]                                                                             


Epoch #123: test_reward: 10995.500000 ± 2984.602461, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #124: 1001it [00:03, 281.10it/s, env_step=124000, gradient_step=12400, len=95, n/ep=1, n/st=100, rew=5011.00]                                                                               


Epoch #124: test_reward: 8212.000000 ± 2945.986286, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #125: 1001it [00:03, 305.90it/s, env_step=125000, gradient_step=12500, len=231, n/ep=0, n/st=100, rew=12801.00]                                                                             


Epoch #125: test_reward: 11283.800000 ± 4470.065968, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #126: 1001it [00:03, 264.80it/s, env_step=126000, gradient_step=12600, len=145, n/ep=1, n/st=100, rew=8087.00]                                                                              


Epoch #126: test_reward: 5243.000000 ± 3052.807921, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #127: 1001it [00:03, 256.04it/s, env_step=127000, gradient_step=12700, len=153, n/ep=0, n/st=100, rew=8817.00]                                                                              


Epoch #127: test_reward: 9463.000000 ± 1696.188492, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #128: 1001it [00:03, 255.76it/s, env_step=128000, gradient_step=12800, len=353, n/ep=1, n/st=100, rew=21903.00]                                                                             


Epoch #128: test_reward: 13559.100000 ± 5597.728494, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #129: 1001it [00:03, 285.78it/s, env_step=129000, gradient_step=12900, len=190, n/ep=1, n/st=100, rew=10713.50]                                                                             


Epoch #129: test_reward: 9239.400000 ± 3851.287063, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #130: 1001it [00:03, 284.73it/s, env_step=130000, gradient_step=13000, len=218, n/ep=0, n/st=100, rew=13017.00]                                                                             


Epoch #130: test_reward: 8803.500000 ± 3304.754250, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #131: 1001it [00:04, 216.86it/s, env_step=131000, gradient_step=13100, len=90, n/ep=0, n/st=100, rew=4350.00]                                                                               


Epoch #131: test_reward: 10028.600000 ± 5810.477901, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #132: 1001it [00:03, 304.23it/s, env_step=132000, gradient_step=13200, len=230, n/ep=0, n/st=100, rew=14030.00]                                                                             


Epoch #132: test_reward: 11660.000000 ± 6432.388934, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #133: 1001it [00:03, 275.79it/s, env_step=133000, gradient_step=13300, len=162, n/ep=2, n/st=100, rew=8502.00]                                                                              


Epoch #133: test_reward: 10471.500000 ± 3171.231188, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #134: 1001it [00:03, 265.04it/s, env_step=134000, gradient_step=13400, len=127, n/ep=1, n/st=100, rew=5327.00]                                                                              


Epoch #134: test_reward: 14185.000000 ± 5978.116258, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #135: 1001it [00:04, 245.48it/s, env_step=135000, gradient_step=13500, len=118, n/ep=0, n/st=100, rew=6470.00]                                                                              


Epoch #135: test_reward: 6976.200000 ± 3997.054360, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #136: 1001it [00:03, 266.51it/s, env_step=136000, gradient_step=13600, len=134, n/ep=0, n/st=100, rew=6984.75]                                                                              


Epoch #136: test_reward: 13986.900000 ± 7024.190138, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #137: 1001it [00:03, 314.49it/s, env_step=137000, gradient_step=13700, len=137, n/ep=0, n/st=100, rew=7344.00]                                                                              


Epoch #137: test_reward: 12581.100000 ± 6341.824886, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #138: 1001it [00:03, 271.19it/s, env_step=138000, gradient_step=13800, len=137, n/ep=0, n/st=100, rew=7232.50]                                                                              


Epoch #138: test_reward: 11856.600000 ± 4830.828442, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #139: 1001it [00:03, 294.55it/s, env_step=139000, gradient_step=13900, len=187, n/ep=1, n/st=100, rew=11358.50]                                                                             


Epoch #139: test_reward: 15378.400000 ± 4247.923639, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #140: 1001it [00:03, 286.59it/s, env_step=140000, gradient_step=14000, len=145, n/ep=1, n/st=100, rew=8185.00]                                                                              


Epoch #140: test_reward: 15223.500000 ± 5622.464411, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #141: 1001it [00:03, 259.44it/s, env_step=141000, gradient_step=14100, len=243, n/ep=0, n/st=100, rew=14593.00]                                                                             


Epoch #141: test_reward: 9514.400000 ± 3654.886898, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #142: 1001it [00:03, 290.74it/s, env_step=142000, gradient_step=14200, len=128, n/ep=0, n/st=100, rew=7061.00]                                                                              


Epoch #142: test_reward: 11540.000000 ± 4262.367042, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #143: 1001it [00:03, 264.50it/s, env_step=143000, gradient_step=14300, len=251, n/ep=0, n/st=100, rew=15066.00]                                                                             


Epoch #143: test_reward: 9808.700000 ± 4075.791900, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #144: 1001it [00:03, 266.36it/s, env_step=144000, gradient_step=14400, len=204, n/ep=1, n/st=100, rew=11071.00]                                                                             


Epoch #144: test_reward: 9987.400000 ± 3386.680268, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #145: 1001it [00:03, 285.55it/s, env_step=145000, gradient_step=14500, len=95, n/ep=0, n/st=100, rew=4912.50]                                                                               


Epoch #145: test_reward: 11472.900000 ± 3404.495394, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #146: 1001it [00:03, 308.40it/s, env_step=146000, gradient_step=14600, len=196, n/ep=0, n/st=100, rew=10524.50]                                                                             


Epoch #146: test_reward: 11547.800000 ± 3785.222366, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #147: 1001it [00:03, 313.17it/s, env_step=147000, gradient_step=14700, len=110, n/ep=2, n/st=100, rew=5563.25]                                                                              


Epoch #147: test_reward: 13013.200000 ± 3366.866579, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #148: 1001it [00:03, 304.94it/s, env_step=148000, gradient_step=14800, len=123, n/ep=0, n/st=100, rew=6903.00]                                                                              


Epoch #148: test_reward: 11083.700000 ± 5012.303982, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #149: 1001it [00:03, 278.51it/s, env_step=149000, gradient_step=14900, len=176, n/ep=0, n/st=100, rew=10160.50]                                                                             


Epoch #149: test_reward: 10754.600000 ± 5850.648720, best_reward: 19277.000000 ± 6500.800212 in #11


Epoch #150: 1001it [00:03, 302.58it/s, env_step=150000, gradient_step=15000, len=183, n/ep=0, n/st=100, rew=8710.00]                                                                              


Epoch #150: test_reward: 9574.900000 ± 6218.050570, best_reward: 19277.000000 ± 6500.800212 in #11

InfoStats(gradient_step=15000, best_reward=19277.0, best_reward_std=6500.8002122815615, train_step=150000, train_episode=713, test_step=349616, test_episode=1510, timing=TimingStats(total_time=822.9745600223541, train_time=543.2883160114288, train_time_collect=63.859177589416504, train_time_update=471.4319431781769, test_time=279.6862440109253, update_speed=276.09649532172995))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #14


Epoch #1: 1001it [00:03, 333.50it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 8192.100000 ± 3197.162256, best_reward: 8192.100000 ± 3197.162256 in #1


Epoch #2: 1001it [00:03, 314.84it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 10220.600000 ± 4608.549711, best_reward: 10220.600000 ± 4608.549711 in #2


Epoch #3: 1001it [00:02, 360.81it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 17210.400000 ± 5284.975217, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #4: 1001it [00:02, 344.65it/s, env_step=4000, gradient_step=400, len=39, n/ep=0, n/st=100, rew=1257.00]                                                                                     


Epoch #4: test_reward: 10259.000000 ± 6633.416917, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #5: 1001it [00:02, 353.24it/s, env_step=5000, gradient_step=500, len=39, n/ep=0, n/st=100, rew=1257.00]                                                                                     


Epoch #5: test_reward: 12857.600000 ± 5182.293917, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #6: 1001it [00:02, 337.67it/s, env_step=6000, gradient_step=600, len=59, n/ep=0, n/st=100, rew=2365.00]                                                                                     


Epoch #6: test_reward: 9153.300000 ± 5871.248761, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #7: 1001it [00:02, 334.12it/s, env_step=7000, gradient_step=700, len=59, n/ep=0, n/st=100, rew=2365.00]                                                                                     


Epoch #7: test_reward: 12463.000000 ± 6102.542781, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #8: 1001it [00:02, 333.78it/s, env_step=8000, gradient_step=800, len=73, n/ep=0, n/st=100, rew=3247.00]                                                                                     


Epoch #8: test_reward: 10291.400000 ± 4249.982311, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #9: 1001it [00:03, 286.45it/s, env_step=9000, gradient_step=900, len=81, n/ep=0, n/st=100, rew=3303.00]                                                                                     


Epoch #9: test_reward: 13111.600000 ± 6698.833021, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #10: 1001it [00:03, 317.70it/s, env_step=10000, gradient_step=1000, len=95, n/ep=0, n/st=100, rew=2902.00]                                                                                  


Epoch #10: test_reward: 10310.900000 ± 5086.010626, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #11: 1001it [00:03, 307.19it/s, env_step=11000, gradient_step=1100, len=109, n/ep=0, n/st=100, rew=4408.50]                                                                                 


Epoch #11: test_reward: 13427.000000 ± 8441.475428, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #12: 1001it [00:03, 323.46it/s, env_step=12000, gradient_step=1200, len=109, n/ep=0, n/st=100, rew=4408.50]                                                                                 


Epoch #12: test_reward: 9101.900000 ± 5936.740157, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #13: 1001it [00:03, 299.12it/s, env_step=13000, gradient_step=1300, len=123, n/ep=0, n/st=100, rew=5060.00]                                                                                 


Epoch #13: test_reward: 4503.100000 ± 4030.562801, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #14: 1001it [00:04, 248.88it/s, env_step=14000, gradient_step=1400, len=132, n/ep=0, n/st=100, rew=7029.00]                                                                                 


Epoch #14: test_reward: 14540.300000 ± 3469.663674, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #15: 1001it [00:03, 331.96it/s, env_step=15000, gradient_step=1500, len=149, n/ep=0, n/st=100, rew=6819.00]                                                                                 


Epoch #15: test_reward: 14840.600000 ± 5764.427035, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #16: 1001it [00:03, 325.94it/s, env_step=16000, gradient_step=1600, len=157, n/ep=0, n/st=100, rew=5681.00]                                                                                 


Epoch #16: test_reward: 5332.400000 ± 4718.475245, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #17: 1001it [00:03, 269.89it/s, env_step=17000, gradient_step=1700, len=164, n/ep=0, n/st=100, rew=7575.50]                                                                                 


Epoch #17: test_reward: 11075.700000 ± 3004.806551, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #18: 1001it [00:03, 322.52it/s, env_step=18000, gradient_step=1800, len=177, n/ep=0, n/st=100, rew=8988.00]                                                                                 


Epoch #18: test_reward: 11594.600000 ± 5278.562384, best_reward: 17210.400000 ± 5284.975217 in #3


Epoch #19: 1001it [00:03, 327.50it/s, env_step=19000, gradient_step=1900, len=181, n/ep=0, n/st=100, rew=7690.50]                                                                                 


Epoch #19: test_reward: 18457.600000 ± 5562.148042, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #20: 1001it [00:03, 290.44it/s, env_step=20000, gradient_step=2000, len=199, n/ep=0, n/st=100, rew=9368.50]                                                                                 


Epoch #20: test_reward: 9336.600000 ± 2447.380118, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #21: 1001it [00:03, 309.57it/s, env_step=21000, gradient_step=2100, len=210, n/ep=2, n/st=100, rew=11187.50]                                                                                


Epoch #21: test_reward: 6563.100000 ± 5200.166968, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #22: 1001it [00:03, 273.39it/s, env_step=22000, gradient_step=2200, len=170, n/ep=3, n/st=100, rew=8213.33]                                                                                 


Epoch #22: test_reward: 12221.000000 ± 5227.707452, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #23: 1001it [00:03, 306.71it/s, env_step=23000, gradient_step=2300, len=228, n/ep=0, n/st=100, rew=12685.50]                                                                                


Epoch #23: test_reward: 7627.000000 ± 2866.232440, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #24: 1001it [00:03, 272.33it/s, env_step=24000, gradient_step=2400, len=238, n/ep=0, n/st=100, rew=12782.50]                                                                                


Epoch #24: test_reward: 6904.800000 ± 3072.468805, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #25: 1001it [00:03, 315.43it/s, env_step=25000, gradient_step=2500, len=250, n/ep=2, n/st=100, rew=13491.75]                                                                                


Epoch #25: test_reward: 5553.200000 ± 3045.774411, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #26: 1001it [00:03, 314.26it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=12838.00]                                                                                


Epoch #26: test_reward: 8992.500000 ± 5509.543252, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #27: 1001it [00:03, 290.19it/s, env_step=27000, gradient_step=2700, len=188, n/ep=0, n/st=100, rew=8830.00]                                                                                 


Epoch #27: test_reward: 5285.500000 ± 2814.387651, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #28: 1001it [00:03, 292.37it/s, env_step=28000, gradient_step=2800, len=280, n/ep=1, n/st=100, rew=15397.50]                                                                                


Epoch #28: test_reward: 15044.900000 ± 6112.339379, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #29: 1001it [00:02, 338.51it/s, env_step=29000, gradient_step=2900, len=83, n/ep=0, n/st=100, rew=4142.00]                                                                                  


Epoch #29: test_reward: 8852.400000 ± 2927.030003, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #30: 1001it [00:03, 298.71it/s, env_step=30000, gradient_step=3000, len=250, n/ep=0, n/st=100, rew=13730.00]                                                                                


Epoch #30: test_reward: 15893.400000 ± 6731.107624, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #31: 1001it [00:03, 308.63it/s, env_step=31000, gradient_step=3100, len=308, n/ep=0, n/st=100, rew=17278.00]                                                                                


Epoch #31: test_reward: 7384.100000 ± 4434.231398, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #32: 1001it [00:03, 277.34it/s, env_step=32000, gradient_step=3200, len=161, n/ep=0, n/st=100, rew=9259.50]                                                                                 


Epoch #32: test_reward: 8234.600000 ± 5712.645065, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #33: 1001it [00:03, 318.00it/s, env_step=33000, gradient_step=3300, len=140, n/ep=0, n/st=100, rew=7605.75]                                                                                 


Epoch #33: test_reward: 6423.600000 ± 3207.752335, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #34: 1001it [00:03, 328.71it/s, env_step=34000, gradient_step=3400, len=119, n/ep=1, n/st=100, rew=6593.00]                                                                                 


Epoch #34: test_reward: 10935.200000 ± 3909.002451, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #35: 1001it [00:03, 289.36it/s, env_step=35000, gradient_step=3500, len=277, n/ep=1, n/st=100, rew=16326.50]                                                                                


Epoch #35: test_reward: 8756.500000 ± 4008.908112, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #36: 1001it [00:03, 301.68it/s, env_step=36000, gradient_step=3600, len=244, n/ep=0, n/st=100, rew=15276.00]                                                                                


Epoch #36: test_reward: 6987.700000 ± 5629.581975, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #37: 1001it [00:03, 296.76it/s, env_step=37000, gradient_step=3700, len=104, n/ep=0, n/st=100, rew=6367.00]                                                                                 


Epoch #37: test_reward: 9243.800000 ± 4489.787808, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #38: 1001it [00:03, 322.23it/s, env_step=38000, gradient_step=3800, len=57, n/ep=0, n/st=100, rew=2576.00]                                                                                  


Epoch #38: test_reward: 12220.300000 ± 4809.618593, best_reward: 18457.600000 ± 5562.148042 in #19


Epoch #39: 1001it [00:03, 299.39it/s, env_step=39000, gradient_step=3900, len=96, n/ep=0, n/st=100, rew=3986.00]                                                                                  


Epoch #39: test_reward: 19390.600000 ± 4412.388655, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #40: 1001it [00:03, 294.33it/s, env_step=40000, gradient_step=4000, len=350, n/ep=4, n/st=100, rew=19928.50]                                                                                


Epoch #40: test_reward: 8176.200000 ± 5794.619052, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #41: 1001it [00:03, 272.03it/s, env_step=41000, gradient_step=4100, len=158, n/ep=0, n/st=100, rew=5401.00]                                                                                 


Epoch #41: test_reward: 15519.400000 ± 3952.608359, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #42: 1001it [00:03, 280.83it/s, env_step=42000, gradient_step=4200, len=171, n/ep=1, n/st=100, rew=10427.50]                                                                                


Epoch #42: test_reward: 7355.300000 ± 2620.950135, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #43: 1001it [00:03, 290.18it/s, env_step=43000, gradient_step=4300, len=162, n/ep=0, n/st=100, rew=8403.75]                                                                                 


Epoch #43: test_reward: 12746.300000 ± 4110.826512, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #44: 1001it [00:03, 290.92it/s, env_step=44000, gradient_step=4400, len=190, n/ep=0, n/st=100, rew=11120.50]                                                                                


Epoch #44: test_reward: 8205.700000 ± 3228.977425, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #45: 1001it [00:03, 272.67it/s, env_step=45000, gradient_step=4500, len=227, n/ep=1, n/st=100, rew=14238.00]                                                                                


Epoch #45: test_reward: 11405.500000 ± 6366.565028, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #46: 1001it [00:03, 267.47it/s, env_step=46000, gradient_step=4600, len=203, n/ep=0, n/st=100, rew=12440.25]                                                                                


Epoch #46: test_reward: 4367.000000 ± 2490.482885, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #47: 1001it [00:04, 226.27it/s, env_step=47000, gradient_step=4700, len=192, n/ep=0, n/st=100, rew=11600.00]                                                                                


Epoch #47: test_reward: 9792.400000 ± 2945.795791, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #48: 1001it [00:04, 219.92it/s, env_step=48000, gradient_step=4800, len=192, n/ep=0, n/st=100, rew=11600.00]                                                                                


Epoch #48: test_reward: 11732.900000 ± 2670.916713, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #49: 1001it [00:03, 299.83it/s, env_step=49000, gradient_step=4900, len=179, n/ep=1, n/st=100, rew=10294.00]                                                                                


Epoch #49: test_reward: 10000.200000 ± 4859.659367, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #50: 1001it [00:04, 234.27it/s, env_step=50000, gradient_step=5000, len=136, n/ep=0, n/st=100, rew=6504.00]                                                                                 


Epoch #50: test_reward: 12503.200000 ± 3430.710125, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #51: 1001it [00:03, 284.29it/s, env_step=51000, gradient_step=5100, len=203, n/ep=0, n/st=100, rew=11581.00]                                                                                


Epoch #51: test_reward: 15323.400000 ± 5898.861639, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #52: 1001it [00:03, 319.45it/s, env_step=52000, gradient_step=5200, len=210, n/ep=0, n/st=100, rew=12066.50]                                                                                


Epoch #52: test_reward: 7061.000000 ± 3190.889186, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #53: 1001it [00:03, 303.21it/s, env_step=53000, gradient_step=5300, len=193, n/ep=0, n/st=100, rew=10081.00]                                                                                


Epoch #53: test_reward: 14138.300000 ± 5127.046519, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #54: 1001it [00:04, 236.46it/s, env_step=54000, gradient_step=5400, len=246, n/ep=0, n/st=100, rew=15247.00]                                                                                


Epoch #54: test_reward: 17214.600000 ± 5526.105685, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #55: 1001it [00:03, 263.85it/s, env_step=55000, gradient_step=5500, len=269, n/ep=0, n/st=100, rew=16394.00]                                                                                


Epoch #55: test_reward: 10771.800000 ± 5318.122560, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #56: 1001it [00:03, 295.72it/s, env_step=56000, gradient_step=5600, len=282, n/ep=0, n/st=100, rew=16934.00]                                                                                


Epoch #56: test_reward: 12433.400000 ± 5537.980249, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #57: 1001it [00:03, 279.70it/s, env_step=57000, gradient_step=5700, len=170, n/ep=1, n/st=100, rew=9767.00]                                                                                 


Epoch #57: test_reward: 8832.200000 ± 4774.128628, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #58: 1001it [00:03, 262.43it/s, env_step=58000, gradient_step=5800, len=144, n/ep=0, n/st=100, rew=6717.00]                                                                                 


Epoch #58: test_reward: 6433.800000 ± 2578.574715, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #59: 1001it [00:03, 267.26it/s, env_step=59000, gradient_step=5900, len=231, n/ep=0, n/st=100, rew=12667.00]                                                                                


Epoch #59: test_reward: 17811.700000 ± 6944.087615, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #60: 1001it [00:03, 272.95it/s, env_step=60000, gradient_step=6000, len=213, n/ep=0, n/st=100, rew=11896.50]                                                                                


Epoch #60: test_reward: 7533.900000 ± 1723.003799, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #61: 1001it [00:03, 306.67it/s, env_step=61000, gradient_step=6100, len=214, n/ep=1, n/st=100, rew=12712.00]                                                                                


Epoch #61: test_reward: 14857.600000 ± 4424.145278, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #62: 1001it [00:03, 282.87it/s, env_step=62000, gradient_step=6200, len=234, n/ep=0, n/st=100, rew=13624.50]                                                                                


Epoch #62: test_reward: 14384.600000 ± 4620.512162, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #63: 1001it [00:03, 288.79it/s, env_step=63000, gradient_step=6300, len=235, n/ep=2, n/st=100, rew=13834.50]                                                                                


Epoch #63: test_reward: 9364.300000 ± 4277.769280, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #64: 1001it [00:03, 252.55it/s, env_step=64000, gradient_step=6400, len=218, n/ep=1, n/st=100, rew=14276.50]                                                                                


Epoch #64: test_reward: 14116.500000 ± 4804.027003, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #65: 1001it [00:03, 258.12it/s, env_step=65000, gradient_step=6500, len=129, n/ep=1, n/st=100, rew=7161.00]                                                                                 


Epoch #65: test_reward: 10498.400000 ± 5279.485017, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #66: 1001it [00:03, 269.13it/s, env_step=66000, gradient_step=6600, len=173, n/ep=1, n/st=100, rew=11001.00]                                                                                


Epoch #66: test_reward: 9197.700000 ± 5047.108956, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #67: 1001it [00:03, 252.54it/s, env_step=67000, gradient_step=6700, len=143, n/ep=0, n/st=100, rew=8178.00]                                                                                 


Epoch #67: test_reward: 10738.900000 ± 3835.224125, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #68: 1001it [00:03, 312.92it/s, env_step=68000, gradient_step=6800, len=226, n/ep=0, n/st=100, rew=12784.25]                                                                                


Epoch #68: test_reward: 15959.500000 ± 4855.718551, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #69: 1001it [00:03, 303.05it/s, env_step=69000, gradient_step=6900, len=249, n/ep=0, n/st=100, rew=14705.00]                                                                                


Epoch #69: test_reward: 12594.400000 ± 5462.315245, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #70: 1001it [00:03, 306.10it/s, env_step=70000, gradient_step=7000, len=281, n/ep=0, n/st=100, rew=17431.00]                                                                                


Epoch #70: test_reward: 12242.200000 ± 5465.026510, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #71: 1001it [00:03, 302.47it/s, env_step=71000, gradient_step=7100, len=220, n/ep=0, n/st=100, rew=12276.00]                                                                                


Epoch #71: test_reward: 12808.900000 ± 6134.671311, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #72: 1001it [00:03, 314.77it/s, env_step=72000, gradient_step=7200, len=109, n/ep=0, n/st=100, rew=5877.00]                                                                                 


Epoch #72: test_reward: 13307.000000 ± 8624.811615, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #73: 1001it [00:03, 309.90it/s, env_step=73000, gradient_step=7300, len=87, n/ep=0, n/st=100, rew=3165.00]                                                                                  


Epoch #73: test_reward: 14557.100000 ± 5354.952893, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #74: 1001it [00:03, 283.75it/s, env_step=74000, gradient_step=7400, len=128, n/ep=1, n/st=100, rew=6781.50]                                                                                 


Epoch #74: test_reward: 9955.800000 ± 4266.446432, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #75: 1001it [00:03, 260.41it/s, env_step=75000, gradient_step=7500, len=153, n/ep=1, n/st=100, rew=9456.50]                                                                                 


Epoch #75: test_reward: 11501.300000 ± 4870.235478, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #76: 1001it [00:04, 246.28it/s, env_step=76000, gradient_step=7600, len=255, n/ep=1, n/st=100, rew=15383.50]                                                                                


Epoch #76: test_reward: 7859.900000 ± 4986.327916, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #77: 1001it [00:03, 303.35it/s, env_step=77000, gradient_step=7700, len=51, n/ep=0, n/st=100, rew=2289.00]                                                                                  


Epoch #77: test_reward: 18364.900000 ± 4170.440947, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #78: 1001it [00:03, 263.74it/s, env_step=78000, gradient_step=7800, len=320, n/ep=0, n/st=100, rew=20167.50]                                                                                


Epoch #78: test_reward: 11187.800000 ± 5392.143503, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #79: 1001it [00:03, 308.53it/s, env_step=79000, gradient_step=7900, len=302, n/ep=0, n/st=100, rew=18912.00]                                                                                


Epoch #79: test_reward: 15446.100000 ± 5804.867000, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #80: 1001it [00:03, 258.72it/s, env_step=80000, gradient_step=8000, len=149, n/ep=0, n/st=100, rew=8744.50]                                                                                 


Epoch #80: test_reward: 14034.000000 ± 7393.460949, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #81: 1001it [00:03, 298.17it/s, env_step=81000, gradient_step=8100, len=174, n/ep=0, n/st=100, rew=10942.00]                                                                                


Epoch #81: test_reward: 16775.900000 ± 6571.968859, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #82: 1001it [00:02, 336.11it/s, env_step=82000, gradient_step=8200, len=233, n/ep=0, n/st=100, rew=14847.00]                                                                                


Epoch #82: test_reward: 12998.700000 ± 5324.924827, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #83: 1001it [00:03, 330.99it/s, env_step=83000, gradient_step=8300, len=339, n/ep=0, n/st=100, rew=21867.50]                                                                                


Epoch #83: test_reward: 9857.000000 ± 4048.403364, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #84: 1001it [00:03, 313.94it/s, env_step=84000, gradient_step=8400, len=189, n/ep=1, n/st=100, rew=10835.00]                                                                                


Epoch #84: test_reward: 14123.900000 ± 6364.320206, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #85: 1001it [00:03, 282.45it/s, env_step=85000, gradient_step=8500, len=243, n/ep=1, n/st=100, rew=15346.00]                                                                                


Epoch #85: test_reward: 18720.800000 ± 7211.332759, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #86: 1001it [00:03, 270.36it/s, env_step=86000, gradient_step=8600, len=185, n/ep=0, n/st=100, rew=10895.00]                                                                                


Epoch #86: test_reward: 12334.900000 ± 5678.258086, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #87: 1001it [00:03, 283.43it/s, env_step=87000, gradient_step=8700, len=243, n/ep=0, n/st=100, rew=15857.00]                                                                                


Epoch #87: test_reward: 15902.100000 ± 5557.571132, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #88: 1001it [00:03, 276.17it/s, env_step=88000, gradient_step=8800, len=279, n/ep=1, n/st=100, rew=16983.50]                                                                                


Epoch #88: test_reward: 18286.500000 ± 5212.577736, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #89: 1001it [00:03, 305.71it/s, env_step=89000, gradient_step=8900, len=276, n/ep=0, n/st=100, rew=18632.50]                                                                                


Epoch #89: test_reward: 12012.000000 ± 6581.549088, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #90: 1001it [00:03, 291.03it/s, env_step=90000, gradient_step=9000, len=141, n/ep=0, n/st=100, rew=8370.00]                                                                                 


Epoch #90: test_reward: 13613.900000 ± 7105.157344, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #91: 1001it [00:03, 279.73it/s, env_step=91000, gradient_step=9100, len=278, n/ep=0, n/st=100, rew=18134.00]                                                                                


Epoch #91: test_reward: 6862.700000 ± 3094.495857, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #92: 1001it [00:03, 315.49it/s, env_step=92000, gradient_step=9200, len=162, n/ep=0, n/st=100, rew=9374.50]                                                                                 


Epoch #92: test_reward: 17234.700000 ± 8507.982499, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #93: 1001it [00:03, 288.01it/s, env_step=93000, gradient_step=9300, len=197, n/ep=1, n/st=100, rew=12406.00]                                                                                


Epoch #93: test_reward: 14694.800000 ± 6922.326369, best_reward: 19390.600000 ± 4412.388655 in #39


Epoch #94: 1001it [00:03, 306.32it/s, env_step=94000, gradient_step=9400, len=157, n/ep=0, n/st=100, rew=9837.00]                                                                                 


Epoch #94: test_reward: 21321.500000 ± 4942.571785, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #95: 1001it [00:03, 296.90it/s, env_step=95000, gradient_step=9500, len=247, n/ep=0, n/st=100, rew=16087.25]                                                                                


Epoch #95: test_reward: 19378.800000 ± 6088.456944, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #96: 1001it [00:03, 305.46it/s, env_step=96000, gradient_step=9600, len=400, n/ep=0, n/st=100, rew=26774.00]                                                                                


Epoch #96: test_reward: 10886.600000 ± 6982.276365, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #97: 1001it [00:03, 289.64it/s, env_step=97000, gradient_step=9700, len=194, n/ep=0, n/st=100, rew=12252.50]                                                                                


Epoch #97: test_reward: 13892.100000 ± 7696.028020, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #98: 1001it [00:03, 289.68it/s, env_step=98000, gradient_step=9800, len=77, n/ep=0, n/st=100, rew=3685.00]                                                                                  


Epoch #98: test_reward: 16817.400000 ± 5994.501417, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #99: 1001it [00:03, 309.07it/s, env_step=99000, gradient_step=9900, len=226, n/ep=0, n/st=100, rew=14763.00]                                                                                


Epoch #99: test_reward: 16204.300000 ± 7327.165360, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #100: 1001it [00:03, 282.40it/s, env_step=100000, gradient_step=10000, len=224, n/ep=0, n/st=100, rew=13248.50]                                                                             


Epoch #100: test_reward: 14307.400000 ± 7531.650858, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #101: 1001it [00:03, 266.83it/s, env_step=101000, gradient_step=10100, len=209, n/ep=0, n/st=100, rew=12997.00]                                                                             


Epoch #101: test_reward: 20334.300000 ± 6161.362139, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #102: 1001it [00:03, 270.55it/s, env_step=102000, gradient_step=10200, len=213, n/ep=0, n/st=100, rew=13814.75]                                                                             


Epoch #102: test_reward: 12234.200000 ± 6114.913406, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #103: 1001it [00:03, 321.45it/s, env_step=103000, gradient_step=10300, len=173, n/ep=0, n/st=100, rew=10156.50]                                                                             


Epoch #103: test_reward: 11582.600000 ± 4168.936272, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #104: 1001it [00:03, 284.98it/s, env_step=104000, gradient_step=10400, len=266, n/ep=1, n/st=100, rew=16911.50]                                                                             


Epoch #104: test_reward: 11744.500000 ± 4529.730262, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #105: 1001it [00:03, 271.87it/s, env_step=105000, gradient_step=10500, len=199, n/ep=1, n/st=100, rew=11276.00]                                                                             


Epoch #105: test_reward: 11606.000000 ± 5980.876257, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #106: 1001it [00:03, 299.06it/s, env_step=106000, gradient_step=10600, len=106, n/ep=0, n/st=100, rew=5493.00]                                                                              


Epoch #106: test_reward: 12444.600000 ± 5600.814302, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #107: 1001it [00:03, 282.19it/s, env_step=107000, gradient_step=10700, len=339, n/ep=1, n/st=100, rew=21161.00]                                                                             


Epoch #107: test_reward: 14974.300000 ± 5498.373015, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #108: 1001it [00:03, 267.94it/s, env_step=108000, gradient_step=10800, len=268, n/ep=0, n/st=100, rew=17604.00]                                                                             


Epoch #108: test_reward: 15831.600000 ± 6650.925097, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #109: 1001it [00:03, 275.36it/s, env_step=109000, gradient_step=10900, len=255, n/ep=2, n/st=100, rew=14873.00]                                                                             


Epoch #109: test_reward: 13659.900000 ± 6606.662356, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #110: 1001it [00:03, 275.05it/s, env_step=110000, gradient_step=11000, len=202, n/ep=0, n/st=100, rew=12292.25]                                                                             


Epoch #110: test_reward: 15404.300000 ± 6713.200713, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #111: 1001it [00:03, 262.87it/s, env_step=111000, gradient_step=11100, len=234, n/ep=0, n/st=100, rew=14778.50]                                                                             


Epoch #111: test_reward: 11390.100000 ± 5155.586707, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #112: 1001it [00:03, 308.85it/s, env_step=112000, gradient_step=11200, len=269, n/ep=1, n/st=100, rew=17849.00]                                                                             


Epoch #112: test_reward: 15944.200000 ± 6236.738792, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #113: 1001it [00:03, 288.34it/s, env_step=113000, gradient_step=11300, len=344, n/ep=0, n/st=100, rew=22895.50]                                                                             


Epoch #113: test_reward: 18745.900000 ± 5272.891663, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #114: 1001it [00:03, 275.94it/s, env_step=114000, gradient_step=11400, len=304, n/ep=1, n/st=100, rew=18081.50]                                                                             


Epoch #114: test_reward: 9689.900000 ± 2852.360933, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #115: 1001it [00:03, 260.44it/s, env_step=115000, gradient_step=11500, len=158, n/ep=0, n/st=100, rew=9787.50]                                                                              


Epoch #115: test_reward: 9746.700000 ± 5665.446320, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #116: 1001it [00:03, 286.58it/s, env_step=116000, gradient_step=11600, len=286, n/ep=0, n/st=100, rew=18669.50]                                                                             


Epoch #116: test_reward: 17972.900000 ± 8148.748081, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #117: 1001it [00:03, 288.72it/s, env_step=117000, gradient_step=11700, len=167, n/ep=0, n/st=100, rew=9743.00]                                                                              


Epoch #117: test_reward: 16843.400000 ± 7949.818592, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #118: 1001it [00:03, 281.44it/s, env_step=118000, gradient_step=11800, len=177, n/ep=0, n/st=100, rew=8625.50]                                                                              


Epoch #118: test_reward: 12617.900000 ± 6853.762010, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #119: 1001it [00:03, 283.09it/s, env_step=119000, gradient_step=11900, len=95, n/ep=1, n/st=100, rew=5173.00]                                                                               


Epoch #119: test_reward: 12078.800000 ± 4212.347630, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #120: 1001it [00:03, 273.69it/s, env_step=120000, gradient_step=12000, len=236, n/ep=0, n/st=100, rew=15226.00]                                                                             


Epoch #120: test_reward: 9230.700000 ± 3259.302626, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #121: 1001it [00:03, 291.55it/s, env_step=121000, gradient_step=12100, len=149, n/ep=0, n/st=100, rew=9296.50]                                                                              


Epoch #121: test_reward: 10808.700000 ± 5417.286665, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #122: 1001it [00:03, 277.54it/s, env_step=122000, gradient_step=12200, len=289, n/ep=2, n/st=100, rew=18248.75]                                                                             


Epoch #122: test_reward: 15968.800000 ± 8084.241261, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #123: 1001it [00:03, 289.79it/s, env_step=123000, gradient_step=12300, len=185, n/ep=0, n/st=100, rew=10897.00]                                                                             


Epoch #123: test_reward: 13660.400000 ± 8219.859940, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #124: 1001it [00:03, 294.01it/s, env_step=124000, gradient_step=12400, len=132, n/ep=3, n/st=100, rew=7849.17]                                                                              


Epoch #124: test_reward: 9851.800000 ± 4206.221506, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #125: 1001it [00:03, 270.62it/s, env_step=125000, gradient_step=12500, len=201, n/ep=0, n/st=100, rew=12752.50]                                                                             


Epoch #125: test_reward: 17291.500000 ± 5695.490378, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #126: 1001it [00:03, 286.22it/s, env_step=126000, gradient_step=12600, len=138, n/ep=1, n/st=100, rew=8756.50]                                                                              


Epoch #126: test_reward: 14757.000000 ± 3162.779664, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #127: 1001it [00:03, 282.97it/s, env_step=127000, gradient_step=12700, len=129, n/ep=0, n/st=100, rew=7735.00]                                                                              


Epoch #127: test_reward: 16529.100000 ± 8481.448903, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #128: 1001it [00:03, 305.10it/s, env_step=128000, gradient_step=12800, len=51, n/ep=1, n/st=100, rew=2465.00]                                                                               


Epoch #128: test_reward: 11508.400000 ± 7225.299194, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #129: 1001it [00:03, 263.28it/s, env_step=129000, gradient_step=12900, len=137, n/ep=1, n/st=100, rew=7820.50]                                                                              


Epoch #129: test_reward: 15082.300000 ± 4964.111946, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #130: 1001it [00:03, 275.20it/s, env_step=130000, gradient_step=13000, len=135, n/ep=1, n/st=100, rew=7714.00]                                                                              


Epoch #130: test_reward: 12649.700000 ± 4813.764391, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #131: 1001it [00:03, 291.69it/s, env_step=131000, gradient_step=13100, len=184, n/ep=0, n/st=100, rew=10114.50]                                                                             


Epoch #131: test_reward: 11229.700000 ± 7945.012902, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #132: 1001it [00:04, 239.94it/s, env_step=132000, gradient_step=13200, len=186, n/ep=1, n/st=100, rew=11066.00]                                                                             


Epoch #132: test_reward: 11024.600000 ± 3375.614765, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #133: 1001it [00:03, 262.62it/s, env_step=133000, gradient_step=13300, len=204, n/ep=0, n/st=100, rew=13731.50]                                                                             


Epoch #133: test_reward: 7460.200000 ± 3604.535415, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #134: 1001it [00:03, 260.82it/s, env_step=134000, gradient_step=13400, len=151, n/ep=0, n/st=100, rew=8893.50]                                                                              


Epoch #134: test_reward: 11925.800000 ± 3835.672921, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #135: 1001it [00:04, 217.70it/s, env_step=135000, gradient_step=13500, len=280, n/ep=0, n/st=100, rew=18192.50]                                                                             


Epoch #135: test_reward: 15139.100000 ± 6322.783002, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #136: 1001it [00:03, 264.56it/s, env_step=136000, gradient_step=13600, len=214, n/ep=0, n/st=100, rew=13636.50]                                                                             


Epoch #136: test_reward: 11402.700000 ± 4372.297978, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #137: 1001it [00:03, 289.75it/s, env_step=137000, gradient_step=13700, len=177, n/ep=1, n/st=100, rew=10266.00]                                                                             


Epoch #137: test_reward: 10055.500000 ± 3793.414853, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #138: 1001it [00:03, 279.65it/s, env_step=138000, gradient_step=13800, len=286, n/ep=0, n/st=100, rew=19892.50]                                                                             


Epoch #138: test_reward: 11343.100000 ± 7165.471742, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #139: 1001it [00:03, 303.64it/s, env_step=139000, gradient_step=13900, len=162, n/ep=0, n/st=100, rew=9562.00]                                                                              


Epoch #139: test_reward: 8121.600000 ± 4426.172776, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #140: 1001it [00:03, 282.73it/s, env_step=140000, gradient_step=14000, len=57, n/ep=0, n/st=100, rew=2638.50]                                                                               


Epoch #140: test_reward: 10463.000000 ± 6597.282425, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #141: 1001it [00:03, 321.39it/s, env_step=141000, gradient_step=14100, len=64, n/ep=0, n/st=100, rew=3053.00]                                                                               


Epoch #141: test_reward: 7220.500000 ± 3568.374119, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #142: 1001it [00:03, 267.58it/s, env_step=142000, gradient_step=14200, len=173, n/ep=2, n/st=100, rew=10313.00]                                                                             


Epoch #142: test_reward: 8155.900000 ± 4549.513390, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #143: 1001it [00:03, 265.04it/s, env_step=143000, gradient_step=14300, len=185, n/ep=0, n/st=100, rew=9402.00]                                                                              


Epoch #143: test_reward: 17027.000000 ± 6092.769255, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #144: 1001it [00:04, 247.62it/s, env_step=144000, gradient_step=14400, len=145, n/ep=0, n/st=100, rew=8016.25]                                                                              


Epoch #144: test_reward: 10521.500000 ± 3797.438064, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #145: 1001it [00:03, 264.32it/s, env_step=145000, gradient_step=14500, len=202, n/ep=1, n/st=100, rew=13265.00]                                                                             


Epoch #145: test_reward: 16437.200000 ± 4199.080323, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #146: 1001it [00:03, 267.87it/s, env_step=146000, gradient_step=14600, len=288, n/ep=1, n/st=100, rew=19088.00]                                                                             


Epoch #146: test_reward: 10945.700000 ± 6582.315824, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #147: 1001it [00:03, 264.29it/s, env_step=147000, gradient_step=14700, len=249, n/ep=0, n/st=100, rew=16479.25]                                                                             


Epoch #147: test_reward: 13245.600000 ± 4008.647582, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #148: 1001it [00:03, 273.86it/s, env_step=148000, gradient_step=14800, len=400, n/ep=0, n/st=100, rew=23829.00]                                                                             


Epoch #148: test_reward: 11753.500000 ± 8494.666553, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #149: 1001it [00:03, 266.67it/s, env_step=149000, gradient_step=14900, len=145, n/ep=0, n/st=100, rew=8228.00]                                                                              


Epoch #149: test_reward: 14207.700000 ± 7563.698950, best_reward: 21321.500000 ± 4942.571785 in #94


Epoch #150: 1001it [00:03, 285.30it/s, env_step=150000, gradient_step=15000, len=41, n/ep=0, n/st=100, rew=1765.00]                                                                               


Epoch #150: test_reward: 8161.400000 ± 2640.598576, best_reward: 21321.500000 ± 4942.571785 in #94

InfoStats(gradient_step=15000, best_reward=21321.5, best_reward_std=4942.571785012333, train_step=150000, train_episode=691, test_step=349607, test_episode=1510, timing=TimingStats(total_time=782.0397884845734, train_time=525.5298936367035, train_time_collect=60.250309467315674, train_time_update=457.76322746276855, test_time=256.5098948478699, update_speed=285.4261989969582))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #1


Epoch #1: 1001it [00:02, 341.26it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 9434.200000 ± 5602.727582, best_reward: 11187.000000 ± 4576.384534 in #0


Epoch #2: 1001it [00:03, 316.92it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 12240.200000 ± 5252.041771, best_reward: 12240.200000 ± 5252.041771 in #2


Epoch #3: 1001it [00:02, 367.20it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 9787.200000 ± 5206.701390, best_reward: 12240.200000 ± 5252.041771 in #2


Epoch #4: 1001it [00:03, 333.42it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 5362.600000 ± 4163.416679, best_reward: 12240.200000 ± 5252.041771 in #2


Epoch #5: 1001it [00:03, 312.40it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 13940.900000 ± 5245.909482, best_reward: 13940.900000 ± 5245.909482 in #5


Epoch #6: 1001it [00:02, 371.85it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 11258.500000 ± 7945.081186, best_reward: 13940.900000 ± 5245.909482 in #5


Epoch #7: 1001it [00:02, 335.60it/s, env_step=7000, gradient_step=700, len=67, n/ep=0, n/st=100, rew=2446.00]                                                                                     


Epoch #7: test_reward: 11129.700000 ± 6103.421238, best_reward: 13940.900000 ± 5245.909482 in #5


Epoch #8: 1001it [00:02, 354.09it/s, env_step=8000, gradient_step=800, len=67, n/ep=0, n/st=100, rew=2446.00]                                                                                     


Epoch #8: test_reward: 11031.600000 ± 3237.740175, best_reward: 13940.900000 ± 5245.909482 in #5


Epoch #9: 1001it [00:03, 275.46it/s, env_step=9000, gradient_step=900, len=90, n/ep=1, n/st=100, rew=3410.50]                                                                                     


Epoch #9: test_reward: 12779.900000 ± 3695.035276, best_reward: 13940.900000 ± 5245.909482 in #5


Epoch #10: 1001it [00:03, 277.54it/s, env_step=10000, gradient_step=1000, len=100, n/ep=1, n/st=100, rew=3291.00]                                                                                 


Epoch #10: test_reward: 14770.600000 ± 4718.632009, best_reward: 14770.600000 ± 4718.632009 in #10


Epoch #11: 1001it [00:03, 261.49it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=5082.00]                                                                                 


Epoch #11: test_reward: 12881.200000 ± 5112.565243, best_reward: 14770.600000 ± 4718.632009 in #10


Epoch #12: 1001it [00:03, 268.40it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=5928.00]                                                                                 


Epoch #12: test_reward: 15697.000000 ± 3701.981226, best_reward: 15697.000000 ± 3701.981226 in #12


Epoch #13: 1001it [00:03, 303.43it/s, env_step=13000, gradient_step=1300, len=118, n/ep=0, n/st=100, rew=5928.00]                                                                                 


Epoch #13: test_reward: 13499.200000 ± 6283.709570, best_reward: 15697.000000 ± 3701.981226 in #12


Epoch #14: 1001it [00:03, 268.83it/s, env_step=14000, gradient_step=1400, len=134, n/ep=0, n/st=100, rew=6365.00]                                                                                 


Epoch #14: test_reward: 12916.200000 ± 5318.289157, best_reward: 15697.000000 ± 3701.981226 in #12


Epoch #15: 1001it [00:03, 253.44it/s, env_step=15000, gradient_step=1500, len=149, n/ep=0, n/st=100, rew=6184.00]                                                                                 


Epoch #15: test_reward: 8925.400000 ± 4584.144854, best_reward: 15697.000000 ± 3701.981226 in #12


Epoch #16: 1001it [00:03, 268.72it/s, env_step=16000, gradient_step=1600, len=149, n/ep=0, n/st=100, rew=6184.00]                                                                                 


Epoch #16: test_reward: 11561.500000 ± 4504.953923, best_reward: 15697.000000 ± 3701.981226 in #12


Epoch #17: 1001it [00:03, 281.70it/s, env_step=17000, gradient_step=1700, len=170, n/ep=3, n/st=100, rew=8807.33]                                                                                 


Epoch #17: test_reward: 8785.200000 ± 4606.791873, best_reward: 15697.000000 ± 3701.981226 in #12


Epoch #18: 1001it [00:03, 288.52it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=9038.00]                                                                                 


Epoch #18: test_reward: 20106.400000 ± 7515.155397, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #19: 1001it [00:03, 274.78it/s, env_step=19000, gradient_step=1900, len=190, n/ep=2, n/st=100, rew=9594.25]                                                                                 


Epoch #19: test_reward: 13504.600000 ± 5675.326250, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #20: 1001it [00:03, 287.14it/s, env_step=20000, gradient_step=2000, len=199, n/ep=0, n/st=100, rew=9922.00]                                                                                 


Epoch #20: test_reward: 9466.500000 ± 3612.918965, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #21: 1001it [00:03, 285.75it/s, env_step=21000, gradient_step=2100, len=209, n/ep=0, n/st=100, rew=11082.00]                                                                                


Epoch #21: test_reward: 10648.300000 ± 6844.636821, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #22: 1001it [00:03, 288.99it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=11321.00]                                                                                


Epoch #22: test_reward: 6148.500000 ± 2952.122872, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #23: 1001it [00:03, 304.56it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=13454.00]                                                                                


Epoch #23: test_reward: 12010.200000 ± 2270.929757, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #24: 1001it [00:03, 290.04it/s, env_step=24000, gradient_step=2400, len=240, n/ep=1, n/st=100, rew=12853.00]                                                                                


Epoch #24: test_reward: 9707.000000 ± 5956.815139, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #25: 1001it [00:03, 266.30it/s, env_step=25000, gradient_step=2500, len=60, n/ep=0, n/st=100, rew=2575.50]                                                                                  


Epoch #25: test_reward: 12620.400000 ± 4584.726888, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #26: 1001it [00:03, 289.32it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=14805.50]                                                                                


Epoch #26: test_reward: 10056.600000 ± 4206.477011, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #27: 1001it [00:03, 269.95it/s, env_step=27000, gradient_step=2700, len=180, n/ep=1, n/st=100, rew=9597.50]                                                                                 


Epoch #27: test_reward: 9661.600000 ± 4238.605035, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #28: 1001it [00:03, 312.90it/s, env_step=28000, gradient_step=2800, len=135, n/ep=1, n/st=100, rew=6897.00]                                                                                 


Epoch #28: test_reward: 9047.500000 ± 5153.263146, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #29: 1001it [00:03, 316.76it/s, env_step=29000, gradient_step=2900, len=289, n/ep=0, n/st=100, rew=16560.00]                                                                                


Epoch #29: test_reward: 8748.400000 ± 5402.140765, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #30: 1001it [00:03, 301.96it/s, env_step=30000, gradient_step=3000, len=231, n/ep=0, n/st=100, rew=12500.50]                                                                                


Epoch #30: test_reward: 9721.900000 ± 3260.262764, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #31: 1001it [00:03, 302.96it/s, env_step=31000, gradient_step=3100, len=139, n/ep=0, n/st=100, rew=7549.00]                                                                                 


Epoch #31: test_reward: 9829.400000 ± 4819.165845, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #32: 1001it [00:03, 272.32it/s, env_step=32000, gradient_step=3200, len=151, n/ep=1, n/st=100, rew=8360.50]                                                                                 


Epoch #32: test_reward: 11870.200000 ± 5868.384681, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #33: 1001it [00:03, 270.20it/s, env_step=33000, gradient_step=3300, len=221, n/ep=0, n/st=100, rew=12564.00]                                                                                


Epoch #33: test_reward: 6731.100000 ± 3622.445650, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #34: 1001it [00:03, 299.75it/s, env_step=34000, gradient_step=3400, len=149, n/ep=0, n/st=100, rew=8632.50]                                                                                 


Epoch #34: test_reward: 5908.400000 ± 2999.214937, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #35: 1001it [00:03, 290.20it/s, env_step=35000, gradient_step=3500, len=246, n/ep=1, n/st=100, rew=14963.00]                                                                                


Epoch #35: test_reward: 13984.900000 ± 3598.248920, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #36: 1001it [00:03, 275.21it/s, env_step=36000, gradient_step=3600, len=131, n/ep=1, n/st=100, rew=6831.00]                                                                                 


Epoch #36: test_reward: 11300.400000 ± 4352.738338, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #37: 1001it [00:03, 316.78it/s, env_step=37000, gradient_step=3700, len=131, n/ep=1, n/st=100, rew=7596.00]                                                                                 


Epoch #37: test_reward: 11265.300000 ± 5263.693001, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #38: 1001it [00:03, 260.95it/s, env_step=38000, gradient_step=3800, len=304, n/ep=0, n/st=100, rew=18420.50]                                                                                


Epoch #38: test_reward: 9507.400000 ± 4014.058799, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #39: 1001it [00:03, 293.88it/s, env_step=39000, gradient_step=3900, len=189, n/ep=1, n/st=100, rew=10864.00]                                                                                


Epoch #39: test_reward: 8135.600000 ± 3906.198848, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #40: 1001it [00:03, 289.54it/s, env_step=40000, gradient_step=4000, len=400, n/ep=2, n/st=100, rew=21788.00]                                                                                


Epoch #40: test_reward: 7793.700000 ± 3977.051396, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #41: 1001it [00:03, 305.60it/s, env_step=41000, gradient_step=4100, len=204, n/ep=0, n/st=100, rew=12142.50]                                                                                


Epoch #41: test_reward: 10335.200000 ± 3830.883261, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #42: 1001it [00:03, 259.64it/s, env_step=42000, gradient_step=4200, len=201, n/ep=1, n/st=100, rew=11123.00]                                                                                


Epoch #42: test_reward: 6053.600000 ± 4951.796183, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #43: 1001it [00:03, 302.23it/s, env_step=43000, gradient_step=4300, len=117, n/ep=0, n/st=100, rew=6725.00]                                                                                 


Epoch #43: test_reward: 9836.400000 ± 5548.661644, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #44: 1001it [00:03, 299.58it/s, env_step=44000, gradient_step=4400, len=141, n/ep=0, n/st=100, rew=7141.50]                                                                                 


Epoch #44: test_reward: 9675.400000 ± 4127.120454, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #45: 1001it [00:03, 268.10it/s, env_step=45000, gradient_step=4500, len=101, n/ep=0, n/st=100, rew=4281.00]                                                                                 


Epoch #45: test_reward: 6857.500000 ± 2730.904804, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #46: 1001it [00:03, 277.40it/s, env_step=46000, gradient_step=4600, len=242, n/ep=0, n/st=100, rew=15091.50]                                                                                


Epoch #46: test_reward: 11423.400000 ± 3642.338705, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #47: 1001it [00:03, 312.90it/s, env_step=47000, gradient_step=4700, len=191, n/ep=0, n/st=100, rew=11187.00]                                                                                


Epoch #47: test_reward: 11700.600000 ± 5404.768269, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #48: 1001it [00:03, 325.48it/s, env_step=48000, gradient_step=4800, len=173, n/ep=0, n/st=100, rew=9815.00]                                                                                 


Epoch #48: test_reward: 9563.400000 ± 2931.472947, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #49: 1001it [00:03, 300.31it/s, env_step=49000, gradient_step=4900, len=129, n/ep=1, n/st=100, rew=5851.00]                                                                                 


Epoch #49: test_reward: 10098.300000 ± 5015.431707, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #50: 1001it [00:03, 283.31it/s, env_step=50000, gradient_step=5000, len=209, n/ep=0, n/st=100, rew=12478.00]                                                                                


Epoch #50: test_reward: 10749.600000 ± 4004.185990, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #51: 1001it [00:03, 278.43it/s, env_step=51000, gradient_step=5100, len=151, n/ep=0, n/st=100, rew=8741.00]                                                                                 


Epoch #51: test_reward: 13286.000000 ± 5605.121783, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #52: 1001it [00:03, 292.46it/s, env_step=52000, gradient_step=5200, len=119, n/ep=0, n/st=100, rew=5638.00]                                                                                 


Epoch #52: test_reward: 13281.200000 ± 4415.446360, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #53: 1001it [00:03, 293.65it/s, env_step=53000, gradient_step=5300, len=119, n/ep=0, n/st=100, rew=5638.00]                                                                                 


Epoch #53: test_reward: 11435.600000 ± 3550.998260, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #54: 1001it [00:03, 284.98it/s, env_step=54000, gradient_step=5400, len=229, n/ep=0, n/st=100, rew=12129.00]                                                                                


Epoch #54: test_reward: 9455.700000 ± 5350.544310, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #55: 1001it [00:03, 312.94it/s, env_step=55000, gradient_step=5500, len=245, n/ep=0, n/st=100, rew=13745.00]                                                                                


Epoch #55: test_reward: 11062.800000 ± 5780.330108, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #56: 1001it [00:03, 275.92it/s, env_step=56000, gradient_step=5600, len=245, n/ep=0, n/st=100, rew=14136.00]                                                                                


Epoch #56: test_reward: 6892.400000 ± 5555.421842, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #57: 1001it [00:03, 273.41it/s, env_step=57000, gradient_step=5700, len=131, n/ep=1, n/st=100, rew=6668.00]                                                                                 


Epoch #57: test_reward: 8604.900000 ± 4242.723476, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #58: 1001it [00:03, 321.10it/s, env_step=58000, gradient_step=5800, len=121, n/ep=0, n/st=100, rew=5932.50]                                                                                 


Epoch #58: test_reward: 9307.800000 ± 3622.579131, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #59: 1001it [00:02, 338.73it/s, env_step=59000, gradient_step=5900, len=175, n/ep=0, n/st=100, rew=8954.00]                                                                                 


Epoch #59: test_reward: 16737.400000 ± 4686.903396, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #60: 1001it [00:03, 277.76it/s, env_step=60000, gradient_step=6000, len=50, n/ep=2, n/st=100, rew=1985.50]                                                                                  


Epoch #60: test_reward: 12305.600000 ± 3850.258983, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #61: 1001it [00:03, 307.42it/s, env_step=61000, gradient_step=6100, len=139, n/ep=0, n/st=100, rew=6761.00]                                                                                 


Epoch #61: test_reward: 9281.700000 ± 5086.010658, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #62: 1001it [00:03, 326.41it/s, env_step=62000, gradient_step=6200, len=217, n/ep=1, n/st=100, rew=11337.00]                                                                                


Epoch #62: test_reward: 9204.000000 ± 4094.483411, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #63: 1001it [00:03, 272.25it/s, env_step=63000, gradient_step=6300, len=175, n/ep=0, n/st=100, rew=10131.33]                                                                                


Epoch #63: test_reward: 6357.000000 ± 5811.790309, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #64: 1001it [00:03, 270.24it/s, env_step=64000, gradient_step=6400, len=257, n/ep=1, n/st=100, rew=15172.00]                                                                                


Epoch #64: test_reward: 12914.200000 ± 5272.462078, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #65: 1001it [00:03, 255.88it/s, env_step=65000, gradient_step=6500, len=231, n/ep=1, n/st=100, rew=13009.00]                                                                                


Epoch #65: test_reward: 16433.800000 ± 5463.474605, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #66: 1001it [00:03, 269.75it/s, env_step=66000, gradient_step=6600, len=266, n/ep=2, n/st=100, rew=15601.75]                                                                                


Epoch #66: test_reward: 15855.900000 ± 4908.113007, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #67: 1001it [00:03, 292.83it/s, env_step=67000, gradient_step=6700, len=203, n/ep=0, n/st=100, rew=10549.00]                                                                                


Epoch #67: test_reward: 9457.200000 ± 5807.774183, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #68: 1001it [00:03, 323.38it/s, env_step=68000, gradient_step=6800, len=224, n/ep=2, n/st=100, rew=12659.25]                                                                                


Epoch #68: test_reward: 7534.000000 ± 3859.675945, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #69: 1001it [00:03, 303.39it/s, env_step=69000, gradient_step=6900, len=201, n/ep=0, n/st=100, rew=11995.25]                                                                                


Epoch #69: test_reward: 9955.800000 ± 5423.457676, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #70: 1001it [00:03, 317.08it/s, env_step=70000, gradient_step=7000, len=301, n/ep=0, n/st=100, rew=17907.00]                                                                                


Epoch #70: test_reward: 12305.200000 ± 3463.479776, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #71: 1001it [00:02, 336.45it/s, env_step=71000, gradient_step=7100, len=186, n/ep=0, n/st=100, rew=10017.00]                                                                                


Epoch #71: test_reward: 11545.200000 ± 5165.732180, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #72: 1001it [00:03, 305.76it/s, env_step=72000, gradient_step=7200, len=155, n/ep=0, n/st=100, rew=9142.00]                                                                                 


Epoch #72: test_reward: 14606.100000 ± 7972.567421, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #73: 1001it [00:03, 305.73it/s, env_step=73000, gradient_step=7300, len=268, n/ep=0, n/st=100, rew=16513.00]                                                                                


Epoch #73: test_reward: 11146.100000 ± 4559.624029, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #74: 1001it [00:03, 325.16it/s, env_step=74000, gradient_step=7400, len=265, n/ep=1, n/st=100, rew=15139.00]                                                                                


Epoch #74: test_reward: 14092.400000 ± 4699.251242, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #75: 1001it [00:03, 291.80it/s, env_step=75000, gradient_step=7500, len=195, n/ep=0, n/st=100, rew=11255.00]                                                                                


Epoch #75: test_reward: 11336.700000 ± 5874.596889, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #76: 1001it [00:03, 290.77it/s, env_step=76000, gradient_step=7600, len=189, n/ep=1, n/st=100, rew=10589.00]                                                                                


Epoch #76: test_reward: 13019.500000 ± 6173.286131, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #77: 1001it [00:03, 330.93it/s, env_step=77000, gradient_step=7700, len=153, n/ep=1, n/st=100, rew=8727.00]                                                                                 


Epoch #77: test_reward: 8273.400000 ± 5643.621057, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #78: 1001it [00:03, 298.34it/s, env_step=78000, gradient_step=7800, len=179, n/ep=0, n/st=100, rew=10355.00]                                                                                


Epoch #78: test_reward: 15394.500000 ± 7225.138466, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #79: 1001it [00:03, 277.68it/s, env_step=79000, gradient_step=7900, len=185, n/ep=0, n/st=100, rew=10685.50]                                                                                


Epoch #79: test_reward: 10952.400000 ± 4063.484396, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #80: 1001it [00:03, 295.51it/s, env_step=80000, gradient_step=8000, len=85, n/ep=1, n/st=100, rew=4473.00]                                                                                  


Epoch #80: test_reward: 13203.100000 ± 5610.300713, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #81: 1001it [00:03, 276.98it/s, env_step=81000, gradient_step=8100, len=250, n/ep=0, n/st=100, rew=16028.00]                                                                                


Epoch #81: test_reward: 6515.200000 ± 4015.776408, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #82: 1001it [00:03, 297.48it/s, env_step=82000, gradient_step=8200, len=184, n/ep=3, n/st=100, rew=10876.67]                                                                                


Epoch #82: test_reward: 4758.200000 ± 3785.345686, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #83: 1001it [00:03, 281.44it/s, env_step=83000, gradient_step=8300, len=265, n/ep=0, n/st=100, rew=15812.50]                                                                                


Epoch #83: test_reward: 12696.900000 ± 3414.873071, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #84: 1001it [00:03, 301.65it/s, env_step=84000, gradient_step=8400, len=84, n/ep=1, n/st=100, rew=3721.50]                                                                                  


Epoch #84: test_reward: 10602.000000 ± 3734.327543, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #85: 1001it [00:03, 280.17it/s, env_step=85000, gradient_step=8500, len=68, n/ep=0, n/st=100, rew=3403.00]                                                                                  


Epoch #85: test_reward: 8597.700000 ± 3987.696203, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #86: 1001it [00:03, 311.32it/s, env_step=86000, gradient_step=8600, len=148, n/ep=0, n/st=100, rew=8752.25]                                                                                 


Epoch #86: test_reward: 6938.000000 ± 4918.949563, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #87: 1001it [00:03, 282.42it/s, env_step=87000, gradient_step=8700, len=176, n/ep=1, n/st=100, rew=10103.00]                                                                                


Epoch #87: test_reward: 15163.500000 ± 3297.983968, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #88: 1001it [00:03, 301.56it/s, env_step=88000, gradient_step=8800, len=153, n/ep=0, n/st=100, rew=9226.00]                                                                                 


Epoch #88: test_reward: 7729.200000 ± 3001.371447, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #89: 1001it [00:03, 321.05it/s, env_step=89000, gradient_step=8900, len=377, n/ep=0, n/st=100, rew=24489.00]                                                                                


Epoch #89: test_reward: 12639.100000 ± 4114.896632, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #90: 1001it [00:03, 332.00it/s, env_step=90000, gradient_step=9000, len=111, n/ep=0, n/st=100, rew=5433.00]                                                                                 


Epoch #90: test_reward: 8437.400000 ± 2141.524887, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #91: 1001it [00:03, 250.41it/s, env_step=91000, gradient_step=9100, len=215, n/ep=0, n/st=100, rew=12300.75]                                                                                


Epoch #91: test_reward: 4739.200000 ± 3411.486386, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #92: 1001it [00:03, 309.50it/s, env_step=92000, gradient_step=9200, len=171, n/ep=2, n/st=100, rew=9445.50]                                                                                 


Epoch #92: test_reward: 7023.300000 ± 5810.644612, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #93: 1001it [00:03, 277.49it/s, env_step=93000, gradient_step=9300, len=177, n/ep=0, n/st=100, rew=10756.00]                                                                                


Epoch #93: test_reward: 13688.100000 ± 3877.722436, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #94: 1001it [00:03, 325.50it/s, env_step=94000, gradient_step=9400, len=184, n/ep=2, n/st=100, rew=10929.00]                                                                                


Epoch #94: test_reward: 12973.000000 ± 7066.553304, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #95: 1001it [00:02, 341.59it/s, env_step=95000, gradient_step=9500, len=179, n/ep=1, n/st=100, rew=11070.50]                                                                                


Epoch #95: test_reward: 6383.200000 ± 2555.527805, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #96: 1001it [00:03, 296.63it/s, env_step=96000, gradient_step=9600, len=147, n/ep=0, n/st=100, rew=8372.67]                                                                                 


Epoch #96: test_reward: 10349.100000 ± 3295.894854, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #97: 1001it [00:03, 305.69it/s, env_step=97000, gradient_step=9700, len=113, n/ep=2, n/st=100, rew=6229.50]                                                                                 


Epoch #97: test_reward: 14185.500000 ± 5505.639622, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #98: 1001it [00:03, 319.67it/s, env_step=98000, gradient_step=9800, len=132, n/ep=0, n/st=100, rew=7231.50]                                                                                 


Epoch #98: test_reward: 12768.200000 ± 6403.439479, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #99: 1001it [00:03, 307.81it/s, env_step=99000, gradient_step=9900, len=199, n/ep=1, n/st=100, rew=12490.50]                                                                                


Epoch #99: test_reward: 11613.900000 ± 4383.942003, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #100: 1001it [00:03, 287.88it/s, env_step=100000, gradient_step=10000, len=204, n/ep=1, n/st=100, rew=11762.00]                                                                             


Epoch #100: test_reward: 7183.000000 ± 6100.375677, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #101: 1001it [00:03, 302.77it/s, env_step=101000, gradient_step=10100, len=393, n/ep=0, n/st=100, rew=27111.00]                                                                             


Epoch #101: test_reward: 10803.000000 ± 4836.251710, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #102: 1001it [00:03, 281.79it/s, env_step=102000, gradient_step=10200, len=217, n/ep=0, n/st=100, rew=13133.00]                                                                             


Epoch #102: test_reward: 10725.600000 ± 5796.837469, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #103: 1001it [00:03, 277.71it/s, env_step=103000, gradient_step=10300, len=176, n/ep=0, n/st=100, rew=9368.50]                                                                              


Epoch #103: test_reward: 11429.300000 ± 4427.829898, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #104: 1001it [00:03, 307.40it/s, env_step=104000, gradient_step=10400, len=191, n/ep=0, n/st=100, rew=10183.00]                                                                             


Epoch #104: test_reward: 8912.000000 ± 4089.041159, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #105: 1001it [00:03, 287.38it/s, env_step=105000, gradient_step=10500, len=159, n/ep=0, n/st=100, rew=8182.00]                                                                              


Epoch #105: test_reward: 10381.000000 ± 8100.516045, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #106: 1001it [00:03, 319.76it/s, env_step=106000, gradient_step=10600, len=147, n/ep=1, n/st=100, rew=7697.00]                                                                              


Epoch #106: test_reward: 11392.900000 ± 4364.211371, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #107: 1001it [00:03, 275.22it/s, env_step=107000, gradient_step=10700, len=155, n/ep=0, n/st=100, rew=8765.25]                                                                              


Epoch #107: test_reward: 8043.000000 ± 4653.409180, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #108: 1001it [00:03, 281.09it/s, env_step=108000, gradient_step=10800, len=164, n/ep=1, n/st=100, rew=8529.00]                                                                              


Epoch #108: test_reward: 11143.700000 ± 5802.451638, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #109: 1001it [00:03, 330.18it/s, env_step=109000, gradient_step=10900, len=123, n/ep=3, n/st=100, rew=7271.33]                                                                              


Epoch #109: test_reward: 9593.700000 ± 5148.216644, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #110: 1001it [00:03, 322.53it/s, env_step=110000, gradient_step=11000, len=204, n/ep=0, n/st=100, rew=11485.00]                                                                             


Epoch #110: test_reward: 8772.800000 ± 8258.442333, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #111: 1001it [00:03, 279.26it/s, env_step=111000, gradient_step=11100, len=122, n/ep=0, n/st=100, rew=6593.83]                                                                              


Epoch #111: test_reward: 8657.400000 ± 4924.404618, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #112: 1001it [00:03, 265.61it/s, env_step=112000, gradient_step=11200, len=141, n/ep=1, n/st=100, rew=8012.00]                                                                              


Epoch #112: test_reward: 9744.500000 ± 3658.554475, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #113: 1001it [00:03, 297.39it/s, env_step=113000, gradient_step=11300, len=160, n/ep=2, n/st=100, rew=10322.00]                                                                             


Epoch #113: test_reward: 10680.400000 ± 5477.449994, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #114: 1001it [00:03, 311.88it/s, env_step=114000, gradient_step=11400, len=158, n/ep=1, n/st=100, rew=9119.00]                                                                              


Epoch #114: test_reward: 8637.200000 ± 6530.856066, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #115: 1001it [00:03, 301.57it/s, env_step=115000, gradient_step=11500, len=154, n/ep=0, n/st=100, rew=9070.00]                                                                              


Epoch #115: test_reward: 4885.900000 ± 2996.359673, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #116: 1001it [00:03, 269.30it/s, env_step=116000, gradient_step=11600, len=160, n/ep=3, n/st=100, rew=9363.67]                                                                              


Epoch #116: test_reward: 5620.100000 ± 4171.417612, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #117: 1001it [00:04, 245.91it/s, env_step=117000, gradient_step=11700, len=161, n/ep=0, n/st=100, rew=10191.50]                                                                             


Epoch #117: test_reward: 6275.500000 ± 4290.260581, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #118: 1001it [00:03, 254.91it/s, env_step=118000, gradient_step=11800, len=127, n/ep=1, n/st=100, rew=7573.50]                                                                              


Epoch #118: test_reward: 7329.700000 ± 3752.300096, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #119: 1001it [00:03, 259.10it/s, env_step=119000, gradient_step=11900, len=193, n/ep=1, n/st=100, rew=11931.00]                                                                             


Epoch #119: test_reward: 11869.400000 ± 8315.617803, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #120: 1001it [00:03, 325.14it/s, env_step=120000, gradient_step=12000, len=237, n/ep=1, n/st=100, rew=14803.00]                                                                             


Epoch #120: test_reward: 9076.900000 ± 2986.934899, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #121: 1001it [00:03, 311.88it/s, env_step=121000, gradient_step=12100, len=146, n/ep=2, n/st=100, rew=8732.00]                                                                              


Epoch #121: test_reward: 14231.900000 ± 6284.493988, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #122: 1001it [00:03, 272.07it/s, env_step=122000, gradient_step=12200, len=112, n/ep=1, n/st=100, rew=6590.00]                                                                              


Epoch #122: test_reward: 13382.300000 ± 4467.565065, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #123: 1001it [00:03, 267.92it/s, env_step=123000, gradient_step=12300, len=162, n/ep=0, n/st=100, rew=9701.00]                                                                              


Epoch #123: test_reward: 12211.500000 ± 6972.862644, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #124: 1001it [00:03, 293.18it/s, env_step=124000, gradient_step=12400, len=181, n/ep=1, n/st=100, rew=11028.00]                                                                             


Epoch #124: test_reward: 10416.500000 ± 5834.307058, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #125: 1001it [00:03, 266.13it/s, env_step=125000, gradient_step=12500, len=109, n/ep=1, n/st=100, rew=5818.50]                                                                              


Epoch #125: test_reward: 11835.400000 ± 4124.946744, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #126: 1001it [00:03, 306.13it/s, env_step=126000, gradient_step=12600, len=93, n/ep=0, n/st=100, rew=5088.00]                                                                               


Epoch #126: test_reward: 7724.200000 ± 4107.775208, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #127: 1001it [00:03, 284.21it/s, env_step=127000, gradient_step=12700, len=97, n/ep=1, n/st=100, rew=5469.00]                                                                               


Epoch #127: test_reward: 13775.300000 ± 3682.783541, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #128: 1001it [00:03, 329.27it/s, env_step=128000, gradient_step=12800, len=231, n/ep=0, n/st=100, rew=14438.50]                                                                             


Epoch #128: test_reward: 16751.800000 ± 8451.786625, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #129: 1001it [00:03, 309.64it/s, env_step=129000, gradient_step=12900, len=87, n/ep=0, n/st=100, rew=3860.00]                                                                               


Epoch #129: test_reward: 12039.800000 ± 5525.673367, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #130: 1001it [00:03, 277.76it/s, env_step=130000, gradient_step=13000, len=261, n/ep=1, n/st=100, rew=15827.00]                                                                             


Epoch #130: test_reward: 4706.500000 ± 2565.279761, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #131: 1001it [00:03, 253.40it/s, env_step=131000, gradient_step=13100, len=130, n/ep=1, n/st=100, rew=6618.00]                                                                              


Epoch #131: test_reward: 8916.900000 ± 2963.672129, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #132: 1001it [00:03, 282.45it/s, env_step=132000, gradient_step=13200, len=138, n/ep=3, n/st=100, rew=7578.83]                                                                              


Epoch #132: test_reward: 6556.100000 ± 3196.885749, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #133: 1001it [00:03, 283.98it/s, env_step=133000, gradient_step=13300, len=129, n/ep=1, n/st=100, rew=7145.00]                                                                              


Epoch #133: test_reward: 10935.500000 ± 4980.819395, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #134: 1001it [00:03, 253.02it/s, env_step=134000, gradient_step=13400, len=148, n/ep=1, n/st=100, rew=7880.50]                                                                              


Epoch #134: test_reward: 11880.100000 ± 6514.141332, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #135: 1001it [00:04, 244.81it/s, env_step=135000, gradient_step=13500, len=165, n/ep=0, n/st=100, rew=10435.00]                                                                             


Epoch #135: test_reward: 16123.300000 ± 6230.500928, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #136: 1001it [00:03, 292.62it/s, env_step=136000, gradient_step=13600, len=175, n/ep=1, n/st=100, rew=8682.50]                                                                              


Epoch #136: test_reward: 9475.100000 ± 3652.549259, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #137: 1001it [00:03, 304.09it/s, env_step=137000, gradient_step=13700, len=145, n/ep=0, n/st=100, rew=7174.00]                                                                              


Epoch #137: test_reward: 10883.500000 ± 5561.437750, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #138: 1001it [00:03, 324.36it/s, env_step=138000, gradient_step=13800, len=214, n/ep=0, n/st=100, rew=13038.00]                                                                             


Epoch #138: test_reward: 10666.100000 ± 5765.232961, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #139: 1001it [00:03, 275.11it/s, env_step=139000, gradient_step=13900, len=30, n/ep=1, n/st=100, rew=1040.00]                                                                               


Epoch #139: test_reward: 13219.400000 ± 4904.191171, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #140: 1001it [00:03, 329.11it/s, env_step=140000, gradient_step=14000, len=240, n/ep=1, n/st=100, rew=15330.50]                                                                             


Epoch #140: test_reward: 7883.200000 ± 5218.869606, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #141: 1001it [00:03, 321.89it/s, env_step=141000, gradient_step=14100, len=127, n/ep=2, n/st=100, rew=7173.75]                                                                              


Epoch #141: test_reward: 7291.600000 ± 3595.014303, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #142: 1001it [00:03, 250.97it/s, env_step=142000, gradient_step=14200, len=181, n/ep=0, n/st=100, rew=11191.50]                                                                             


Epoch #142: test_reward: 13100.200000 ± 8545.975670, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #143: 1001it [00:03, 258.55it/s, env_step=143000, gradient_step=14300, len=136, n/ep=1, n/st=100, rew=7619.00]                                                                              


Epoch #143: test_reward: 12124.500000 ± 3442.028159, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #144: 1001it [00:03, 258.85it/s, env_step=144000, gradient_step=14400, len=104, n/ep=2, n/st=100, rew=5226.00]                                                                              


Epoch #144: test_reward: 8384.700000 ± 3783.581400, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #145: 1001it [00:03, 288.62it/s, env_step=145000, gradient_step=14500, len=209, n/ep=0, n/st=100, rew=13236.00]                                                                             


Epoch #145: test_reward: 12944.300000 ± 3401.190558, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #146: 1001it [00:03, 276.05it/s, env_step=146000, gradient_step=14600, len=150, n/ep=0, n/st=100, rew=9215.50]                                                                              


Epoch #146: test_reward: 8186.800000 ± 1967.555275, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #147: 1001it [00:03, 275.55it/s, env_step=147000, gradient_step=14700, len=150, n/ep=1, n/st=100, rew=7736.00]                                                                              


Epoch #147: test_reward: 5078.000000 ± 3993.150410, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #148: 1001it [00:03, 318.31it/s, env_step=148000, gradient_step=14800, len=136, n/ep=0, n/st=100, rew=7460.50]                                                                              


Epoch #148: test_reward: 7820.100000 ± 4664.223246, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #149: 1001it [00:03, 280.94it/s, env_step=149000, gradient_step=14900, len=156, n/ep=2, n/st=100, rew=9359.50]                                                                              


Epoch #149: test_reward: 11262.900000 ± 5846.050640, best_reward: 20106.400000 ± 7515.155397 in #18


Epoch #150: 1001it [00:03, 297.26it/s, env_step=150000, gradient_step=15000, len=180, n/ep=0, n/st=100, rew=10765.50]                                                                             


Epoch #150: test_reward: 9637.700000 ± 4794.174466, best_reward: 20106.400000 ± 7515.155397 in #18

InfoStats(gradient_step=15000, best_reward=20106.4, best_reward_std=7515.155396929594, train_step=150000, train_episode=822, test_step=316220, test_episode=1510, timing=TimingStats(total_time=756.1901512145996, train_time=516.0212194919586, train_time_collect=59.08205056190491, train_time_update=449.4236614704132, test_time=240.168931722641, update_speed=290.6857205362221))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #4


Epoch #1: 1001it [00:02, 425.54it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 9141.700000 ± 3028.735778, best_reward: 11766.600000 ± 5636.648245 in #0


Epoch #2: 1001it [00:02, 339.14it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 5608.900000 ± 3532.160938, best_reward: 11766.600000 ± 5636.648245 in #0


Epoch #3: 1001it [00:02, 368.42it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 14504.100000 ± 2932.632757, best_reward: 14504.100000 ± 2932.632757 in #3


Epoch #4: 1001it [00:02, 370.71it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 13819.000000 ± 4140.433745, best_reward: 14504.100000 ± 2932.632757 in #3


Epoch #5: 1001it [00:02, 377.07it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 13827.300000 ± 4998.068748, best_reward: 14504.100000 ± 2932.632757 in #3


Epoch #6: 1001it [00:02, 403.92it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 15740.200000 ± 7639.742794, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #7: 1001it [00:02, 337.22it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 12289.400000 ± 4892.342102, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #8: 1001it [00:02, 370.46it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=1887.50]                                                                                     


Epoch #8: test_reward: 7442.200000 ± 3457.887268, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #9: 1001it [00:02, 334.08it/s, env_step=9000, gradient_step=900, len=90, n/ep=2, n/st=100, rew=2569.75]                                                                                     


Epoch #9: test_reward: 13850.500000 ± 5814.791815, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #10: 1001it [00:02, 354.52it/s, env_step=10000, gradient_step=1000, len=95, n/ep=0, n/st=100, rew=4222.00]                                                                                  


Epoch #10: test_reward: 12581.600000 ± 4804.879482, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #11: 1001it [00:03, 324.11it/s, env_step=11000, gradient_step=1100, len=109, n/ep=0, n/st=100, rew=4345.00]                                                                                 


Epoch #11: test_reward: 10507.400000 ± 3363.751275, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #12: 1001it [00:03, 291.20it/s, env_step=12000, gradient_step=1200, len=119, n/ep=0, n/st=100, rew=5643.50]                                                                                 


Epoch #12: test_reward: 6123.100000 ± 2750.145613, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #13: 1001it [00:02, 372.99it/s, env_step=13000, gradient_step=1300, len=130, n/ep=1, n/st=100, rew=5330.00]                                                                                 


Epoch #13: test_reward: 8681.400000 ± 6243.057267, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #14: 1001it [00:02, 334.76it/s, env_step=14000, gradient_step=1400, len=136, n/ep=0, n/st=100, rew=5400.30]                                                                                 


Epoch #14: test_reward: 8371.600000 ± 3589.108279, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #15: 1001it [00:03, 271.29it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=7910.75]                                                                                 


Epoch #15: test_reward: 11985.400000 ± 1293.685294, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #16: 1001it [00:03, 285.48it/s, env_step=16000, gradient_step=1600, len=66, n/ep=0, n/st=100, rew=3154.00]                                                                                  


Epoch #16: test_reward: 10942.400000 ± 5772.915316, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #17: 1001it [00:03, 291.63it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=8587.00]                                                                                 


Epoch #17: test_reward: 8583.200000 ± 3524.425082, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #18: 1001it [00:03, 268.52it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=8105.00]                                                                                 


Epoch #18: test_reward: 12817.100000 ± 5075.514800, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #19: 1001it [00:03, 287.71it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=9424.00]                                                                                 


Epoch #19: test_reward: 10586.800000 ± 3078.983982, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #20: 1001it [00:03, 326.42it/s, env_step=20000, gradient_step=2000, len=200, n/ep=2, n/st=100, rew=9581.00]                                                                                 


Epoch #20: test_reward: 12679.600000 ± 4023.774402, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #21: 1001it [00:03, 316.40it/s, env_step=21000, gradient_step=2100, len=207, n/ep=0, n/st=100, rew=11641.00]                                                                                


Epoch #21: test_reward: 10311.000000 ± 3650.327465, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #22: 1001it [00:02, 369.39it/s, env_step=22000, gradient_step=2200, len=220, n/ep=1, n/st=100, rew=11596.00]                                                                                


Epoch #22: test_reward: 9531.800000 ± 3638.311526, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #23: 1001it [00:02, 339.15it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=10262.00]                                                                                


Epoch #23: test_reward: 7240.600000 ± 3587.854434, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #24: 1001it [00:02, 358.10it/s, env_step=24000, gradient_step=2400, len=51, n/ep=0, n/st=100, rew=1751.00]                                                                                  


Epoch #24: test_reward: 11190.800000 ± 3872.821421, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #25: 1001it [00:03, 279.99it/s, env_step=25000, gradient_step=2500, len=100, n/ep=1, n/st=100, rew=4024.00]                                                                                 


Epoch #25: test_reward: 9107.000000 ± 3257.223112, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #26: 1001it [00:03, 309.48it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=12939.00]                                                                                


Epoch #26: test_reward: 7416.200000 ± 3300.959642, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #27: 1001it [00:03, 298.56it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=12797.00]                                                                                


Epoch #27: test_reward: 11945.100000 ± 4804.921653, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #28: 1001it [00:02, 352.22it/s, env_step=28000, gradient_step=2800, len=279, n/ep=0, n/st=100, rew=15925.00]                                                                                


Epoch #28: test_reward: 11837.500000 ± 4062.714099, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #29: 1001it [00:03, 301.66it/s, env_step=29000, gradient_step=2900, len=217, n/ep=2, n/st=100, rew=11446.00]                                                                                


Epoch #29: test_reward: 9302.000000 ± 4516.139148, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #30: 1001it [00:03, 299.30it/s, env_step=30000, gradient_step=3000, len=135, n/ep=0, n/st=100, rew=6557.50]                                                                                 


Epoch #30: test_reward: 14626.500000 ± 5989.182369, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #31: 1001it [00:03, 276.10it/s, env_step=31000, gradient_step=3100, len=152, n/ep=0, n/st=100, rew=8515.50]                                                                                 


Epoch #31: test_reward: 13387.200000 ± 5354.472482, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #32: 1001it [00:03, 251.88it/s, env_step=32000, gradient_step=3200, len=131, n/ep=0, n/st=100, rew=6774.25]                                                                                 


Epoch #32: test_reward: 12075.100000 ± 3664.888169, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #33: 1001it [00:03, 287.08it/s, env_step=33000, gradient_step=3300, len=139, n/ep=0, n/st=100, rew=7824.00]                                                                                 


Epoch #33: test_reward: 11342.400000 ± 4813.821251, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #34: 1001it [00:03, 309.48it/s, env_step=34000, gradient_step=3400, len=174, n/ep=0, n/st=100, rew=9147.00]                                                                                 


Epoch #34: test_reward: 8830.700000 ± 3491.170922, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #35: 1001it [00:03, 255.89it/s, env_step=35000, gradient_step=3500, len=222, n/ep=2, n/st=100, rew=12462.00]                                                                                


Epoch #35: test_reward: 11815.200000 ± 6652.315113, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #36: 1001it [00:03, 327.15it/s, env_step=36000, gradient_step=3600, len=178, n/ep=0, n/st=100, rew=10629.00]                                                                                


Epoch #36: test_reward: 14267.800000 ± 1829.623940, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #37: 1001it [00:02, 377.66it/s, env_step=37000, gradient_step=3700, len=144, n/ep=0, n/st=100, rew=7484.50]                                                                                 


Epoch #37: test_reward: 9726.000000 ± 4562.443424, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #38: 1001it [00:03, 277.25it/s, env_step=38000, gradient_step=3800, len=157, n/ep=0, n/st=100, rew=7601.50]                                                                                 


Epoch #38: test_reward: 15736.100000 ± 5093.664112, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #39: 1001it [00:02, 339.52it/s, env_step=39000, gradient_step=3900, len=160, n/ep=0, n/st=100, rew=8801.50]                                                                                 


Epoch #39: test_reward: 12960.500000 ± 5196.336541, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #40: 1001it [00:03, 323.91it/s, env_step=40000, gradient_step=4000, len=186, n/ep=1, n/st=100, rew=9472.50]                                                                                 


Epoch #40: test_reward: 8556.000000 ± 3661.398503, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #41: 1001it [00:03, 319.61it/s, env_step=41000, gradient_step=4100, len=276, n/ep=0, n/st=100, rew=16220.00]                                                                                


Epoch #41: test_reward: 12727.000000 ± 4773.604718, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #42: 1001it [00:03, 302.85it/s, env_step=42000, gradient_step=4200, len=150, n/ep=1, n/st=100, rew=7422.00]                                                                                 


Epoch #42: test_reward: 8971.500000 ± 3940.932485, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #43: 1001it [00:03, 277.96it/s, env_step=43000, gradient_step=4300, len=196, n/ep=0, n/st=100, rew=10987.83]                                                                                


Epoch #43: test_reward: 12400.000000 ± 5719.387729, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #44: 1001it [00:03, 311.15it/s, env_step=44000, gradient_step=4400, len=173, n/ep=1, n/st=100, rew=9303.50]                                                                                 


Epoch #44: test_reward: 13747.500000 ± 3046.075024, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #45: 1001it [00:03, 282.62it/s, env_step=45000, gradient_step=4500, len=166, n/ep=0, n/st=100, rew=8321.50]                                                                                 


Epoch #45: test_reward: 12580.400000 ± 5123.109743, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #46: 1001it [00:03, 295.37it/s, env_step=46000, gradient_step=4600, len=168, n/ep=2, n/st=100, rew=8589.75]                                                                                 


Epoch #46: test_reward: 12775.800000 ± 5835.628240, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #47: 1001it [00:02, 347.41it/s, env_step=47000, gradient_step=4700, len=134, n/ep=0, n/st=100, rew=6530.00]                                                                                 


Epoch #47: test_reward: 15354.800000 ± 3536.030000, best_reward: 15740.200000 ± 7639.742794 in #6


Epoch #48: 1001it [00:03, 294.94it/s, env_step=48000, gradient_step=4800, len=330, n/ep=0, n/st=100, rew=20036.00]                                                                                


Epoch #48: test_reward: 16888.900000 ± 4966.271226, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #49: 1001it [00:03, 287.94it/s, env_step=49000, gradient_step=4900, len=120, n/ep=0, n/st=100, rew=5173.00]                                                                                 


Epoch #49: test_reward: 11139.500000 ± 2720.330541, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #50: 1001it [00:03, 293.69it/s, env_step=50000, gradient_step=5000, len=255, n/ep=0, n/st=100, rew=14603.00]                                                                                


Epoch #50: test_reward: 15490.700000 ± 3148.715359, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #51: 1001it [00:03, 329.22it/s, env_step=51000, gradient_step=5100, len=259, n/ep=0, n/st=100, rew=14736.25]                                                                                


Epoch #51: test_reward: 12437.200000 ± 3643.322132, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #52: 1001it [00:03, 290.53it/s, env_step=52000, gradient_step=5200, len=245, n/ep=0, n/st=100, rew=13741.00]                                                                                


Epoch #52: test_reward: 15024.300000 ± 6413.009123, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #53: 1001it [00:03, 320.26it/s, env_step=53000, gradient_step=5300, len=142, n/ep=0, n/st=100, rew=7273.00]                                                                                 


Epoch #53: test_reward: 10597.600000 ± 4015.038934, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #54: 1001it [00:03, 280.59it/s, env_step=54000, gradient_step=5400, len=254, n/ep=0, n/st=100, rew=13461.00]                                                                                


Epoch #54: test_reward: 15459.600000 ± 5331.340229, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #55: 1001it [00:03, 292.10it/s, env_step=55000, gradient_step=5500, len=124, n/ep=0, n/st=100, rew=6216.00]                                                                                 


Epoch #55: test_reward: 13872.000000 ± 5279.952405, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #56: 1001it [00:03, 309.94it/s, env_step=56000, gradient_step=5600, len=169, n/ep=1, n/st=100, rew=8900.50]                                                                                 


Epoch #56: test_reward: 11304.000000 ± 3397.787604, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #57: 1001it [00:02, 341.58it/s, env_step=57000, gradient_step=5700, len=325, n/ep=0, n/st=100, rew=17829.50]                                                                                


Epoch #57: test_reward: 12658.100000 ± 5663.272295, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #58: 1001it [00:03, 314.33it/s, env_step=58000, gradient_step=5800, len=234, n/ep=1, n/st=100, rew=12396.50]                                                                                


Epoch #58: test_reward: 7747.700000 ± 4368.321143, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #59: 1001it [00:03, 296.37it/s, env_step=59000, gradient_step=5900, len=259, n/ep=2, n/st=100, rew=14438.75]                                                                                


Epoch #59: test_reward: 13112.400000 ± 6595.583616, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #60: 1001it [00:02, 347.07it/s, env_step=60000, gradient_step=6000, len=301, n/ep=0, n/st=100, rew=17142.50]                                                                                


Epoch #60: test_reward: 10873.700000 ± 6765.279123, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #61: 1001it [00:03, 316.55it/s, env_step=61000, gradient_step=6100, len=205, n/ep=1, n/st=100, rew=11332.00]                                                                                


Epoch #61: test_reward: 14339.000000 ± 5879.456591, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #62: 1001it [00:03, 285.43it/s, env_step=62000, gradient_step=6200, len=188, n/ep=0, n/st=100, rew=10651.75]                                                                                


Epoch #62: test_reward: 10135.200000 ± 7680.709665, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #63: 1001it [00:03, 300.91it/s, env_step=63000, gradient_step=6300, len=259, n/ep=1, n/st=100, rew=15745.00]                                                                                


Epoch #63: test_reward: 12279.100000 ± 5196.643772, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #64: 1001it [00:03, 281.64it/s, env_step=64000, gradient_step=6400, len=319, n/ep=1, n/st=100, rew=18122.00]                                                                                


Epoch #64: test_reward: 12737.800000 ± 5731.336001, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #65: 1001it [00:02, 345.29it/s, env_step=65000, gradient_step=6500, len=287, n/ep=0, n/st=100, rew=15698.00]                                                                                


Epoch #65: test_reward: 13549.600000 ± 6060.505032, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #66: 1001it [00:03, 316.08it/s, env_step=66000, gradient_step=6600, len=231, n/ep=2, n/st=100, rew=13665.50]                                                                                


Epoch #66: test_reward: 10279.500000 ± 4323.661487, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #67: 1001it [00:03, 293.20it/s, env_step=67000, gradient_step=6700, len=135, n/ep=3, n/st=100, rew=6531.67]                                                                                 


Epoch #67: test_reward: 9839.500000 ± 4095.459565, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #68: 1001it [00:03, 289.61it/s, env_step=68000, gradient_step=6800, len=225, n/ep=0, n/st=100, rew=12174.00]                                                                                


Epoch #68: test_reward: 12923.100000 ± 4822.000217, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #69: 1001it [00:03, 285.24it/s, env_step=69000, gradient_step=6900, len=83, n/ep=1, n/st=100, rew=3321.00]                                                                                  


Epoch #69: test_reward: 13291.100000 ± 4656.291388, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #70: 1001it [00:03, 287.95it/s, env_step=70000, gradient_step=7000, len=266, n/ep=0, n/st=100, rew=16344.50]                                                                                


Epoch #70: test_reward: 10592.400000 ± 4559.523554, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #71: 1001it [00:03, 322.82it/s, env_step=71000, gradient_step=7100, len=176, n/ep=0, n/st=100, rew=10212.00]                                                                                


Epoch #71: test_reward: 10999.000000 ± 4209.729445, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #72: 1001it [00:03, 296.49it/s, env_step=72000, gradient_step=7200, len=148, n/ep=1, n/st=100, rew=6509.00]                                                                                 


Epoch #72: test_reward: 8817.800000 ± 2667.640411, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #73: 1001it [00:03, 329.59it/s, env_step=73000, gradient_step=7300, len=70, n/ep=0, n/st=100, rew=2192.00]                                                                                  


Epoch #73: test_reward: 13269.500000 ± 7479.291815, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #74: 1001it [00:03, 304.96it/s, env_step=74000, gradient_step=7400, len=119, n/ep=1, n/st=100, rew=5440.00]                                                                                 


Epoch #74: test_reward: 7858.100000 ± 4806.061058, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #75: 1001it [00:03, 301.21it/s, env_step=75000, gradient_step=7500, len=186, n/ep=0, n/st=100, rew=10825.00]                                                                                


Epoch #75: test_reward: 10246.700000 ± 4374.038433, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #76: 1001it [00:03, 262.76it/s, env_step=76000, gradient_step=7600, len=320, n/ep=1, n/st=100, rew=19595.50]                                                                                


Epoch #76: test_reward: 9832.600000 ± 3734.140067, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #77: 1001it [00:03, 312.61it/s, env_step=77000, gradient_step=7700, len=158, n/ep=0, n/st=100, rew=7943.00]                                                                                 


Epoch #77: test_reward: 15726.300000 ± 5566.276979, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #78: 1001it [00:03, 317.79it/s, env_step=78000, gradient_step=7800, len=56, n/ep=0, n/st=100, rew=2088.00]                                                                                  


Epoch #78: test_reward: 11822.800000 ± 4085.230956, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #79: 1001it [00:03, 286.29it/s, env_step=79000, gradient_step=7900, len=157, n/ep=0, n/st=100, rew=9025.00]                                                                                 


Epoch #79: test_reward: 11645.500000 ± 5190.983900, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #80: 1001it [00:02, 334.31it/s, env_step=80000, gradient_step=8000, len=96, n/ep=0, n/st=100, rew=4193.50]                                                                                  


Epoch #80: test_reward: 13739.600000 ± 5189.103876, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #81: 1001it [00:03, 296.06it/s, env_step=81000, gradient_step=8100, len=159, n/ep=2, n/st=100, rew=7987.25]                                                                                 


Epoch #81: test_reward: 11059.700000 ± 2829.292776, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #82: 1001it [00:03, 305.18it/s, env_step=82000, gradient_step=8200, len=102, n/ep=0, n/st=100, rew=4775.00]                                                                                 


Epoch #82: test_reward: 8841.900000 ± 4870.885288, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #83: 1001it [00:03, 297.23it/s, env_step=83000, gradient_step=8300, len=250, n/ep=1, n/st=100, rew=15760.00]                                                                                


Epoch #83: test_reward: 8750.300000 ± 4303.401632, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #84: 1001it [00:03, 263.69it/s, env_step=84000, gradient_step=8400, len=170, n/ep=1, n/st=100, rew=8638.00]                                                                                 


Epoch #84: test_reward: 11179.600000 ± 2681.631563, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #85: 1001it [00:03, 297.91it/s, env_step=85000, gradient_step=8500, len=88, n/ep=0, n/st=100, rew=3887.50]                                                                                  


Epoch #85: test_reward: 11116.900000 ± 5428.043450, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #86: 1001it [00:03, 285.70it/s, env_step=86000, gradient_step=8600, len=203, n/ep=2, n/st=100, rew=11111.25]                                                                                


Epoch #86: test_reward: 10582.800000 ± 3877.292916, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #87: 1001it [00:03, 300.54it/s, env_step=87000, gradient_step=8700, len=200, n/ep=1, n/st=100, rew=11699.50]                                                                                


Epoch #87: test_reward: 11544.200000 ± 2161.374831, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #88: 1001it [00:03, 317.77it/s, env_step=88000, gradient_step=8800, len=40, n/ep=0, n/st=100, rew=1315.50]                                                                                  


Epoch #88: test_reward: 11613.300000 ± 5514.801176, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #89: 1001it [00:03, 302.41it/s, env_step=89000, gradient_step=8900, len=224, n/ep=2, n/st=100, rew=12534.00]                                                                                


Epoch #89: test_reward: 14562.900000 ± 5955.287490, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #90: 1001it [00:03, 264.28it/s, env_step=90000, gradient_step=9000, len=269, n/ep=0, n/st=100, rew=14874.50]                                                                                


Epoch #90: test_reward: 11917.300000 ± 7001.984262, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #91: 1001it [00:03, 308.06it/s, env_step=91000, gradient_step=9100, len=254, n/ep=0, n/st=100, rew=12406.00]                                                                                


Epoch #91: test_reward: 11501.500000 ± 5252.696379, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #92: 1001it [00:03, 312.06it/s, env_step=92000, gradient_step=9200, len=96, n/ep=1, n/st=100, rew=3809.00]                                                                                  


Epoch #92: test_reward: 8641.000000 ± 4046.426745, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #93: 1001it [00:03, 275.18it/s, env_step=93000, gradient_step=9300, len=188, n/ep=0, n/st=100, rew=9686.00]                                                                                 


Epoch #93: test_reward: 8845.300000 ± 3326.192179, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #94: 1001it [00:03, 296.29it/s, env_step=94000, gradient_step=9400, len=235, n/ep=0, n/st=100, rew=13696.00]                                                                                


Epoch #94: test_reward: 12752.400000 ± 6325.881618, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #95: 1001it [00:03, 297.41it/s, env_step=95000, gradient_step=9500, len=182, n/ep=1, n/st=100, rew=9175.00]                                                                                 


Epoch #95: test_reward: 12465.800000 ± 4311.701979, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #96: 1001it [00:02, 341.27it/s, env_step=96000, gradient_step=9600, len=184, n/ep=2, n/st=100, rew=10489.50]                                                                                


Epoch #96: test_reward: 9926.100000 ± 3543.539486, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #97: 1001it [00:03, 283.53it/s, env_step=97000, gradient_step=9700, len=137, n/ep=1, n/st=100, rew=6546.00]                                                                                 


Epoch #97: test_reward: 14132.600000 ± 5652.663730, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #98: 1001it [00:03, 267.14it/s, env_step=98000, gradient_step=9800, len=158, n/ep=1, n/st=100, rew=6488.50]                                                                                 


Epoch #98: test_reward: 12771.700000 ± 4485.157657, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #99: 1001it [00:03, 295.68it/s, env_step=99000, gradient_step=9900, len=74, n/ep=1, n/st=100, rew=3433.00]                                                                                  


Epoch #99: test_reward: 12374.300000 ± 5870.607124, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #100: 1001it [00:03, 314.97it/s, env_step=100000, gradient_step=10000, len=236, n/ep=0, n/st=100, rew=12538.00]                                                                             


Epoch #100: test_reward: 10815.900000 ± 2723.725406, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #101: 1001it [00:03, 288.54it/s, env_step=101000, gradient_step=10100, len=98, n/ep=2, n/st=100, rew=5000.50]                                                                               


Epoch #101: test_reward: 12891.600000 ± 6224.984598, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #102: 1001it [00:02, 340.49it/s, env_step=102000, gradient_step=10200, len=201, n/ep=2, n/st=100, rew=10868.25]                                                                             


Epoch #102: test_reward: 11084.000000 ± 6833.019596, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #103: 1001it [00:03, 310.28it/s, env_step=103000, gradient_step=10300, len=359, n/ep=0, n/st=100, rew=22072.00]                                                                             


Epoch #103: test_reward: 9719.900000 ± 4305.167789, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #104: 1001it [00:03, 304.74it/s, env_step=104000, gradient_step=10400, len=208, n/ep=1, n/st=100, rew=11453.00]                                                                             


Epoch #104: test_reward: 12234.900000 ± 5949.778340, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #105: 1001it [00:03, 322.69it/s, env_step=105000, gradient_step=10500, len=40, n/ep=0, n/st=100, rew=1761.00]                                                                               


Epoch #105: test_reward: 12295.200000 ± 2168.449806, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #106: 1001it [00:03, 291.83it/s, env_step=106000, gradient_step=10600, len=200, n/ep=0, n/st=100, rew=12579.00]                                                                             


Epoch #106: test_reward: 11973.000000 ± 3414.404458, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #107: 1001it [00:03, 313.03it/s, env_step=107000, gradient_step=10700, len=58, n/ep=0, n/st=100, rew=2840.00]                                                                               


Epoch #107: test_reward: 12446.500000 ± 5961.776199, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #108: 1001it [00:03, 279.28it/s, env_step=108000, gradient_step=10800, len=135, n/ep=0, n/st=100, rew=7382.00]                                                                              


Epoch #108: test_reward: 9092.100000 ± 4438.919180, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #109: 1001it [00:03, 289.35it/s, env_step=109000, gradient_step=10900, len=144, n/ep=1, n/st=100, rew=8488.50]                                                                              


Epoch #109: test_reward: 13134.600000 ± 6241.131952, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #110: 1001it [00:03, 296.45it/s, env_step=110000, gradient_step=11000, len=227, n/ep=0, n/st=100, rew=14047.50]                                                                             


Epoch #110: test_reward: 10600.500000 ± 2767.512719, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #111: 1001it [00:03, 328.11it/s, env_step=111000, gradient_step=11100, len=220, n/ep=1, n/st=100, rew=13083.00]                                                                             


Epoch #111: test_reward: 12922.200000 ± 6339.141563, best_reward: 16888.900000 ± 4966.271226 in #48


Epoch #112: 1001it [00:02, 339.40it/s, env_step=112000, gradient_step=11200, len=224, n/ep=0, n/st=100, rew=12819.50]                                                                             


Epoch #112: test_reward: 17590.100000 ± 3905.752385, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #113: 1001it [00:03, 300.97it/s, env_step=113000, gradient_step=11300, len=140, n/ep=1, n/st=100, rew=6204.00]                                                                              


Epoch #113: test_reward: 11063.100000 ± 6018.772673, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #114: 1001it [00:03, 283.89it/s, env_step=114000, gradient_step=11400, len=158, n/ep=2, n/st=100, rew=8841.75]                                                                              


Epoch #114: test_reward: 14442.300000 ± 4960.578193, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #115: 1001it [00:03, 260.97it/s, env_step=115000, gradient_step=11500, len=198, n/ep=0, n/st=100, rew=10574.00]                                                                             


Epoch #115: test_reward: 12009.400000 ± 4678.108853, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #116: 1001it [00:03, 266.79it/s, env_step=116000, gradient_step=11600, len=141, n/ep=0, n/st=100, rew=7260.00]                                                                              


Epoch #116: test_reward: 10348.400000 ± 2840.522248, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #117: 1001it [00:03, 321.94it/s, env_step=117000, gradient_step=11700, len=149, n/ep=0, n/st=100, rew=8009.00]                                                                              


Epoch #117: test_reward: 9436.600000 ± 4063.087698, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #118: 1001it [00:03, 282.90it/s, env_step=118000, gradient_step=11800, len=200, n/ep=2, n/st=100, rew=12015.25]                                                                             


Epoch #118: test_reward: 8140.600000 ± 3154.213823, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #119: 1001it [00:03, 292.34it/s, env_step=119000, gradient_step=11900, len=203, n/ep=1, n/st=100, rew=11474.50]                                                                             


Epoch #119: test_reward: 7511.600000 ± 3201.664074, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #120: 1001it [00:03, 267.84it/s, env_step=120000, gradient_step=12000, len=166, n/ep=1, n/st=100, rew=9723.00]                                                                              


Epoch #120: test_reward: 12493.100000 ± 6382.823614, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #121: 1001it [00:03, 296.11it/s, env_step=121000, gradient_step=12100, len=153, n/ep=0, n/st=100, rew=9207.00]                                                                              


Epoch #121: test_reward: 11394.900000 ± 4569.754358, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #122: 1001it [00:03, 281.22it/s, env_step=122000, gradient_step=12200, len=108, n/ep=0, n/st=100, rew=5773.00]                                                                              


Epoch #122: test_reward: 11815.300000 ± 3033.256272, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #123: 1001it [00:03, 274.28it/s, env_step=123000, gradient_step=12300, len=125, n/ep=0, n/st=100, rew=7178.50]                                                                              


Epoch #123: test_reward: 10979.000000 ± 4647.891436, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #124: 1001it [00:03, 279.27it/s, env_step=124000, gradient_step=12400, len=114, n/ep=0, n/st=100, rew=6440.50]                                                                              


Epoch #124: test_reward: 15915.400000 ± 6768.720827, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #125: 1001it [00:03, 254.02it/s, env_step=125000, gradient_step=12500, len=148, n/ep=0, n/st=100, rew=7960.00]                                                                              


Epoch #125: test_reward: 13800.600000 ± 8228.323817, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #126: 1001it [00:03, 250.60it/s, env_step=126000, gradient_step=12600, len=311, n/ep=0, n/st=100, rew=16656.00]                                                                             


Epoch #126: test_reward: 14701.600000 ± 7575.029125, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #127: 1001it [00:03, 275.51it/s, env_step=127000, gradient_step=12700, len=231, n/ep=0, n/st=100, rew=12171.00]                                                                             


Epoch #127: test_reward: 11398.600000 ± 2998.955225, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #128: 1001it [00:03, 309.28it/s, env_step=128000, gradient_step=12800, len=172, n/ep=0, n/st=100, rew=8526.25]                                                                              


Epoch #128: test_reward: 16150.100000 ± 5921.685038, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #129: 1001it [00:03, 269.93it/s, env_step=129000, gradient_step=12900, len=123, n/ep=0, n/st=100, rew=7093.50]                                                                              


Epoch #129: test_reward: 10323.700000 ± 6599.525893, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #130: 1001it [00:03, 299.15it/s, env_step=130000, gradient_step=13000, len=127, n/ep=0, n/st=100, rew=7049.00]                                                                              


Epoch #130: test_reward: 15674.400000 ± 5186.819376, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #131: 1001it [00:03, 258.28it/s, env_step=131000, gradient_step=13100, len=193, n/ep=0, n/st=100, rew=11360.50]                                                                             


Epoch #131: test_reward: 12620.800000 ± 4966.803898, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #132: 1001it [00:03, 269.79it/s, env_step=132000, gradient_step=13200, len=212, n/ep=0, n/st=100, rew=11831.00]                                                                             


Epoch #132: test_reward: 13202.200000 ± 7129.006576, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #133: 1001it [00:03, 314.52it/s, env_step=133000, gradient_step=13300, len=103, n/ep=0, n/st=100, rew=5526.00]                                                                              


Epoch #133: test_reward: 13107.500000 ± 5529.667680, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #134: 1001it [00:03, 288.08it/s, env_step=134000, gradient_step=13400, len=150, n/ep=1, n/st=100, rew=8108.50]                                                                              


Epoch #134: test_reward: 12396.400000 ± 4051.952399, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #135: 1001it [00:03, 277.39it/s, env_step=135000, gradient_step=13500, len=174, n/ep=0, n/st=100, rew=9838.25]                                                                              


Epoch #135: test_reward: 12183.500000 ± 7075.202545, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #136: 1001it [00:03, 297.88it/s, env_step=136000, gradient_step=13600, len=223, n/ep=1, n/st=100, rew=13046.00]                                                                             


Epoch #136: test_reward: 13770.100000 ± 5716.433424, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #137: 1001it [00:03, 312.46it/s, env_step=137000, gradient_step=13700, len=182, n/ep=1, n/st=100, rew=11025.00]                                                                             


Epoch #137: test_reward: 12292.300000 ± 2909.181055, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #138: 1001it [00:03, 255.61it/s, env_step=138000, gradient_step=13800, len=191, n/ep=4, n/st=100, rew=11015.75]                                                                             


Epoch #138: test_reward: 10346.400000 ± 2474.277398, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #139: 1001it [00:03, 326.53it/s, env_step=139000, gradient_step=13900, len=166, n/ep=0, n/st=100, rew=9499.75]                                                                              


Epoch #139: test_reward: 11966.900000 ± 2489.714701, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #140: 1001it [00:03, 305.80it/s, env_step=140000, gradient_step=14000, len=239, n/ep=2, n/st=100, rew=12811.75]                                                                             


Epoch #140: test_reward: 11468.700000 ± 3482.581831, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #141: 1001it [00:03, 304.64it/s, env_step=141000, gradient_step=14100, len=62, n/ep=0, n/st=100, rew=3168.00]                                                                               


Epoch #141: test_reward: 13571.600000 ± 4629.535769, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #142: 1001it [00:03, 260.22it/s, env_step=142000, gradient_step=14200, len=203, n/ep=2, n/st=100, rew=12152.50]                                                                             


Epoch #142: test_reward: 12332.000000 ± 2711.772372, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #143: 1001it [00:03, 276.98it/s, env_step=143000, gradient_step=14300, len=190, n/ep=0, n/st=100, rew=10789.00]                                                                             


Epoch #143: test_reward: 11363.900000 ± 3065.580677, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #144: 1001it [00:03, 263.00it/s, env_step=144000, gradient_step=14400, len=83, n/ep=1, n/st=100, rew=5019.50]                                                                               


Epoch #144: test_reward: 9932.300000 ± 4975.804981, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #145: 1001it [00:03, 302.12it/s, env_step=145000, gradient_step=14500, len=145, n/ep=0, n/st=100, rew=7465.50]                                                                              


Epoch #145: test_reward: 10923.900000 ± 6025.646728, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #146: 1001it [00:03, 305.05it/s, env_step=146000, gradient_step=14600, len=125, n/ep=1, n/st=100, rew=7552.50]                                                                              


Epoch #146: test_reward: 11908.100000 ± 5909.994018, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #147: 1001it [00:04, 247.49it/s, env_step=147000, gradient_step=14700, len=150, n/ep=0, n/st=100, rew=8604.50]                                                                              


Epoch #147: test_reward: 14057.800000 ± 6133.053168, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #148: 1001it [00:03, 320.57it/s, env_step=148000, gradient_step=14800, len=136, n/ep=0, n/st=100, rew=8648.00]                                                                              


Epoch #148: test_reward: 8186.000000 ± 3597.562286, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #149: 1001it [00:03, 305.71it/s, env_step=149000, gradient_step=14900, len=183, n/ep=1, n/st=100, rew=11099.00]                                                                             


Epoch #149: test_reward: 15782.000000 ± 7065.883950, best_reward: 17590.100000 ± 3905.752385 in #112


Epoch #150: 1001it [00:03, 323.33it/s, env_step=150000, gradient_step=15000, len=54, n/ep=0, n/st=100, rew=2727.50]                                                                               


Epoch #150: test_reward: 11517.000000 ± 5389.597851, best_reward: 17590.100000 ± 3905.752385 in #112

InfoStats(gradient_step=15000, best_reward=17590.1, best_reward_std=3905.7523846245044, train_step=150000, train_episode=792, test_step=332800, test_episode=1510, timing=TimingStats(total_time=746.4389843940735, train_time=499.56022453308105, train_time_collect=59.31385111808777, train_time_update=432.8522126674652, test_time=246.87875986099243, update_speed=300.2640975674134))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #9


Epoch #1: 1001it [00:02, 372.48it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 10549.900000 ± 5105.840762, best_reward: 11862.700000 ± 3451.361762 in #0


Epoch #2: 1001it [00:03, 327.30it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 9177.400000 ± 5596.746612, best_reward: 11862.700000 ± 3451.361762 in #0


Epoch #3: 1001it [00:02, 364.99it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 12017.200000 ± 4608.814355, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #4: 1001it [00:02, 411.61it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 8514.900000 ± 4367.471224, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #5: 1001it [00:02, 367.21it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 10583.700000 ± 5318.559280, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #6: 1001it [00:03, 319.39it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 9751.000000 ± 5728.624407, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #7: 1001it [00:02, 343.01it/s, env_step=7000, gradient_step=700, len=69, n/ep=0, n/st=100, rew=2550.50]                                                                                     


Epoch #7: test_reward: 6266.400000 ± 1932.320015, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #8: 1001it [00:03, 308.72it/s, env_step=8000, gradient_step=800, len=72, n/ep=0, n/st=100, rew=2690.00]                                                                                     


Epoch #8: test_reward: 9482.300000 ± 2705.666131, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #9: 1001it [00:03, 308.50it/s, env_step=9000, gradient_step=900, len=90, n/ep=1, n/st=100, rew=4207.00]                                                                                     


Epoch #9: test_reward: 8947.300000 ± 3862.427010, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #10: 1001it [00:03, 269.17it/s, env_step=10000, gradient_step=1000, len=100, n/ep=2, n/st=100, rew=4071.50]                                                                                 


Epoch #10: test_reward: 8505.800000 ± 5165.588346, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #11: 1001it [00:02, 336.49it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=5429.00]                                                                                 


Epoch #11: test_reward: 7225.500000 ± 4182.239956, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #12: 1001it [00:02, 338.54it/s, env_step=12000, gradient_step=1200, len=116, n/ep=0, n/st=100, rew=6320.50]                                                                                 


Epoch #12: test_reward: 8268.600000 ± 4216.464780, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #13: 1001it [00:03, 279.32it/s, env_step=13000, gradient_step=1300, len=129, n/ep=0, n/st=100, rew=6336.00]                                                                                 


Epoch #13: test_reward: 7471.200000 ± 3072.513915, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #14: 1001it [00:03, 285.56it/s, env_step=14000, gradient_step=1400, len=139, n/ep=0, n/st=100, rew=6514.00]                                                                                 


Epoch #14: test_reward: 6931.400000 ± 2707.979365, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #15: 1001it [00:03, 295.83it/s, env_step=15000, gradient_step=1500, len=145, n/ep=0, n/st=100, rew=7625.00]                                                                                 


Epoch #15: test_reward: 9053.100000 ± 7284.127078, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #16: 1001it [00:03, 281.92it/s, env_step=16000, gradient_step=1600, len=159, n/ep=0, n/st=100, rew=8078.50]                                                                                 


Epoch #16: test_reward: 10755.000000 ± 3764.670636, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #17: 1001it [00:03, 299.95it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=8669.50]                                                                                 


Epoch #17: test_reward: 7026.500000 ± 2229.699318, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #18: 1001it [00:03, 255.42it/s, env_step=18000, gradient_step=1800, len=77, n/ep=0, n/st=100, rew=3088.00]                                                                                  


Epoch #18: test_reward: 10161.900000 ± 7107.247406, best_reward: 12017.200000 ± 4608.814355 in #3


Epoch #19: 1001it [00:03, 278.83it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=10459.00]                                                                                


Epoch #19: test_reward: 14006.100000 ± 3738.173737, best_reward: 14006.100000 ± 3738.173737 in #19


Epoch #20: 1001it [00:03, 309.30it/s, env_step=20000, gradient_step=2000, len=116, n/ep=1, n/st=100, rew=6000.00]                                                                                 


Epoch #20: test_reward: 11461.600000 ± 2895.409995, best_reward: 14006.100000 ± 3738.173737 in #19


Epoch #21: 1001it [00:03, 287.08it/s, env_step=21000, gradient_step=2100, len=209, n/ep=0, n/st=100, rew=11862.88]                                                                                


Epoch #21: test_reward: 9277.800000 ± 4592.101867, best_reward: 14006.100000 ± 3738.173737 in #19


Epoch #22: 1001it [00:03, 314.21it/s, env_step=22000, gradient_step=2200, len=219, n/ep=0, n/st=100, rew=12319.00]                                                                                


Epoch #22: test_reward: 14375.800000 ± 4338.485768, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #23: 1001it [00:03, 331.64it/s, env_step=23000, gradient_step=2300, len=85, n/ep=1, n/st=100, rew=3175.00]                                                                                  


Epoch #23: test_reward: 10182.300000 ± 6285.421482, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #24: 1001it [00:03, 289.54it/s, env_step=24000, gradient_step=2400, len=235, n/ep=0, n/st=100, rew=13451.00]                                                                                


Epoch #24: test_reward: 9611.900000 ± 3106.157834, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #25: 1001it [00:03, 296.06it/s, env_step=25000, gradient_step=2500, len=250, n/ep=1, n/st=100, rew=14823.00]                                                                                


Epoch #25: test_reward: 9298.800000 ± 2640.781884, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #26: 1001it [00:02, 343.80it/s, env_step=26000, gradient_step=2600, len=258, n/ep=0, n/st=100, rew=15146.50]                                                                                


Epoch #26: test_reward: 9353.800000 ± 2642.136022, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #27: 1001it [00:03, 262.72it/s, env_step=27000, gradient_step=2700, len=140, n/ep=0, n/st=100, rew=7311.83]                                                                                 


Epoch #27: test_reward: 11124.600000 ± 4135.217436, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #28: 1001it [00:03, 308.38it/s, env_step=28000, gradient_step=2800, len=275, n/ep=0, n/st=100, rew=15262.00]                                                                                


Epoch #28: test_reward: 10344.100000 ± 4643.999234, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #29: 1001it [00:03, 259.58it/s, env_step=29000, gradient_step=2900, len=290, n/ep=1, n/st=100, rew=18562.00]                                                                                


Epoch #29: test_reward: 10665.800000 ± 2050.289043, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #30: 1001it [00:03, 274.80it/s, env_step=30000, gradient_step=3000, len=157, n/ep=0, n/st=100, rew=8922.50]                                                                                 


Epoch #30: test_reward: 9063.800000 ± 3091.913382, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #31: 1001it [00:03, 271.98it/s, env_step=31000, gradient_step=3100, len=53, n/ep=0, n/st=100, rew=1717.50]                                                                                  


Epoch #31: test_reward: 11687.600000 ± 6742.184990, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #32: 1001it [00:03, 280.77it/s, env_step=32000, gradient_step=3200, len=161, n/ep=1, n/st=100, rew=7533.00]                                                                                 


Epoch #32: test_reward: 8474.200000 ± 4309.818205, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #33: 1001it [00:03, 309.41it/s, env_step=33000, gradient_step=3300, len=227, n/ep=0, n/st=100, rew=12281.00]                                                                                


Epoch #33: test_reward: 11499.500000 ± 4794.242239, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #34: 1001it [00:03, 275.03it/s, env_step=34000, gradient_step=3400, len=131, n/ep=1, n/st=100, rew=5844.00]                                                                                 


Epoch #34: test_reward: 11031.200000 ± 3275.178188, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #35: 1001it [00:03, 291.54it/s, env_step=35000, gradient_step=3500, len=115, n/ep=0, n/st=100, rew=5264.00]                                                                                 


Epoch #35: test_reward: 13921.500000 ± 7542.938821, best_reward: 14375.800000 ± 4338.485768 in #22


Epoch #36: 1001it [00:03, 275.45it/s, env_step=36000, gradient_step=3600, len=37, n/ep=0, n/st=100, rew=1336.00]                                                                                  


Epoch #36: test_reward: 16713.400000 ± 6765.909905, best_reward: 16713.400000 ± 6765.909905 in #36


Epoch #37: 1001it [00:03, 310.67it/s, env_step=37000, gradient_step=3700, len=117, n/ep=0, n/st=100, rew=6156.50]                                                                                 


Epoch #37: test_reward: 9718.300000 ± 3477.752034, best_reward: 16713.400000 ± 6765.909905 in #36


Epoch #38: 1001it [00:03, 284.73it/s, env_step=38000, gradient_step=3800, len=169, n/ep=1, n/st=100, rew=8977.00]                                                                                 


Epoch #38: test_reward: 16798.500000 ± 5953.247118, best_reward: 16798.500000 ± 5953.247118 in #38


Epoch #39: 1001it [00:03, 283.35it/s, env_step=39000, gradient_step=3900, len=234, n/ep=3, n/st=100, rew=12925.00]                                                                                


Epoch #39: test_reward: 13555.400000 ± 4104.742823, best_reward: 16798.500000 ± 5953.247118 in #38


Epoch #40: 1001it [00:03, 316.46it/s, env_step=40000, gradient_step=4000, len=176, n/ep=0, n/st=100, rew=8745.00]                                                                                 


Epoch #40: test_reward: 14272.500000 ± 5558.419618, best_reward: 16798.500000 ± 5953.247118 in #38


Epoch #41: 1001it [00:03, 270.11it/s, env_step=41000, gradient_step=4100, len=169, n/ep=0, n/st=100, rew=8056.50]                                                                                 


Epoch #41: test_reward: 14861.600000 ± 6003.507212, best_reward: 16798.500000 ± 5953.247118 in #38


Epoch #42: 1001it [00:03, 273.82it/s, env_step=42000, gradient_step=4200, len=192, n/ep=0, n/st=100, rew=10197.00]                                                                                


Epoch #42: test_reward: 12668.600000 ± 7169.289521, best_reward: 16798.500000 ± 5953.247118 in #38


Epoch #43: 1001it [00:03, 289.31it/s, env_step=43000, gradient_step=4300, len=173, n/ep=1, n/st=100, rew=8161.00]                                                                                 


Epoch #43: test_reward: 18653.600000 ± 5146.045515, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #44: 1001it [00:03, 275.33it/s, env_step=44000, gradient_step=4400, len=234, n/ep=0, n/st=100, rew=13568.50]                                                                                


Epoch #44: test_reward: 14553.400000 ± 4181.581045, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #45: 1001it [00:03, 311.05it/s, env_step=45000, gradient_step=4500, len=282, n/ep=0, n/st=100, rew=16900.50]                                                                                


Epoch #45: test_reward: 16235.100000 ± 5070.942683, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #46: 1001it [00:03, 273.00it/s, env_step=46000, gradient_step=4600, len=221, n/ep=0, n/st=100, rew=12560.33]                                                                                


Epoch #46: test_reward: 15017.400000 ± 6031.469343, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #47: 1001it [00:03, 265.20it/s, env_step=47000, gradient_step=4700, len=135, n/ep=0, n/st=100, rew=6788.00]                                                                                 


Epoch #47: test_reward: 15210.400000 ± 8091.937410, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #48: 1001it [00:03, 315.85it/s, env_step=48000, gradient_step=4800, len=177, n/ep=0, n/st=100, rew=9705.50]                                                                                 


Epoch #48: test_reward: 11672.300000 ± 4247.440925, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #49: 1001it [00:03, 309.65it/s, env_step=49000, gradient_step=4900, len=137, n/ep=1, n/st=100, rew=7774.00]                                                                                 


Epoch #49: test_reward: 11312.400000 ± 4605.457940, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #50: 1001it [00:03, 301.97it/s, env_step=50000, gradient_step=5000, len=142, n/ep=1, n/st=100, rew=8032.50]                                                                                 


Epoch #50: test_reward: 10153.900000 ± 4496.545484, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #51: 1001it [00:03, 267.22it/s, env_step=51000, gradient_step=5100, len=251, n/ep=0, n/st=100, rew=15048.00]                                                                                


Epoch #51: test_reward: 13678.400000 ± 5768.726986, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #52: 1001it [00:03, 273.92it/s, env_step=52000, gradient_step=5200, len=342, n/ep=0, n/st=100, rew=20966.50]                                                                                


Epoch #52: test_reward: 9938.400000 ± 2669.384618, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #53: 1001it [00:03, 328.06it/s, env_step=53000, gradient_step=5300, len=139, n/ep=0, n/st=100, rew=7392.00]                                                                                 


Epoch #53: test_reward: 11340.200000 ± 5459.535105, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #54: 1001it [00:03, 277.84it/s, env_step=54000, gradient_step=5400, len=320, n/ep=0, n/st=100, rew=20711.00]                                                                                


Epoch #54: test_reward: 7751.900000 ± 7939.509663, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #55: 1001it [00:03, 304.72it/s, env_step=55000, gradient_step=5500, len=221, n/ep=0, n/st=100, rew=13838.00]                                                                                


Epoch #55: test_reward: 11829.300000 ± 5530.197140, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #56: 1001it [00:03, 274.17it/s, env_step=56000, gradient_step=5600, len=360, n/ep=0, n/st=100, rew=22675.00]                                                                                


Epoch #56: test_reward: 9075.800000 ± 7517.110147, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #57: 1001it [00:03, 280.01it/s, env_step=57000, gradient_step=5700, len=163, n/ep=0, n/st=100, rew=9063.00]                                                                                 


Epoch #57: test_reward: 15320.000000 ± 4005.632834, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #58: 1001it [00:03, 285.75it/s, env_step=58000, gradient_step=5800, len=165, n/ep=0, n/st=100, rew=9151.00]                                                                                 


Epoch #58: test_reward: 8191.000000 ± 3356.461023, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #59: 1001it [00:03, 301.61it/s, env_step=59000, gradient_step=5900, len=229, n/ep=1, n/st=100, rew=12792.50]                                                                                


Epoch #59: test_reward: 16650.300000 ± 3949.315486, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #60: 1001it [00:03, 269.35it/s, env_step=60000, gradient_step=6000, len=242, n/ep=0, n/st=100, rew=14232.25]                                                                                


Epoch #60: test_reward: 12971.400000 ± 5892.516885, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #61: 1001it [00:03, 284.62it/s, env_step=61000, gradient_step=6100, len=281, n/ep=1, n/st=100, rew=17369.00]                                                                                


Epoch #61: test_reward: 4822.600000 ± 3869.694360, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #62: 1001it [00:03, 283.66it/s, env_step=62000, gradient_step=6200, len=229, n/ep=0, n/st=100, rew=13366.50]                                                                                


Epoch #62: test_reward: 8507.000000 ± 5609.677870, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #63: 1001it [00:03, 317.35it/s, env_step=63000, gradient_step=6300, len=139, n/ep=0, n/st=100, rew=8122.00]                                                                                 


Epoch #63: test_reward: 7710.200000 ± 4684.011247, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #64: 1001it [00:03, 292.79it/s, env_step=64000, gradient_step=6400, len=201, n/ep=1, n/st=100, rew=11394.50]                                                                                


Epoch #64: test_reward: 17294.200000 ± 5592.283287, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #65: 1001it [00:03, 283.97it/s, env_step=65000, gradient_step=6500, len=213, n/ep=0, n/st=100, rew=12807.00]                                                                                


Epoch #65: test_reward: 13240.200000 ± 5266.752867, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #66: 1001it [00:03, 276.12it/s, env_step=66000, gradient_step=6600, len=194, n/ep=2, n/st=100, rew=10116.00]                                                                                


Epoch #66: test_reward: 10782.100000 ± 5627.193340, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #67: 1001it [00:03, 260.17it/s, env_step=67000, gradient_step=6700, len=166, n/ep=0, n/st=100, rew=9459.00]                                                                                 


Epoch #67: test_reward: 8712.700000 ± 3199.311771, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #68: 1001it [00:03, 296.15it/s, env_step=68000, gradient_step=6800, len=171, n/ep=0, n/st=100, rew=8815.50]                                                                                 


Epoch #68: test_reward: 14272.400000 ± 4061.513959, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #69: 1001it [00:03, 271.64it/s, env_step=69000, gradient_step=6900, len=183, n/ep=1, n/st=100, rew=9544.00]                                                                                 


Epoch #69: test_reward: 8971.700000 ± 3889.529690, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #70: 1001it [00:03, 257.94it/s, env_step=70000, gradient_step=7000, len=154, n/ep=0, n/st=100, rew=8405.00]                                                                                 


Epoch #70: test_reward: 8428.200000 ± 3288.631624, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #71: 1001it [00:03, 307.98it/s, env_step=71000, gradient_step=7100, len=157, n/ep=0, n/st=100, rew=8537.00]                                                                                 


Epoch #71: test_reward: 12232.100000 ± 3644.044688, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #72: 1001it [00:03, 257.52it/s, env_step=72000, gradient_step=7200, len=400, n/ep=1, n/st=100, rew=25425.00]                                                                                


Epoch #72: test_reward: 11603.200000 ± 6481.344688, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #73: 1001it [00:03, 274.19it/s, env_step=73000, gradient_step=7300, len=77, n/ep=1, n/st=100, rew=3427.00]                                                                                  


Epoch #73: test_reward: 13000.400000 ± 5883.190379, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #74: 1001it [00:03, 280.59it/s, env_step=74000, gradient_step=7400, len=297, n/ep=0, n/st=100, rew=19168.00]                                                                                


Epoch #74: test_reward: 13129.200000 ± 5020.461349, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #75: 1001it [00:03, 305.69it/s, env_step=75000, gradient_step=7500, len=251, n/ep=1, n/st=100, rew=15796.00]                                                                                


Epoch #75: test_reward: 12187.600000 ± 4992.797617, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #76: 1001it [00:03, 309.20it/s, env_step=76000, gradient_step=7600, len=169, n/ep=0, n/st=100, rew=8747.00]                                                                                 


Epoch #76: test_reward: 18210.700000 ± 5797.989532, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #77: 1001it [00:03, 311.83it/s, env_step=77000, gradient_step=7700, len=162, n/ep=0, n/st=100, rew=8704.50]                                                                                 


Epoch #77: test_reward: 12470.500000 ± 4389.232125, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #78: 1001it [00:02, 336.96it/s, env_step=78000, gradient_step=7800, len=283, n/ep=0, n/st=100, rew=17883.00]                                                                                


Epoch #78: test_reward: 10068.400000 ± 4626.864320, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #79: 1001it [00:03, 298.19it/s, env_step=79000, gradient_step=7900, len=195, n/ep=0, n/st=100, rew=10499.50]                                                                                


Epoch #79: test_reward: 10919.200000 ± 4371.913055, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #80: 1001it [00:04, 246.93it/s, env_step=80000, gradient_step=8000, len=194, n/ep=0, n/st=100, rew=11451.75]                                                                                


Epoch #80: test_reward: 14290.100000 ± 6293.563735, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #81: 1001it [00:03, 272.60it/s, env_step=81000, gradient_step=8100, len=179, n/ep=0, n/st=100, rew=11139.00]                                                                                


Epoch #81: test_reward: 14640.300000 ± 4450.166874, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #82: 1001it [00:03, 261.33it/s, env_step=82000, gradient_step=8200, len=93, n/ep=1, n/st=100, rew=3961.00]                                                                                  


Epoch #82: test_reward: 12231.200000 ± 3679.827083, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #83: 1001it [00:03, 309.56it/s, env_step=83000, gradient_step=8300, len=235, n/ep=1, n/st=100, rew=13670.50]                                                                                


Epoch #83: test_reward: 9883.000000 ± 2723.699433, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #84: 1001it [00:03, 310.07it/s, env_step=84000, gradient_step=8400, len=70, n/ep=0, n/st=100, rew=3507.00]                                                                                  


Epoch #84: test_reward: 11443.900000 ± 3531.577790, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #85: 1001it [00:03, 285.85it/s, env_step=85000, gradient_step=8500, len=161, n/ep=0, n/st=100, rew=9123.00]                                                                                 


Epoch #85: test_reward: 11620.400000 ± 3334.030810, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #86: 1001it [00:03, 312.93it/s, env_step=86000, gradient_step=8600, len=161, n/ep=1, n/st=100, rew=9269.50]                                                                                 


Epoch #86: test_reward: 12569.700000 ± 6361.700025, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #87: 1001it [00:03, 315.29it/s, env_step=87000, gradient_step=8700, len=97, n/ep=0, n/st=100, rew=5096.00]                                                                                  


Epoch #87: test_reward: 11507.000000 ± 5895.631077, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #88: 1001it [00:03, 301.74it/s, env_step=88000, gradient_step=8800, len=216, n/ep=0, n/st=100, rew=13377.25]                                                                                


Epoch #88: test_reward: 6959.600000 ± 2650.730133, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #89: 1001it [00:03, 305.10it/s, env_step=89000, gradient_step=8900, len=155, n/ep=0, n/st=100, rew=8962.12]                                                                                 


Epoch #89: test_reward: 10013.200000 ± 4822.930993, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #90: 1001it [00:03, 284.22it/s, env_step=90000, gradient_step=9000, len=146, n/ep=1, n/st=100, rew=8040.50]                                                                                 


Epoch #90: test_reward: 15228.200000 ± 6450.571057, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #91: 1001it [00:03, 316.52it/s, env_step=91000, gradient_step=9100, len=206, n/ep=0, n/st=100, rew=13368.00]                                                                                


Epoch #91: test_reward: 12690.700000 ± 3185.193873, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #92: 1001it [00:03, 287.32it/s, env_step=92000, gradient_step=9200, len=154, n/ep=1, n/st=100, rew=8794.00]                                                                                 


Epoch #92: test_reward: 15707.700000 ± 6665.026152, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #93: 1001it [00:03, 273.51it/s, env_step=93000, gradient_step=9300, len=135, n/ep=0, n/st=100, rew=7777.00]                                                                                 


Epoch #93: test_reward: 10589.700000 ± 4000.104225, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #94: 1001it [00:03, 309.47it/s, env_step=94000, gradient_step=9400, len=137, n/ep=1, n/st=100, rew=7774.50]                                                                                 


Epoch #94: test_reward: 8399.600000 ± 4091.574641, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #95: 1001it [00:03, 316.23it/s, env_step=95000, gradient_step=9500, len=150, n/ep=0, n/st=100, rew=8936.00]                                                                                 


Epoch #95: test_reward: 9166.700000 ± 2949.055647, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #96: 1001it [00:03, 289.10it/s, env_step=96000, gradient_step=9600, len=131, n/ep=1, n/st=100, rew=7336.00]                                                                                 


Epoch #96: test_reward: 11775.200000 ± 6314.401853, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #97: 1001it [00:03, 268.17it/s, env_step=97000, gradient_step=9700, len=288, n/ep=1, n/st=100, rew=16074.50]                                                                                


Epoch #97: test_reward: 10112.800000 ± 3786.660080, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #98: 1001it [00:03, 293.71it/s, env_step=98000, gradient_step=9800, len=195, n/ep=0, n/st=100, rew=11651.00]                                                                                


Epoch #98: test_reward: 11155.000000 ± 5234.795851, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #99: 1001it [00:02, 363.95it/s, env_step=99000, gradient_step=9900, len=186, n/ep=0, n/st=100, rew=11533.00]                                                                                


Epoch #99: test_reward: 13484.800000 ± 5120.252744, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #100: 1001it [00:02, 351.75it/s, env_step=100000, gradient_step=10000, len=151, n/ep=0, n/st=100, rew=9081.00]                                                                              


Epoch #100: test_reward: 15339.300000 ± 6418.625819, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #101: 1001it [00:03, 331.65it/s, env_step=101000, gradient_step=10100, len=127, n/ep=0, n/st=100, rew=6812.00]                                                                              


Epoch #101: test_reward: 10741.500000 ± 2562.447004, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #102: 1001it [00:02, 348.12it/s, env_step=102000, gradient_step=10200, len=216, n/ep=2, n/st=100, rew=13222.50]                                                                             


Epoch #102: test_reward: 8368.100000 ± 2530.567622, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #103: 1001it [00:03, 308.48it/s, env_step=103000, gradient_step=10300, len=159, n/ep=0, n/st=100, rew=9467.25]                                                                              


Epoch #103: test_reward: 8619.900000 ± 4112.468807, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #104: 1001it [00:02, 340.86it/s, env_step=104000, gradient_step=10400, len=139, n/ep=0, n/st=100, rew=7430.50]                                                                              


Epoch #104: test_reward: 10531.600000 ± 3821.746700, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #105: 1001it [00:02, 350.02it/s, env_step=105000, gradient_step=10500, len=232, n/ep=0, n/st=100, rew=14816.50]                                                                             


Epoch #105: test_reward: 8545.100000 ± 3886.880200, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #106: 1001it [00:03, 324.82it/s, env_step=106000, gradient_step=10600, len=133, n/ep=0, n/st=100, rew=7155.00]                                                                              


Epoch #106: test_reward: 14028.700000 ± 3619.493005, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #107: 1001it [00:02, 367.60it/s, env_step=107000, gradient_step=10700, len=157, n/ep=3, n/st=100, rew=8449.17]                                                                              


Epoch #107: test_reward: 14230.800000 ± 7294.296797, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #108: 1001it [00:02, 367.41it/s, env_step=108000, gradient_step=10800, len=72, n/ep=2, n/st=100, rew=3190.50]                                                                               


Epoch #108: test_reward: 8796.200000 ± 3446.108582, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #109: 1001it [00:02, 399.02it/s, env_step=109000, gradient_step=10900, len=114, n/ep=0, n/st=100, rew=7059.00]                                                                              


Epoch #109: test_reward: 8998.600000 ± 4864.071714, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #110: 1001it [00:02, 366.42it/s, env_step=110000, gradient_step=11000, len=229, n/ep=1, n/st=100, rew=14805.00]                                                                             


Epoch #110: test_reward: 8294.800000 ± 3099.735756, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #111: 1001it [00:02, 351.97it/s, env_step=111000, gradient_step=11100, len=123, n/ep=2, n/st=100, rew=7478.25]                                                                              


Epoch #111: test_reward: 10610.600000 ± 2333.478099, best_reward: 18653.600000 ± 5146.045515 in #43


Epoch #112: 1001it [00:02, 368.99it/s, env_step=112000, gradient_step=11200, len=156, n/ep=0, n/st=100, rew=8905.00]                                                                              


Epoch #112: test_reward: 18856.700000 ± 5695.553530, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #113: 1001it [00:02, 375.17it/s, env_step=113000, gradient_step=11300, len=161, n/ep=3, n/st=100, rew=9780.67]                                                                              


Epoch #113: test_reward: 14496.400000 ± 6687.055364, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #114: 1001it [00:02, 355.88it/s, env_step=114000, gradient_step=11400, len=192, n/ep=0, n/st=100, rew=11535.00]                                                                             


Epoch #114: test_reward: 12924.300000 ± 4981.655308, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #115: 1001it [00:02, 387.60it/s, env_step=115000, gradient_step=11500, len=191, n/ep=0, n/st=100, rew=12044.00]                                                                             


Epoch #115: test_reward: 11228.400000 ± 5963.744767, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #116: 1001it [00:02, 345.78it/s, env_step=116000, gradient_step=11600, len=315, n/ep=1, n/st=100, rew=15657.00]                                                                             


Epoch #116: test_reward: 10275.800000 ± 2134.584072, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #117: 1001it [00:02, 356.21it/s, env_step=117000, gradient_step=11700, len=171, n/ep=1, n/st=100, rew=10265.50]                                                                             


Epoch #117: test_reward: 8995.800000 ± 2768.272017, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #118: 1001it [00:02, 355.77it/s, env_step=118000, gradient_step=11800, len=161, n/ep=0, n/st=100, rew=8994.50]                                                                              


Epoch #118: test_reward: 8782.100000 ± 4541.370001, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #119: 1001it [00:02, 347.71it/s, env_step=119000, gradient_step=11900, len=193, n/ep=1, n/st=100, rew=10514.50]                                                                             


Epoch #119: test_reward: 10173.900000 ± 3650.699726, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #120: 1001it [00:03, 332.32it/s, env_step=120000, gradient_step=12000, len=118, n/ep=3, n/st=100, rew=7093.17]                                                                              


Epoch #120: test_reward: 11600.400000 ± 4284.366912, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #121: 1001it [00:02, 342.18it/s, env_step=121000, gradient_step=12100, len=129, n/ep=2, n/st=100, rew=7743.00]                                                                              


Epoch #121: test_reward: 13787.800000 ± 5410.574347, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #122: 1001it [00:02, 370.40it/s, env_step=122000, gradient_step=12200, len=165, n/ep=1, n/st=100, rew=9440.00]                                                                              


Epoch #122: test_reward: 10892.700000 ± 2121.329585, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #123: 1001it [00:02, 371.47it/s, env_step=123000, gradient_step=12300, len=147, n/ep=3, n/st=100, rew=7646.17]                                                                              


Epoch #123: test_reward: 13623.000000 ± 6777.098730, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #124: 1001it [00:03, 329.18it/s, env_step=124000, gradient_step=12400, len=202, n/ep=2, n/st=100, rew=12276.00]                                                                             


Epoch #124: test_reward: 8912.700000 ± 6542.922223, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #125: 1001it [00:03, 324.02it/s, env_step=125000, gradient_step=12500, len=93, n/ep=0, n/st=100, rew=4778.00]                                                                               


Epoch #125: test_reward: 9992.700000 ± 3879.917655, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #126: 1001it [00:03, 328.54it/s, env_step=126000, gradient_step=12600, len=196, n/ep=1, n/st=100, rew=12550.00]                                                                             


Epoch #126: test_reward: 12775.500000 ± 2238.042727, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #127: 1001it [00:03, 322.75it/s, env_step=127000, gradient_step=12700, len=86, n/ep=0, n/st=100, rew=4321.00]                                                                               


Epoch #127: test_reward: 9490.200000 ± 2508.862125, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #128: 1001it [00:03, 297.91it/s, env_step=128000, gradient_step=12800, len=172, n/ep=0, n/st=100, rew=10279.00]                                                                             


Epoch #128: test_reward: 12072.800000 ± 3547.651527, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #129: 1001it [00:03, 317.70it/s, env_step=129000, gradient_step=12900, len=79, n/ep=0, n/st=100, rew=2875.00]                                                                               


Epoch #129: test_reward: 11153.200000 ± 2088.838902, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #130: 1001it [00:02, 339.59it/s, env_step=130000, gradient_step=13000, len=211, n/ep=0, n/st=100, rew=13856.50]                                                                             


Epoch #130: test_reward: 9287.400000 ± 4550.921142, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #131: 1001it [00:02, 337.64it/s, env_step=131000, gradient_step=13100, len=150, n/ep=0, n/st=100, rew=8286.00]                                                                              


Epoch #131: test_reward: 8216.900000 ± 2127.341461, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #132: 1001it [00:02, 374.49it/s, env_step=132000, gradient_step=13200, len=160, n/ep=1, n/st=100, rew=9321.50]                                                                              


Epoch #132: test_reward: 9645.200000 ± 4906.859297, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #133: 1001it [00:02, 386.96it/s, env_step=133000, gradient_step=13300, len=184, n/ep=0, n/st=100, rew=11139.00]                                                                             


Epoch #133: test_reward: 14992.500000 ± 6448.505211, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #134: 1001it [00:03, 324.32it/s, env_step=134000, gradient_step=13400, len=129, n/ep=0, n/st=100, rew=7092.00]                                                                              


Epoch #134: test_reward: 9891.600000 ± 4049.725724, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #135: 1001it [00:03, 315.53it/s, env_step=135000, gradient_step=13500, len=28, n/ep=0, n/st=100, rew=887.00]                                                                                


Epoch #135: test_reward: 9111.800000 ± 3011.469867, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #136: 1001it [00:03, 314.52it/s, env_step=136000, gradient_step=13600, len=109, n/ep=0, n/st=100, rew=5538.00]                                                                              


Epoch #136: test_reward: 13971.600000 ± 4736.165098, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #137: 1001it [00:03, 289.77it/s, env_step=137000, gradient_step=13700, len=144, n/ep=0, n/st=100, rew=7955.83]                                                                              


Epoch #137: test_reward: 5284.400000 ± 4019.059870, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #138: 1001it [00:02, 379.06it/s, env_step=138000, gradient_step=13800, len=213, n/ep=0, n/st=100, rew=11778.00]                                                                             


Epoch #138: test_reward: 10707.000000 ± 3503.913840, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #139: 1001it [00:03, 325.11it/s, env_step=139000, gradient_step=13900, len=182, n/ep=0, n/st=100, rew=10627.00]                                                                             


Epoch #139: test_reward: 10915.700000 ± 4082.770041, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #140: 1001it [00:02, 376.82it/s, env_step=140000, gradient_step=14000, len=125, n/ep=1, n/st=100, rew=7495.00]                                                                              


Epoch #140: test_reward: 11995.100000 ± 4618.009191, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #141: 1001it [00:02, 403.09it/s, env_step=141000, gradient_step=14100, len=137, n/ep=1, n/st=100, rew=7456.00]                                                                              


Epoch #141: test_reward: 7051.000000 ± 4228.967463, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #142: 1001it [00:02, 337.08it/s, env_step=142000, gradient_step=14200, len=99, n/ep=0, n/st=100, rew=4461.00]                                                                               


Epoch #142: test_reward: 4370.400000 ± 6371.002923, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #143: 1001it [00:02, 392.87it/s, env_step=143000, gradient_step=14300, len=211, n/ep=0, n/st=100, rew=12992.50]                                                                             


Epoch #143: test_reward: 9827.900000 ± 5170.593862, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #144: 1001it [00:02, 383.20it/s, env_step=144000, gradient_step=14400, len=175, n/ep=2, n/st=100, rew=10411.25]                                                                             


Epoch #144: test_reward: 14056.600000 ± 6492.791514, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #145: 1001it [00:03, 324.05it/s, env_step=145000, gradient_step=14500, len=181, n/ep=0, n/st=100, rew=9828.00]                                                                              


Epoch #145: test_reward: 10917.200000 ± 6998.514940, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #146: 1001it [00:02, 333.97it/s, env_step=146000, gradient_step=14600, len=248, n/ep=0, n/st=100, rew=15320.50]                                                                             


Epoch #146: test_reward: 7930.700000 ± 5001.973871, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #147: 1001it [00:03, 318.72it/s, env_step=147000, gradient_step=14700, len=152, n/ep=1, n/st=100, rew=8920.00]                                                                              


Epoch #147: test_reward: 11430.600000 ± 6467.765645, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #148: 1001it [00:03, 326.15it/s, env_step=148000, gradient_step=14800, len=232, n/ep=1, n/st=100, rew=15550.50]                                                                             


Epoch #148: test_reward: 9289.900000 ± 7295.555338, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #149: 1001it [00:02, 343.26it/s, env_step=149000, gradient_step=14900, len=158, n/ep=0, n/st=100, rew=8952.00]                                                                              


Epoch #149: test_reward: 14259.900000 ± 3112.211093, best_reward: 18856.700000 ± 5695.553530 in #112


Epoch #150: 1001it [00:03, 323.61it/s, env_step=150000, gradient_step=15000, len=126, n/ep=0, n/st=100, rew=7690.00]                                                                              


Epoch #150: test_reward: 10525.000000 ± 5305.023977, best_reward: 18856.700000 ± 5695.553530 in #112

InfoStats(gradient_step=15000, best_reward=18856.7, best_reward_std=5695.553529728257, train_step=150000, train_episode=802, test_step=332253, test_episode=1510, timing=TimingStats(total_time=710.9718344211578, train_time=486.31331610679626, train_time_collect=57.3750364780426, train_time_update=421.8089499473572, test_time=224.65851831436157, update_speed=308.4431271609668))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #7


Epoch #1: 1001it [00:02, 444.49it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 5255.000000 ± 2164.602365, best_reward: 11822.300000 ± 3402.447709 in #0


Epoch #2: 1001it [00:02, 439.32it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 9225.200000 ± 7578.388111, best_reward: 11822.300000 ± 3402.447709 in #0


Epoch #3: 1001it [00:02, 347.69it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 12382.900000 ± 3181.089199, best_reward: 12382.900000 ± 3181.089199 in #3


Epoch #4: 1001it [00:02, 385.21it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 15543.100000 ± 5643.453968, best_reward: 15543.100000 ± 5643.453968 in #4


Epoch #5: 1001it [00:02, 419.73it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 15134.800000 ± 4079.458415, best_reward: 15543.100000 ± 5643.453968 in #4


Epoch #6: 1001it [00:02, 395.31it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 12574.000000 ± 3913.090339, best_reward: 15543.100000 ± 5643.453968 in #4


Epoch #7: 1001it [00:02, 434.05it/s, env_step=7000, gradient_step=700, len=66, n/ep=0, n/st=100, rew=1792.00]                                                                                     


Epoch #7: test_reward: 12098.800000 ± 5839.892941, best_reward: 15543.100000 ± 5643.453968 in #4


Epoch #8: 1001it [00:02, 462.48it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=2253.00]                                                                                     


Epoch #8: test_reward: 11451.800000 ± 2897.472133, best_reward: 15543.100000 ± 5643.453968 in #4


Epoch #9: 1001it [00:02, 430.68it/s, env_step=9000, gradient_step=900, len=88, n/ep=0, n/st=100, rew=2543.00]                                                                                     


Epoch #9: test_reward: 10959.000000 ± 4430.413976, best_reward: 15543.100000 ± 5643.453968 in #4


Epoch #10: 1001it [00:02, 403.79it/s, env_step=10000, gradient_step=1000, len=92, n/ep=0, n/st=100, rew=2136.00]                                                                                  


Epoch #10: test_reward: 15582.300000 ± 4153.960473, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #11: 1001it [00:02, 367.83it/s, env_step=11000, gradient_step=1100, len=92, n/ep=0, n/st=100, rew=2136.00]                                                                                  


Epoch #11: test_reward: 7899.400000 ± 3255.005505, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #12: 1001it [00:03, 321.62it/s, env_step=12000, gradient_step=1200, len=120, n/ep=1, n/st=100, rew=5389.00]                                                                                 


Epoch #12: test_reward: 12122.600000 ± 6897.623768, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #13: 1001it [00:03, 330.06it/s, env_step=13000, gradient_step=1300, len=128, n/ep=0, n/st=100, rew=6304.00]                                                                                 


Epoch #13: test_reward: 9684.700000 ± 3054.577354, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #14: 1001it [00:03, 331.14it/s, env_step=14000, gradient_step=1400, len=140, n/ep=1, n/st=100, rew=5211.00]                                                                                 


Epoch #14: test_reward: 9273.900000 ± 2798.861499, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #15: 1001it [00:02, 354.44it/s, env_step=15000, gradient_step=1500, len=150, n/ep=1, n/st=100, rew=5594.00]                                                                                 


Epoch #15: test_reward: 11679.600000 ± 4489.504298, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #16: 1001it [00:02, 359.80it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=6426.00]                                                                                 


Epoch #16: test_reward: 13567.700000 ± 5064.984463, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #17: 1001it [00:02, 360.30it/s, env_step=17000, gradient_step=1700, len=170, n/ep=1, n/st=100, rew=7089.00]                                                                                 


Epoch #17: test_reward: 9040.400000 ± 4946.878596, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #18: 1001it [00:02, 373.91it/s, env_step=18000, gradient_step=1800, len=180, n/ep=2, n/st=100, rew=8415.50]                                                                                 


Epoch #18: test_reward: 11353.600000 ± 2877.949902, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #19: 1001it [00:03, 310.17it/s, env_step=19000, gradient_step=1900, len=188, n/ep=0, n/st=100, rew=8571.00]                                                                                 


Epoch #19: test_reward: 11855.100000 ± 2488.782170, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #20: 1001it [00:02, 391.01it/s, env_step=20000, gradient_step=2000, len=110, n/ep=0, n/st=100, rew=3842.50]                                                                                 


Epoch #20: test_reward: 13623.400000 ± 4291.450645, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #21: 1001it [00:02, 371.22it/s, env_step=21000, gradient_step=2100, len=208, n/ep=0, n/st=100, rew=10527.00]                                                                                


Epoch #21: test_reward: 9644.500000 ± 4769.640998, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #22: 1001it [00:02, 408.76it/s, env_step=22000, gradient_step=2200, len=215, n/ep=0, n/st=100, rew=10354.00]                                                                                


Epoch #22: test_reward: 9608.900000 ± 4309.537155, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #23: 1001it [00:02, 342.02it/s, env_step=23000, gradient_step=2300, len=228, n/ep=0, n/st=100, rew=11279.00]                                                                                


Epoch #23: test_reward: 7372.300000 ± 2331.310792, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #24: 1001it [00:02, 393.95it/s, env_step=24000, gradient_step=2400, len=234, n/ep=0, n/st=100, rew=11905.00]                                                                                


Epoch #24: test_reward: 13561.600000 ± 5306.288858, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #25: 1001it [00:02, 376.26it/s, env_step=25000, gradient_step=2500, len=250, n/ep=1, n/st=100, rew=12860.00]                                                                                


Epoch #25: test_reward: 9299.000000 ± 3773.731946, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #26: 1001it [00:03, 332.97it/s, env_step=26000, gradient_step=2600, len=68, n/ep=1, n/st=100, rew=3059.00]                                                                                  


Epoch #26: test_reward: 11633.600000 ± 4767.166836, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #27: 1001it [00:02, 370.69it/s, env_step=27000, gradient_step=2700, len=211, n/ep=2, n/st=100, rew=10306.50]                                                                                


Epoch #27: test_reward: 9232.600000 ± 2982.774990, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #28: 1001it [00:03, 323.29it/s, env_step=28000, gradient_step=2800, len=280, n/ep=1, n/st=100, rew=13824.00]                                                                                


Epoch #28: test_reward: 12085.800000 ± 4657.878354, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #29: 1001it [00:02, 355.66it/s, env_step=29000, gradient_step=2900, len=289, n/ep=0, n/st=100, rew=16513.50]                                                                                


Epoch #29: test_reward: 13798.600000 ± 3165.873788, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #30: 1001it [00:02, 341.57it/s, env_step=30000, gradient_step=3000, len=291, n/ep=0, n/st=100, rew=15497.50]                                                                                


Epoch #30: test_reward: 11629.100000 ± 5444.814018, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #31: 1001it [00:02, 419.31it/s, env_step=31000, gradient_step=3100, len=310, n/ep=1, n/st=100, rew=16946.00]                                                                                


Epoch #31: test_reward: 12405.700000 ± 5689.646580, best_reward: 15582.300000 ± 4153.960473 in #10


Epoch #32: 1001it [00:02, 376.21it/s, env_step=32000, gradient_step=3200, len=196, n/ep=1, n/st=100, rew=9624.00]                                                                                 


Epoch #32: test_reward: 16421.600000 ± 5590.793704, best_reward: 16421.600000 ± 5590.793704 in #32


Epoch #33: 1001it [00:02, 373.74it/s, env_step=33000, gradient_step=3300, len=145, n/ep=2, n/st=100, rew=7511.25]                                                                                 


Epoch #33: test_reward: 12290.200000 ± 3165.491109, best_reward: 16421.600000 ± 5590.793704 in #32


Epoch #34: 1001it [00:02, 397.37it/s, env_step=34000, gradient_step=3400, len=145, n/ep=0, n/st=100, rew=7511.25]                                                                                 


Epoch #34: test_reward: 12273.000000 ± 3871.317424, best_reward: 16421.600000 ± 5590.793704 in #32


Epoch #35: 1001it [00:02, 340.10it/s, env_step=35000, gradient_step=3500, len=349, n/ep=0, n/st=100, rew=20853.25]                                                                                


Epoch #35: test_reward: 15817.400000 ± 3662.199345, best_reward: 16421.600000 ± 5590.793704 in #32


Epoch #36: 1001it [00:02, 367.20it/s, env_step=36000, gradient_step=3600, len=355, n/ep=0, n/st=100, rew=19728.00]                                                                                


Epoch #36: test_reward: 16578.900000 ± 3831.478421, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #37: 1001it [00:02, 362.09it/s, env_step=37000, gradient_step=3700, len=241, n/ep=0, n/st=100, rew=13243.33]                                                                                


Epoch #37: test_reward: 12733.000000 ± 5413.237497, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #38: 1001it [00:02, 352.84it/s, env_step=38000, gradient_step=3800, len=341, n/ep=2, n/st=100, rew=19204.50]                                                                                


Epoch #38: test_reward: 11039.500000 ± 5655.474397, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #39: 1001it [00:02, 417.03it/s, env_step=39000, gradient_step=3900, len=196, n/ep=0, n/st=100, rew=10966.25]                                                                                


Epoch #39: test_reward: 9912.800000 ± 4070.482252, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #40: 1001it [00:02, 364.15it/s, env_step=40000, gradient_step=4000, len=400, n/ep=2, n/st=100, rew=23318.00]                                                                                


Epoch #40: test_reward: 14724.600000 ± 4699.543195, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #41: 1001it [00:02, 337.97it/s, env_step=41000, gradient_step=4100, len=269, n/ep=0, n/st=100, rew=14716.00]                                                                                


Epoch #41: test_reward: 12054.200000 ± 4308.763321, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #42: 1001it [00:02, 360.05it/s, env_step=42000, gradient_step=4200, len=210, n/ep=0, n/st=100, rew=11316.00]                                                                                


Epoch #42: test_reward: 12679.100000 ± 6846.407108, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #43: 1001it [00:02, 425.06it/s, env_step=43000, gradient_step=4300, len=174, n/ep=0, n/st=100, rew=9156.00]                                                                                 


Epoch #43: test_reward: 14397.800000 ± 4674.126502, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #44: 1001it [00:02, 342.16it/s, env_step=44000, gradient_step=4400, len=177, n/ep=0, n/st=100, rew=9617.00]                                                                                 


Epoch #44: test_reward: 16372.900000 ± 4948.488748, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #45: 1001it [00:02, 395.34it/s, env_step=45000, gradient_step=4500, len=226, n/ep=0, n/st=100, rew=12979.00]                                                                                


Epoch #45: test_reward: 10934.000000 ± 6095.921095, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #46: 1001it [00:02, 412.28it/s, env_step=46000, gradient_step=4600, len=204, n/ep=1, n/st=100, rew=11308.00]                                                                                


Epoch #46: test_reward: 14882.100000 ± 4714.343251, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #47: 1001it [00:02, 408.34it/s, env_step=47000, gradient_step=4700, len=96, n/ep=0, n/st=100, rew=4309.00]                                                                                  


Epoch #47: test_reward: 16512.600000 ± 7165.834344, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #48: 1001it [00:02, 389.68it/s, env_step=48000, gradient_step=4800, len=155, n/ep=0, n/st=100, rew=6895.50]                                                                                 


Epoch #48: test_reward: 11425.700000 ± 3288.603535, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #49: 1001it [00:02, 351.55it/s, env_step=49000, gradient_step=4900, len=225, n/ep=0, n/st=100, rew=12770.00]                                                                                


Epoch #49: test_reward: 13010.000000 ± 6660.809080, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #50: 1001it [00:03, 321.87it/s, env_step=50000, gradient_step=5000, len=284, n/ep=2, n/st=100, rew=16697.00]                                                                                


Epoch #50: test_reward: 11465.900000 ± 4154.106220, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #51: 1001it [00:02, 391.14it/s, env_step=51000, gradient_step=5100, len=135, n/ep=2, n/st=100, rew=6621.00]                                                                                 


Epoch #51: test_reward: 10017.200000 ± 4182.890431, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #52: 1001it [00:02, 415.76it/s, env_step=52000, gradient_step=5200, len=162, n/ep=2, n/st=100, rew=9376.50]                                                                                 


Epoch #52: test_reward: 9591.800000 ± 6239.706817, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #53: 1001it [00:02, 393.28it/s, env_step=53000, gradient_step=5300, len=274, n/ep=0, n/st=100, rew=16496.00]                                                                                


Epoch #53: test_reward: 15338.100000 ± 5657.587268, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #54: 1001it [00:02, 371.49it/s, env_step=54000, gradient_step=5400, len=139, n/ep=0, n/st=100, rew=7353.50]                                                                                 


Epoch #54: test_reward: 12263.400000 ± 5950.629970, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #55: 1001it [00:02, 346.33it/s, env_step=55000, gradient_step=5500, len=126, n/ep=0, n/st=100, rew=5401.00]                                                                                 


Epoch #55: test_reward: 9780.400000 ± 5798.534008, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #56: 1001it [00:02, 368.20it/s, env_step=56000, gradient_step=5600, len=191, n/ep=2, n/st=100, rew=11402.50]                                                                                


Epoch #56: test_reward: 14169.200000 ± 7381.169403, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #57: 1001it [00:02, 339.81it/s, env_step=57000, gradient_step=5700, len=257, n/ep=1, n/st=100, rew=13640.00]                                                                                


Epoch #57: test_reward: 12913.900000 ± 3694.315267, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #58: 1001it [00:02, 339.83it/s, env_step=58000, gradient_step=5800, len=260, n/ep=3, n/st=100, rew=15175.33]                                                                                


Epoch #58: test_reward: 11226.400000 ± 4231.433072, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #59: 1001it [00:03, 315.27it/s, env_step=59000, gradient_step=5900, len=153, n/ep=1, n/st=100, rew=7952.50]                                                                                 


Epoch #59: test_reward: 16440.400000 ± 5727.968649, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #60: 1001it [00:02, 391.36it/s, env_step=60000, gradient_step=6000, len=228, n/ep=1, n/st=100, rew=13353.00]                                                                                


Epoch #60: test_reward: 11932.000000 ± 3409.464298, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #61: 1001it [00:02, 421.42it/s, env_step=61000, gradient_step=6100, len=52, n/ep=1, n/st=100, rew=2292.00]                                                                                  


Epoch #61: test_reward: 11084.400000 ± 4039.296850, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #62: 1001it [00:02, 420.01it/s, env_step=62000, gradient_step=6200, len=89, n/ep=0, n/st=100, rew=4358.50]                                                                                  


Epoch #62: test_reward: 9699.200000 ± 3535.054082, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #63: 1001it [00:02, 369.69it/s, env_step=63000, gradient_step=6300, len=172, n/ep=0, n/st=100, rew=9289.00]                                                                                 


Epoch #63: test_reward: 11387.700000 ± 3618.041792, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #64: 1001it [00:02, 354.76it/s, env_step=64000, gradient_step=6400, len=136, n/ep=0, n/st=100, rew=6972.00]                                                                                 


Epoch #64: test_reward: 10465.000000 ± 4654.764527, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #65: 1001it [00:02, 406.87it/s, env_step=65000, gradient_step=6500, len=168, n/ep=1, n/st=100, rew=9173.00]                                                                                 


Epoch #65: test_reward: 12462.600000 ± 4464.103296, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #66: 1001it [00:02, 416.84it/s, env_step=66000, gradient_step=6600, len=192, n/ep=0, n/st=100, rew=10090.50]                                                                                


Epoch #66: test_reward: 16382.600000 ± 4129.043114, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #67: 1001it [00:02, 404.46it/s, env_step=67000, gradient_step=6700, len=136, n/ep=0, n/st=100, rew=7237.50]                                                                                 


Epoch #67: test_reward: 11695.400000 ± 4077.938209, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #68: 1001it [00:03, 329.11it/s, env_step=68000, gradient_step=6800, len=139, n/ep=1, n/st=100, rew=7596.50]                                                                                 


Epoch #68: test_reward: 11897.200000 ± 3506.235269, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #69: 1001it [00:02, 405.90it/s, env_step=69000, gradient_step=6900, len=238, n/ep=0, n/st=100, rew=14401.50]                                                                                


Epoch #69: test_reward: 9311.600000 ± 2671.922312, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #70: 1001it [00:02, 374.33it/s, env_step=70000, gradient_step=7000, len=58, n/ep=1, n/st=100, rew=2249.00]                                                                                  


Epoch #70: test_reward: 10107.700000 ± 3622.470043, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #71: 1001it [00:02, 344.50it/s, env_step=71000, gradient_step=7100, len=194, n/ep=0, n/st=100, rew=10558.25]                                                                                


Epoch #71: test_reward: 14073.400000 ± 4525.331992, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #72: 1001it [00:03, 310.91it/s, env_step=72000, gradient_step=7200, len=400, n/ep=1, n/st=100, rew=24712.00]                                                                                


Epoch #72: test_reward: 11877.200000 ± 4979.033758, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #73: 1001it [00:02, 408.71it/s, env_step=73000, gradient_step=7300, len=230, n/ep=1, n/st=100, rew=13753.00]                                                                                


Epoch #73: test_reward: 8959.400000 ± 4121.097650, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #74: 1001it [00:02, 354.01it/s, env_step=74000, gradient_step=7400, len=359, n/ep=0, n/st=100, rew=23641.00]                                                                                


Epoch #74: test_reward: 16054.500000 ± 2807.632642, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #75: 1001it [00:03, 317.07it/s, env_step=75000, gradient_step=7500, len=123, n/ep=0, n/st=100, rew=5518.17]                                                                                 


Epoch #75: test_reward: 10615.500000 ± 3675.402053, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #76: 1001it [00:02, 372.77it/s, env_step=76000, gradient_step=7600, len=72, n/ep=1, n/st=100, rew=3619.00]                                                                                  


Epoch #76: test_reward: 11487.400000 ± 3593.898696, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #77: 1001it [00:02, 384.47it/s, env_step=77000, gradient_step=7700, len=203, n/ep=0, n/st=100, rew=12066.50]                                                                                


Epoch #77: test_reward: 10679.900000 ± 3647.508724, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #78: 1001it [00:02, 420.73it/s, env_step=78000, gradient_step=7800, len=293, n/ep=2, n/st=100, rew=17465.50]                                                                                


Epoch #78: test_reward: 12347.200000 ± 3633.254761, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #79: 1001it [00:02, 361.60it/s, env_step=79000, gradient_step=7900, len=198, n/ep=1, n/st=100, rew=10970.00]                                                                                


Epoch #79: test_reward: 11098.900000 ± 4716.412481, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #80: 1001it [00:02, 353.00it/s, env_step=80000, gradient_step=8000, len=173, n/ep=1, n/st=100, rew=9204.00]                                                                                 


Epoch #80: test_reward: 12710.200000 ± 4434.393438, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #81: 1001it [00:02, 369.80it/s, env_step=81000, gradient_step=8100, len=110, n/ep=2, n/st=100, rew=5897.50]                                                                                 


Epoch #81: test_reward: 11137.700000 ± 5087.159700, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #82: 1001it [00:02, 402.67it/s, env_step=82000, gradient_step=8200, len=191, n/ep=0, n/st=100, rew=11207.00]                                                                                


Epoch #82: test_reward: 12166.700000 ± 6247.000273, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #83: 1001it [00:02, 334.93it/s, env_step=83000, gradient_step=8300, len=168, n/ep=0, n/st=100, rew=9610.00]                                                                                 


Epoch #83: test_reward: 12092.400000 ± 2425.966125, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #84: 1001it [00:02, 377.24it/s, env_step=84000, gradient_step=8400, len=138, n/ep=1, n/st=100, rew=6838.00]                                                                                 


Epoch #84: test_reward: 13874.200000 ± 3973.333658, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #85: 1001it [00:02, 372.63it/s, env_step=85000, gradient_step=8500, len=212, n/ep=1, n/st=100, rew=12365.00]                                                                                


Epoch #85: test_reward: 11210.300000 ± 3709.183523, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #86: 1001it [00:02, 350.20it/s, env_step=86000, gradient_step=8600, len=193, n/ep=0, n/st=100, rew=9375.50]                                                                                 


Epoch #86: test_reward: 13919.100000 ± 6087.446467, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #87: 1001it [00:02, 337.53it/s, env_step=87000, gradient_step=8700, len=181, n/ep=0, n/st=100, rew=10875.00]                                                                                


Epoch #87: test_reward: 12031.000000 ± 5815.771436, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #88: 1001it [00:03, 326.53it/s, env_step=88000, gradient_step=8800, len=139, n/ep=0, n/st=100, rew=6752.33]                                                                                 


Epoch #88: test_reward: 13093.800000 ± 5420.090992, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #89: 1001it [00:02, 397.85it/s, env_step=89000, gradient_step=8900, len=255, n/ep=0, n/st=100, rew=15553.50]                                                                                


Epoch #89: test_reward: 9413.400000 ± 2896.738172, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #90: 1001it [00:02, 405.34it/s, env_step=90000, gradient_step=9000, len=155, n/ep=0, n/st=100, rew=8060.83]                                                                                 


Epoch #90: test_reward: 10306.700000 ± 4933.456112, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #91: 1001it [00:02, 418.13it/s, env_step=91000, gradient_step=9100, len=134, n/ep=1, n/st=100, rew=6058.00]                                                                                 


Epoch #91: test_reward: 10263.000000 ± 4951.224172, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #92: 1001it [00:02, 422.61it/s, env_step=92000, gradient_step=9200, len=109, n/ep=0, n/st=100, rew=5472.00]                                                                                 


Epoch #92: test_reward: 11754.500000 ± 6579.573318, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #93: 1001it [00:03, 325.51it/s, env_step=93000, gradient_step=9300, len=168, n/ep=0, n/st=100, rew=9867.00]                                                                                 


Epoch #93: test_reward: 9635.700000 ± 5484.716438, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #94: 1001it [00:02, 385.86it/s, env_step=94000, gradient_step=9400, len=194, n/ep=0, n/st=100, rew=10439.00]                                                                                


Epoch #94: test_reward: 9535.200000 ± 3795.586195, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #95: 1001it [00:02, 412.34it/s, env_step=95000, gradient_step=9500, len=152, n/ep=2, n/st=100, rew=8473.25]                                                                                 


Epoch #95: test_reward: 9983.200000 ± 3527.357532, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #96: 1001it [00:02, 346.17it/s, env_step=96000, gradient_step=9600, len=118, n/ep=1, n/st=100, rew=5322.00]                                                                                 


Epoch #96: test_reward: 8137.300000 ± 5889.628512, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #97: 1001it [00:02, 347.67it/s, env_step=97000, gradient_step=9700, len=188, n/ep=0, n/st=100, rew=9875.00]                                                                                 


Epoch #97: test_reward: 13828.900000 ± 5253.576314, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #98: 1001it [00:03, 328.71it/s, env_step=98000, gradient_step=9800, len=157, n/ep=0, n/st=100, rew=8664.00]                                                                                 


Epoch #98: test_reward: 8786.900000 ± 5271.075231, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #99: 1001it [00:02, 397.18it/s, env_step=99000, gradient_step=9900, len=75, n/ep=0, n/st=100, rew=3163.00]                                                                                  


Epoch #99: test_reward: 14687.200000 ± 5053.879971, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #100: 1001it [00:03, 320.00it/s, env_step=100000, gradient_step=10000, len=253, n/ep=0, n/st=100, rew=15679.00]                                                                             


Epoch #100: test_reward: 10389.700000 ± 6016.469963, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #101: 1001it [00:02, 419.54it/s, env_step=101000, gradient_step=10100, len=168, n/ep=1, n/st=100, rew=9230.50]                                                                              


Epoch #101: test_reward: 13783.800000 ± 4185.302350, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #102: 1001it [00:02, 377.67it/s, env_step=102000, gradient_step=10200, len=164, n/ep=2, n/st=100, rew=9023.25]                                                                              


Epoch #102: test_reward: 12787.000000 ± 4663.828835, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #103: 1001it [00:03, 320.92it/s, env_step=103000, gradient_step=10300, len=147, n/ep=0, n/st=100, rew=7382.00]                                                                              


Epoch #103: test_reward: 7753.400000 ± 2265.845546, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #104: 1001it [00:02, 347.23it/s, env_step=104000, gradient_step=10400, len=48, n/ep=0, n/st=100, rew=1755.00]                                                                               


Epoch #104: test_reward: 12517.500000 ± 5465.651622, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #105: 1001it [00:02, 338.52it/s, env_step=105000, gradient_step=10500, len=202, n/ep=1, n/st=100, rew=10446.00]                                                                             


Epoch #105: test_reward: 9931.600000 ± 6361.531218, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #106: 1001it [00:02, 359.43it/s, env_step=106000, gradient_step=10600, len=183, n/ep=0, n/st=100, rew=9593.00]                                                                              


Epoch #106: test_reward: 11682.200000 ± 3577.222884, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #107: 1001it [00:02, 381.24it/s, env_step=107000, gradient_step=10700, len=90, n/ep=0, n/st=100, rew=4457.00]                                                                               


Epoch #107: test_reward: 9021.600000 ± 3827.899847, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #108: 1001it [00:02, 343.01it/s, env_step=108000, gradient_step=10800, len=188, n/ep=0, n/st=100, rew=10976.00]                                                                             


Epoch #108: test_reward: 7700.300000 ± 3794.441304, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #109: 1001it [00:03, 320.27it/s, env_step=109000, gradient_step=10900, len=193, n/ep=0, n/st=100, rew=10797.00]                                                                             


Epoch #109: test_reward: 9736.700000 ± 3396.374392, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #110: 1001it [00:02, 397.27it/s, env_step=110000, gradient_step=11000, len=400, n/ep=1, n/st=100, rew=25550.00]                                                                             


Epoch #110: test_reward: 10094.800000 ± 3432.692611, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #111: 1001it [00:02, 341.82it/s, env_step=111000, gradient_step=11100, len=204, n/ep=1, n/st=100, rew=12471.00]                                                                             


Epoch #111: test_reward: 9040.200000 ± 3516.666882, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #112: 1001it [00:03, 329.49it/s, env_step=112000, gradient_step=11200, len=24, n/ep=1, n/st=100, rew=686.00]                                                                                


Epoch #112: test_reward: 11600.300000 ± 4708.892143, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #113: 1001it [00:02, 337.80it/s, env_step=113000, gradient_step=11300, len=361, n/ep=0, n/st=100, rew=20948.00]                                                                             


Epoch #113: test_reward: 12953.700000 ± 4944.128519, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #114: 1001it [00:02, 392.17it/s, env_step=114000, gradient_step=11400, len=198, n/ep=0, n/st=100, rew=11476.00]                                                                             


Epoch #114: test_reward: 9019.000000 ± 4576.264088, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #115: 1001it [00:02, 395.42it/s, env_step=115000, gradient_step=11500, len=137, n/ep=0, n/st=100, rew=7200.00]                                                                              


Epoch #115: test_reward: 11470.800000 ± 3177.669108, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #116: 1001it [00:02, 380.87it/s, env_step=116000, gradient_step=11600, len=153, n/ep=0, n/st=100, rew=8853.00]                                                                              


Epoch #116: test_reward: 10189.000000 ± 3789.075586, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #117: 1001it [00:02, 351.08it/s, env_step=117000, gradient_step=11700, len=259, n/ep=1, n/st=100, rew=15446.00]                                                                             


Epoch #117: test_reward: 9919.100000 ± 2923.142229, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #118: 1001it [00:03, 312.94it/s, env_step=118000, gradient_step=11800, len=166, n/ep=0, n/st=100, rew=9327.00]                                                                              


Epoch #118: test_reward: 4107.800000 ± 2795.216264, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #119: 1001it [00:02, 386.52it/s, env_step=119000, gradient_step=11900, len=100, n/ep=0, n/st=100, rew=4851.00]                                                                              


Epoch #119: test_reward: 10148.500000 ± 3878.613960, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #120: 1001it [00:02, 341.89it/s, env_step=120000, gradient_step=12000, len=169, n/ep=1, n/st=100, rew=9757.50]                                                                              


Epoch #120: test_reward: 10254.100000 ± 4356.303948, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #121: 1001it [00:02, 357.54it/s, env_step=121000, gradient_step=12100, len=66, n/ep=0, n/st=100, rew=2493.00]                                                                               


Epoch #121: test_reward: 11151.900000 ± 5668.454824, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #122: 1001it [00:02, 370.44it/s, env_step=122000, gradient_step=12200, len=121, n/ep=0, n/st=100, rew=6268.17]                                                                              


Epoch #122: test_reward: 10719.600000 ± 2504.500078, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #123: 1001it [00:02, 358.70it/s, env_step=123000, gradient_step=12300, len=124, n/ep=0, n/st=100, rew=6775.75]                                                                              


Epoch #123: test_reward: 11863.900000 ± 4060.915918, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #124: 1001it [00:03, 322.25it/s, env_step=124000, gradient_step=12400, len=198, n/ep=0, n/st=100, rew=12020.00]                                                                             


Epoch #124: test_reward: 10102.000000 ± 4324.165284, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #125: 1001it [00:02, 383.03it/s, env_step=125000, gradient_step=12500, len=138, n/ep=0, n/st=100, rew=7415.50]                                                                              


Epoch #125: test_reward: 13502.700000 ± 3219.799995, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #126: 1001it [00:03, 332.50it/s, env_step=126000, gradient_step=12600, len=204, n/ep=2, n/st=100, rew=11539.50]                                                                             


Epoch #126: test_reward: 12915.600000 ± 6531.448571, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #127: 1001it [00:02, 384.26it/s, env_step=127000, gradient_step=12700, len=55, n/ep=0, n/st=100, rew=2374.00]                                                                               


Epoch #127: test_reward: 10165.300000 ± 2205.684114, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #128: 1001it [00:02, 346.31it/s, env_step=128000, gradient_step=12800, len=118, n/ep=1, n/st=100, rew=7179.00]                                                                              


Epoch #128: test_reward: 8407.600000 ± 2709.266144, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #129: 1001it [00:02, 344.32it/s, env_step=129000, gradient_step=12900, len=179, n/ep=2, n/st=100, rew=10803.25]                                                                             


Epoch #129: test_reward: 11685.100000 ± 5299.116652, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #130: 1001it [00:02, 345.65it/s, env_step=130000, gradient_step=13000, len=149, n/ep=0, n/st=100, rew=7868.00]                                                                              


Epoch #130: test_reward: 11619.400000 ± 5705.938051, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #131: 1001it [00:02, 365.63it/s, env_step=131000, gradient_step=13100, len=156, n/ep=0, n/st=100, rew=8904.75]                                                                              


Epoch #131: test_reward: 10474.600000 ± 3460.587008, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #132: 1001it [00:02, 337.70it/s, env_step=132000, gradient_step=13200, len=230, n/ep=0, n/st=100, rew=14942.00]                                                                             


Epoch #132: test_reward: 9776.500000 ± 4923.108718, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #133: 1001it [00:02, 427.03it/s, env_step=133000, gradient_step=13300, len=127, n/ep=0, n/st=100, rew=6702.00]                                                                              


Epoch #133: test_reward: 11387.600000 ± 4425.016479, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #134: 1001it [00:02, 344.43it/s, env_step=134000, gradient_step=13400, len=112, n/ep=0, n/st=100, rew=6352.50]                                                                              


Epoch #134: test_reward: 10555.300000 ± 5315.839144, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #135: 1001it [00:02, 353.21it/s, env_step=135000, gradient_step=13500, len=128, n/ep=1, n/st=100, rew=7037.50]                                                                              


Epoch #135: test_reward: 10199.500000 ± 3585.937346, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #136: 1001it [00:02, 342.85it/s, env_step=136000, gradient_step=13600, len=308, n/ep=0, n/st=100, rew=20606.00]                                                                             


Epoch #136: test_reward: 9207.100000 ± 2980.881764, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #137: 1001it [00:02, 365.77it/s, env_step=137000, gradient_step=13700, len=108, n/ep=2, n/st=100, rew=5717.50]                                                                              


Epoch #137: test_reward: 11607.700000 ± 6090.899015, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #138: 1001it [00:03, 330.49it/s, env_step=138000, gradient_step=13800, len=131, n/ep=1, n/st=100, rew=7359.00]                                                                              


Epoch #138: test_reward: 10349.400000 ± 4510.410008, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #139: 1001it [00:02, 375.47it/s, env_step=139000, gradient_step=13900, len=129, n/ep=1, n/st=100, rew=7593.00]                                                                              


Epoch #139: test_reward: 8446.700000 ± 3117.842012, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #140: 1001it [00:03, 323.49it/s, env_step=140000, gradient_step=14000, len=133, n/ep=1, n/st=100, rew=7156.00]                                                                              


Epoch #140: test_reward: 13514.100000 ± 6085.489109, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #141: 1001it [00:02, 348.97it/s, env_step=141000, gradient_step=14100, len=150, n/ep=1, n/st=100, rew=9816.50]                                                                              


Epoch #141: test_reward: 16177.400000 ± 5503.429316, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #142: 1001it [00:02, 336.67it/s, env_step=142000, gradient_step=14200, len=189, n/ep=0, n/st=100, rew=11040.00]                                                                             


Epoch #142: test_reward: 7816.700000 ± 3125.144190, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #143: 1001it [00:02, 383.42it/s, env_step=143000, gradient_step=14300, len=112, n/ep=0, n/st=100, rew=6328.00]                                                                              


Epoch #143: test_reward: 10214.100000 ± 3727.066365, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #144: 1001it [00:03, 326.55it/s, env_step=144000, gradient_step=14400, len=69, n/ep=0, n/st=100, rew=3366.00]                                                                               


Epoch #144: test_reward: 9818.200000 ± 3801.511194, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #145: 1001it [00:02, 343.69it/s, env_step=145000, gradient_step=14500, len=193, n/ep=0, n/st=100, rew=11654.00]                                                                             


Epoch #145: test_reward: 9781.600000 ± 2088.003027, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #146: 1001it [00:02, 359.48it/s, env_step=146000, gradient_step=14600, len=121, n/ep=2, n/st=100, rew=6540.50]                                                                              


Epoch #146: test_reward: 10640.300000 ± 4473.499437, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #147: 1001it [00:03, 293.37it/s, env_step=147000, gradient_step=14700, len=120, n/ep=1, n/st=100, rew=7013.50]                                                                              


Epoch #147: test_reward: 12192.800000 ± 6297.681062, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #148: 1001it [00:03, 312.11it/s, env_step=148000, gradient_step=14800, len=124, n/ep=1, n/st=100, rew=7084.50]                                                                              


Epoch #148: test_reward: 14608.200000 ± 5668.130517, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #149: 1001it [00:03, 308.03it/s, env_step=149000, gradient_step=14900, len=272, n/ep=0, n/st=100, rew=17980.00]                                                                             


Epoch #149: test_reward: 10165.200000 ± 5065.526543, best_reward: 16578.900000 ± 3831.478421 in #36


Epoch #150: 1001it [00:03, 307.38it/s, env_step=150000, gradient_step=15000, len=100, n/ep=1, n/st=100, rew=5670.00]                                                                              


Epoch #150: test_reward: 10425.000000 ± 2495.811131, best_reward: 16578.900000 ± 3831.478421 in #36

InfoStats(gradient_step=15000, best_reward=16578.9, best_reward_std=3831.478420923182, train_step=150000, train_episode=775, test_step=324602, test_episode=1510, timing=TimingStats(total_time=617.8045229911804, train_time=414.7817931175232, train_time_collect=52.694798707962036, train_time_update=355.6797730922699, test_time=203.02272987365723, update_speed=361.63593120274544))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #10


Epoch #1: 1001it [00:02, 410.23it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 5026.600000 ± 3745.645050, best_reward: 10267.800000 ± 4671.100294 in #0


Epoch #2: 1001it [00:02, 405.83it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 10048.600000 ± 5417.907718, best_reward: 10267.800000 ± 4671.100294 in #0


Epoch #3: 1001it [00:02, 394.58it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10317.800000 ± 4808.343120, best_reward: 10317.800000 ± 4808.343120 in #3


Epoch #4: 1001it [00:02, 409.26it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 9087.300000 ± 4301.163216, best_reward: 10317.800000 ± 4808.343120 in #3


Epoch #5: 1001it [00:02, 393.42it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 12848.600000 ± 3371.005138, best_reward: 12848.600000 ± 3371.005138 in #5


Epoch #6: 1001it [00:02, 415.91it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 10626.600000 ± 2176.940615, best_reward: 12848.600000 ± 3371.005138 in #5


Epoch #7: 1001it [00:02, 395.34it/s, env_step=7000, gradient_step=700, len=68, n/ep=0, n/st=100, rew=2176.00]                                                                                     


Epoch #7: test_reward: 11905.900000 ± 3700.415827, best_reward: 12848.600000 ± 3371.005138 in #5


Epoch #8: 1001it [00:02, 399.65it/s, env_step=8000, gradient_step=800, len=80, n/ep=1, n/st=100, rew=2264.00]                                                                                     


Epoch #8: test_reward: 15170.700000 ± 6367.278996, best_reward: 15170.700000 ± 6367.278996 in #8


Epoch #9: 1001it [00:02, 408.92it/s, env_step=9000, gradient_step=900, len=90, n/ep=1, n/st=100, rew=3283.00]                                                                                     


Epoch #9: test_reward: 11580.000000 ± 2886.365465, best_reward: 15170.700000 ± 6367.278996 in #8


Epoch #10: 1001it [00:02, 429.70it/s, env_step=10000, gradient_step=1000, len=98, n/ep=0, n/st=100, rew=3487.00]                                                                                  


Epoch #10: test_reward: 14090.700000 ± 2592.497524, best_reward: 15170.700000 ± 6367.278996 in #8


Epoch #11: 1001it [00:02, 401.47it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=3288.00]                                                                                 


Epoch #11: test_reward: 16584.800000 ± 2974.504490, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #12: 1001it [00:02, 357.82it/s, env_step=12000, gradient_step=1200, len=118, n/ep=0, n/st=100, rew=5315.00]                                                                                 


Epoch #12: test_reward: 12108.900000 ± 4906.879099, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #13: 1001it [00:02, 400.72it/s, env_step=13000, gradient_step=1300, len=130, n/ep=4, n/st=100, rew=4495.75]                                                                                 


Epoch #13: test_reward: 7756.200000 ± 2465.757563, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #14: 1001it [00:02, 343.83it/s, env_step=14000, gradient_step=1400, len=127, n/ep=6, n/st=100, rew=4496.67]                                                                                 


Epoch #14: test_reward: 10943.700000 ± 3549.024825, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #15: 1001it [00:02, 392.97it/s, env_step=15000, gradient_step=1500, len=148, n/ep=0, n/st=100, rew=5234.17]                                                                                 


Epoch #15: test_reward: 11461.200000 ± 4720.740552, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #16: 1001it [00:02, 397.67it/s, env_step=16000, gradient_step=1600, len=152, n/ep=0, n/st=100, rew=6760.25]                                                                                 


Epoch #16: test_reward: 13011.300000 ± 4603.691237, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #17: 1001it [00:02, 356.12it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=9278.50]                                                                                 


Epoch #17: test_reward: 10885.000000 ± 3384.551403, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #18: 1001it [00:03, 329.57it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=7235.00]                                                                                 


Epoch #18: test_reward: 11598.200000 ± 3433.648375, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #19: 1001it [00:02, 399.65it/s, env_step=19000, gradient_step=1900, len=190, n/ep=1, n/st=100, rew=8357.00]                                                                                 


Epoch #19: test_reward: 14021.400000 ± 4705.113435, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #20: 1001it [00:02, 385.52it/s, env_step=20000, gradient_step=2000, len=196, n/ep=0, n/st=100, rew=9744.50]                                                                                 


Epoch #20: test_reward: 12020.900000 ± 4530.297197, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #21: 1001it [00:02, 382.72it/s, env_step=21000, gradient_step=2100, len=124, n/ep=2, n/st=100, rew=4812.50]                                                                                 


Epoch #21: test_reward: 12168.900000 ± 4097.552109, best_reward: 16584.800000 ± 2974.504490 in #11


Epoch #22: 1001it [00:02, 375.99it/s, env_step=22000, gradient_step=2200, len=218, n/ep=0, n/st=100, rew=9063.50]                                                                                 


Epoch #22: test_reward: 16646.800000 ± 3633.381780, best_reward: 16646.800000 ± 3633.381780 in #22


Epoch #23: 1001it [00:03, 333.39it/s, env_step=23000, gradient_step=2300, len=230, n/ep=1, n/st=100, rew=13394.50]                                                                                


Epoch #23: test_reward: 15035.700000 ± 3838.064904, best_reward: 16646.800000 ± 3633.381780 in #22


Epoch #24: 1001it [00:02, 335.58it/s, env_step=24000, gradient_step=2400, len=239, n/ep=0, n/st=100, rew=12586.00]                                                                                


Epoch #24: test_reward: 14603.000000 ± 5129.020413, best_reward: 16646.800000 ± 3633.381780 in #22


Epoch #25: 1001it [00:02, 372.49it/s, env_step=25000, gradient_step=2500, len=180, n/ep=0, n/st=100, rew=8700.00]                                                                                 


Epoch #25: test_reward: 16407.200000 ± 4258.968157, best_reward: 16646.800000 ± 3633.381780 in #22


Epoch #26: 1001it [00:02, 340.69it/s, env_step=26000, gradient_step=2600, len=260, n/ep=1, n/st=100, rew=13854.00]                                                                                


Epoch #26: test_reward: 12755.100000 ± 6812.458300, best_reward: 16646.800000 ± 3633.381780 in #22


Epoch #27: 1001it [00:02, 340.32it/s, env_step=27000, gradient_step=2700, len=180, n/ep=0, n/st=100, rew=6968.25]                                                                                 


Epoch #27: test_reward: 14451.200000 ± 5520.306093, best_reward: 16646.800000 ± 3633.381780 in #22


Epoch #28: 1001it [00:02, 347.19it/s, env_step=28000, gradient_step=2800, len=231, n/ep=0, n/st=100, rew=11353.88]                                                                                


Epoch #28: test_reward: 16616.300000 ± 3169.407770, best_reward: 16646.800000 ± 3633.381780 in #22


Epoch #29: 1001it [00:02, 376.60it/s, env_step=29000, gradient_step=2900, len=233, n/ep=2, n/st=100, rew=11174.50]                                                                                


Epoch #29: test_reward: 15198.000000 ± 9476.583140, best_reward: 16646.800000 ± 3633.381780 in #22


Epoch #30: 1001it [00:02, 365.06it/s, env_step=30000, gradient_step=3000, len=150, n/ep=3, n/st=100, rew=7422.17]                                                                                 


Epoch #30: test_reward: 16833.100000 ± 4629.947396, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #31: 1001it [00:02, 379.26it/s, env_step=31000, gradient_step=3100, len=185, n/ep=0, n/st=100, rew=10877.00]                                                                                


Epoch #31: test_reward: 12015.600000 ± 5429.902157, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #32: 1001it [00:02, 360.45it/s, env_step=32000, gradient_step=3200, len=314, n/ep=0, n/st=100, rew=15711.00]                                                                                


Epoch #32: test_reward: 12177.000000 ± 5143.489263, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #33: 1001it [00:02, 371.30it/s, env_step=33000, gradient_step=3300, len=258, n/ep=1, n/st=100, rew=11764.00]                                                                                


Epoch #33: test_reward: 13062.500000 ± 5568.656773, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #34: 1001it [00:02, 349.85it/s, env_step=34000, gradient_step=3400, len=339, n/ep=0, n/st=100, rew=17177.00]                                                                                


Epoch #34: test_reward: 12917.500000 ± 6061.981611, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #35: 1001it [00:02, 369.80it/s, env_step=35000, gradient_step=3500, len=176, n/ep=0, n/st=100, rew=9971.50]                                                                                 


Epoch #35: test_reward: 15953.000000 ± 4158.663848, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #36: 1001it [00:02, 344.49it/s, env_step=36000, gradient_step=3600, len=129, n/ep=0, n/st=100, rew=6514.75]                                                                                 


Epoch #36: test_reward: 15605.000000 ± 5456.978523, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #37: 1001it [00:02, 405.44it/s, env_step=37000, gradient_step=3700, len=227, n/ep=0, n/st=100, rew=12699.00]                                                                                


Epoch #37: test_reward: 10996.500000 ± 4020.847703, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #38: 1001it [00:02, 393.74it/s, env_step=38000, gradient_step=3800, len=375, n/ep=0, n/st=100, rew=20441.00]                                                                                


Epoch #38: test_reward: 12574.300000 ± 4527.729808, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #39: 1001it [00:02, 347.53it/s, env_step=39000, gradient_step=3900, len=213, n/ep=0, n/st=100, rew=12191.00]                                                                                


Epoch #39: test_reward: 8758.700000 ± 3316.902261, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #40: 1001it [00:02, 382.40it/s, env_step=40000, gradient_step=4000, len=253, n/ep=1, n/st=100, rew=15438.00]                                                                                


Epoch #40: test_reward: 12994.000000 ± 5392.565679, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #41: 1001it [00:02, 385.52it/s, env_step=41000, gradient_step=4100, len=258, n/ep=1, n/st=100, rew=14298.00]                                                                                


Epoch #41: test_reward: 12794.600000 ± 5203.851731, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #42: 1001it [00:02, 388.25it/s, env_step=42000, gradient_step=4200, len=276, n/ep=0, n/st=100, rew=15960.00]                                                                                


Epoch #42: test_reward: 11393.600000 ± 4194.973616, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #43: 1001it [00:02, 334.31it/s, env_step=43000, gradient_step=4300, len=300, n/ep=1, n/st=100, rew=15507.00]                                                                                


Epoch #43: test_reward: 13310.800000 ± 6961.793516, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #44: 1001it [00:02, 375.72it/s, env_step=44000, gradient_step=4400, len=310, n/ep=1, n/st=100, rew=17251.50]                                                                                


Epoch #44: test_reward: 10916.300000 ± 3199.409197, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #45: 1001it [00:02, 367.56it/s, env_step=45000, gradient_step=4500, len=50, n/ep=1, n/st=100, rew=2013.00]                                                                                  


Epoch #45: test_reward: 12119.000000 ± 4073.896243, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #46: 1001it [00:02, 339.58it/s, env_step=46000, gradient_step=4600, len=189, n/ep=0, n/st=100, rew=10598.00]                                                                                


Epoch #46: test_reward: 13274.000000 ± 2901.585773, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #47: 1001it [00:02, 421.65it/s, env_step=47000, gradient_step=4700, len=183, n/ep=0, n/st=100, rew=9862.25]                                                                                 


Epoch #47: test_reward: 9771.100000 ± 2893.092410, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #48: 1001it [00:02, 371.62it/s, env_step=48000, gradient_step=4800, len=239, n/ep=0, n/st=100, rew=12338.00]                                                                                


Epoch #48: test_reward: 9135.600000 ± 4178.898975, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #49: 1001it [00:02, 354.60it/s, env_step=49000, gradient_step=4900, len=213, n/ep=0, n/st=100, rew=11487.00]                                                                                


Epoch #49: test_reward: 16332.500000 ± 4902.987625, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #50: 1001it [00:02, 371.18it/s, env_step=50000, gradient_step=5000, len=349, n/ep=0, n/st=100, rew=21475.00]                                                                                


Epoch #50: test_reward: 16306.100000 ± 4548.217551, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #51: 1001it [00:02, 363.49it/s, env_step=51000, gradient_step=5100, len=74, n/ep=0, n/st=100, rew=3372.00]                                                                                  


Epoch #51: test_reward: 13819.400000 ± 6890.533974, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #52: 1001it [00:02, 395.43it/s, env_step=52000, gradient_step=5200, len=276, n/ep=0, n/st=100, rew=15977.50]                                                                                


Epoch #52: test_reward: 12281.900000 ± 4189.329504, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #53: 1001it [00:02, 368.80it/s, env_step=53000, gradient_step=5300, len=339, n/ep=0, n/st=100, rew=19632.50]                                                                                


Epoch #53: test_reward: 9527.300000 ± 5121.930282, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #54: 1001it [00:02, 368.15it/s, env_step=54000, gradient_step=5400, len=400, n/ep=1, n/st=100, rew=23514.00]                                                                                


Epoch #54: test_reward: 12743.000000 ± 3366.585243, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #55: 1001it [00:02, 383.77it/s, env_step=55000, gradient_step=5500, len=99, n/ep=1, n/st=100, rew=4219.50]                                                                                  


Epoch #55: test_reward: 12412.900000 ± 5056.492786, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #56: 1001it [00:02, 374.16it/s, env_step=56000, gradient_step=5600, len=185, n/ep=1, n/st=100, rew=10326.00]                                                                                


Epoch #56: test_reward: 10913.600000 ± 5773.726564, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #57: 1001it [00:02, 403.66it/s, env_step=57000, gradient_step=5700, len=134, n/ep=0, n/st=100, rew=6963.00]                                                                                 


Epoch #57: test_reward: 8588.400000 ± 3903.710829, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #58: 1001it [00:02, 369.52it/s, env_step=58000, gradient_step=5800, len=186, n/ep=0, n/st=100, rew=10242.00]                                                                                


Epoch #58: test_reward: 12925.700000 ± 3422.350948, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #59: 1001it [00:02, 370.45it/s, env_step=59000, gradient_step=5900, len=195, n/ep=0, n/st=100, rew=11091.00]                                                                                


Epoch #59: test_reward: 10788.600000 ± 4237.720241, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #60: 1001it [00:02, 368.08it/s, env_step=60000, gradient_step=6000, len=316, n/ep=1, n/st=100, rew=18877.00]                                                                                


Epoch #60: test_reward: 8169.300000 ± 4077.317404, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #61: 1001it [00:02, 372.65it/s, env_step=61000, gradient_step=6100, len=161, n/ep=1, n/st=100, rew=9354.50]                                                                                 


Epoch #61: test_reward: 12514.400000 ± 4705.905613, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #62: 1001it [00:02, 369.39it/s, env_step=62000, gradient_step=6200, len=170, n/ep=0, n/st=100, rew=9554.00]                                                                                 


Epoch #62: test_reward: 10240.700000 ± 5250.929442, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #63: 1001it [00:02, 418.52it/s, env_step=63000, gradient_step=6300, len=162, n/ep=1, n/st=100, rew=7779.00]                                                                                 


Epoch #63: test_reward: 8911.300000 ± 3895.388660, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #64: 1001it [00:02, 358.67it/s, env_step=64000, gradient_step=6400, len=209, n/ep=0, n/st=100, rew=11667.00]                                                                                


Epoch #64: test_reward: 11029.600000 ± 4045.625000, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #65: 1001it [00:02, 415.16it/s, env_step=65000, gradient_step=6500, len=221, n/ep=0, n/st=100, rew=12382.00]                                                                                


Epoch #65: test_reward: 13790.800000 ± 3834.309033, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #66: 1001it [00:02, 338.49it/s, env_step=66000, gradient_step=6600, len=165, n/ep=0, n/st=100, rew=8578.75]                                                                                 


Epoch #66: test_reward: 10435.300000 ± 4415.771870, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #67: 1001it [00:02, 385.14it/s, env_step=67000, gradient_step=6700, len=263, n/ep=1, n/st=100, rew=16240.00]                                                                                


Epoch #67: test_reward: 9321.000000 ± 4449.494196, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #68: 1001it [00:02, 401.55it/s, env_step=68000, gradient_step=6800, len=94, n/ep=0, n/st=100, rew=4516.00]                                                                                  


Epoch #68: test_reward: 7377.200000 ± 4122.177987, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #69: 1001it [00:02, 425.05it/s, env_step=69000, gradient_step=6900, len=235, n/ep=1, n/st=100, rew=13448.50]                                                                                


Epoch #69: test_reward: 9822.600000 ± 2905.095255, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #70: 1001it [00:02, 397.61it/s, env_step=70000, gradient_step=7000, len=237, n/ep=0, n/st=100, rew=13133.25]                                                                                


Epoch #70: test_reward: 10870.800000 ± 2981.125016, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #71: 1001it [00:02, 397.86it/s, env_step=71000, gradient_step=7100, len=273, n/ep=2, n/st=100, rew=15541.50]                                                                                


Epoch #71: test_reward: 8961.800000 ± 5374.306296, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #72: 1001it [00:02, 394.71it/s, env_step=72000, gradient_step=7200, len=238, n/ep=0, n/st=100, rew=13428.00]                                                                                


Epoch #72: test_reward: 14218.000000 ± 3491.473500, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #73: 1001it [00:02, 372.25it/s, env_step=73000, gradient_step=7300, len=319, n/ep=1, n/st=100, rew=17233.00]                                                                                


Epoch #73: test_reward: 14087.000000 ± 5427.123455, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #74: 1001it [00:02, 365.38it/s, env_step=74000, gradient_step=7400, len=190, n/ep=0, n/st=100, rew=9640.00]                                                                                 


Epoch #74: test_reward: 10843.000000 ± 4239.451073, best_reward: 16833.100000 ± 4629.947396 in #30


Epoch #75: 1001it [00:02, 397.90it/s, env_step=75000, gradient_step=7500, len=277, n/ep=1, n/st=100, rew=15510.50]                                                                                


Epoch #75: test_reward: 19126.400000 ± 5267.276852, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #76: 1001it [00:02, 417.27it/s, env_step=76000, gradient_step=7600, len=210, n/ep=0, n/st=100, rew=11895.88]                                                                                


Epoch #76: test_reward: 12695.900000 ± 4642.488114, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #77: 1001it [00:02, 369.75it/s, env_step=77000, gradient_step=7700, len=296, n/ep=0, n/st=100, rew=17780.75]                                                                                


Epoch #77: test_reward: 13999.900000 ± 6112.340377, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #78: 1001it [00:02, 420.47it/s, env_step=78000, gradient_step=7800, len=194, n/ep=1, n/st=100, rew=11457.00]                                                                                


Epoch #78: test_reward: 10295.400000 ± 3449.332463, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #79: 1001it [00:02, 388.95it/s, env_step=79000, gradient_step=7900, len=77, n/ep=1, n/st=100, rew=3651.00]                                                                                  


Epoch #79: test_reward: 10178.900000 ± 5366.741105, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #80: 1001it [00:02, 342.80it/s, env_step=80000, gradient_step=8000, len=298, n/ep=0, n/st=100, rew=15980.00]                                                                                


Epoch #80: test_reward: 12012.300000 ± 5244.722949, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #81: 1001it [00:02, 354.92it/s, env_step=81000, gradient_step=8100, len=261, n/ep=0, n/st=100, rew=15850.00]                                                                                


Epoch #81: test_reward: 10266.200000 ± 4325.611675, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #82: 1001it [00:02, 400.44it/s, env_step=82000, gradient_step=8200, len=124, n/ep=0, n/st=100, rew=5557.00]                                                                                 


Epoch #82: test_reward: 8758.000000 ± 4107.230916, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #83: 1001it [00:02, 412.92it/s, env_step=83000, gradient_step=8300, len=164, n/ep=0, n/st=100, rew=8868.50]                                                                                 


Epoch #83: test_reward: 13712.200000 ± 6678.755315, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #84: 1001it [00:02, 389.49it/s, env_step=84000, gradient_step=8400, len=107, n/ep=0, n/st=100, rew=4391.00]                                                                                 


Epoch #84: test_reward: 14506.300000 ± 6863.865428, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #85: 1001it [00:02, 355.54it/s, env_step=85000, gradient_step=8500, len=227, n/ep=1, n/st=100, rew=12019.00]                                                                                


Epoch #85: test_reward: 12979.000000 ± 6412.065517, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #86: 1001it [00:02, 386.81it/s, env_step=86000, gradient_step=8600, len=231, n/ep=0, n/st=100, rew=11162.00]                                                                                


Epoch #86: test_reward: 13355.000000 ± 4695.249344, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #87: 1001it [00:02, 354.64it/s, env_step=87000, gradient_step=8700, len=171, n/ep=0, n/st=100, rew=9571.25]                                                                                 


Epoch #87: test_reward: 16637.100000 ± 6253.003110, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #88: 1001it [00:02, 346.97it/s, env_step=88000, gradient_step=8800, len=273, n/ep=0, n/st=100, rew=14556.00]                                                                                


Epoch #88: test_reward: 10304.300000 ± 4286.658163, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #89: 1001it [00:02, 413.95it/s, env_step=89000, gradient_step=8900, len=105, n/ep=0, n/st=100, rew=4873.00]                                                                                 


Epoch #89: test_reward: 12333.400000 ± 4226.300302, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #90: 1001it [00:02, 380.80it/s, env_step=90000, gradient_step=9000, len=322, n/ep=2, n/st=100, rew=18506.00]                                                                                


Epoch #90: test_reward: 7783.200000 ± 6188.058271, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #91: 1001it [00:02, 426.32it/s, env_step=91000, gradient_step=9100, len=240, n/ep=1, n/st=100, rew=14067.50]                                                                                


Epoch #91: test_reward: 12114.900000 ± 5465.299927, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #92: 1001it [00:02, 334.67it/s, env_step=92000, gradient_step=9200, len=188, n/ep=0, n/st=100, rew=11512.00]                                                                                


Epoch #92: test_reward: 12104.400000 ± 5536.488241, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #93: 1001it [00:02, 388.55it/s, env_step=93000, gradient_step=9300, len=107, n/ep=2, n/st=100, rew=5487.50]                                                                                 


Epoch #93: test_reward: 16135.800000 ± 4586.190441, best_reward: 19126.400000 ± 5267.276852 in #75


Epoch #94: 1001it [00:02, 348.79it/s, env_step=94000, gradient_step=9400, len=71, n/ep=0, n/st=100, rew=3206.00]                                                                                  


Epoch #94: test_reward: 19421.200000 ± 7138.373439, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #95: 1001it [00:02, 423.96it/s, env_step=95000, gradient_step=9500, len=312, n/ep=0, n/st=100, rew=19120.50]                                                                                


Epoch #95: test_reward: 9897.000000 ± 2533.147173, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #96: 1001it [00:02, 390.25it/s, env_step=96000, gradient_step=9600, len=137, n/ep=0, n/st=100, rew=7507.00]                                                                                 


Epoch #96: test_reward: 14160.400000 ± 7378.274164, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #97: 1001it [00:03, 331.48it/s, env_step=97000, gradient_step=9700, len=220, n/ep=0, n/st=100, rew=12200.25]                                                                                


Epoch #97: test_reward: 9708.400000 ± 4403.994873, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #98: 1001it [00:03, 323.25it/s, env_step=98000, gradient_step=9800, len=273, n/ep=0, n/st=100, rew=15241.00]                                                                                


Epoch #98: test_reward: 8718.400000 ± 3453.707782, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #99: 1001it [00:03, 292.67it/s, env_step=99000, gradient_step=9900, len=160, n/ep=0, n/st=100, rew=9361.00]                                                                                 


Epoch #99: test_reward: 11902.800000 ± 4241.260869, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #100: 1001it [00:03, 320.24it/s, env_step=100000, gradient_step=10000, len=299, n/ep=0, n/st=100, rew=17227.50]                                                                             


Epoch #100: test_reward: 10907.600000 ± 6237.817699, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #101: 1001it [00:02, 364.14it/s, env_step=101000, gradient_step=10100, len=282, n/ep=0, n/st=100, rew=16655.00]                                                                             


Epoch #101: test_reward: 9491.000000 ± 4780.290891, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #102: 1001it [00:03, 313.67it/s, env_step=102000, gradient_step=10200, len=127, n/ep=0, n/st=100, rew=7325.50]                                                                              


Epoch #102: test_reward: 9980.200000 ± 2948.490251, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #103: 1001it [00:02, 337.16it/s, env_step=103000, gradient_step=10300, len=53, n/ep=0, n/st=100, rew=2409.00]                                                                               


Epoch #103: test_reward: 8032.800000 ± 2642.457977, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #104: 1001it [00:02, 361.61it/s, env_step=104000, gradient_step=10400, len=85, n/ep=0, n/st=100, rew=4055.00]                                                                               


Epoch #104: test_reward: 16166.300000 ± 7379.691349, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #105: 1001it [00:02, 336.09it/s, env_step=105000, gradient_step=10500, len=276, n/ep=0, n/st=100, rew=15906.00]                                                                             


Epoch #105: test_reward: 11408.400000 ± 4380.826913, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #106: 1001it [00:03, 328.77it/s, env_step=106000, gradient_step=10600, len=160, n/ep=2, n/st=100, rew=9126.00]                                                                              


Epoch #106: test_reward: 9458.200000 ± 4019.056551, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #107: 1001it [00:02, 338.68it/s, env_step=107000, gradient_step=10700, len=188, n/ep=0, n/st=100, rew=10980.50]                                                                             


Epoch #107: test_reward: 11019.400000 ± 3832.665605, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #108: 1001it [00:03, 331.07it/s, env_step=108000, gradient_step=10800, len=224, n/ep=2, n/st=100, rew=12850.00]                                                                             


Epoch #108: test_reward: 7457.200000 ± 3076.460687, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #109: 1001it [00:02, 365.37it/s, env_step=109000, gradient_step=10900, len=111, n/ep=0, n/st=100, rew=6184.00]                                                                              


Epoch #109: test_reward: 8541.200000 ± 4124.047546, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #110: 1001it [00:02, 387.59it/s, env_step=110000, gradient_step=11000, len=230, n/ep=0, n/st=100, rew=13937.00]                                                                             


Epoch #110: test_reward: 9322.400000 ± 3645.062172, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #111: 1001it [00:02, 409.58it/s, env_step=111000, gradient_step=11100, len=257, n/ep=2, n/st=100, rew=14627.50]                                                                             


Epoch #111: test_reward: 11387.800000 ± 6316.782881, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #112: 1001it [00:02, 379.77it/s, env_step=112000, gradient_step=11200, len=131, n/ep=0, n/st=100, rew=7000.50]                                                                              


Epoch #112: test_reward: 10653.800000 ± 2695.286916, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #113: 1001it [00:02, 369.10it/s, env_step=113000, gradient_step=11300, len=213, n/ep=0, n/st=100, rew=11817.00]                                                                             


Epoch #113: test_reward: 6830.800000 ± 2489.875652, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #114: 1001it [00:02, 393.40it/s, env_step=114000, gradient_step=11400, len=144, n/ep=0, n/st=100, rew=8310.00]                                                                              


Epoch #114: test_reward: 8975.000000 ± 6002.505693, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #115: 1001it [00:02, 368.64it/s, env_step=115000, gradient_step=11500, len=245, n/ep=2, n/st=100, rew=14356.25]                                                                             


Epoch #115: test_reward: 11706.100000 ± 4189.807882, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #116: 1001it [00:02, 403.07it/s, env_step=116000, gradient_step=11600, len=100, n/ep=2, n/st=100, rew=4738.50]                                                                              


Epoch #116: test_reward: 9929.100000 ± 4132.068065, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #117: 1001it [00:02, 347.52it/s, env_step=117000, gradient_step=11700, len=231, n/ep=0, n/st=100, rew=12904.50]                                                                             


Epoch #117: test_reward: 12311.200000 ± 4646.229628, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #118: 1001it [00:02, 420.19it/s, env_step=118000, gradient_step=11800, len=259, n/ep=1, n/st=100, rew=15830.00]                                                                             


Epoch #118: test_reward: 13516.700000 ± 3044.510406, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #119: 1001it [00:02, 371.60it/s, env_step=119000, gradient_step=11900, len=150, n/ep=0, n/st=100, rew=7063.00]                                                                              


Epoch #119: test_reward: 11357.600000 ± 3121.124675, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #120: 1001it [00:02, 337.84it/s, env_step=120000, gradient_step=12000, len=64, n/ep=0, n/st=100, rew=2864.00]                                                                               


Epoch #120: test_reward: 11075.100000 ± 4330.933397, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #121: 1001it [00:02, 391.09it/s, env_step=121000, gradient_step=12100, len=149, n/ep=0, n/st=100, rew=5661.00]                                                                              


Epoch #121: test_reward: 10460.200000 ± 3897.217002, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #122: 1001it [00:02, 393.74it/s, env_step=122000, gradient_step=12200, len=183, n/ep=1, n/st=100, rew=8734.00]                                                                              


Epoch #122: test_reward: 11259.500000 ± 2909.453737, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #123: 1001it [00:02, 369.32it/s, env_step=123000, gradient_step=12300, len=181, n/ep=0, n/st=100, rew=10628.50]                                                                             


Epoch #123: test_reward: 9166.200000 ± 3349.987606, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #124: 1001it [00:02, 358.64it/s, env_step=124000, gradient_step=12400, len=121, n/ep=1, n/st=100, rew=6491.50]                                                                              


Epoch #124: test_reward: 8729.000000 ± 6525.440200, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #125: 1001it [00:02, 419.02it/s, env_step=125000, gradient_step=12500, len=164, n/ep=0, n/st=100, rew=8908.00]                                                                              


Epoch #125: test_reward: 12086.600000 ± 4343.508771, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #126: 1001it [00:02, 354.21it/s, env_step=126000, gradient_step=12600, len=281, n/ep=0, n/st=100, rew=17831.00]                                                                             


Epoch #126: test_reward: 9434.800000 ± 3885.835220, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #127: 1001it [00:03, 328.42it/s, env_step=127000, gradient_step=12700, len=148, n/ep=0, n/st=100, rew=8545.50]                                                                              


Epoch #127: test_reward: 10025.200000 ± 3551.345936, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #128: 1001it [00:02, 343.71it/s, env_step=128000, gradient_step=12800, len=217, n/ep=0, n/st=100, rew=12178.00]                                                                             


Epoch #128: test_reward: 11088.600000 ± 4795.554341, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #129: 1001it [00:02, 386.64it/s, env_step=129000, gradient_step=12900, len=154, n/ep=1, n/st=100, rew=8530.00]                                                                              


Epoch #129: test_reward: 13669.300000 ± 6147.050155, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #130: 1001it [00:02, 405.41it/s, env_step=130000, gradient_step=13000, len=182, n/ep=0, n/st=100, rew=9957.50]                                                                              


Epoch #130: test_reward: 10682.400000 ± 5218.489632, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #131: 1001it [00:02, 378.98it/s, env_step=131000, gradient_step=13100, len=104, n/ep=1, n/st=100, rew=5069.00]                                                                              


Epoch #131: test_reward: 9590.100000 ± 4484.847878, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #132: 1001it [00:02, 371.90it/s, env_step=132000, gradient_step=13200, len=97, n/ep=0, n/st=100, rew=4385.00]                                                                               


Epoch #132: test_reward: 8647.400000 ± 2985.657589, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #133: 1001it [00:02, 358.74it/s, env_step=133000, gradient_step=13300, len=159, n/ep=0, n/st=100, rew=8974.00]                                                                              


Epoch #133: test_reward: 9442.800000 ± 4281.783105, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #134: 1001it [00:02, 388.36it/s, env_step=134000, gradient_step=13400, len=335, n/ep=0, n/st=100, rew=21716.00]                                                                             


Epoch #134: test_reward: 12103.100000 ± 5319.386778, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #135: 1001it [00:02, 356.32it/s, env_step=135000, gradient_step=13500, len=152, n/ep=0, n/st=100, rew=9299.00]                                                                              


Epoch #135: test_reward: 9503.800000 ± 2887.812245, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #136: 1001it [00:02, 385.36it/s, env_step=136000, gradient_step=13600, len=107, n/ep=0, n/st=100, rew=5922.00]                                                                              


Epoch #136: test_reward: 11719.600000 ± 5782.346984, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #137: 1001it [00:03, 332.18it/s, env_step=137000, gradient_step=13700, len=140, n/ep=0, n/st=100, rew=6932.00]                                                                              


Epoch #137: test_reward: 12810.300000 ± 4175.340203, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #138: 1001it [00:03, 312.96it/s, env_step=138000, gradient_step=13800, len=104, n/ep=1, n/st=100, rew=5174.00]                                                                              


Epoch #138: test_reward: 13050.400000 ± 5557.640780, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #139: 1001it [00:02, 394.75it/s, env_step=139000, gradient_step=13900, len=82, n/ep=0, n/st=100, rew=3477.50]                                                                               


Epoch #139: test_reward: 12958.400000 ± 4417.742030, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #140: 1001it [00:02, 368.40it/s, env_step=140000, gradient_step=14000, len=192, n/ep=1, n/st=100, rew=11196.50]                                                                             


Epoch #140: test_reward: 11322.400000 ± 4927.720248, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #141: 1001it [00:02, 411.63it/s, env_step=141000, gradient_step=14100, len=121, n/ep=1, n/st=100, rew=6205.50]                                                                              


Epoch #141: test_reward: 10294.400000 ± 5618.867487, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #142: 1001it [00:02, 392.69it/s, env_step=142000, gradient_step=14200, len=110, n/ep=1, n/st=100, rew=5698.00]                                                                              


Epoch #142: test_reward: 12994.400000 ± 3705.546632, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #143: 1001it [00:03, 318.33it/s, env_step=143000, gradient_step=14300, len=86, n/ep=1, n/st=100, rew=4503.00]                                                                               


Epoch #143: test_reward: 9872.500000 ± 5380.581293, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #144: 1001it [00:02, 351.21it/s, env_step=144000, gradient_step=14400, len=196, n/ep=0, n/st=100, rew=10988.00]                                                                             


Epoch #144: test_reward: 7322.600000 ± 3758.915168, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #145: 1001it [00:03, 330.61it/s, env_step=145000, gradient_step=14500, len=92, n/ep=1, n/st=100, rew=4503.50]                                                                               


Epoch #145: test_reward: 10385.100000 ± 6202.908502, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #146: 1001it [00:03, 323.54it/s, env_step=146000, gradient_step=14600, len=137, n/ep=0, n/st=100, rew=7303.00]                                                                              


Epoch #146: test_reward: 9093.400000 ± 2735.736691, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #147: 1001it [00:03, 300.63it/s, env_step=147000, gradient_step=14700, len=149, n/ep=1, n/st=100, rew=7866.50]                                                                              


Epoch #147: test_reward: 11171.900000 ± 5313.752129, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #148: 1001it [00:02, 338.35it/s, env_step=148000, gradient_step=14800, len=122, n/ep=2, n/st=100, rew=6571.50]                                                                              


Epoch #148: test_reward: 10843.800000 ± 2938.068406, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #149: 1001it [00:02, 391.26it/s, env_step=149000, gradient_step=14900, len=108, n/ep=1, n/st=100, rew=6364.00]                                                                              


Epoch #149: test_reward: 8827.500000 ± 5786.522034, best_reward: 19421.200000 ± 7138.373439 in #94


Epoch #150: 1001it [00:02, 363.04it/s, env_step=150000, gradient_step=15000, len=53, n/ep=0, n/st=100, rew=1601.50]                                                                               


Epoch #150: test_reward: 13554.200000 ± 6750.761628, best_reward: 19421.200000 ± 7138.373439 in #94

InfoStats(gradient_step=15000, best_reward=19421.2, best_reward_std=7138.373439376788, train_step=150000, train_episode=726, test_step=341072, test_episode=1510, timing=TimingStats(total_time=602.4164335727692, train_time=407.08080196380615, train_time_collect=51.14881920814514, train_time_update=349.478271484375, test_time=195.335631608963, update_speed=368.47721453918285))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #17


Epoch #1: 1001it [00:02, 439.97it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 9608.600000 ± 4871.324116, best_reward: 9608.600000 ± 4871.324116 in #1


Epoch #2: 1001it [00:01, 503.44it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 11249.500000 ± 3115.192683, best_reward: 11249.500000 ± 3115.192683 in #2


Epoch #3: 1001it [00:02, 461.71it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 10090.500000 ± 4114.055694, best_reward: 11249.500000 ± 3115.192683 in #2


Epoch #4: 1001it [00:02, 407.20it/s, env_step=4000, gradient_step=400, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #4: test_reward: 13729.900000 ± 4299.608923, best_reward: 13729.900000 ± 4299.608923 in #4


Epoch #5: 1001it [00:02, 409.80it/s, env_step=5000, gradient_step=500, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #5: test_reward: 12472.900000 ± 4764.877133, best_reward: 13729.900000 ± 4299.608923 in #4


Epoch #6: 1001it [00:02, 422.41it/s, env_step=6000, gradient_step=600, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #6: test_reward: 11581.000000 ± 5677.354190, best_reward: 13729.900000 ± 4299.608923 in #4


Epoch #7: 1001it [00:02, 439.53it/s, env_step=7000, gradient_step=700, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #7: test_reward: 13812.500000 ± 5274.635386, best_reward: 13812.500000 ± 5274.635386 in #7


Epoch #8: 1001it [00:02, 385.93it/s, env_step=8000, gradient_step=800, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #8: test_reward: 9596.100000 ± 3282.256402, best_reward: 13812.500000 ± 5274.635386 in #7


Epoch #9: 1001it [00:02, 406.48it/s, env_step=9000, gradient_step=900, len=86, n/ep=0, n/st=100, rew=2584.00]                                                                                     


Epoch #9: test_reward: 14709.000000 ± 6642.583067, best_reward: 14709.000000 ± 6642.583067 in #9


Epoch #10: 1001it [00:02, 383.48it/s, env_step=10000, gradient_step=1000, len=86, n/ep=0, n/st=100, rew=2584.00]                                                                                  


Epoch #10: test_reward: 11091.700000 ± 3481.020800, best_reward: 14709.000000 ± 6642.583067 in #9


Epoch #11: 1001it [00:02, 385.61it/s, env_step=11000, gradient_step=1100, len=110, n/ep=1, n/st=100, rew=3423.50]                                                                                 


Epoch #11: test_reward: 11723.800000 ± 4834.351224, best_reward: 14709.000000 ± 6642.583067 in #9


Epoch #12: 1001it [00:02, 355.11it/s, env_step=12000, gradient_step=1200, len=120, n/ep=1, n/st=100, rew=3624.00]                                                                                 


Epoch #12: test_reward: 12098.900000 ± 3997.092905, best_reward: 14709.000000 ± 6642.583067 in #9


Epoch #13: 1001it [00:02, 430.24it/s, env_step=13000, gradient_step=1300, len=120, n/ep=0, n/st=100, rew=3624.00]                                                                                 


Epoch #13: test_reward: 11937.100000 ± 4325.587531, best_reward: 14709.000000 ± 6642.583067 in #9


Epoch #14: 1001it [00:02, 393.62it/s, env_step=14000, gradient_step=1400, len=138, n/ep=0, n/st=100, rew=5123.00]                                                                                 


Epoch #14: test_reward: 10607.300000 ± 4813.614734, best_reward: 14709.000000 ± 6642.583067 in #9


Epoch #15: 1001it [00:02, 335.18it/s, env_step=15000, gradient_step=1500, len=150, n/ep=1, n/st=100, rew=6191.00]                                                                                 


Epoch #15: test_reward: 15885.600000 ± 5141.518593, best_reward: 15885.600000 ± 5141.518593 in #15


Epoch #16: 1001it [00:02, 365.67it/s, env_step=16000, gradient_step=1600, len=160, n/ep=1, n/st=100, rew=7582.00]                                                                                 


Epoch #16: test_reward: 12078.400000 ± 3339.188470, best_reward: 15885.600000 ± 5141.518593 in #15


Epoch #17: 1001it [00:02, 416.97it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=9807.00]                                                                                 


Epoch #17: test_reward: 12777.100000 ± 3676.323774, best_reward: 15885.600000 ± 5141.518593 in #15


Epoch #18: 1001it [00:02, 404.82it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=9165.00]                                                                                 


Epoch #18: test_reward: 18446.000000 ± 5842.672385, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #19: 1001it [00:02, 361.76it/s, env_step=19000, gradient_step=1900, len=188, n/ep=0, n/st=100, rew=6663.50]                                                                                 


Epoch #19: test_reward: 13749.800000 ± 6045.411397, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #20: 1001it [00:02, 421.07it/s, env_step=20000, gradient_step=2000, len=200, n/ep=1, n/st=100, rew=8735.00]                                                                                 


Epoch #20: test_reward: 14248.600000 ± 5993.298394, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #21: 1001it [00:02, 365.26it/s, env_step=21000, gradient_step=2100, len=209, n/ep=0, n/st=100, rew=9988.50]                                                                                 


Epoch #21: test_reward: 11491.400000 ± 4138.843418, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #22: 1001it [00:02, 386.23it/s, env_step=22000, gradient_step=2200, len=130, n/ep=2, n/st=100, rew=5585.50]                                                                                 


Epoch #22: test_reward: 16249.300000 ± 5393.106638, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #23: 1001it [00:02, 417.26it/s, env_step=23000, gradient_step=2300, len=52, n/ep=1, n/st=100, rew=1720.00]                                                                                  


Epoch #23: test_reward: 12805.300000 ± 1972.432562, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #24: 1001it [00:02, 374.07it/s, env_step=24000, gradient_step=2400, len=240, n/ep=1, n/st=100, rew=11628.00]                                                                                


Epoch #24: test_reward: 11530.000000 ± 5221.389470, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #25: 1001it [00:02, 348.65it/s, env_step=25000, gradient_step=2500, len=250, n/ep=2, n/st=100, rew=11533.00]                                                                                


Epoch #25: test_reward: 10956.500000 ± 2908.277231, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #26: 1001it [00:02, 368.30it/s, env_step=26000, gradient_step=2600, len=259, n/ep=0, n/st=100, rew=14765.00]                                                                                


Epoch #26: test_reward: 10102.600000 ± 4781.198410, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #27: 1001it [00:02, 366.02it/s, env_step=27000, gradient_step=2700, len=270, n/ep=1, n/st=100, rew=14060.00]                                                                                


Epoch #27: test_reward: 14844.300000 ± 4985.818991, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #28: 1001it [00:02, 409.23it/s, env_step=28000, gradient_step=2800, len=280, n/ep=1, n/st=100, rew=13320.00]                                                                                


Epoch #28: test_reward: 13402.700000 ± 4187.502097, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #29: 1001it [00:02, 417.11it/s, env_step=29000, gradient_step=2900, len=180, n/ep=2, n/st=100, rew=8933.25]                                                                                 


Epoch #29: test_reward: 9245.400000 ± 2502.047170, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #30: 1001it [00:02, 352.59it/s, env_step=30000, gradient_step=3000, len=250, n/ep=3, n/st=100, rew=14347.00]                                                                                


Epoch #30: test_reward: 12472.300000 ± 4214.942231, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #31: 1001it [00:02, 352.44it/s, env_step=31000, gradient_step=3100, len=198, n/ep=1, n/st=100, rew=11753.00]                                                                                


Epoch #31: test_reward: 13295.900000 ± 5959.540695, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #32: 1001it [00:02, 377.11it/s, env_step=32000, gradient_step=3200, len=138, n/ep=1, n/st=100, rew=7559.00]                                                                                 


Epoch #32: test_reward: 10434.800000 ± 2891.356906, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #33: 1001it [00:02, 371.82it/s, env_step=33000, gradient_step=3300, len=192, n/ep=1, n/st=100, rew=9456.00]                                                                                 


Epoch #33: test_reward: 8850.400000 ± 3445.358652, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #34: 1001it [00:02, 385.91it/s, env_step=34000, gradient_step=3400, len=121, n/ep=2, n/st=100, rew=5259.50]                                                                                 


Epoch #34: test_reward: 11882.200000 ± 4587.875190, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #35: 1001it [00:02, 375.17it/s, env_step=35000, gradient_step=3500, len=130, n/ep=0, n/st=100, rew=5517.50]                                                                                 


Epoch #35: test_reward: 16399.800000 ± 5131.927392, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #36: 1001it [00:02, 340.69it/s, env_step=36000, gradient_step=3600, len=210, n/ep=1, n/st=100, rew=12329.00]                                                                                


Epoch #36: test_reward: 12329.800000 ± 4016.134804, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #37: 1001it [00:02, 354.03it/s, env_step=37000, gradient_step=3700, len=68, n/ep=0, n/st=100, rew=2906.00]                                                                                  


Epoch #37: test_reward: 17048.900000 ± 4854.288526, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #38: 1001it [00:02, 391.56it/s, env_step=38000, gradient_step=3800, len=160, n/ep=1, n/st=100, rew=8705.50]                                                                                 


Epoch #38: test_reward: 12842.800000 ± 4639.038193, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #39: 1001it [00:02, 355.24it/s, env_step=39000, gradient_step=3900, len=129, n/ep=0, n/st=100, rew=5454.50]                                                                                 


Epoch #39: test_reward: 9883.500000 ± 3644.407147, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #40: 1001it [00:02, 366.75it/s, env_step=40000, gradient_step=4000, len=124, n/ep=0, n/st=100, rew=5435.00]                                                                                 


Epoch #40: test_reward: 16747.500000 ± 4683.402807, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #41: 1001it [00:03, 315.88it/s, env_step=41000, gradient_step=4100, len=183, n/ep=0, n/st=100, rew=10464.00]                                                                                


Epoch #41: test_reward: 16895.600000 ± 5548.852696, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #42: 1001it [00:02, 334.48it/s, env_step=42000, gradient_step=4200, len=182, n/ep=0, n/st=100, rew=8649.00]                                                                                 


Epoch #42: test_reward: 14087.700000 ± 6409.067827, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #43: 1001it [00:02, 374.99it/s, env_step=43000, gradient_step=4300, len=244, n/ep=0, n/st=100, rew=9923.00]                                                                                 


Epoch #43: test_reward: 17858.000000 ± 3341.060341, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #44: 1001it [00:02, 349.22it/s, env_step=44000, gradient_step=4400, len=240, n/ep=0, n/st=100, rew=13637.50]                                                                                


Epoch #44: test_reward: 15711.800000 ± 4541.481494, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #45: 1001it [00:02, 389.23it/s, env_step=45000, gradient_step=4500, len=247, n/ep=0, n/st=100, rew=13525.00]                                                                                


Epoch #45: test_reward: 17621.300000 ± 4231.602629, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #46: 1001it [00:02, 411.12it/s, env_step=46000, gradient_step=4600, len=134, n/ep=0, n/st=100, rew=6344.00]                                                                                 


Epoch #46: test_reward: 16976.000000 ± 6072.795666, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #47: 1001it [00:02, 344.08it/s, env_step=47000, gradient_step=4700, len=150, n/ep=0, n/st=100, rew=7741.50]                                                                                 


Epoch #47: test_reward: 12785.400000 ± 6063.259259, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #48: 1001it [00:02, 376.89it/s, env_step=48000, gradient_step=4800, len=237, n/ep=2, n/st=100, rew=11856.00]                                                                                


Epoch #48: test_reward: 14221.100000 ± 3329.041798, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #49: 1001it [00:03, 327.38it/s, env_step=49000, gradient_step=4900, len=265, n/ep=0, n/st=100, rew=15659.00]                                                                                


Epoch #49: test_reward: 11167.200000 ± 3418.440370, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #50: 1001it [00:02, 382.03it/s, env_step=50000, gradient_step=5000, len=244, n/ep=0, n/st=100, rew=12522.00]                                                                                


Epoch #50: test_reward: 17272.400000 ± 4706.529001, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #51: 1001it [00:02, 337.33it/s, env_step=51000, gradient_step=5100, len=221, n/ep=0, n/st=100, rew=11380.00]                                                                                


Epoch #51: test_reward: 16165.500000 ± 5100.197432, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #52: 1001it [00:02, 386.98it/s, env_step=52000, gradient_step=5200, len=208, n/ep=0, n/st=100, rew=10384.50]                                                                                


Epoch #52: test_reward: 9237.500000 ± 3214.699123, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #53: 1001it [00:02, 358.07it/s, env_step=53000, gradient_step=5300, len=278, n/ep=0, n/st=100, rew=13756.00]                                                                                


Epoch #53: test_reward: 14095.000000 ± 3579.403973, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #54: 1001it [00:02, 378.83it/s, env_step=54000, gradient_step=5400, len=240, n/ep=0, n/st=100, rew=12499.50]                                                                                


Epoch #54: test_reward: 12726.700000 ± 5434.926495, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #55: 1001it [00:02, 393.15it/s, env_step=55000, gradient_step=5500, len=242, n/ep=0, n/st=100, rew=13421.50]                                                                                


Epoch #55: test_reward: 15761.700000 ± 5653.414633, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #56: 1001it [00:02, 343.28it/s, env_step=56000, gradient_step=5600, len=185, n/ep=0, n/st=100, rew=9924.00]                                                                                 


Epoch #56: test_reward: 13964.100000 ± 5111.393048, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #57: 1001it [00:02, 373.04it/s, env_step=57000, gradient_step=5700, len=208, n/ep=1, n/st=100, rew=11514.00]                                                                                


Epoch #57: test_reward: 14936.200000 ± 4485.787908, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #58: 1001it [00:03, 327.77it/s, env_step=58000, gradient_step=5800, len=219, n/ep=1, n/st=100, rew=11477.00]                                                                                


Epoch #58: test_reward: 10196.000000 ± 3794.141589, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #59: 1001it [00:02, 364.25it/s, env_step=59000, gradient_step=5900, len=250, n/ep=1, n/st=100, rew=12457.00]                                                                                


Epoch #59: test_reward: 16027.500000 ± 7283.025432, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #60: 1001it [00:02, 340.89it/s, env_step=60000, gradient_step=6000, len=201, n/ep=0, n/st=100, rew=10010.50]                                                                                


Epoch #60: test_reward: 14047.700000 ± 7201.002598, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #61: 1001it [00:02, 336.85it/s, env_step=61000, gradient_step=6100, len=241, n/ep=0, n/st=100, rew=11674.00]                                                                                


Epoch #61: test_reward: 8995.200000 ± 6434.107052, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #62: 1001it [00:02, 433.45it/s, env_step=62000, gradient_step=6200, len=169, n/ep=1, n/st=100, rew=9535.00]                                                                                 


Epoch #62: test_reward: 9172.300000 ± 6576.176261, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #63: 1001it [00:02, 369.00it/s, env_step=63000, gradient_step=6300, len=154, n/ep=1, n/st=100, rew=7942.00]                                                                                 


Epoch #63: test_reward: 12713.600000 ± 6249.558500, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #64: 1001it [00:02, 358.72it/s, env_step=64000, gradient_step=6400, len=171, n/ep=1, n/st=100, rew=8681.50]                                                                                 


Epoch #64: test_reward: 12279.000000 ± 5203.061541, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #65: 1001it [00:02, 371.51it/s, env_step=65000, gradient_step=6500, len=324, n/ep=1, n/st=100, rew=19927.00]                                                                                


Epoch #65: test_reward: 12700.000000 ± 4520.695367, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #66: 1001it [00:02, 362.54it/s, env_step=66000, gradient_step=6600, len=199, n/ep=0, n/st=100, rew=10540.00]                                                                                


Epoch #66: test_reward: 9949.800000 ± 2991.265511, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #67: 1001it [00:02, 380.37it/s, env_step=67000, gradient_step=6700, len=299, n/ep=0, n/st=100, rew=17509.00]                                                                                


Epoch #67: test_reward: 8659.600000 ± 3154.195435, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #68: 1001it [00:02, 359.73it/s, env_step=68000, gradient_step=6800, len=260, n/ep=0, n/st=100, rew=15400.25]                                                                                


Epoch #68: test_reward: 11336.100000 ± 4939.160788, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #69: 1001it [00:02, 363.61it/s, env_step=69000, gradient_step=6900, len=400, n/ep=1, n/st=100, rew=26285.00]                                                                                


Epoch #69: test_reward: 10251.400000 ± 6456.539324, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #70: 1001it [00:02, 340.56it/s, env_step=70000, gradient_step=7000, len=393, n/ep=0, n/st=100, rew=25118.00]                                                                                


Epoch #70: test_reward: 13859.200000 ± 6651.900312, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #71: 1001it [00:02, 394.12it/s, env_step=71000, gradient_step=7100, len=108, n/ep=0, n/st=100, rew=4231.00]                                                                                 


Epoch #71: test_reward: 9954.900000 ± 3820.290786, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #72: 1001it [00:02, 413.03it/s, env_step=72000, gradient_step=7200, len=207, n/ep=0, n/st=100, rew=11070.00]                                                                                


Epoch #72: test_reward: 7354.400000 ± 2219.389430, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #73: 1001it [00:02, 359.07it/s, env_step=73000, gradient_step=7300, len=123, n/ep=0, n/st=100, rew=6186.00]                                                                                 


Epoch #73: test_reward: 11825.200000 ± 3677.665015, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #74: 1001it [00:02, 421.71it/s, env_step=74000, gradient_step=7400, len=160, n/ep=1, n/st=100, rew=8935.00]                                                                                 


Epoch #74: test_reward: 4949.100000 ± 3433.143238, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #75: 1001it [00:02, 406.00it/s, env_step=75000, gradient_step=7500, len=298, n/ep=0, n/st=100, rew=16869.75]                                                                                


Epoch #75: test_reward: 8343.700000 ± 3307.029394, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #76: 1001it [00:02, 371.81it/s, env_step=76000, gradient_step=7600, len=151, n/ep=0, n/st=100, rew=7808.50]                                                                                 


Epoch #76: test_reward: 10191.000000 ± 6552.441270, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #77: 1001it [00:02, 389.29it/s, env_step=77000, gradient_step=7700, len=283, n/ep=1, n/st=100, rew=16082.00]                                                                                


Epoch #77: test_reward: 13365.500000 ± 3484.338538, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #78: 1001it [00:02, 396.18it/s, env_step=78000, gradient_step=7800, len=240, n/ep=0, n/st=100, rew=14578.00]                                                                                


Epoch #78: test_reward: 14249.800000 ± 5231.256537, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #79: 1001it [00:02, 395.43it/s, env_step=79000, gradient_step=7900, len=155, n/ep=2, n/st=100, rew=7512.50]                                                                                 


Epoch #79: test_reward: 10131.900000 ± 6636.233306, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #80: 1001it [00:02, 351.39it/s, env_step=80000, gradient_step=8000, len=230, n/ep=1, n/st=100, rew=11313.50]                                                                                


Epoch #80: test_reward: 10582.500000 ± 3452.581969, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #81: 1001it [00:02, 399.05it/s, env_step=81000, gradient_step=8100, len=198, n/ep=0, n/st=100, rew=9046.00]                                                                                 


Epoch #81: test_reward: 10074.500000 ± 3575.820304, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #82: 1001it [00:02, 397.57it/s, env_step=82000, gradient_step=8200, len=145, n/ep=0, n/st=100, rew=7189.00]                                                                                 


Epoch #82: test_reward: 11878.300000 ± 5179.697328, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #83: 1001it [00:02, 399.47it/s, env_step=83000, gradient_step=8300, len=117, n/ep=0, n/st=100, rew=4596.00]                                                                                 


Epoch #83: test_reward: 7110.000000 ± 2357.359667, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #84: 1001it [00:02, 370.21it/s, env_step=84000, gradient_step=8400, len=184, n/ep=0, n/st=100, rew=10584.00]                                                                                


Epoch #84: test_reward: 9411.200000 ± 4661.085063, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #85: 1001it [00:02, 361.94it/s, env_step=85000, gradient_step=8500, len=196, n/ep=0, n/st=100, rew=11136.00]                                                                                


Epoch #85: test_reward: 11032.800000 ± 5797.933180, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #86: 1001it [00:02, 383.23it/s, env_step=86000, gradient_step=8600, len=376, n/ep=0, n/st=100, rew=23880.50]                                                                                


Epoch #86: test_reward: 15375.400000 ± 7225.122562, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #87: 1001it [00:02, 375.74it/s, env_step=87000, gradient_step=8700, len=155, n/ep=1, n/st=100, rew=8637.00]                                                                                 


Epoch #87: test_reward: 12338.500000 ± 5408.981461, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #88: 1001it [00:02, 364.87it/s, env_step=88000, gradient_step=8800, len=71, n/ep=0, n/st=100, rew=3178.00]                                                                                  


Epoch #88: test_reward: 16477.300000 ± 5843.767775, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #89: 1001it [00:03, 327.57it/s, env_step=89000, gradient_step=8900, len=251, n/ep=1, n/st=100, rew=14298.00]                                                                                


Epoch #89: test_reward: 15355.100000 ± 5219.926023, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #90: 1001it [00:02, 376.48it/s, env_step=90000, gradient_step=9000, len=332, n/ep=1, n/st=100, rew=20019.50]                                                                                


Epoch #90: test_reward: 9278.800000 ± 5295.356396, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #91: 1001it [00:02, 399.13it/s, env_step=91000, gradient_step=9100, len=181, n/ep=0, n/st=100, rew=8686.00]                                                                                 


Epoch #91: test_reward: 15874.600000 ± 4279.072544, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #92: 1001it [00:02, 369.68it/s, env_step=92000, gradient_step=9200, len=133, n/ep=1, n/st=100, rew=7124.50]                                                                                 


Epoch #92: test_reward: 10119.800000 ± 3550.968848, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #93: 1001it [00:02, 420.47it/s, env_step=93000, gradient_step=9300, len=277, n/ep=0, n/st=100, rew=15493.50]                                                                                


Epoch #93: test_reward: 7512.400000 ± 2534.813926, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #94: 1001it [00:03, 321.73it/s, env_step=94000, gradient_step=9400, len=80, n/ep=0, n/st=100, rew=2923.00]                                                                                  


Epoch #94: test_reward: 10332.300000 ± 4333.165818, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #95: 1001it [00:02, 384.94it/s, env_step=95000, gradient_step=9500, len=45, n/ep=0, n/st=100, rew=1832.00]                                                                                  


Epoch #95: test_reward: 14507.100000 ± 5831.972367, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #96: 1001it [00:03, 318.35it/s, env_step=96000, gradient_step=9600, len=116, n/ep=0, n/st=100, rew=6066.50]                                                                                 


Epoch #96: test_reward: 13109.100000 ± 3907.360246, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #97: 1001it [00:02, 398.95it/s, env_step=97000, gradient_step=9700, len=138, n/ep=1, n/st=100, rew=5081.50]                                                                                 


Epoch #97: test_reward: 11052.600000 ± 4209.737669, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #98: 1001it [00:02, 420.34it/s, env_step=98000, gradient_step=9800, len=24, n/ep=1, n/st=100, rew=707.00]                                                                                   


Epoch #98: test_reward: 12269.600000 ± 3659.698326, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #99: 1001it [00:02, 368.58it/s, env_step=99000, gradient_step=9900, len=129, n/ep=1, n/st=100, rew=6669.00]                                                                                 


Epoch #99: test_reward: 9137.100000 ± 5778.477662, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #100: 1001it [00:03, 315.74it/s, env_step=100000, gradient_step=10000, len=130, n/ep=0, n/st=100, rew=6628.50]                                                                              


Epoch #100: test_reward: 11228.600000 ± 5065.970789, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #101: 1001it [00:02, 422.83it/s, env_step=101000, gradient_step=10100, len=282, n/ep=1, n/st=100, rew=15264.00]                                                                             


Epoch #101: test_reward: 13515.600000 ± 4732.618201, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #102: 1001it [00:02, 335.26it/s, env_step=102000, gradient_step=10200, len=120, n/ep=0, n/st=100, rew=6190.00]                                                                              


Epoch #102: test_reward: 11144.100000 ± 5156.787866, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #103: 1001it [00:02, 335.53it/s, env_step=103000, gradient_step=10300, len=174, n/ep=0, n/st=100, rew=9287.00]                                                                              


Epoch #103: test_reward: 13860.800000 ± 5589.281811, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #104: 1001it [00:02, 373.31it/s, env_step=104000, gradient_step=10400, len=216, n/ep=1, n/st=100, rew=13432.00]                                                                             


Epoch #104: test_reward: 11356.400000 ± 4237.071446, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #105: 1001it [00:02, 355.10it/s, env_step=105000, gradient_step=10500, len=138, n/ep=0, n/st=100, rew=7863.50]                                                                              


Epoch #105: test_reward: 13660.600000 ± 3576.691549, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #106: 1001it [00:02, 353.74it/s, env_step=106000, gradient_step=10600, len=317, n/ep=0, n/st=100, rew=17406.00]                                                                             


Epoch #106: test_reward: 9632.200000 ± 4255.476796, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #107: 1001it [00:03, 295.97it/s, env_step=107000, gradient_step=10700, len=220, n/ep=0, n/st=100, rew=13503.00]                                                                             


Epoch #107: test_reward: 6792.300000 ± 4707.697867, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #108: 1001it [00:03, 272.65it/s, env_step=108000, gradient_step=10800, len=201, n/ep=0, n/st=100, rew=11836.00]                                                                             


Epoch #108: test_reward: 14231.200000 ± 5081.083444, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #109: 1001it [00:03, 313.73it/s, env_step=109000, gradient_step=10900, len=188, n/ep=2, n/st=100, rew=11373.00]                                                                             


Epoch #109: test_reward: 11913.200000 ± 4258.830210, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #110: 1001it [00:02, 352.87it/s, env_step=110000, gradient_step=11000, len=224, n/ep=3, n/st=100, rew=13692.50]                                                                             


Epoch #110: test_reward: 12511.500000 ± 4792.413468, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #111: 1001it [00:02, 370.11it/s, env_step=111000, gradient_step=11100, len=108, n/ep=4, n/st=100, rew=5483.25]                                                                              


Epoch #111: test_reward: 9279.200000 ± 4004.741510, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #112: 1001it [00:03, 308.73it/s, env_step=112000, gradient_step=11200, len=72, n/ep=1, n/st=100, rew=2575.00]                                                                               


Epoch #112: test_reward: 11416.700000 ± 3924.522545, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #113: 1001it [00:03, 293.91it/s, env_step=113000, gradient_step=11300, len=194, n/ep=0, n/st=100, rew=11881.50]                                                                             


Epoch #113: test_reward: 9286.800000 ± 4303.184607, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #114: 1001it [00:03, 312.70it/s, env_step=114000, gradient_step=11400, len=111, n/ep=0, n/st=100, rew=5462.00]                                                                              


Epoch #114: test_reward: 11993.200000 ± 3247.937586, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #115: 1001it [00:03, 298.12it/s, env_step=115000, gradient_step=11500, len=189, n/ep=2, n/st=100, rew=10847.75]                                                                             


Epoch #115: test_reward: 10571.400000 ± 2849.291673, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #116: 1001it [00:02, 357.71it/s, env_step=116000, gradient_step=11600, len=99, n/ep=0, n/st=100, rew=5322.50]                                                                               


Epoch #116: test_reward: 8672.500000 ± 3362.753671, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #117: 1001it [00:03, 317.19it/s, env_step=117000, gradient_step=11700, len=224, n/ep=1, n/st=100, rew=13243.50]                                                                             


Epoch #117: test_reward: 10507.600000 ± 5724.288239, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #118: 1001it [00:02, 344.98it/s, env_step=118000, gradient_step=11800, len=208, n/ep=1, n/st=100, rew=11970.00]                                                                             


Epoch #118: test_reward: 10642.200000 ± 3376.774491, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #119: 1001it [00:02, 363.93it/s, env_step=119000, gradient_step=11900, len=57, n/ep=0, n/st=100, rew=2177.00]                                                                               


Epoch #119: test_reward: 11698.300000 ± 5061.197053, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #120: 1001it [00:02, 345.61it/s, env_step=120000, gradient_step=12000, len=257, n/ep=1, n/st=100, rew=15847.00]                                                                             


Epoch #120: test_reward: 9515.900000 ± 4449.333983, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #121: 1001it [00:02, 357.34it/s, env_step=121000, gradient_step=12100, len=150, n/ep=0, n/st=100, rew=7729.50]                                                                              


Epoch #121: test_reward: 15127.700000 ± 5760.261887, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #122: 1001it [00:02, 355.77it/s, env_step=122000, gradient_step=12200, len=251, n/ep=0, n/st=100, rew=14629.00]                                                                             


Epoch #122: test_reward: 14120.100000 ± 4930.550688, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #123: 1001it [00:02, 342.54it/s, env_step=123000, gradient_step=12300, len=94, n/ep=1, n/st=100, rew=3699.00]                                                                               


Epoch #123: test_reward: 9816.200000 ± 5154.110802, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #124: 1001it [00:03, 311.54it/s, env_step=124000, gradient_step=12400, len=255, n/ep=1, n/st=100, rew=13123.00]                                                                             


Epoch #124: test_reward: 15163.000000 ± 4212.611731, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #125: 1001it [00:03, 327.92it/s, env_step=125000, gradient_step=12500, len=196, n/ep=0, n/st=100, rew=12836.00]                                                                             


Epoch #125: test_reward: 9771.800000 ± 3663.175639, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #126: 1001it [00:02, 340.46it/s, env_step=126000, gradient_step=12600, len=138, n/ep=1, n/st=100, rew=8038.00]                                                                              


Epoch #126: test_reward: 12093.300000 ± 5269.269267, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #127: 1001it [00:02, 354.36it/s, env_step=127000, gradient_step=12700, len=151, n/ep=1, n/st=100, rew=7603.00]                                                                              


Epoch #127: test_reward: 12407.800000 ± 5187.086539, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #128: 1001it [00:02, 349.86it/s, env_step=128000, gradient_step=12800, len=241, n/ep=1, n/st=100, rew=15624.00]                                                                             


Epoch #128: test_reward: 11081.900000 ± 4268.156521, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #129: 1001it [00:03, 332.31it/s, env_step=129000, gradient_step=12900, len=148, n/ep=0, n/st=100, rew=8387.00]                                                                              


Epoch #129: test_reward: 10499.000000 ± 4557.182134, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #130: 1001it [00:03, 305.53it/s, env_step=130000, gradient_step=13000, len=145, n/ep=0, n/st=100, rew=7675.50]                                                                              


Epoch #130: test_reward: 10409.800000 ± 4040.258229, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #131: 1001it [00:02, 393.05it/s, env_step=131000, gradient_step=13100, len=142, n/ep=2, n/st=100, rew=7695.75]                                                                              


Epoch #131: test_reward: 10586.100000 ± 6115.509144, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #132: 1001it [00:02, 398.00it/s, env_step=132000, gradient_step=13200, len=172, n/ep=0, n/st=100, rew=10211.50]                                                                             


Epoch #132: test_reward: 14375.300000 ± 5671.526744, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #133: 1001it [00:02, 410.00it/s, env_step=133000, gradient_step=13300, len=143, n/ep=0, n/st=100, rew=8706.50]                                                                              


Epoch #133: test_reward: 9136.400000 ± 3020.335054, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #134: 1001it [00:02, 352.54it/s, env_step=134000, gradient_step=13400, len=93, n/ep=1, n/st=100, rew=3901.00]                                                                               


Epoch #134: test_reward: 11443.600000 ± 4873.313600, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #135: 1001it [00:02, 337.18it/s, env_step=135000, gradient_step=13500, len=266, n/ep=2, n/st=100, rew=16260.25]                                                                             


Epoch #135: test_reward: 16800.600000 ± 8508.755082, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #136: 1001it [00:02, 378.15it/s, env_step=136000, gradient_step=13600, len=151, n/ep=0, n/st=100, rew=7979.50]                                                                              


Epoch #136: test_reward: 9203.400000 ± 2801.640384, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #137: 1001it [00:02, 365.85it/s, env_step=137000, gradient_step=13700, len=236, n/ep=2, n/st=100, rew=13420.25]                                                                             


Epoch #137: test_reward: 8360.400000 ± 3270.718154, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #138: 1001it [00:02, 400.58it/s, env_step=138000, gradient_step=13800, len=173, n/ep=0, n/st=100, rew=10400.50]                                                                             


Epoch #138: test_reward: 10589.400000 ± 4359.883673, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #139: 1001it [00:02, 366.20it/s, env_step=139000, gradient_step=13900, len=144, n/ep=1, n/st=100, rew=7418.00]                                                                              


Epoch #139: test_reward: 8711.600000 ± 3048.850282, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #140: 1001it [00:02, 374.12it/s, env_step=140000, gradient_step=14000, len=151, n/ep=0, n/st=100, rew=8804.67]                                                                              


Epoch #140: test_reward: 12997.500000 ± 5468.836901, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #141: 1001it [00:03, 316.53it/s, env_step=141000, gradient_step=14100, len=210, n/ep=1, n/st=100, rew=12201.00]                                                                             


Epoch #141: test_reward: 12321.700000 ± 4118.285178, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #142: 1001it [00:02, 350.24it/s, env_step=142000, gradient_step=14200, len=171, n/ep=1, n/st=100, rew=10593.50]                                                                             


Epoch #142: test_reward: 12793.200000 ± 6676.566555, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #143: 1001it [00:03, 323.10it/s, env_step=143000, gradient_step=14300, len=150, n/ep=0, n/st=100, rew=8219.00]                                                                              


Epoch #143: test_reward: 11560.600000 ± 3560.234043, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #144: 1001it [00:02, 343.22it/s, env_step=144000, gradient_step=14400, len=146, n/ep=0, n/st=100, rew=7776.00]                                                                              


Epoch #144: test_reward: 9233.600000 ± 3705.410725, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #145: 1001it [00:02, 374.46it/s, env_step=145000, gradient_step=14500, len=148, n/ep=2, n/st=100, rew=8424.00]                                                                              


Epoch #145: test_reward: 14267.600000 ± 3014.208327, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #146: 1001it [00:02, 339.11it/s, env_step=146000, gradient_step=14600, len=107, n/ep=0, n/st=100, rew=5998.00]                                                                              


Epoch #146: test_reward: 9573.800000 ± 4297.819582, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #147: 1001it [00:03, 329.58it/s, env_step=147000, gradient_step=14700, len=264, n/ep=0, n/st=100, rew=17210.00]                                                                             


Epoch #147: test_reward: 11236.600000 ± 5600.301781, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #148: 1001it [00:03, 309.55it/s, env_step=148000, gradient_step=14800, len=177, n/ep=0, n/st=100, rew=10357.75]                                                                             


Epoch #148: test_reward: 9324.700000 ± 3283.245408, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #149: 1001it [00:03, 312.51it/s, env_step=149000, gradient_step=14900, len=254, n/ep=1, n/st=100, rew=16719.50]                                                                             


Epoch #149: test_reward: 11443.100000 ± 5427.516346, best_reward: 18446.000000 ± 5842.672385 in #18


Epoch #150: 1001it [00:03, 324.14it/s, env_step=150000, gradient_step=15000, len=209, n/ep=2, n/st=100, rew=11578.50]                                                                             


Epoch #150: test_reward: 10412.500000 ± 4873.129595, best_reward: 18446.000000 ± 5842.672385 in #18

InfoStats(gradient_step=15000, best_reward=18446.0, best_reward_std=5842.6723851333645, train_step=150000, train_episode=764, test_step=352302, test_episode=1510, timing=TimingStats(total_time=624.8493559360504, train_time=414.5568380355835, train_time_collect=52.39285898208618, train_time_update=355.622234582901, test_time=210.29251790046692, update_speed=361.8321692889908))

(the trained policy can be accessed via policy.policies[agents[1]])
Train with agent_opponent #4


Epoch #1: 1001it [00:02, 424.79it/s, env_step=1000, gradient_step=100, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #1: test_reward: 4651.100000 ± 2025.837429, best_reward: 6748.900000 ± 2658.795458 in #0


Epoch #2: 1001it [00:02, 442.65it/s, env_step=2000, gradient_step=200, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #2: test_reward: 8870.600000 ± 3696.125653, best_reward: 8870.600000 ± 3696.125653 in #2


Epoch #3: 1001it [00:02, 470.95it/s, env_step=3000, gradient_step=300, len=0, n/ep=0, n/st=100, rew=0.00]                                                                                         


Epoch #3: test_reward: 11000.600000 ± 4535.009133, best_reward: 11000.600000 ± 4535.009133 in #3


Epoch #4: 1001it [00:02, 431.20it/s, env_step=4000, gradient_step=400, len=39, n/ep=0, n/st=100, rew=879.00]                                                                                      


Epoch #4: test_reward: 10839.200000 ± 4908.517022, best_reward: 11000.600000 ± 4535.009133 in #3


Epoch #5: 1001it [00:02, 478.31it/s, env_step=5000, gradient_step=500, len=49, n/ep=0, n/st=100, rew=1505.00]                                                                                     


Epoch #5: test_reward: 10008.200000 ± 8458.740684, best_reward: 11000.600000 ± 4535.009133 in #3


Epoch #6: 1001it [00:02, 443.35it/s, env_step=6000, gradient_step=600, len=55, n/ep=0, n/st=100, rew=1935.00]                                                                                     


Epoch #6: test_reward: 11772.600000 ± 6969.354291, best_reward: 11772.600000 ± 6969.354291 in #6


Epoch #7: 1001it [00:02, 401.30it/s, env_step=7000, gradient_step=700, len=55, n/ep=0, n/st=100, rew=1935.00]                                                                                     


Epoch #7: test_reward: 11398.500000 ± 5420.343702, best_reward: 11772.600000 ± 6969.354291 in #6


Epoch #8: 1001it [00:03, 327.94it/s, env_step=8000, gradient_step=800, len=78, n/ep=0, n/st=100, rew=2670.00]                                                                                     


Epoch #8: test_reward: 10116.900000 ± 5160.847148, best_reward: 11772.600000 ± 6969.354291 in #6


Epoch #9: 1001it [00:02, 382.87it/s, env_step=9000, gradient_step=900, len=89, n/ep=0, n/st=100, rew=3514.00]                                                                                     


Epoch #9: test_reward: 6923.000000 ± 5353.390869, best_reward: 11772.600000 ± 6969.354291 in #6


Epoch #10: 1001it [00:02, 364.99it/s, env_step=10000, gradient_step=1000, len=93, n/ep=0, n/st=100, rew=3132.00]                                                                                  


Epoch #10: test_reward: 9650.100000 ± 4382.710770, best_reward: 11772.600000 ± 6969.354291 in #6


Epoch #11: 1001it [00:02, 378.24it/s, env_step=11000, gradient_step=1100, len=108, n/ep=0, n/st=100, rew=4301.00]                                                                                 


Epoch #11: test_reward: 8899.800000 ± 5380.922259, best_reward: 11772.600000 ± 6969.354291 in #6


Epoch #12: 1001it [00:03, 332.32it/s, env_step=12000, gradient_step=1200, len=119, n/ep=0, n/st=100, rew=5760.00]                                                                                 


Epoch #12: test_reward: 12324.600000 ± 4739.300037, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #13: 1001it [00:02, 409.91it/s, env_step=13000, gradient_step=1300, len=129, n/ep=0, n/st=100, rew=5349.00]                                                                                 


Epoch #13: test_reward: 9708.700000 ± 4539.774401, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #14: 1001it [00:02, 378.77it/s, env_step=14000, gradient_step=1400, len=102, n/ep=0, n/st=100, rew=4386.25]                                                                                 


Epoch #14: test_reward: 11604.800000 ± 3880.366524, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #15: 1001it [00:02, 378.03it/s, env_step=15000, gradient_step=1500, len=150, n/ep=2, n/st=100, rew=7049.00]                                                                                 


Epoch #15: test_reward: 11934.400000 ± 3108.191860, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #16: 1001it [00:02, 391.87it/s, env_step=16000, gradient_step=1600, len=158, n/ep=0, n/st=100, rew=6276.50]                                                                                 


Epoch #16: test_reward: 8684.300000 ± 3934.619246, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #17: 1001it [00:02, 340.84it/s, env_step=17000, gradient_step=1700, len=169, n/ep=0, n/st=100, rew=8104.50]                                                                                 


Epoch #17: test_reward: 10663.300000 ± 4257.387087, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #18: 1001it [00:02, 356.40it/s, env_step=18000, gradient_step=1800, len=180, n/ep=1, n/st=100, rew=8899.00]                                                                                 


Epoch #18: test_reward: 8626.500000 ± 4216.143813, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #19: 1001it [00:02, 391.43it/s, env_step=19000, gradient_step=1900, len=189, n/ep=0, n/st=100, rew=9341.00]                                                                                 


Epoch #19: test_reward: 10725.300000 ± 3873.470383, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #20: 1001it [00:02, 396.76it/s, env_step=20000, gradient_step=2000, len=199, n/ep=0, n/st=100, rew=9732.00]                                                                                 


Epoch #20: test_reward: 8306.600000 ± 4800.314244, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #21: 1001it [00:03, 317.92it/s, env_step=21000, gradient_step=2100, len=207, n/ep=0, n/st=100, rew=10215.75]                                                                                


Epoch #21: test_reward: 11878.300000 ± 3177.453070, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #22: 1001it [00:02, 352.56it/s, env_step=22000, gradient_step=2200, len=220, n/ep=2, n/st=100, rew=11396.25]                                                                                


Epoch #22: test_reward: 9045.200000 ± 2942.600374, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #23: 1001it [00:03, 327.45it/s, env_step=23000, gradient_step=2300, len=169, n/ep=2, n/st=100, rew=8165.25]                                                                                 


Epoch #23: test_reward: 8291.400000 ± 3835.808655, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #24: 1001it [00:03, 332.72it/s, env_step=24000, gradient_step=2400, len=79, n/ep=0, n/st=100, rew=3531.00]                                                                                  


Epoch #24: test_reward: 6676.600000 ± 5715.534624, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #25: 1001it [00:02, 364.27it/s, env_step=25000, gradient_step=2500, len=172, n/ep=1, n/st=100, rew=8676.00]                                                                                 


Epoch #25: test_reward: 9076.800000 ± 4586.795653, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #26: 1001it [00:02, 371.63it/s, env_step=26000, gradient_step=2600, len=87, n/ep=0, n/st=100, rew=4606.00]                                                                                  


Epoch #26: test_reward: 7936.800000 ± 2507.057909, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #27: 1001it [00:02, 345.05it/s, env_step=27000, gradient_step=2700, len=153, n/ep=2, n/st=100, rew=7647.25]                                                                                 


Epoch #27: test_reward: 7244.600000 ± 4304.351570, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #28: 1001it [00:02, 387.04it/s, env_step=28000, gradient_step=2800, len=105, n/ep=1, n/st=100, rew=4704.50]                                                                                 


Epoch #28: test_reward: 7896.400000 ± 2654.232213, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #29: 1001it [00:02, 388.16it/s, env_step=29000, gradient_step=2900, len=212, n/ep=0, n/st=100, rew=11539.75]                                                                                


Epoch #29: test_reward: 10122.700000 ± 5029.755323, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #30: 1001it [00:03, 312.40it/s, env_step=30000, gradient_step=3000, len=25, n/ep=0, n/st=100, rew=678.00]                                                                                   


Epoch #30: test_reward: 7121.600000 ± 2248.762824, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #31: 1001it [00:03, 332.90it/s, env_step=31000, gradient_step=3100, len=117, n/ep=1, n/st=100, rew=6056.00]                                                                                 


Epoch #31: test_reward: 7878.500000 ± 3075.242210, best_reward: 12324.600000 ± 4739.300037 in #12


Epoch #32: 1001it [00:02, 386.91it/s, env_step=32000, gradient_step=3200, len=186, n/ep=0, n/st=100, rew=8535.50]                                                                                 


Epoch #32: test_reward: 15266.900000 ± 3321.243636, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #33: 1001it [00:02, 407.36it/s, env_step=33000, gradient_step=3300, len=91, n/ep=1, n/st=100, rew=4677.00]                                                                                  


Epoch #33: test_reward: 8448.800000 ± 3608.150546, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #34: 1001it [00:03, 318.94it/s, env_step=34000, gradient_step=3400, len=123, n/ep=1, n/st=100, rew=6800.00]                                                                                 


Epoch #34: test_reward: 12421.900000 ± 2610.812649, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #35: 1001it [00:02, 358.15it/s, env_step=35000, gradient_step=3500, len=135, n/ep=1, n/st=100, rew=5571.00]                                                                                 


Epoch #35: test_reward: 11228.500000 ± 4077.024289, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #36: 1001it [00:02, 334.52it/s, env_step=36000, gradient_step=3600, len=80, n/ep=0, n/st=100, rew=3654.00]                                                                                  


Epoch #36: test_reward: 11105.800000 ± 3780.742726, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #37: 1001it [00:02, 393.77it/s, env_step=37000, gradient_step=3700, len=143, n/ep=0, n/st=100, rew=8498.50]                                                                                 


Epoch #37: test_reward: 9829.400000 ± 5622.861410, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #38: 1001it [00:03, 324.93it/s, env_step=38000, gradient_step=3800, len=137, n/ep=1, n/st=100, rew=7934.50]                                                                                 


Epoch #38: test_reward: 8690.700000 ± 2393.743013, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #39: 1001it [00:02, 377.76it/s, env_step=39000, gradient_step=3900, len=188, n/ep=0, n/st=100, rew=6410.00]                                                                                 


Epoch #39: test_reward: 14615.700000 ± 6173.347035, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #40: 1001it [00:02, 348.82it/s, env_step=40000, gradient_step=4000, len=145, n/ep=0, n/st=100, rew=7877.25]                                                                                 


Epoch #40: test_reward: 12023.400000 ± 5199.195961, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #41: 1001it [00:02, 389.49it/s, env_step=41000, gradient_step=4100, len=206, n/ep=1, n/st=100, rew=12017.00]                                                                                


Epoch #41: test_reward: 13315.100000 ± 5634.641700, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #42: 1001it [00:02, 382.44it/s, env_step=42000, gradient_step=4200, len=136, n/ep=1, n/st=100, rew=6955.50]                                                                                 


Epoch #42: test_reward: 9700.400000 ± 6232.659837, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #43: 1001it [00:02, 420.26it/s, env_step=43000, gradient_step=4300, len=123, n/ep=0, n/st=100, rew=6700.50]                                                                                 


Epoch #43: test_reward: 8751.400000 ± 2107.384360, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #44: 1001it [00:02, 357.22it/s, env_step=44000, gradient_step=4400, len=209, n/ep=0, n/st=100, rew=13015.00]                                                                                


Epoch #44: test_reward: 14204.600000 ± 6484.762944, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #45: 1001it [00:03, 315.86it/s, env_step=45000, gradient_step=4500, len=209, n/ep=1, n/st=100, rew=12239.00]                                                                                


Epoch #45: test_reward: 14829.600000 ± 5428.030788, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #46: 1001it [00:02, 382.43it/s, env_step=46000, gradient_step=4600, len=29, n/ep=1, n/st=100, rew=805.00]                                                                                   


Epoch #46: test_reward: 12881.500000 ± 3923.039466, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #47: 1001it [00:02, 342.21it/s, env_step=47000, gradient_step=4700, len=229, n/ep=0, n/st=100, rew=12191.17]                                                                                


Epoch #47: test_reward: 13930.800000 ± 6092.098420, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #48: 1001it [00:02, 347.19it/s, env_step=48000, gradient_step=4800, len=139, n/ep=0, n/st=100, rew=7709.75]                                                                                 


Epoch #48: test_reward: 12647.200000 ± 3656.089982, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #49: 1001it [00:02, 334.81it/s, env_step=49000, gradient_step=4900, len=225, n/ep=2, n/st=100, rew=12121.75]                                                                                


Epoch #49: test_reward: 11018.500000 ± 3759.627887, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #50: 1001it [00:02, 336.07it/s, env_step=50000, gradient_step=5000, len=274, n/ep=1, n/st=100, rew=16523.50]                                                                                


Epoch #50: test_reward: 12038.900000 ± 4312.759243, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #51: 1001it [00:02, 339.40it/s, env_step=51000, gradient_step=5100, len=265, n/ep=1, n/st=100, rew=14218.00]                                                                                


Epoch #51: test_reward: 8599.000000 ± 3302.113172, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #52: 1001it [00:03, 324.61it/s, env_step=52000, gradient_step=5200, len=259, n/ep=0, n/st=100, rew=14743.00]                                                                                


Epoch #52: test_reward: 10010.800000 ± 4959.252561, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #53: 1001it [00:02, 390.35it/s, env_step=53000, gradient_step=5300, len=267, n/ep=0, n/st=100, rew=13981.50]                                                                                


Epoch #53: test_reward: 7508.200000 ± 4955.769220, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #54: 1001it [00:03, 331.49it/s, env_step=54000, gradient_step=5400, len=102, n/ep=0, n/st=100, rew=4812.25]                                                                                 


Epoch #54: test_reward: 8886.300000 ± 3476.229683, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #55: 1001it [00:02, 339.38it/s, env_step=55000, gradient_step=5500, len=90, n/ep=1, n/st=100, rew=4071.00]                                                                                  


Epoch #55: test_reward: 13031.900000 ± 6205.860931, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #56: 1001it [00:02, 378.32it/s, env_step=56000, gradient_step=5600, len=155, n/ep=1, n/st=100, rew=8748.00]                                                                                 


Epoch #56: test_reward: 10956.700000 ± 2738.525298, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #57: 1001it [00:03, 306.71it/s, env_step=57000, gradient_step=5700, len=253, n/ep=1, n/st=100, rew=13882.00]                                                                                


Epoch #57: test_reward: 11689.200000 ± 4144.084719, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #58: 1001it [00:03, 331.25it/s, env_step=58000, gradient_step=5800, len=245, n/ep=1, n/st=100, rew=12119.50]                                                                                


Epoch #58: test_reward: 10498.500000 ± 3290.716495, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #59: 1001it [00:03, 329.23it/s, env_step=59000, gradient_step=5900, len=269, n/ep=1, n/st=100, rew=16042.50]                                                                                


Epoch #59: test_reward: 12163.200000 ± 4853.385824, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #60: 1001it [00:02, 337.59it/s, env_step=60000, gradient_step=6000, len=147, n/ep=2, n/st=100, rew=8440.00]                                                                                 


Epoch #60: test_reward: 11431.300000 ± 6571.064953, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #61: 1001it [00:02, 344.24it/s, env_step=61000, gradient_step=6100, len=105, n/ep=1, n/st=100, rew=5944.00]                                                                                 


Epoch #61: test_reward: 14334.100000 ± 4641.291403, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #62: 1001it [00:02, 348.57it/s, env_step=62000, gradient_step=6200, len=221, n/ep=0, n/st=100, rew=13139.50]                                                                                


Epoch #62: test_reward: 12716.100000 ± 2783.477194, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #63: 1001it [00:03, 326.44it/s, env_step=63000, gradient_step=6300, len=283, n/ep=0, n/st=100, rew=16200.50]                                                                                


Epoch #63: test_reward: 12502.900000 ± 4546.932734, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #64: 1001it [00:03, 319.00it/s, env_step=64000, gradient_step=6400, len=283, n/ep=0, n/st=100, rew=16200.50]                                                                                


Epoch #64: test_reward: 11298.300000 ± 3199.885062, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #65: 1001it [00:03, 333.11it/s, env_step=65000, gradient_step=6500, len=293, n/ep=0, n/st=100, rew=16986.00]                                                                                


Epoch #65: test_reward: 9087.100000 ± 3370.243743, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #66: 1001it [00:03, 328.04it/s, env_step=66000, gradient_step=6600, len=313, n/ep=2, n/st=100, rew=18999.00]                                                                                


Epoch #66: test_reward: 10161.500000 ± 5165.652200, best_reward: 15266.900000 ± 3321.243636 in #32


Epoch #67: 1001it [00:02, 369.07it/s, env_step=67000, gradient_step=6700, len=154, n/ep=0, n/st=100, rew=8665.00]                                                                                 


Epoch #67: test_reward: 16056.100000 ± 5613.178893, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #68: 1001it [00:02, 343.37it/s, env_step=68000, gradient_step=6800, len=102, n/ep=1, n/st=100, rew=4165.00]                                                                                 


Epoch #68: test_reward: 12398.200000 ± 3872.601059, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #69: 1001it [00:02, 383.86it/s, env_step=69000, gradient_step=6900, len=224, n/ep=2, n/st=100, rew=13331.00]                                                                                


Epoch #69: test_reward: 15292.700000 ± 4625.083827, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #70: 1001it [00:02, 340.36it/s, env_step=70000, gradient_step=7000, len=158, n/ep=1, n/st=100, rew=8664.00]                                                                                 


Epoch #70: test_reward: 15463.200000 ± 4726.268841, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #71: 1001it [00:03, 315.87it/s, env_step=71000, gradient_step=7100, len=264, n/ep=0, n/st=100, rew=15139.00]                                                                                


Epoch #71: test_reward: 12794.000000 ± 3938.974080, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #72: 1001it [00:03, 293.69it/s, env_step=72000, gradient_step=7200, len=169, n/ep=2, n/st=100, rew=9347.25]                                                                                 


Epoch #72: test_reward: 12236.100000 ± 2773.861657, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #73: 1001it [00:03, 314.86it/s, env_step=73000, gradient_step=7300, len=215, n/ep=1, n/st=100, rew=13193.00]                                                                                


Epoch #73: test_reward: 12227.600000 ± 7239.068078, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #74: 1001it [00:02, 389.13it/s, env_step=74000, gradient_step=7400, len=110, n/ep=0, n/st=100, rew=6350.00]                                                                                 


Epoch #74: test_reward: 11092.800000 ± 2662.153181, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #75: 1001it [00:02, 353.61it/s, env_step=75000, gradient_step=7500, len=163, n/ep=2, n/st=100, rew=8941.75]                                                                                 


Epoch #75: test_reward: 11718.000000 ± 6489.799997, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #76: 1001it [00:03, 299.01it/s, env_step=76000, gradient_step=7600, len=270, n/ep=1, n/st=100, rew=16699.50]                                                                                


Epoch #76: test_reward: 9058.400000 ± 2899.287747, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #77: 1001it [00:03, 324.27it/s, env_step=77000, gradient_step=7700, len=182, n/ep=0, n/st=100, rew=11082.00]                                                                                


Epoch #77: test_reward: 10495.800000 ± 2522.647887, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #78: 1001it [00:02, 374.10it/s, env_step=78000, gradient_step=7800, len=400, n/ep=1, n/st=100, rew=23616.00]                                                                                


Epoch #78: test_reward: 8420.300000 ± 5481.348667, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #79: 1001it [00:02, 340.75it/s, env_step=79000, gradient_step=7900, len=119, n/ep=1, n/st=100, rew=6098.50]                                                                                 


Epoch #79: test_reward: 9498.300000 ± 6109.361048, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #80: 1001it [00:03, 318.51it/s, env_step=80000, gradient_step=8000, len=310, n/ep=1, n/st=100, rew=17574.50]                                                                                


Epoch #80: test_reward: 12069.000000 ± 3612.568421, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #81: 1001it [00:03, 301.65it/s, env_step=81000, gradient_step=8100, len=148, n/ep=0, n/st=100, rew=8882.50]                                                                                 


Epoch #81: test_reward: 11787.900000 ± 4068.992958, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #82: 1001it [00:03, 299.42it/s, env_step=82000, gradient_step=8200, len=201, n/ep=0, n/st=100, rew=11900.50]                                                                                


Epoch #82: test_reward: 10961.200000 ± 3516.850858, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #83: 1001it [00:03, 323.09it/s, env_step=83000, gradient_step=8300, len=149, n/ep=0, n/st=100, rew=8906.00]                                                                                 


Epoch #83: test_reward: 11790.100000 ± 5718.600379, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #84: 1001it [00:03, 326.04it/s, env_step=84000, gradient_step=8400, len=172, n/ep=2, n/st=100, rew=10791.50]                                                                                


Epoch #84: test_reward: 12867.700000 ± 5347.917371, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #85: 1001it [00:03, 282.03it/s, env_step=85000, gradient_step=8500, len=325, n/ep=1, n/st=100, rew=20524.00]                                                                                


Epoch #85: test_reward: 6844.600000 ± 3091.006833, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #86: 1001it [00:03, 295.39it/s, env_step=86000, gradient_step=8600, len=233, n/ep=0, n/st=100, rew=14569.00]                                                                                


Epoch #86: test_reward: 7568.200000 ± 3034.713357, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #87: 1001it [00:02, 352.05it/s, env_step=87000, gradient_step=8700, len=204, n/ep=0, n/st=100, rew=12165.12]                                                                                


Epoch #87: test_reward: 10318.400000 ± 4117.756287, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #88: 1001it [00:02, 401.49it/s, env_step=88000, gradient_step=8800, len=142, n/ep=1, n/st=100, rew=9127.00]                                                                                 


Epoch #88: test_reward: 10890.000000 ± 4280.241021, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #89: 1001it [00:02, 338.69it/s, env_step=89000, gradient_step=8900, len=166, n/ep=0, n/st=100, rew=10179.50]                                                                                


Epoch #89: test_reward: 7533.000000 ± 3187.329039, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #90: 1001it [00:02, 343.09it/s, env_step=90000, gradient_step=9000, len=182, n/ep=0, n/st=100, rew=10662.00]                                                                                


Epoch #90: test_reward: 8538.600000 ± 4686.968321, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #91: 1001it [00:03, 325.23it/s, env_step=91000, gradient_step=9100, len=150, n/ep=1, n/st=100, rew=9090.00]                                                                                 


Epoch #91: test_reward: 10200.300000 ± 2988.462215, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #92: 1001it [00:02, 373.36it/s, env_step=92000, gradient_step=9200, len=141, n/ep=1, n/st=100, rew=7856.00]                                                                                 


Epoch #92: test_reward: 10233.000000 ± 5535.665181, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #93: 1001it [00:03, 316.97it/s, env_step=93000, gradient_step=9300, len=156, n/ep=0, n/st=100, rew=8148.50]                                                                                 


Epoch #93: test_reward: 7656.400000 ± 4917.409627, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #94: 1001it [00:02, 388.19it/s, env_step=94000, gradient_step=9400, len=340, n/ep=1, n/st=100, rew=22212.50]                                                                                


Epoch #94: test_reward: 11404.100000 ± 5004.584727, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #95: 1001it [00:03, 321.58it/s, env_step=95000, gradient_step=9500, len=99, n/ep=1, n/st=100, rew=4973.00]                                                                                  


Epoch #95: test_reward: 13009.500000 ± 4644.930748, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #96: 1001it [00:02, 375.12it/s, env_step=96000, gradient_step=9600, len=152, n/ep=0, n/st=100, rew=8285.00]                                                                                 


Epoch #96: test_reward: 12083.100000 ± 6615.686154, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #97: 1001it [00:02, 345.70it/s, env_step=97000, gradient_step=9700, len=212, n/ep=1, n/st=100, rew=13891.50]                                                                                


Epoch #97: test_reward: 6431.400000 ± 2729.885866, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #98: 1001it [00:02, 368.16it/s, env_step=98000, gradient_step=9800, len=213, n/ep=0, n/st=100, rew=13219.50]                                                                                


Epoch #98: test_reward: 12592.900000 ± 4720.563705, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #99: 1001it [00:02, 378.23it/s, env_step=99000, gradient_step=9900, len=92, n/ep=1, n/st=100, rew=4074.00]                                                                                  


Epoch #99: test_reward: 14305.500000 ± 5425.055046, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #100: 1001it [00:02, 350.93it/s, env_step=100000, gradient_step=10000, len=129, n/ep=1, n/st=100, rew=7161.00]                                                                              


Epoch #100: test_reward: 7367.200000 ± 3137.192879, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #101: 1001it [00:03, 317.67it/s, env_step=101000, gradient_step=10100, len=209, n/ep=1, n/st=100, rew=11383.50]                                                                             


Epoch #101: test_reward: 6681.100000 ± 3565.304235, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #102: 1001it [00:02, 351.50it/s, env_step=102000, gradient_step=10200, len=146, n/ep=0, n/st=100, rew=7507.50]                                                                              


Epoch #102: test_reward: 8437.200000 ± 3505.504608, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #103: 1001it [00:03, 268.78it/s, env_step=103000, gradient_step=10300, len=162, n/ep=0, n/st=100, rew=8143.50]                                                                              


Epoch #103: test_reward: 14717.400000 ± 5236.394336, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #104: 1001it [00:03, 333.35it/s, env_step=104000, gradient_step=10400, len=178, n/ep=1, n/st=100, rew=9470.00]                                                                              


Epoch #104: test_reward: 3725.500000 ± 1349.453982, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #105: 1001it [00:02, 386.63it/s, env_step=105000, gradient_step=10500, len=167, n/ep=0, n/st=100, rew=8862.00]                                                                              


Epoch #105: test_reward: 11233.200000 ± 4872.570591, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #106: 1001it [00:02, 340.59it/s, env_step=106000, gradient_step=10600, len=89, n/ep=0, n/st=100, rew=4619.50]                                                                               


Epoch #106: test_reward: 11866.100000 ± 4700.838424, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #107: 1001it [00:02, 351.28it/s, env_step=107000, gradient_step=10700, len=145, n/ep=0, n/st=100, rew=8289.17]                                                                              


Epoch #107: test_reward: 7804.800000 ± 2436.089851, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #108: 1001it [00:03, 322.91it/s, env_step=108000, gradient_step=10800, len=162, n/ep=0, n/st=100, rew=8702.00]                                                                              


Epoch #108: test_reward: 6071.000000 ± 2926.846392, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #109: 1001it [00:02, 341.80it/s, env_step=109000, gradient_step=10900, len=204, n/ep=0, n/st=100, rew=12810.00]                                                                             


Epoch #109: test_reward: 8558.300000 ± 4819.871618, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #110: 1001it [00:02, 353.00it/s, env_step=110000, gradient_step=11000, len=205, n/ep=0, n/st=100, rew=13016.50]                                                                             


Epoch #110: test_reward: 8232.200000 ± 4300.433392, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #111: 1001it [00:03, 293.87it/s, env_step=111000, gradient_step=11100, len=267, n/ep=1, n/st=100, rew=15629.00]                                                                             


Epoch #111: test_reward: 10552.900000 ± 5142.259123, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #112: 1001it [00:03, 333.04it/s, env_step=112000, gradient_step=11200, len=161, n/ep=0, n/st=100, rew=8658.00]                                                                              


Epoch #112: test_reward: 6895.800000 ± 4762.515256, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #113: 1001it [00:03, 308.28it/s, env_step=113000, gradient_step=11300, len=159, n/ep=0, n/st=100, rew=9003.50]                                                                              


Epoch #113: test_reward: 10816.900000 ± 8671.660042, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #114: 1001it [00:02, 338.69it/s, env_step=114000, gradient_step=11400, len=102, n/ep=1, n/st=100, rew=5393.00]                                                                              


Epoch #114: test_reward: 5780.400000 ± 4882.349172, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #115: 1001it [00:02, 382.69it/s, env_step=115000, gradient_step=11500, len=151, n/ep=1, n/st=100, rew=8276.00]                                                                              


Epoch #115: test_reward: 9153.200000 ± 4150.440165, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #116: 1001it [00:02, 339.44it/s, env_step=116000, gradient_step=11600, len=400, n/ep=0, n/st=100, rew=25623.00]                                                                             


Epoch #116: test_reward: 4707.600000 ± 2606.631972, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #117: 1001it [00:03, 313.55it/s, env_step=117000, gradient_step=11700, len=161, n/ep=0, n/st=100, rew=8914.50]                                                                              


Epoch #117: test_reward: 12414.300000 ± 6466.714576, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #118: 1001it [00:03, 310.54it/s, env_step=118000, gradient_step=11800, len=212, n/ep=0, n/st=100, rew=10837.50]                                                                             


Epoch #118: test_reward: 9752.700000 ± 8081.698424, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #119: 1001it [00:03, 320.66it/s, env_step=119000, gradient_step=11900, len=137, n/ep=0, n/st=100, rew=8512.00]                                                                              


Epoch #119: test_reward: 6257.300000 ± 2477.494462, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #120: 1001it [00:03, 333.11it/s, env_step=120000, gradient_step=12000, len=149, n/ep=1, n/st=100, rew=8750.50]                                                                              


Epoch #120: test_reward: 8649.100000 ± 5136.665853, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #121: 1001it [00:03, 319.74it/s, env_step=121000, gradient_step=12100, len=201, n/ep=0, n/st=100, rew=12214.00]                                                                             


Epoch #121: test_reward: 8941.300000 ± 4492.101959, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #122: 1001it [00:03, 312.02it/s, env_step=122000, gradient_step=12200, len=124, n/ep=2, n/st=100, rew=6684.50]                                                                              


Epoch #122: test_reward: 9994.800000 ± 1982.958588, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #123: 1001it [00:03, 290.29it/s, env_step=123000, gradient_step=12300, len=140, n/ep=0, n/st=100, rew=8038.50]                                                                              


Epoch #123: test_reward: 7816.600000 ± 3681.483402, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #124: 1001it [00:03, 329.20it/s, env_step=124000, gradient_step=12400, len=119, n/ep=1, n/st=100, rew=6138.00]                                                                              


Epoch #124: test_reward: 9658.200000 ± 2556.147953, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #125: 1001it [00:02, 361.19it/s, env_step=125000, gradient_step=12500, len=178, n/ep=1, n/st=100, rew=11283.00]                                                                             


Epoch #125: test_reward: 11544.300000 ± 4190.457852, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #126: 1001it [00:02, 335.31it/s, env_step=126000, gradient_step=12600, len=57, n/ep=0, n/st=100, rew=2250.00]                                                                               


Epoch #126: test_reward: 12249.400000 ± 3936.312188, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #127: 1001it [00:03, 287.18it/s, env_step=127000, gradient_step=12700, len=132, n/ep=0, n/st=100, rew=7598.50]                                                                              


Epoch #127: test_reward: 5750.400000 ± 5862.284660, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #128: 1001it [00:03, 298.52it/s, env_step=128000, gradient_step=12800, len=292, n/ep=0, n/st=100, rew=18577.50]                                                                             


Epoch #128: test_reward: 10174.500000 ± 6279.622747, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #129: 1001it [00:03, 328.43it/s, env_step=129000, gradient_step=12900, len=207, n/ep=1, n/st=100, rew=12530.00]                                                                             


Epoch #129: test_reward: 8595.100000 ± 3273.239174, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #130: 1001it [00:02, 338.73it/s, env_step=130000, gradient_step=13000, len=297, n/ep=0, n/st=100, rew=19596.00]                                                                             


Epoch #130: test_reward: 10120.900000 ± 3357.425783, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #131: 1001it [00:02, 340.21it/s, env_step=131000, gradient_step=13100, len=103, n/ep=0, n/st=100, rew=4866.50]                                                                              


Epoch #131: test_reward: 11661.500000 ± 6561.579859, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #132: 1001it [00:03, 320.24it/s, env_step=132000, gradient_step=13200, len=142, n/ep=3, n/st=100, rew=7904.50]                                                                              


Epoch #132: test_reward: 8260.500000 ± 3246.873365, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #133: 1001it [00:02, 344.30it/s, env_step=133000, gradient_step=13300, len=117, n/ep=0, n/st=100, rew=6645.50]                                                                              


Epoch #133: test_reward: 6925.000000 ± 5640.879595, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #134: 1001it [00:03, 299.74it/s, env_step=134000, gradient_step=13400, len=153, n/ep=1, n/st=100, rew=9066.50]                                                                              


Epoch #134: test_reward: 6444.300000 ± 3180.967967, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #135: 1001it [00:03, 329.67it/s, env_step=135000, gradient_step=13500, len=179, n/ep=2, n/st=100, rew=10685.25]                                                                             


Epoch #135: test_reward: 10822.600000 ± 4537.975853, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #136: 1001it [00:03, 317.83it/s, env_step=136000, gradient_step=13600, len=129, n/ep=1, n/st=100, rew=6704.00]                                                                              


Epoch #136: test_reward: 8168.400000 ± 2667.350566, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #137: 1001it [00:03, 317.38it/s, env_step=137000, gradient_step=13700, len=83, n/ep=0, n/st=100, rew=4489.00]                                                                               


Epoch #137: test_reward: 13308.600000 ± 4855.869875, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #138: 1001it [00:03, 324.65it/s, env_step=138000, gradient_step=13800, len=117, n/ep=0, n/st=100, rew=6654.50]                                                                              


Epoch #138: test_reward: 13804.300000 ± 4105.329342, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #139: 1001it [00:03, 293.71it/s, env_step=139000, gradient_step=13900, len=215, n/ep=0, n/st=100, rew=12385.50]                                                                             


Epoch #139: test_reward: 8900.700000 ± 2826.004602, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #140: 1001it [00:02, 340.63it/s, env_step=140000, gradient_step=14000, len=189, n/ep=2, n/st=100, rew=11339.00]                                                                             


Epoch #140: test_reward: 8107.800000 ± 2143.734209, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #141: 1001it [00:02, 357.85it/s, env_step=141000, gradient_step=14100, len=242, n/ep=1, n/st=100, rew=15728.00]                                                                             


Epoch #141: test_reward: 10441.300000 ± 2447.295898, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #142: 1001it [00:03, 322.22it/s, env_step=142000, gradient_step=14200, len=119, n/ep=1, n/st=100, rew=7140.00]                                                                              


Epoch #142: test_reward: 10559.500000 ± 3191.120344, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #143: 1001it [00:02, 368.31it/s, env_step=143000, gradient_step=14300, len=111, n/ep=2, n/st=100, rew=6198.50]                                                                              


Epoch #143: test_reward: 15180.700000 ± 6402.123445, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #144: 1001it [00:03, 312.54it/s, env_step=144000, gradient_step=14400, len=103, n/ep=0, n/st=100, rew=5332.00]                                                                              


Epoch #144: test_reward: 7437.700000 ± 3742.289247, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #145: 1001it [00:03, 313.18it/s, env_step=145000, gradient_step=14500, len=136, n/ep=3, n/st=100, rew=7462.00]                                                                              


Epoch #145: test_reward: 7041.500000 ± 4326.656036, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #146: 1001it [00:03, 301.55it/s, env_step=146000, gradient_step=14600, len=137, n/ep=0, n/st=100, rew=7843.00]                                                                              


Epoch #146: test_reward: 7744.500000 ± 2413.517361, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #147: 1001it [00:02, 340.20it/s, env_step=147000, gradient_step=14700, len=127, n/ep=0, n/st=100, rew=6490.00]                                                                              


Epoch #147: test_reward: 10449.800000 ± 4725.424823, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #148: 1001it [00:03, 288.44it/s, env_step=148000, gradient_step=14800, len=213, n/ep=0, n/st=100, rew=12199.50]                                                                             


Epoch #148: test_reward: 12858.700000 ± 5598.952242, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #149: 1001it [00:03, 309.50it/s, env_step=149000, gradient_step=14900, len=161, n/ep=0, n/st=100, rew=8885.00]                                                                              


Epoch #149: test_reward: 11176.100000 ± 5186.595655, best_reward: 16056.100000 ± 5613.178893 in #67


Epoch #150: 1001it [00:03, 323.71it/s, env_step=150000, gradient_step=15000, len=126, n/ep=1, n/st=100, rew=6719.50]                                                                              


Epoch #150: test_reward: 17879.900000 ± 4215.689942, best_reward: 17879.900000 ± 4215.689942 in #150

InfoStats(gradient_step=15000, best_reward=17879.9, best_reward_std=4215.68994234633, train_step=150000, train_episode=844, test_step=310461, test_episode=1510, timing=TimingStats(total_time=629.1210973262787, train_time=440.1674313545227, train_time_collect=54.099361419677734, train_time_update=379.1930375099182, test_time=188.95366597175598, update_speed=340.7794155474124))

(the trained policy can be accessed via policy.policies[agents[1]])


### 🦧 Play different learned policies

In [215]:
for n_agent, agent in enumerate(agents_learned):
    print("----------------------------------------------------")
    winners = 0
    for n_agent_opp, agent_opponent in enumerate(agents_learned):
        if agent != agent_opponent:
            PLAYS = {"bastaushy": 0, "qostaushy": 0}
    
            env = _get_env()#render_mode="human")
    
            policies = MultiAgentPolicyManager(policies=[agent_opponent, agent], env=env)
            env = DummyVectorEnv([lambda: env])
            
            collector = Collector(policies, env)
            
            result = collector.collect(n_episode=1, reset_before_collect=True)
            print(f"Agent {n_agent} plays with agent_opponent {n_agent_opp} {PLAYS}")
            winners += PLAYS["qostaushy"]
    print(f"Agent {n_agent} wins as Qostaushy {winners}")

----------------------------------------------------
Agent 0 plays with agent_opponent 1 {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays with agent_opponent 2 {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays with agent_opponent 3 {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays with agent_opponent 4 {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays with agent_opponent 5 {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays with agent_opponent 6 {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays with agent_opponent 7 {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays with agent_opponent 8 {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays with agent_opponent 9 {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays with agent_opponent 10 {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays with agent_opponent 11 {'bastaushy': 0, 'qostaushy': 1}
Agent 0 plays with agent_opponent 12 {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays with agent_opponent 13 {'bastaushy': 1, 'qostaushy': 0}
Agent 0 plays with agent_opponent 14 {'bastaus

In [220]:
PLAYS = {"bastaushy": 0, "qostaushy": 0}

env = _get_env(render_mode="human")

policies = MultiAgentPolicyManager(policies=[agents_learned[0], agents_learned[6]], env=env)

env = DummyVectorEnv([lambda: env])

collector = Collector(policies, env)

result = collector.collect(n_episode=1, reset_before_collect=True)
print(PLAYS)

{'bastaushy': 0, 'qostaushy': 1}
