# 🐌 Prepare imports

## 🦛 Install need packages

In [1]:
! pip install gdown



In [2]:
import gdown

url = "https://drive.google.com/uc?id=15uPcQurzdz2aO2Bgf3WLEsQ3nAaMaM8T"
output = 'requirements.txt'
gdown.download(url, output, quiet=False);

Downloading...
From: https://drive.google.com/uc?id=15uPcQurzdz2aO2Bgf3WLEsQ3nAaMaM8T
To: /kaggle/working/requirements.txt
100%|██████████| 2.36k/2.36k [00:00<00:00, 6.73MB/s]


In [3]:
! pip install -r requirements.txt
! pip install tianshou coloredlogs
! pip install --upgrade ipython

Collecting tianshou@ git+https://github.com/thu-ml/tianshou.git@ade85ab32baab721605508dfd9d460015a1832e5 (from -r requirements.txt (line 107))
  Cloning https://github.com/thu-ml/tianshou.git (to revision ade85ab32baab721605508dfd9d460015a1832e5) to /tmp/pip-install-khjs8b2z/tianshou_db26d01f0cf14a5c90423b974baf9d3e
  Running command git clone --filter=blob:none --quiet https://github.com/thu-ml/tianshou.git /tmp/pip-install-khjs8b2z/tianshou_db26d01f0cf14a5c90423b974baf9d3e
  Running command git rev-parse -q --verify 'sha^ade85ab32baab721605508dfd9d460015a1832e5'
  Running command git fetch -q https://github.com/thu-ml/tianshou.git ade85ab32baab721605508dfd9d460015a1832e5
  Running command git checkout -q ade85ab32baab721605508dfd9d460015a1832e5
  Resolved https://github.com/thu-ml/tianshou.git to commit ade85ab32baab721605508dfd9d460015a1832e5
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyprojec

In [4]:
# ! pip install -U ipywidgets

## 🐦 Imports

In [5]:
import gymnasium
import numpy as np
from gymnasium.spaces import Discrete, MultiDiscrete
from gymnasium import spaces
import random

from IPython.display import clear_output
import time
from pettingzoo import AECEnv
from pettingzoo.utils import agent_selector, wrappers
import matplotlib.pyplot as plt
from tianshou.env.pettingzoo_env import PettingZooEnv

from typing import Optional, Tuple
from tianshou.policy import BasePolicy, DQNPolicy, RainbowPolicy, MultiAgentPolicyManager, RandomPolicy
from tianshou.trainer import OffpolicyTrainer
from tianshou.utils.net.common import Net
from tianshou.env import DummyVectorEnv
from copy import deepcopy
from tianshou.data import Collector, VectorReplayBuffer

from torch.utils.tensorboard import SummaryWriter
from tianshou.utils import TensorboardLogger

import torch

import random
import os

NUM_ITERS = 400
PLAYS = {"bastaushy": 0, "qostaushy": 0}

  from jax import xla_computation as _xla_computation


# 🐘 AEC environment https://pettingzoo.farama.org/api/aec/#about-aec

## 🦉 Create environment

In [6]:
class TogyzQumalaqEnv(AECEnv):
    """
    The "name" metadata allows the environment to be pretty printed.
    """

    metadata = {
        "render_modes": ["ansi", "human"],
        "name": "togyzqumalaq_v0"
        }

    def __init__(self, render_mode=None):
        """
        The init method takes in environment arguments and
         should define the following attributes:
        - otaular
        - tuzdyq
        - qazandar
        - possible_agents
        - render_mode
        """
        self.otaular = []
        self.tuzdyq = []
        self.qazandar = []
        self.direction = []
        self.agents = ["bastaushy", "qostaushy"]
        self.possible_agents = self.agents[:]
        self.action_spaces = {i: spaces.Discrete(9) for i in self.agents}
        self.observation_spaces = {
            i: spaces.Dict(
                {
                    "observation":
                        MultiDiscrete([100] * 18 + [10] * 2 + [162] * 2 + [2]),
                    "action_mask":
                        Discrete(9),
                }
            )
            for i in self.agents
        }
        self.render_mode = render_mode

    # Action space should be defined here.
    def action_space(self, agent):
        return self.action_spaces[agent]

    # Observation space should be defined here.
    def observation_space(self, agent):
        return self.observation_spaces[agent]

    def render(self):
        """
        Renders the environment. In human mode,
        it can print to terminal, open
        up a graphical window, or open up some
        other display that
        a human can see and understand.
        """
        """Renders the environment."""
        if self.render_mode is None:
            gymnasium.logger.warn(
                "You are calling render method without "
                "specifying any render mode."
            )
            return

        if len(self.agents) == 2:
            points_bastaushy_x = np.array([i * 2 for i in range(10)])
            points_bastaushy_y = np.array([i % 5 for i in range(50)])

            qazandar = self.qazandar
            otaular = self.otaular
            tuzdyq = self.tuzdyq
            x = np.arange(-3, 225, 1)
            y = -1

            text_kwargs = dict(ha='center', va='center', fontsize=12)
            plt.figure(figsize=(15, 4))

            for i in range(9):
                # qostaushy's part
                plt.scatter(np.repeat(
                    points_bastaushy_x + 25 * i, 5)[:otaular[17 - i]],
                            points_bastaushy_y[:otaular[17 - i]], marker='o')
                # vertical lines
                plt.plot(np.repeat(25 * i - 2, len(x)),
                         np.arange(-7, 5, 12 / len(x)))
                # bastaushy's part
                plt.scatter(np.repeat(points_bastaushy_x + 25 * i, 5)[:otaular[i]],
                                points_bastaushy_y[:otaular[i]] - 6, marker='o')
            
            # horizontal line
            x_lims = np.arange(-3, 245, 1)
            plt.plot(x_lims, np.repeat(y, len(x_lims)))
            # last vertical line
            plt.plot(np.repeat(25 * 9 - 2, 13),
                     np.arange(-7, 6, 1))
        
            for i in range(9):
                # bastaushy's qumalaqtar
                plt.text(25 * i + 10, -7,
                         f'{i} ({otaular[i]})', **text_kwargs)
                # qostaushy's qumalaqtar
                plt.text(25 * i + 10, 5,
                         f'{17 - i} ({otaular[17 - i]})', **text_kwargs)
            # bastaushy qazan's qumalaqtar
            plt.text(235, -4,
                     f'qazan: {qazandar[0]}', **text_kwargs)
            # qostaushy qazan's qumalaqtar
            plt.text(235, 2,
                     f'qazan: {qazandar[1]}', **text_kwargs)
            # bastaushy tuzdyq's qumalaqtar
            plt.text(235, -6,
                     f'tuzdyq: {tuzdyq[0]}', **text_kwargs)
            # qostaushy tuzdyq's qumalaqtar
            plt.text(235, 0,
                     f'tuzdyq: {tuzdyq[1]}', **text_kwargs)
            plt.xticks([])
            plt.yticks([])
            plt.show()
        else:
            if self.render_mode == "human":
                print("Game over")
        time.sleep(1)
        #clear_output()

    def _legal_moves(self, agent):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2
        return [item for item in range(9) if self.tuzdyq[opp_player] != item  + cur_player * 9 and self.otaular[item + cur_player * 9] > 0]

    def observe(self, agent):
        """
        Observe should return the observation of the specified agent. This function
        should return a sane observation (though not necessarily the most up to date possible)
        at any time after reset() is called.
        """
        # observation of one agent is the previous state of the other
        legal_moves = self._legal_moves(agent) if agent == self.agent_selection else []
        action_mask = np.zeros(9, "int8")
        
        for i in legal_moves:
            action_mask[i] = 1
        observation = tuple(
            self.otaular + self.tuzdyq + self.qazandar + [self.possible_agents.index(self.agent_selection)]
        )
        return {"observation": observation, "action_mask": action_mask}

    def close(self):
        """
        Close should release any graphical displays, subprocesses, network connections
        or any other environment data which should not be kept around after the
        user is no longer using the environment.
        """
        pass

    def reset(self, seed=None, options=None):
        """
        Reset needs to initialize the following attributes
        - agents
        - rewards
        - _cumulative_rewards
        - terminations
        - truncations
        - infos
        - agent_selection
        """
        self.agents = self.possible_agents[:]
        self.rewards = {agent: 0 for agent in self.agents}
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.otaular = [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
        self.direction = [list(range(18)), [9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 1, 2, 3, 4, 5, 6, 7, 8]]
        self.tuzdyq = [-1, -1]
        self.qazandar = [0, 0]
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}
        self.num_moves = 0
        observation = tuple(
            self.otaular + self.tuzdyq + self.qazandar + [0]
        )
        self.observations = {agent: observation for agent in self.agents}
        """
        Our agent_selector utility allows easy cyclic stepping through the agents list.
        """
        self._agent_selector = agent_selector(self.agents)
        self.agent_selection = self._agent_selector.next()

    def step(self, action):
        """
        step(action) takes in an action for the current agent (specified by
        agent_selection) and needs to update
        - rewards
        - _cumulative_rewards (accumulating the rewards)
        - terminations
        - truncations
        - infos
        - agent_selection (to the next agent)
        And any internal state used by observe() or render()
        """
        if (
            self.terminations[self.agent_selection]
            or self.truncations[self.agent_selection]
        ):
            # handles stepping an agent which is already dead
            # accepts a None action for the one agent, and moves the agent_selection to
            # the next dead agent,  or if there are no more dead agents, to the next live agent
            self._was_dead_step(action)
            return
        self.rewards = {agent: 0 for agent in self.agents}
        cur_player = self.possible_agents.index(self.agent_selection)
        opp_player = (cur_player + 1) % 2
        self.num_moves += 1
        if self.render_mode == "human":
            print(f'MOVE #{self.num_moves}')
        # The truncations dictionary must be updated for all players.
        self.truncations = {
            agent: self.num_moves >= NUM_ITERS for agent in self.agents
        }
        # distribute qumalaqs
        if cur_player == 1:
            action += 9
        if self.render_mode == "human":
            print(f'{self.agent_selection} made action {action}')
        num_qumalaq = self.otaular[action]
        idx_action = self.direction[cur_player].index(action)
        if self.otaular[action] == 1:
            self.otaular[self.direction[cur_player][idx_action + 1]] += 1
            self.otaular[action] -= 1
        else:
            i = 1
            coef = 1
            if self.otaular[action] / 18 > 1:
                coef = int(self.otaular[action] / 18) + 1
            while self.otaular[action] > coef:
                self.otaular[self.direction[cur_player][(idx_action + i) % 18]] += 1
                self.otaular[action] -= 1
                i += 1
        # check tuzdyq & add rewards to qazandar
        reward = 0
        if self.tuzdyq[cur_player] < 0 and self.check_tuzdyq(self.agent_selection, action, num_qumalaq):
            reward += 3
            if self.render_mode == "human":
                print(f'{self.agent_selection} won tuzdyq {reward}')
                
            #******* awarding a rewards for receiving tuzdyq **********
            # self.rewards[self.agent_selection] += 50
            # self.rewards[self.possible_agents[opp_player]] -= 50
        else:

            if num_qumalaq > 1:
                last_otau = self.direction[cur_player][(idx_action + num_qumalaq - 1) % 18]
            else:
                last_otau = self.direction[cur_player][(idx_action + num_qumalaq) % 18]

            if (last_otau in range(opp_player * 9, (opp_player + 1) * 9) and
                    self.otaular[last_otau] % 2 == 0):
                reward += self.otaular[last_otau]
                if self.render_mode == "human":
                    print(f'{self.agent_selection} won {reward}')
                self.otaular[last_otau] = 0
            if (self.tuzdyq[cur_player] >= 0 and
                    self.otaular[self.tuzdyq[cur_player]] > 0):
                reward += self.otaular[self.tuzdyq[cur_player]]
                if self.render_mode == "human":
                    print(f'{self.agent_selection} won tuzdyq {self.otaular[self.tuzdyq[cur_player]]}')
                self.otaular[self.tuzdyq[cur_player]] = 0
        if self.render_mode == "human":
            print(f'{self.agent_selection} won total {reward}')
        self.qazandar[cur_player] += reward

        #******* awarding a rewards from otaular **********
        self.rewards[self.agent_selection] += reward
        self.rewards[self.possible_agents[opp_player]] -= reward
        
        # check if there is a winner
        winner = self.check_for_winner()
        if winner:
            self.terminations = {i: True for i in self.agents}
            if self.render_mode == "human":
                print(f'{self.agent_selection} won the game!!!')
                
            #******* awarding a reward for winning a game **********
            self.rewards[self.agent_selection] += self.qazandar[opp_player]
            self.rewards[self.possible_agents[opp_player]] -= self.qazandar[opp_player]
            for i in range(9):
                self.rewards[self.agent_selection] += self.otaular[i + 9 * opp_player]
                self.rewards[self.possible_agents[opp_player]] -= self.otaular[i + 9 * opp_player]
            
        # selects the next agent.
        self.agent_selection = self._agent_selector.next()
        # Adds .rewards to ._cumulative_rewards
        self._accumulate_rewards()

        total_rewards = sum(self.rewards.values())
        assert total_rewards == 0, f"Error: Total reward is not zero: {total_rewards}"
        total_qumalaqs = 0
        for i in self.otaular:
            total_qumalaqs += i
        for i in self.qazandar:
            total_qumalaqs += i
        assert total_qumalaqs == 162, f"Error: Total qumalaqs is not equal to 162: {total_qumalaqs}"
        if self.render_mode == "human":
            self.render()

    def check_tuzdyq(self, agent, action, num_qumalaq):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2
        idx = self.direction[cur_player].index(action)

        if num_qumalaq > 1:
            last_otau = self.direction[cur_player][(idx + num_qumalaq - 1) % 18]
        else:
            last_otau = self.direction[cur_player][(idx + num_qumalaq) % 18]

        if (last_otau in range(opp_player * 9, (opp_player + 1) * 9) and
                self.otaular[last_otau] == 3 and last_otau != 17 - cur_player * 9 and
                abs(last_otau - self.tuzdyq[opp_player]) != 9):
            self.tuzdyq[cur_player] = last_otau
            self.otaular[last_otau] = 0
            if self.render_mode == "human":
                print(f'{agent} got tuzdyq {last_otau}!')
            return True

        return False

    def check_atsyrau(self, agent):
        cur_player = self.possible_agents.index(agent)
        opp_player = (cur_player + 1) % 2

        for idx, i in enumerate(
                self.otaular[cur_player * 9: (cur_player + 1) * 9]):
            if i > 0 and idx + cur_player * 9 != self.tuzdyq[opp_player]:
                return False
        if self.render_mode == "human":
            print(f'{agent} reached atsyrau')
        return True

    def check_for_winner(self):
        cur_player = self.possible_agents.index(self.agent_selection)
        opp_player = (cur_player + 1) % 2
        if self.qazandar[cur_player] > 81:
            PLAYS[self.agent_selection] += 1
            return True
        if (self.check_atsyrau(self.possible_agents[opp_player])
                and self.qazandar[opp_player] <= 81):
            PLAYS[self.agent_selection] += 1
            return True
        return False


def _get_env(render_mode=None):
    """This function is needed to provide callables for DummyVectorEnv."""
    def env(render_mode=None):
        """
        The env function often wraps the environment in wrappers by default.
        You can find full documentation for these methods
        elsewhere in the developer documentation.
        """
        internal_render_mode = render_mode \
            if render_mode != "ansi" else "human"
        env = TogyzQumalaqEnv(render_mode=internal_render_mode)
        # This wrapper is only for environments
        # which print results to the terminal
        if render_mode == "ansi":
            env = wrappers.CaptureStdoutWrapper(env)
        # this wrapper helps error handling for discrete action spaces
        env = wrappers.AssertOutOfBoundsWrapper(env)
        # Provides a wide vareity of helpful user errors
        # Strongly recommended
        env = wrappers.OrderEnforcingWrapper(env)
        return env
    return PettingZooEnv(env(render_mode=render_mode))

  and should_run_async(code)


# 🐦 Prepare data

## 🐆 Create folders

### 🦂 Log folders

In [7]:
!mkdir logs
LOGS_PATH = os.path.join('/', 'kaggle', 'working', 'logs')

### 🦆 Model folders

In [8]:
!mkdir models
MODELS_PATH = os.path.join('/', 'kaggle', 'working', 'models')

## 🐠 Training parameters

In [9]:
NET_ARCHS = [[64,128,128,64], [1024, 2048, 2048, 1024], [2048, 4096, 4096, 2048], [2048, 4096, 8192, 4096, 2048]]

MAX_EPOCHS = 4000

STEP_PER_EPOCH = 10000

STEP_PER_COLLECT = 1000

EPISODE_PER_TEST = 200

UPDATE_PER_STEP = 1 / STEP_PER_COLLECT

BATCH_SIZE = 1024

LR = 1e-04

DF = 0.99

TRAIN_EPS = 0.15

TEST_EPS = 0.05

TRAIN_NUM = 100

TEST_NUM = 10

agent_idx2name = {0: 'bastaushy', 1: 'qostaushy'}

SEED = 888

# 🐼 DQN agent to play vs a random policy agent https://pettingzoo.farama.org/tutorials/tianshou/intermediate/

## 🐜 Set net architectures

In [10]:
def arch2str(net_arch):
    out = str(net_arch[0])
    for i in net_arch[1:]:
        out += 'x' + str(i)
    return out

def str2arch(inp):
    lst = inp.split('x')
    return [int(i) for i in lst]

## 🐫 Prepare main functions¶

In [11]:
def _get_agents_dqn(
    agent_train: Optional[BasePolicy] = None,
    agent_opponent: Optional[BasePolicy] = None,
    optim: Optional[torch.optim.Optimizer] = None,
    agent_type = None,
    net_arch = [64, 128, 128, 64]
) -> Tuple[BasePolicy, torch.optim.Optimizer, list]:
    env = _get_env()
    observation_space = (
        env.observation_space["observation"]
        if isinstance(env.observation_space, gymnasium.spaces.Dict)
        else env.observation_space
    )
    if agent_train is None:
        # model
        net = Net(
            state_shape=observation_space.shape or observation_space.n,
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=net_arch,
            device="cuda" if torch.cuda.is_available() else "cpu"
        )

        if optim is None:
            optim = torch.optim.Adam(net.parameters(), lr=LR)
        agent_train = DQNPolicy(
            model=net,
            optim=optim,
            discount_factor=DF,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space,
            observation_space=observation_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")


    if agent_opponent is None:
        agent_opponent = RandomPolicy(action_space=env.action_space)
        
    if agent_type=="qostaushy":
        agents = [agent_opponent, agent_train]
    else:
        agents = [agent_train, agent_opponent]
    policy = MultiAgentPolicyManager(policies=agents, env=env)
    return policy, optim, env.agents

In [12]:
def play_vs_random(agent, type, seed=777):
    
    env = _get_env()

    rand_policy = RandomPolicy(action_space=env.action_space)
    env.action_space.seed(seed)

    if type == 'qostaushy':
        policies = MultiAgentPolicyManager(policies=[rand_policy, agent], env=env)
    else:
        policies = MultiAgentPolicyManager(policies=[agent, rand_policy], env=env)

    env = DummyVectorEnv([lambda: env])

    collector = Collector(policies, env)

    result = collector.collect(n_episode=100)#, reset_before_collect=True)
    
    print(f'Agent {type} vs random', PLAYS)

In [13]:
def play_vs_others(agent, type, opponents):

    for n_agent_opp, agent_opponent in enumerate(opponents):
        
        env = _get_env()#render_mode='human')

        if type == "bastaushy":

            policies = MultiAgentPolicyManager(policies=[agent, agent_opponent], env=env)
            
        else:
            
            policies = MultiAgentPolicyManager(policies=[agent_opponent, agent], env=env)
        
        test_envs = DummyVectorEnv([lambda: env for _ in range(10)])

        collector = Collector(policies, test_envs, exploration_noise=False)

        result = collector.collect(n_episode=1)
        
    print(f'{type} plays vs others: {PLAYS}')

## 🐧 Training agents

### 🦚 Method train agent

In [14]:
def train_agent_dqn(index, net_arch, agent_idx, agent_train=None, agent_opponent=None, seed=777):
    # Before evaluate this cell run the cell with env
    # ======== Step 1: Environment setup =========
    train_envs = DummyVectorEnv([_get_env for _ in range(TRAIN_NUM)])
    test_envs = DummyVectorEnv([_get_env for _ in range(TEST_NUM)])
    
    # seed
    np.random.seed(seed)
    torch.manual_seed(seed)
    train_envs.seed(seed)
    test_envs.seed(seed)
    
    # ======== Step 2: Agent setup =========
    policy, optim, agents = _get_agents_dqn(agent_train=agent_train, agent_opponent=agent_opponent, agent_type=agent_idx2name[agent_idx], net_arch=net_arch)
    
    # # ======== Step 3: Collector setup =========
    train_collector = Collector(
        policy,
        train_envs,
        VectorReplayBuffer(100_000, len(train_envs)),
        exploration_noise=True,
    )
    test_collector = Collector(policy, test_envs, exploration_noise=True)

    train_collector.collect(n_step=BATCH_SIZE * TRAIN_NUM)  # batch size * training_num
    log_path = os.path.join(LOGS_PATH, str(index))
    writer = SummaryWriter(log_path)
    logger = TensorboardLogger(writer)
    
    # ======== Step 4: Callback functions setup =========
    def save_best_fn(policy):
        model_save_path = os.path.join(MODELS_PATH, f'policy_dqn_{arch2str(net_arch)}_{index}.pth')
        os.makedirs(MODELS_PATH, exist_ok=True)
        
        torch.save(policy.policies[agents[agent_idx]].state_dict(), model_save_path)
        global PLAYS
        PLAYS ["bastaushy"], PLAYS["qostaushy"] = 0, 0
        net = Net(
            state_shape=(23,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=net_arch,
            device="cuda" if torch.cuda.is_available() else "cpu"
        )
        
        agent_trained = DQNPolicy(
            model=net,
            optim = torch.optim.Adam(net.parameters(), lr=LR),
            discount_factor=DF,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space,
            observation_space=env.observation_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")
        agent_trained.load_state_dict(torch.load(f"{MODELS_PATH}/policy_dqn_{arch2str(net_arch)}_{index}.pth", map_location=torch.device('cpu')))
        play_vs_random(agent_trained, agent_idx2name[agent_idx], seed=seed)
        # PLAYS ["bastaushy"], PLAYS["qostaushy"] = 0, 0
        # play_vs_others(agent_trained, agent_idx2name[agent_idx], agents_trained[(agent_idx + 1) % 2 : len(agents_trained) : 2])
    
    def stop_fn(mean_rewards):
        return mean_rewards >= 160
    
    def train_fn(epoch, env_step):
        policy.policies[agents[agent_idx]].set_eps(get_train_eps(epoch))
    
    def test_fn(epoch, env_step):
        policy.policies[agents[agent_idx]].set_eps(TEST_EPS)
    
    def reward_metric(rews):
        return rews[:, agent_idx]
    
    def get_train_eps(epoch):
        return max(0.01, 1.0 - epoch / MAX_EPOCHS)

    def get_test_eps(epoch):
        return 0.05  # Постоянное значение для тестирования

    # ======== Step 5: Run the trainer =========
    result = OffpolicyTrainer(
        policy=policy,
        train_collector=train_collector,
        test_collector=test_collector,
        max_epoch=MAX_EPOCHS,
        step_per_epoch=STEP_PER_EPOCH,
        step_per_collect=STEP_PER_COLLECT,
        episode_per_test=EPISODE_PER_TEST,
        batch_size=BATCH_SIZE,
        train_fn=train_fn,
        test_fn=test_fn,
        stop_fn=stop_fn,
        save_best_fn=save_best_fn,
        update_per_step=UPDATE_PER_STEP,
        test_in_train=False,
        reward_metric=reward_metric,
        verbose=True,
        show_progress=True,
        logger=logger
    ).run()
    
    # return result, policy.policies[agents[1]]
    print(f"\n==========Result==========\n{result}")
    print(f"\n(the trained policy can be accessed via policy.policies[agents[{agent_idx}]])")
    return policy.policies[agents[agent_idx]]

# 🐑 Self-play training

### 🐸 Download previous models

In [15]:
# urls = ["https://drive.google.com/uc?id=1gU6xKG0OsC9PnJ7v2nCqAh9p-U6rco4u",
#         "https://drive.google.com/uc?id=1El8_10upA_es-tyqQfGxzdHfvBEqIkc0",  
#         "https://drive.google.com/uc?id=1rGM4TK_QnjWdToy7fkUPmdgL0ybh4w3L",
#         "https://drive.google.com/uc?id=14bwcfNbiOVlNCpIguqpfCcHeeXtpZvsJ", 
#        ]

# for idx, url in enumerate(urls):
#     output = f'models/policy_dqn_2048x4096x4096x2048_{idx}.pth'
#     gdown.download(url, output, quiet=False);

### 🐊 Load trained models

In [16]:
# import copy

# env = _get_env()

# agents_trained = []

# net_arch = [2048, 4096, 4096, 2048]
# for idx in range(len(urls)):

#     agent_idx = idx % 2
#     net = Net(
#             state_shape=(23,),
#             action_shape=env.action_space.shape or env.action_space.n,
#             hidden_sizes=net_arch,
#             device="cuda" if torch.cuda.is_available() else "cpu"
#             )
            
#     agent_trained1 = DQNPolicy(
#             model=net,
#             optim = torch.optim.Adam(net.parameters(), lr=LR),
#             discount_factor=DF,
#             estimation_step=3,
#             target_update_freq=320,
#             action_space=env.action_space,
#             observation_space=env.observation_space
#     ).to("cuda" if torch.cuda.is_available() else "cpu")
    
#     arch_name = arch2str(net_arch)
#     agent_trained1.load_state_dict(torch.load(f"{MODELS_PATH}/policy_dqn_{arch_name}_{idx}.pth", map_location=torch.device('cpu')))
#     agent_trained = copy.deepcopy(agent_trained1)
#     agents_trained.append(agent_trained)

### 🪲 Play trained models

In [17]:
# for i in range(len(agents_trained)):
    
#     agent_idx = i % 2
    
#     PLAYS = {"bastaushy": 0, "qostaushy": 0}
    
#     play_vs_others(agents_trained[i], agent_idx2name[agent_idx], agents_trained[(agent_idx + 1) % 2 : len(agents_trained) : 2])
#     print('----------------------------------------------')

### 🪲 Play trained models vs random

In [18]:
# for i in range(len(agents_trained)):
#     agent_idx = i % 2
#     # play trained agents with random policy
#     PLAYS = {"bastaushy": 0, "qostaushy": 0}
#     play_vs_random(agents_trained[i], agent_idx2name[agent_idx], seed=891)

## 🐣 Main loop

### 🐟 Set parameters 

In [19]:
net_arch = NET_ARCHS[3]

arch_name = arch2str(net_arch)

### 🐨 Self-play

In [None]:
for i in range(2):
        agent_idx = i % 2
        ### training
        env = _get_env()
    
        net = Net(
            state_shape=(23,),
            action_shape=env.action_space.shape or env.action_space.n,
            hidden_sizes=net_arch,
            device="cuda" if torch.cuda.is_available() else "cpu"
        )
        
        agent_trained = DQNPolicy(
            model=net,
            optim = torch.optim.Adam(net.parameters(), lr=LR),
            discount_factor=DF,
            estimation_step=3,
            target_update_freq=320,
            action_space=env.action_space,
            observation_space=env.observation_space
        ).to("cuda" if torch.cuda.is_available() else "cpu")

        if i > 1: 
            net_opp = Net(
                state_shape=(23,),
                action_shape=env.action_space.shape or env.action_space.n,
                hidden_sizes=net_arch,
                device="cuda" if torch.cuda.is_available() else "cpu"
            )
        
            agent_trained_opp = DQNPolicy(
                model=net_opp,
                optim = torch.optim.Adam(net.parameters(), lr=LR),
                discount_factor=DF,
                estimation_step=3,
                target_update_freq=320,
                action_space=env.action_space,
                observation_space=env.observation_space
            ).to("cuda" if torch.cuda.is_available() else "cpu")

            net_prev = Net(
                state_shape=(23,),
                action_shape=env.action_space.shape or env.action_space.n,
                hidden_sizes=net_arch,
                device="cuda" if torch.cuda.is_available() else "cpu"
            )
        
            agent_trained_prev = DQNPolicy(
                model=net_prev,
                optim = torch.optim.Adam(net.parameters(), lr=LR),
                discount_factor=DF,
                estimation_step=3,
                target_update_freq=320,
                action_space=env.action_space,
                observation_space=env.observation_space
            ).to("cuda" if torch.cuda.is_available() else "cpu")
            
            ###load policy models

            idx_opp = random.choice([i for i in range((i + 1) % 2, len(agents_trained), 2)])
            agent_trained_opp.load_state_dict(torch.load(f"{MODELS_PATH}/policy_dqn_{arch_name}_{idx_opp}.pth", map_location=torch.device('cpu')))
            agent_trained_prev.load_state_dict(torch.load(f"{MODELS_PATH}/policy_dqn_{arch_name}_{i-2}.pth", map_location=torch.device('cpu')))

            print(f'{agent_idx2name[agent_idx]} is training vs {agent_idx2name[(agent_idx + 1) % 2]} {idx_opp}')
            #train_agent_dqn(index=i, net_arch=net_arch, agent_idx=agent_idx, agent_train=agent_trained_prev, agent_opponent=agent_trained_opp)
            train_agent_dqn(index=i, net_arch=net_arch, agent_idx=agent_idx, agent_opponent=agent_trained_opp, seed=SEED)
        else:
            train_agent_dqn(index=i, net_arch=net_arch, agent_idx=agent_idx)

        # agent_trained.load_state_dict(torch.load(f"{MODELS_PATH}/policy_dqn_{arch_name}_{i}.pth", map_location=torch.device('cpu')))
        # agent_trained1 = copy.deepcopy(agent_trained)
        # agents_trained.append(agent_trained1)

  agent_trained.load_state_dict(torch.load(f"{MODELS_PATH}/policy_dqn_{arch2str(net_arch)}_{index}.pth", map_location=torch.device('cpu')))


Agent bastaushy vs random {'bastaushy': 11, 'qostaushy': 89}


Epoch #1: 10001it [00:03, 2813.73it/s, bastaushy/loss=112.376, env_step=10000, len=135, n/ep=17, n/st=1000, rew=-6.47]                           
  agent_trained.load_state_dict(torch.load(f"{MODELS_PATH}/policy_dqn_{arch2str(net_arch)}_{index}.pth", map_location=torch.device('cpu')))


Agent bastaushy vs random {'bastaushy': 26, 'qostaushy': 74}
Epoch #1: test_reward: -61.985000 ± 74.022056, best_reward: -61.985000 ± 74.022056 in #1


Epoch #2: 10001it [00:02, 4525.38it/s, bastaushy/loss=120.483, env_step=20000, len=106, n/ep=10, n/st=1000, rew=19.00]                           


Epoch #2: test_reward: -77.695000 ± 61.040249, best_reward: -61.985000 ± 74.022056 in #1


Epoch #3: 10001it [00:02, 4457.85it/s, bastaushy/loss=118.002, env_step=30000, len=113, n/ep=8, n/st=1000, rew=44.50]                           


Epoch #3: test_reward: -73.480000 ± 71.138735, best_reward: -61.985000 ± 74.022056 in #1


Epoch #4: 10001it [00:02, 4406.91it/s, bastaushy/loss=117.743, env_step=40000, len=118, n/ep=8, n/st=1000, rew=-0.12]                           


Agent bastaushy vs random {'bastaushy': 17, 'qostaushy': 83}
Epoch #4: test_reward: -41.975000 ± 85.702243, best_reward: -41.975000 ± 85.702243 in #4


Epoch #5: 10001it [00:02, 4416.40it/s, bastaushy/loss=115.810, env_step=50000, len=143, n/ep=6, n/st=1000, rew=-3.67]                            


Epoch #5: test_reward: -71.885000 ± 73.300967, best_reward: -41.975000 ± 85.702243 in #4


Epoch #6: 10001it [00:02, 4411.46it/s, bastaushy/loss=115.632, env_step=60000, len=96, n/ep=6, n/st=1000, rew=-1.00]                             


Epoch #6: test_reward: -71.980000 ± 70.445721, best_reward: -41.975000 ± 85.702243 in #4


Epoch #7: 10001it [00:02, 4481.52it/s, bastaushy/loss=116.998, env_step=70000, len=127, n/ep=12, n/st=1000, rew=-26.92]                           


Epoch #7: test_reward: -51.665000 ± 79.837665, best_reward: -41.975000 ± 85.702243 in #4


Epoch #8: 10001it [00:02, 4422.27it/s, bastaushy/loss=116.561, env_step=80000, len=123, n/ep=7, n/st=1000, rew=12.57]                            


Epoch #8: test_reward: -56.625000 ± 77.573219, best_reward: -41.975000 ± 85.702243 in #4


Epoch #9: 10001it [00:02, 4483.43it/s, bastaushy/loss=116.718, env_step=90000, len=162, n/ep=4, n/st=1000, rew=84.50]                            


Epoch #9: test_reward: -81.625000 ± 59.120338, best_reward: -41.975000 ± 85.702243 in #4


Epoch #10: 10001it [00:02, 4355.38it/s, bastaushy/loss=117.511, env_step=100000, len=99, n/ep=8, n/st=1000, rew=1.62]                             


Agent bastaushy vs random {'bastaushy': 41, 'qostaushy': 59}
Epoch #10: test_reward: -40.175000 ± 82.674448, best_reward: -40.175000 ± 82.674448 in #10


Epoch #11: 10001it [00:02, 4427.47it/s, bastaushy/loss=118.565, env_step=110000, len=119, n/ep=10, n/st=1000, rew=35.50]                           


Agent bastaushy vs random {'bastaushy': 48, 'qostaushy': 52}
Epoch #11: test_reward: -1.450000 ± 86.758213, best_reward: -1.450000 ± 86.758213 in #11


Epoch #12: 10001it [00:02, 4608.00it/s, bastaushy/loss=119.749, env_step=120000, len=123, n/ep=4, n/st=1000, rew=1.00]                             


Epoch #12: test_reward: -48.990000 ± 79.531880, best_reward: -1.450000 ± 86.758213 in #11


Epoch #13: 10001it [00:02, 4571.52it/s, bastaushy/loss=120.632, env_step=130000, len=127, n/ep=10, n/st=1000, rew=-2.20]                           


Epoch #13: test_reward: -32.935000 ± 84.388629, best_reward: -1.450000 ± 86.758213 in #11


Epoch #14: 10001it [00:02, 4449.42it/s, bastaushy/loss=123.024, env_step=140000, len=156, n/ep=2, n/st=1000, rew=2.00]                             


Epoch #14: test_reward: -50.190000 ± 79.096295, best_reward: -1.450000 ± 86.758213 in #11


Epoch #15: 10001it [00:02, 4404.14it/s, bastaushy/loss=125.265, env_step=150000, len=127, n/ep=11, n/st=1000, rew=6.09]                            


Epoch #15: test_reward: -97.750000 ± 43.052729, best_reward: -1.450000 ± 86.758213 in #11


Epoch #16: 10001it [00:02, 4518.03it/s, bastaushy/loss=126.121, env_step=160000, len=140, n/ep=5, n/st=1000, rew=-15.60]                           


Epoch #16: test_reward: -67.635000 ± 66.485425, best_reward: -1.450000 ± 86.758213 in #11


Epoch #17: 10001it [00:02, 4597.73it/s, bastaushy/loss=125.496, env_step=170000, len=127, n/ep=9, n/st=1000, rew=-4.33]                           


Epoch #17: test_reward: -65.405000 ± 70.275252, best_reward: -1.450000 ± 86.758213 in #11


Epoch #18: 10001it [00:02, 4634.70it/s, bastaushy/loss=126.364, env_step=180000, len=101, n/ep=9, n/st=1000, rew=-20.44]                           


Epoch #18: test_reward: -55.900000 ± 81.239153, best_reward: -1.450000 ± 86.758213 in #11


Epoch #19: 10001it [00:02, 4455.56it/s, bastaushy/loss=125.718, env_step=190000, len=92, n/ep=6, n/st=1000, rew=93.33]                             


Epoch #19: test_reward: -71.105000 ± 69.850655, best_reward: -1.450000 ± 86.758213 in #11


Epoch #20: 10001it [00:02, 4424.75it/s, bastaushy/loss=126.848, env_step=200000, len=103, n/ep=7, n/st=1000, rew=38.86]                            


Epoch #20: test_reward: -73.815000 ± 67.331128, best_reward: -1.450000 ± 86.758213 in #11


Epoch #21: 10001it [00:02, 4469.12it/s, bastaushy/loss=126.887, env_step=210000, len=139, n/ep=7, n/st=1000, rew=-13.14]                           


Epoch #21: test_reward: -54.390000 ± 79.592009, best_reward: -1.450000 ± 86.758213 in #11


Epoch #22: 10001it [00:02, 4488.43it/s, bastaushy/loss=124.502, env_step=220000, len=108, n/ep=5, n/st=1000, rew=58.00]                            


Epoch #22: test_reward: -42.135000 ± 84.616705, best_reward: -1.450000 ± 86.758213 in #11


Epoch #23: 10001it [00:02, 4466.04it/s, bastaushy/loss=125.061, env_step=230000, len=159, n/ep=10, n/st=1000, rew=5.50]                           


Epoch #23: test_reward: -56.155000 ± 77.700392, best_reward: -1.450000 ± 86.758213 in #11


Epoch #24: 10001it [00:02, 4422.29it/s, bastaushy/loss=124.475, env_step=240000, len=105, n/ep=9, n/st=1000, rew=-11.67]                           


Epoch #24: test_reward: -53.735000 ± 77.458600, best_reward: -1.450000 ± 86.758213 in #11


Epoch #25: 10001it [00:02, 4464.30it/s, bastaushy/loss=125.316, env_step=250000, len=110, n/ep=7, n/st=1000, rew=65.57]                             


Epoch #25: test_reward: -72.080000 ± 65.867015, best_reward: -1.450000 ± 86.758213 in #11


Epoch #26: 10001it [00:02, 4458.98it/s, bastaushy/loss=125.203, env_step=260000, len=125, n/ep=8, n/st=1000, rew=1.25]                             


Epoch #26: test_reward: -73.650000 ± 61.849313, best_reward: -1.450000 ± 86.758213 in #11


Epoch #27: 10001it [00:02, 4454.49it/s, bastaushy/loss=125.727, env_step=270000, len=140, n/ep=4, n/st=1000, rew=38.75]                            


Epoch #27: test_reward: -72.065000 ± 66.410095, best_reward: -1.450000 ± 86.758213 in #11


Epoch #28: 10001it [00:02, 4495.00it/s, bastaushy/loss=126.303, env_step=280000, len=113, n/ep=8, n/st=1000, rew=40.62]                            


Epoch #28: test_reward: -49.810000 ± 80.393370, best_reward: -1.450000 ± 86.758213 in #11


Epoch #29: 10001it [00:02, 4413.17it/s, bastaushy/loss=128.129, env_step=290000, len=99, n/ep=10, n/st=1000, rew=37.80]                            


Epoch #29: test_reward: -52.140000 ± 78.135270, best_reward: -1.450000 ± 86.758213 in #11


Epoch #30: 10001it [00:02, 4424.34it/s, bastaushy/loss=126.362, env_step=300000, len=111, n/ep=6, n/st=1000, rew=35.67]                            


Epoch #30: test_reward: -52.565000 ± 78.221134, best_reward: -1.450000 ± 86.758213 in #11


Epoch #31: 10001it [00:02, 4464.92it/s, bastaushy/loss=124.852, env_step=310000, len=103, n/ep=7, n/st=1000, rew=-14.71]                           


Epoch #31: test_reward: -51.155000 ± 78.460888, best_reward: -1.450000 ± 86.758213 in #11


Epoch #32: 10001it [00:02, 4423.39it/s, bastaushy/loss=124.444, env_step=320000, len=125, n/ep=14, n/st=1000, rew=-12.36]                           


Epoch #32: test_reward: -50.150000 ± 78.914115, best_reward: -1.450000 ± 86.758213 in #11


Epoch #33: 10001it [00:02, 4487.03it/s, bastaushy/loss=127.362, env_step=330000, len=149, n/ep=7, n/st=1000, rew=65.00]                            


Epoch #33: test_reward: -84.495000 ± 50.314312, best_reward: -1.450000 ± 86.758213 in #11


Epoch #34: 10001it [00:02, 4490.33it/s, bastaushy/loss=126.637, env_step=340000, len=142, n/ep=11, n/st=1000, rew=61.36]                           


Epoch #34: test_reward: -66.790000 ± 69.809855, best_reward: -1.450000 ± 86.758213 in #11


Epoch #35: 10001it [00:02, 4482.72it/s, bastaushy/loss=124.831, env_step=350000, len=128, n/ep=10, n/st=1000, rew=-29.90]                           


Epoch #35: test_reward: -56.890000 ± 76.332810, best_reward: -1.450000 ± 86.758213 in #11


Epoch #36: 10001it [00:02, 4533.63it/s, bastaushy/loss=122.546, env_step=360000, len=129, n/ep=4, n/st=1000, rew=57.00]                            


Epoch #36: test_reward: -66.175000 ± 69.184784, best_reward: -1.450000 ± 86.758213 in #11


Epoch #37: 10001it [00:02, 4527.96it/s, bastaushy/loss=118.917, env_step=370000, len=88, n/ep=9, n/st=1000, rew=12.56]                             


Epoch #37: test_reward: -31.880000 ± 85.757773, best_reward: -1.450000 ± 86.758213 in #11


Epoch #38: 10001it [00:02, 4364.07it/s, bastaushy/loss=117.005, env_step=380000, len=125, n/ep=2, n/st=1000, rew=94.00]                            


Epoch #38: test_reward: -77.095000 ± 59.637371, best_reward: -1.450000 ± 86.758213 in #11


Epoch #39: 10001it [00:02, 4437.10it/s, bastaushy/loss=113.125, env_step=390000, len=118, n/ep=10, n/st=1000, rew=-1.80]                           


Epoch #39: test_reward: -58.110000 ± 76.999272, best_reward: -1.450000 ± 86.758213 in #11


Epoch #40: 10001it [00:02, 4476.29it/s, bastaushy/loss=110.700, env_step=400000, len=130, n/ep=8, n/st=1000, rew=-2.38]                           


Epoch #40: test_reward: -67.280000 ± 69.933051, best_reward: -1.450000 ± 86.758213 in #11


Epoch #41: 10001it [00:02, 4579.01it/s, bastaushy/loss=110.735, env_step=410000, len=135, n/ep=7, n/st=1000, rew=-7.14]                            


Epoch #41: test_reward: -73.765000 ± 67.471177, best_reward: -1.450000 ± 86.758213 in #11


Epoch #42: 10001it [00:02, 4424.09it/s, bastaushy/loss=109.333, env_step=420000, len=125, n/ep=7, n/st=1000, rew=37.43]                            


Epoch #42: test_reward: -68.715000 ± 68.873535, best_reward: -1.450000 ± 86.758213 in #11


Epoch #43: 10001it [00:02, 4503.62it/s, bastaushy/loss=104.146, env_step=430000, len=136, n/ep=10, n/st=1000, rew=14.40]                           


Epoch #43: test_reward: -70.240000 ± 67.312201, best_reward: -1.450000 ± 86.758213 in #11


Epoch #44: 10001it [00:02, 4434.59it/s, bastaushy/loss=102.324, env_step=440000, len=112, n/ep=12, n/st=1000, rew=-2.75]                           


Epoch #44: test_reward: -53.235000 ± 79.543069, best_reward: -1.450000 ± 86.758213 in #11


Epoch #45: 10001it [00:02, 4444.05it/s, bastaushy/loss=99.173, env_step=450000, len=138, n/ep=6, n/st=1000, rew=-9.17]                             


Epoch #45: test_reward: -56.960000 ± 77.177447, best_reward: -1.450000 ± 86.758213 in #11


Epoch #46: 10001it [00:02, 4650.15it/s, bastaushy/loss=98.205, env_step=460000, len=123, n/ep=8, n/st=1000, rew=45.00]                            


Epoch #46: test_reward: -61.940000 ± 72.817899, best_reward: -1.450000 ± 86.758213 in #11


Epoch #47: 10001it [00:02, 4592.44it/s, bastaushy/loss=99.969, env_step=470000, len=108, n/ep=9, n/st=1000, rew=74.89]                            


Epoch #47: test_reward: -68.335000 ± 69.903096, best_reward: -1.450000 ± 86.758213 in #11


Epoch #48: 10001it [00:02, 4582.55it/s, bastaushy/loss=98.948, env_step=480000, len=132, n/ep=13, n/st=1000, rew=9.00]                            


Epoch #48: test_reward: -66.425000 ± 71.162521, best_reward: -1.450000 ± 86.758213 in #11


Epoch #49: 10001it [00:02, 4590.78it/s, bastaushy/loss=99.742, env_step=490000, len=133, n/ep=13, n/st=1000, rew=-3.69]                           


Epoch #49: test_reward: -57.555000 ± 74.575579, best_reward: -1.450000 ± 86.758213 in #11


Epoch #50: 10001it [00:02, 4488.82it/s, bastaushy/loss=100.327, env_step=500000, len=112, n/ep=9, n/st=1000, rew=-11.89]                           


Epoch #50: test_reward: -74.395000 ± 63.646594, best_reward: -1.450000 ± 86.758213 in #11


Epoch #51: 10001it [00:02, 4580.67it/s, bastaushy/loss=98.892, env_step=510000, len=106, n/ep=4, n/st=1000, rew=-3.00]                            


Epoch #51: test_reward: -75.275000 ± 62.475030, best_reward: -1.450000 ± 86.758213 in #11


Epoch #52: 10001it [00:02, 4567.77it/s, bastaushy/loss=99.119, env_step=520000, len=102, n/ep=8, n/st=1000, rew=23.25]                            


Epoch #52: test_reward: -84.120000 ± 53.356121, best_reward: -1.450000 ± 86.758213 in #11


Epoch #53: 10001it [00:02, 4513.73it/s, bastaushy/loss=98.730, env_step=530000, len=115, n/ep=12, n/st=1000, rew=50.17]                           


Epoch #53: test_reward: -78.160000 ± 62.195855, best_reward: -1.450000 ± 86.758213 in #11


Epoch #54: 10001it [00:02, 4469.82it/s, bastaushy/loss=100.007, env_step=540000, len=127, n/ep=9, n/st=1000, rew=70.56]                           


Epoch #54: test_reward: -81.130000 ± 57.312766, best_reward: -1.450000 ± 86.758213 in #11


Epoch #55: 10001it [00:02, 4514.02it/s, bastaushy/loss=102.171, env_step=550000, len=102, n/ep=10, n/st=1000, rew=1.50]                           


Epoch #55: test_reward: -73.670000 ± 63.776101, best_reward: -1.450000 ± 86.758213 in #11


Epoch #56: 10001it [00:02, 4529.12it/s, bastaushy/loss=104.065, env_step=560000, len=135, n/ep=3, n/st=1000, rew=-40.00]                           


Epoch #56: test_reward: -70.880000 ± 66.964659, best_reward: -1.450000 ± 86.758213 in #11


Epoch #57: 10001it [00:02, 4515.30it/s, bastaushy/loss=103.550, env_step=570000, len=136, n/ep=9, n/st=1000, rew=-12.67]                           


Epoch #57: test_reward: -81.665000 ± 58.348974, best_reward: -1.450000 ± 86.758213 in #11


Epoch #58: 10001it [00:02, 4550.17it/s, bastaushy/loss=103.735, env_step=580000, len=147, n/ep=10, n/st=1000, rew=37.20]                           


Epoch #58: test_reward: -77.720000 ± 60.129790, best_reward: -1.450000 ± 86.758213 in #11


Epoch #59: 10001it [00:02, 4493.41it/s, bastaushy/loss=102.778, env_step=590000, len=104, n/ep=10, n/st=1000, rew=19.90]                           


Epoch #59: test_reward: -77.015000 ± 63.860823, best_reward: -1.450000 ± 86.758213 in #11


Epoch #60: 10001it [00:02, 4527.76it/s, bastaushy/loss=103.262, env_step=600000, len=141, n/ep=9, n/st=1000, rew=-33.22]                           


Epoch #60: test_reward: -79.750000 ± 56.642718, best_reward: -1.450000 ± 86.758213 in #11


Epoch #61: 10001it [00:02, 4511.86it/s, bastaushy/loss=102.821, env_step=610000, len=145, n/ep=9, n/st=1000, rew=32.33]                           


Epoch #61: test_reward: -65.075000 ± 72.645711, best_reward: -1.450000 ± 86.758213 in #11


Epoch #62: 10001it [00:02, 4506.41it/s, bastaushy/loss=102.196, env_step=620000, len=106, n/ep=6, n/st=1000, rew=-3.33]                           


Epoch #62: test_reward: -81.040000 ± 58.509216, best_reward: -1.450000 ± 86.758213 in #11


Epoch #63: 10001it [00:02, 4420.55it/s, bastaushy/loss=101.109, env_step=630000, len=134, n/ep=7, n/st=1000, rew=20.00]                           


Epoch #63: test_reward: -64.670000 ± 70.441118, best_reward: -1.450000 ± 86.758213 in #11


Epoch #64: 10001it [00:02, 4446.29it/s, bastaushy/loss=97.867, env_step=640000, len=120, n/ep=2, n/st=1000, rew=19.00]                            


Epoch #64: test_reward: -59.500000 ± 74.808556, best_reward: -1.450000 ± 86.758213 in #11


Epoch #65: 10001it [00:02, 4422.69it/s, bastaushy/loss=97.440, env_step=650000, len=134, n/ep=6, n/st=1000, rew=57.00]                            


Epoch #65: test_reward: -74.260000 ± 64.476836, best_reward: -1.450000 ± 86.758213 in #11


Epoch #66: 10001it [00:02, 4450.55it/s, bastaushy/loss=95.963, env_step=660000, len=137, n/ep=6, n/st=1000, rew=-58.50]                           


Epoch #66: test_reward: -65.210000 ± 70.387470, best_reward: -1.450000 ± 86.758213 in #11


Epoch #67: 10001it [00:02, 4414.98it/s, bastaushy/loss=95.282, env_step=670000, len=146, n/ep=7, n/st=1000, rew=19.14]                            


Epoch #67: test_reward: -50.235000 ± 77.505676, best_reward: -1.450000 ± 86.758213 in #11


Epoch #68: 10001it [00:02, 4475.44it/s, bastaushy/loss=95.392, env_step=680000, len=118, n/ep=7, n/st=1000, rew=62.71]                            


Epoch #68: test_reward: -56.860000 ± 74.527716, best_reward: -1.450000 ± 86.758213 in #11


Epoch #69: 10001it [00:02, 4506.40it/s, bastaushy/loss=96.289, env_step=690000, len=138, n/ep=8, n/st=1000, rew=24.62]                           


Epoch #69: test_reward: -81.415000 ± 53.873396, best_reward: -1.450000 ± 86.758213 in #11


Epoch #70: 10001it [00:02, 4257.20it/s, bastaushy/loss=96.582, env_step=700000, len=120, n/ep=12, n/st=1000, rew=30.42]                           


Epoch #70: test_reward: -72.420000 ± 62.539776, best_reward: -1.450000 ± 86.758213 in #11


Epoch #71: 10001it [00:02, 4461.99it/s, bastaushy/loss=96.641, env_step=710000, len=121, n/ep=7, n/st=1000, rew=40.00]                            


Epoch #71: test_reward: -66.415000 ± 73.740442, best_reward: -1.450000 ± 86.758213 in #11


Epoch #72: 10001it [00:02, 4472.81it/s, bastaushy/loss=96.263, env_step=720000, len=157, n/ep=8, n/st=1000, rew=-6.62]                           


Epoch #72: test_reward: -67.525000 ± 69.929746, best_reward: -1.450000 ± 86.758213 in #11


Epoch #73: 10001it [00:02, 4492.56it/s, bastaushy/loss=96.914, env_step=730000, len=115, n/ep=5, n/st=1000, rew=20.20]                           


Epoch #73: test_reward: -71.375000 ± 67.524546, best_reward: -1.450000 ± 86.758213 in #11


Epoch #74: 10001it [00:02, 4457.81it/s, bastaushy/loss=97.827, env_step=740000, len=120, n/ep=12, n/st=1000, rew=17.67]                           


Epoch #74: test_reward: -71.340000 ± 66.746793, best_reward: -1.450000 ± 86.758213 in #11


Epoch #75: 10001it [00:02, 4490.70it/s, bastaushy/loss=96.883, env_step=750000, len=123, n/ep=11, n/st=1000, rew=21.91]                           


Epoch #75: test_reward: -82.775000 ± 54.677275, best_reward: -1.450000 ± 86.758213 in #11


Epoch #76: 10001it [00:02, 4454.80it/s, bastaushy/loss=96.422, env_step=760000, len=100, n/ep=6, n/st=1000, rew=34.83]                            


Epoch #76: test_reward: -68.225000 ± 69.306813, best_reward: -1.450000 ± 86.758213 in #11


Epoch #77: 10001it [00:02, 4491.12it/s, bastaushy/loss=96.624, env_step=770000, len=115, n/ep=14, n/st=1000, rew=26.57]                           


Epoch #77: test_reward: -74.365000 ± 62.145489, best_reward: -1.450000 ± 86.758213 in #11


Epoch #78: 10001it [00:02, 4591.45it/s, bastaushy/loss=96.683, env_step=780000, len=124, n/ep=6, n/st=1000, rew=33.83]                             


Epoch #78: test_reward: -79.255000 ± 58.611773, best_reward: -1.450000 ± 86.758213 in #11


Epoch #79: 10001it [00:02, 4573.86it/s, bastaushy/loss=95.151, env_step=790000, len=127, n/ep=10, n/st=1000, rew=-0.30]                           


Epoch #79: test_reward: -63.410000 ± 72.112287, best_reward: -1.450000 ± 86.758213 in #11


Epoch #80: 10001it [00:02, 4489.49it/s, bastaushy/loss=93.542, env_step=800000, len=119, n/ep=8, n/st=1000, rew=-44.38]                           


Epoch #80: test_reward: -66.755000 ± 71.305645, best_reward: -1.450000 ± 86.758213 in #11


Epoch #81: 10001it [00:02, 4594.03it/s, bastaushy/loss=92.791, env_step=810000, len=96, n/ep=9, n/st=1000, rew=12.22]                            


Epoch #81: test_reward: -80.375000 ± 55.242324, best_reward: -1.450000 ± 86.758213 in #11


Epoch #82: 10001it [00:02, 4461.40it/s, bastaushy/loss=92.673, env_step=820000, len=109, n/ep=8, n/st=1000, rew=-2.12]                           


Epoch #82: test_reward: -63.100000 ± 73.215504, best_reward: -1.450000 ± 86.758213 in #11


Epoch #83: 10001it [00:02, 4437.23it/s, bastaushy/loss=92.310, env_step=830000, len=117, n/ep=8, n/st=1000, rew=44.75]                            


Epoch #83: test_reward: -62.160000 ± 75.392403, best_reward: -1.450000 ± 86.758213 in #11


Epoch #84: 10001it [00:02, 4467.13it/s, bastaushy/loss=91.697, env_step=840000, len=115, n/ep=5, n/st=1000, rew=-94.20]                           


Epoch #84: test_reward: -66.450000 ± 68.966640, best_reward: -1.450000 ± 86.758213 in #11


Epoch #85: 10001it [00:02, 4529.19it/s, bastaushy/loss=91.452, env_step=850000, len=131, n/ep=10, n/st=1000, rew=18.70]                           


Epoch #85: test_reward: -73.730000 ± 64.680346, best_reward: -1.450000 ± 86.758213 in #11


Epoch #86: 10001it [00:02, 4430.35it/s, bastaushy/loss=90.739, env_step=860000, len=107, n/ep=8, n/st=1000, rew=2.12]                            


Epoch #86: test_reward: -69.190000 ± 69.024227, best_reward: -1.450000 ± 86.758213 in #11


Epoch #87: 10001it [00:02, 4491.27it/s, bastaushy/loss=89.597, env_step=870000, len=128, n/ep=9, n/st=1000, rew=8.56]                              


Epoch #87: test_reward: -62.890000 ± 73.366327, best_reward: -1.450000 ± 86.758213 in #11


Epoch #88: 10001it [00:02, 4439.32it/s, bastaushy/loss=87.755, env_step=880000, len=129, n/ep=8, n/st=1000, rew=2.50]                            


Epoch #88: test_reward: -74.170000 ± 63.920741, best_reward: -1.450000 ± 86.758213 in #11


Epoch #89: 10001it [00:02, 4461.55it/s, bastaushy/loss=89.057, env_step=890000, len=118, n/ep=7, n/st=1000, rew=-34.86]                           


Epoch #89: test_reward: -69.430000 ± 67.652976, best_reward: -1.450000 ± 86.758213 in #11


Epoch #90: 10001it [00:02, 4315.22it/s, bastaushy/loss=89.193, env_step=900000, len=150, n/ep=10, n/st=1000, rew=22.50]                           


Epoch #90: test_reward: -76.365000 ± 60.706357, best_reward: -1.450000 ± 86.758213 in #11


Epoch #91: 10001it [00:02, 4462.24it/s, bastaushy/loss=89.338, env_step=910000, len=126, n/ep=6, n/st=1000, rew=60.67]                           


Epoch #91: test_reward: -48.955000 ± 78.753178, best_reward: -1.450000 ± 86.758213 in #11


Epoch #92: 10001it [00:02, 4469.27it/s, bastaushy/loss=88.749, env_step=920000, len=137, n/ep=12, n/st=1000, rew=-25.67]                           


Epoch #92: test_reward: -57.155000 ± 75.500669, best_reward: -1.450000 ± 86.758213 in #11


Epoch #93: 10001it [00:02, 4307.70it/s, bastaushy/loss=88.087, env_step=930000, len=109, n/ep=6, n/st=1000, rew=-33.17]                           


Epoch #93: test_reward: -47.490000 ± 80.322039, best_reward: -1.450000 ± 86.758213 in #11


Epoch #94: 10001it [00:02, 4454.17it/s, bastaushy/loss=86.924, env_step=940000, len=151, n/ep=11, n/st=1000, rew=-11.18]                           


Epoch #94: test_reward: -34.250000 ± 82.991912, best_reward: -1.450000 ± 86.758213 in #11


Epoch #95: 10001it [00:02, 4500.98it/s, bastaushy/loss=85.940, env_step=950000, len=131, n/ep=10, n/st=1000, rew=1.70]                           


Epoch #95: test_reward: -32.430000 ± 84.377634, best_reward: -1.450000 ± 86.758213 in #11


Epoch #96: 10001it [00:02, 4342.49it/s, bastaushy/loss=86.772, env_step=960000, len=100, n/ep=10, n/st=1000, rew=20.10]                           


Epoch #96: test_reward: -29.085000 ± 84.932372, best_reward: -1.450000 ± 86.758213 in #11


Epoch #97: 10001it [00:02, 4497.01it/s, bastaushy/loss=88.206, env_step=970000, len=102, n/ep=8, n/st=1000, rew=22.75]                           


Epoch #97: test_reward: -18.970000 ± 87.236226, best_reward: -1.450000 ± 86.758213 in #11


Epoch #98: 10001it [00:02, 4556.11it/s, bastaushy/loss=88.449, env_step=980000, len=124, n/ep=10, n/st=1000, rew=56.40]                           


Epoch #98: test_reward: -46.310000 ± 82.179340, best_reward: -1.450000 ± 86.758213 in #11


Epoch #99: 10001it [00:02, 4360.99it/s, bastaushy/loss=87.683, env_step=990000, len=92, n/ep=6, n/st=1000, rew=66.00]                             


Epoch #99: test_reward: -30.215000 ± 86.200805, best_reward: -1.450000 ± 86.758213 in #11


Epoch #100: 10001it [00:02, 4569.53it/s, bastaushy/loss=87.185, env_step=1000000, len=140, n/ep=3, n/st=1000, rew=-27.00]                           


Epoch #100: test_reward: -41.975000 ± 84.216651, best_reward: -1.450000 ± 86.758213 in #11


Epoch #101: 10001it [00:02, 4555.25it/s, bastaushy/loss=86.413, env_step=1010000, len=132, n/ep=15, n/st=1000, rew=10.27]                           


Epoch #101: test_reward: -33.515000 ± 85.779309, best_reward: -1.450000 ± 86.758213 in #11


Epoch #102: 10001it [00:02, 4392.01it/s, bastaushy/loss=85.775, env_step=1020000, len=129, n/ep=4, n/st=1000, rew=6.50]                             


Epoch #102: test_reward: -50.340000 ± 80.259232, best_reward: -1.450000 ± 86.758213 in #11


Epoch #103: 10001it [00:02, 4469.53it/s, bastaushy/loss=85.041, env_step=1030000, len=96, n/ep=8, n/st=1000, rew=70.50]                            


Epoch #103: test_reward: -55.915000 ± 76.918449, best_reward: -1.450000 ± 86.758213 in #11


Epoch #104: 10001it [00:02, 4529.48it/s, bastaushy/loss=85.156, env_step=1040000, len=110, n/ep=12, n/st=1000, rew=1.67]                            


Epoch #104: test_reward: -24.855000 ± 86.882300, best_reward: -1.450000 ± 86.758213 in #11


Epoch #105: 10001it [00:02, 4347.09it/s, bastaushy/loss=84.727, env_step=1050000, len=114, n/ep=9, n/st=1000, rew=30.56]                           


Epoch #105: test_reward: -35.010000 ± 84.813029, best_reward: -1.450000 ± 86.758213 in #11


Epoch #106: 10001it [00:02, 4449.14it/s, bastaushy/loss=84.553, env_step=1060000, len=132, n/ep=7, n/st=1000, rew=41.29]                             


Epoch #106: test_reward: -53.365000 ± 76.433512, best_reward: -1.450000 ± 86.758213 in #11


Epoch #107: 10001it [00:02, 4404.78it/s, bastaushy/loss=83.760, env_step=1070000, len=148, n/ep=8, n/st=1000, rew=69.50]                           


Epoch #107: test_reward: -65.375000 ± 71.809988, best_reward: -1.450000 ± 86.758213 in #11


Epoch #108: 10001it [00:02, 4317.25it/s, bastaushy/loss=83.993, env_step=1080000, len=116, n/ep=8, n/st=1000, rew=-0.88]                            


Epoch #108: test_reward: -37.770000 ± 85.766468, best_reward: -1.450000 ± 86.758213 in #11


Epoch #109: 10001it [00:02, 4404.17it/s, bastaushy/loss=83.037, env_step=1090000, len=141, n/ep=11, n/st=1000, rew=23.45]                           


Epoch #109: test_reward: -37.865000 ± 87.994868, best_reward: -1.450000 ± 86.758213 in #11


Epoch #110: 10001it [00:02, 4434.75it/s, bastaushy/loss=82.790, env_step=1100000, len=137, n/ep=5, n/st=1000, rew=-56.20]                           


Epoch #110: test_reward: -36.005000 ± 84.536530, best_reward: -1.450000 ± 86.758213 in #11


Epoch #111: 10001it [00:02, 4345.68it/s, bastaushy/loss=83.446, env_step=1110000, len=122, n/ep=14, n/st=1000, rew=-2.29]                           


Epoch #111: test_reward: -22.250000 ± 88.320085, best_reward: -1.450000 ± 86.758213 in #11


Epoch #112: 10001it [00:02, 4471.59it/s, bastaushy/loss=85.067, env_step=1120000, len=118, n/ep=7, n/st=1000, rew=-13.71]                           


Epoch #112: test_reward: -59.565000 ± 76.853144, best_reward: -1.450000 ± 86.758213 in #11


Epoch #113: 10001it [00:02, 4381.58it/s, bastaushy/loss=86.093, env_step=1130000, len=109, n/ep=10, n/st=1000, rew=2.00]                            


Epoch #113: test_reward: -47.635000 ± 80.829894, best_reward: -1.450000 ± 86.758213 in #11


Epoch #114: 10001it [00:02, 4298.18it/s, bastaushy/loss=87.563, env_step=1140000, len=119, n/ep=7, n/st=1000, rew=12.29]                            


Epoch #114: test_reward: -63.905000 ± 72.491765, best_reward: -1.450000 ± 86.758213 in #11


Epoch #115: 10001it [00:02, 4447.78it/s, bastaushy/loss=87.832, env_step=1150000, len=114, n/ep=7, n/st=1000, rew=17.14]                            


Epoch #115: test_reward: -46.680000 ± 81.265784, best_reward: -1.450000 ± 86.758213 in #11


Epoch #116: 10001it [00:02, 4484.95it/s, bastaushy/loss=86.889, env_step=1160000, len=123, n/ep=8, n/st=1000, rew=47.88]                            


Epoch #116: test_reward: -27.950000 ± 84.067101, best_reward: -1.450000 ± 86.758213 in #11


Epoch #117: 10001it [00:02, 4395.80it/s, bastaushy/loss=88.237, env_step=1170000, len=119, n/ep=8, n/st=1000, rew=70.75]                            


Epoch #117: test_reward: -47.980000 ± 81.108998, best_reward: -1.450000 ± 86.758213 in #11


Epoch #118: 10001it [00:02, 4519.62it/s, bastaushy/loss=87.639, env_step=1180000, len=129, n/ep=10, n/st=1000, rew=1.50]                           


Epoch #118: test_reward: -21.455000 ± 86.408090, best_reward: -1.450000 ± 86.758213 in #11


Epoch #119: 10001it [00:02, 4394.98it/s, bastaushy/loss=87.996, env_step=1190000, len=149, n/ep=8, n/st=1000, rew=22.25]                            


Epoch #119: test_reward: -48.320000 ± 80.701472, best_reward: -1.450000 ± 86.758213 in #11


Epoch #120: 10001it [00:02, 4387.00it/s, bastaushy/loss=87.879, env_step=1200000, len=127, n/ep=10, n/st=1000, rew=-73.10]                           


Epoch #120: test_reward: -29.140000 ± 86.729524, best_reward: -1.450000 ± 86.758213 in #11


Epoch #121: 10001it [00:02, 4418.02it/s, bastaushy/loss=88.470, env_step=1210000, len=81, n/ep=3, n/st=1000, rew=-35.67]                           


Epoch #121: test_reward: -47.915000 ± 78.646028, best_reward: -1.450000 ± 86.758213 in #11


Epoch #122: 10001it [00:02, 4550.95it/s, bastaushy/loss=88.291, env_step=1220000, len=143, n/ep=9, n/st=1000, rew=11.00]                           


Epoch #122: test_reward: -28.020000 ± 85.686928, best_reward: -1.450000 ± 86.758213 in #11


Epoch #123: 10001it [00:02, 4258.17it/s, bastaushy/loss=87.939, env_step=1230000, len=126, n/ep=12, n/st=1000, rew=15.00]                           


Epoch #123: test_reward: -56.745000 ± 76.809635, best_reward: -1.450000 ± 86.758213 in #11


Epoch #124: 10001it [00:02, 4407.04it/s, bastaushy/loss=85.514, env_step=1240000, len=114, n/ep=7, n/st=1000, rew=15.43]                            


Epoch #124: test_reward: -51.675000 ± 77.870401, best_reward: -1.450000 ± 86.758213 in #11


Epoch #125: 10001it [00:02, 4452.51it/s, bastaushy/loss=84.970, env_step=1250000, len=127, n/ep=9, n/st=1000, rew=48.89]                            


Epoch #125: test_reward: -34.030000 ± 84.720771, best_reward: -1.450000 ± 86.758213 in #11


Epoch #126: 10001it [00:02, 4301.36it/s, bastaushy/loss=85.781, env_step=1260000, len=148, n/ep=7, n/st=1000, rew=9.86]                            


Epoch #126: test_reward: -32.490000 ± 84.019164, best_reward: -1.450000 ± 86.758213 in #11


Epoch #127: 10001it [00:02, 4432.27it/s, bastaushy/loss=83.961, env_step=1270000, len=118, n/ep=7, n/st=1000, rew=18.43]                           


Epoch #127: test_reward: -31.545000 ± 84.183953, best_reward: -1.450000 ± 86.758213 in #11


Epoch #128: 10001it [00:02, 4386.07it/s, bastaushy/loss=84.226, env_step=1280000, len=145, n/ep=10, n/st=1000, rew=23.90]                            


Epoch #128: test_reward: -33.855000 ± 86.986171, best_reward: -1.450000 ± 86.758213 in #11


Epoch #129: 10001it [00:02, 4390.01it/s, bastaushy/loss=84.618, env_step=1290000, len=118, n/ep=11, n/st=1000, rew=-23.00]                           


Epoch #129: test_reward: -47.725000 ± 79.890296, best_reward: -1.450000 ± 86.758213 in #11


Epoch #130: 10001it [00:02, 4425.04it/s, bastaushy/loss=82.981, env_step=1300000, len=114, n/ep=8, n/st=1000, rew=27.88]                           


Epoch #130: test_reward: -59.030000 ± 75.339227, best_reward: -1.450000 ± 86.758213 in #11


Epoch #131: 10001it [00:02, 4501.44it/s, bastaushy/loss=81.729, env_step=1310000, len=167, n/ep=4, n/st=1000, rew=0.50]                            


Epoch #131: test_reward: -49.630000 ± 80.283704, best_reward: -1.450000 ± 86.758213 in #11


Epoch #132: 10001it [00:02, 4434.33it/s, bastaushy/loss=80.994, env_step=1320000, len=135, n/ep=10, n/st=1000, rew=-1.30]                           


Epoch #132: test_reward: -54.960000 ± 77.118016, best_reward: -1.450000 ± 86.758213 in #11


Epoch #133: 10001it [00:02, 4630.77it/s, bastaushy/loss=79.921, env_step=1330000, len=121, n/ep=9, n/st=1000, rew=34.78]                            


Epoch #133: test_reward: -38.905000 ± 83.797410, best_reward: -1.450000 ± 86.758213 in #11


Epoch #134: 10001it [00:02, 4507.58it/s, bastaushy/loss=80.945, env_step=1340000, len=110, n/ep=7, n/st=1000, rew=-7.86]                           


Epoch #134: test_reward: -31.595000 ± 85.006417, best_reward: -1.450000 ± 86.758213 in #11


Epoch #135: 10001it [00:02, 4429.59it/s, bastaushy/loss=80.647, env_step=1350000, len=109, n/ep=5, n/st=1000, rew=-3.60]                           


Epoch #135: test_reward: -41.825000 ± 82.484267, best_reward: -1.450000 ± 86.758213 in #11


Epoch #136: 10001it [00:02, 4420.87it/s, bastaushy/loss=79.192, env_step=1360000, len=109, n/ep=11, n/st=1000, rew=-41.64]                           


Epoch #136: test_reward: -29.730000 ± 87.371317, best_reward: -1.450000 ± 86.758213 in #11


Epoch #137: 10001it [00:02, 4440.80it/s, bastaushy/loss=79.448, env_step=1370000, len=93, n/ep=7, n/st=1000, rew=40.86]                            


Epoch #137: test_reward: -32.650000 ± 86.005857, best_reward: -1.450000 ± 86.758213 in #11


Epoch #138: 10001it [00:02, 4505.40it/s, bastaushy/loss=78.482, env_step=1380000, len=89, n/ep=8, n/st=1000, rew=-28.38]                            


Epoch #138: test_reward: -22.825000 ± 87.454356, best_reward: -1.450000 ± 86.758213 in #11


Epoch #139: 10001it [00:02, 4512.63it/s, bastaushy/loss=76.970, env_step=1390000, len=142, n/ep=13, n/st=1000, rew=1.85]                           


Epoch #139: test_reward: -31.070000 ± 85.275701, best_reward: -1.450000 ± 86.758213 in #11


Epoch #140: 10001it [00:02, 4474.35it/s, bastaushy/loss=77.686, env_step=1400000, len=132, n/ep=7, n/st=1000, rew=13.00]                            


Epoch #140: test_reward: -40.785000 ± 83.006016, best_reward: -1.450000 ± 86.758213 in #11


Epoch #141: 10001it [00:02, 4293.11it/s, bastaushy/loss=78.424, env_step=1410000, len=133, n/ep=7, n/st=1000, rew=35.29]                           


Epoch #141: test_reward: -36.415000 ± 83.405412, best_reward: -1.450000 ± 86.758213 in #11


Epoch #142: 10001it [00:02, 4447.24it/s, bastaushy/loss=78.839, env_step=1420000, len=129, n/ep=12, n/st=1000, rew=32.83]                           


Epoch #142: test_reward: -20.985000 ± 86.485286, best_reward: -1.450000 ± 86.758213 in #11


Epoch #143: 10001it [00:02, 4504.42it/s, bastaushy/loss=79.249, env_step=1430000, len=138, n/ep=8, n/st=1000, rew=19.88]                            


Epoch #143: test_reward: -39.405000 ± 83.103435, best_reward: -1.450000 ± 86.758213 in #11


Epoch #144: 10001it [00:02, 4393.81it/s, bastaushy/loss=77.792, env_step=1440000, len=118, n/ep=10, n/st=1000, rew=59.70]                           


Epoch #144: test_reward: -21.830000 ± 86.349760, best_reward: -1.450000 ± 86.758213 in #11


Epoch #145: 10001it [00:02, 4337.78it/s, bastaushy/loss=78.537, env_step=1450000, len=134, n/ep=8, n/st=1000, rew=48.62]                           


Epoch #145: test_reward: -42.770000 ± 80.729778, best_reward: -1.450000 ± 86.758213 in #11


Epoch #146: 10001it [00:02, 4528.55it/s, bastaushy/loss=79.158, env_step=1460000, len=128, n/ep=10, n/st=1000, rew=-21.50]                           


Epoch #146: test_reward: -20.540000 ± 88.916412, best_reward: -1.450000 ± 86.758213 in #11


Epoch #147: 10001it [00:02, 4421.25it/s, bastaushy/loss=77.226, env_step=1470000, len=108, n/ep=8, n/st=1000, rew=76.88]                            


Epoch #147: test_reward: -15.120000 ± 87.481573, best_reward: -1.450000 ± 86.758213 in #11


Epoch #148: 10001it [00:02, 4509.60it/s, bastaushy/loss=76.921, env_step=1480000, len=126, n/ep=10, n/st=1000, rew=2.80]                            


Epoch #148: test_reward: -21.620000 ± 86.873791, best_reward: -1.450000 ± 86.758213 in #11


Epoch #149: 10001it [00:02, 4428.38it/s, bastaushy/loss=76.392, env_step=1490000, len=102, n/ep=9, n/st=1000, rew=11.00]                            


Epoch #149: test_reward: -9.820000 ± 88.147193, best_reward: -1.450000 ± 86.758213 in #11


Epoch #150: 10001it [00:02, 4483.13it/s, bastaushy/loss=77.255, env_step=1500000, len=131, n/ep=11, n/st=1000, rew=27.64]                           


Epoch #150: test_reward: -2.080000 ± 87.904457, best_reward: -1.450000 ± 86.758213 in #11


Epoch #151: 10001it [00:02, 4462.68it/s, bastaushy/loss=76.493, env_step=1510000, len=118, n/ep=7, n/st=1000, rew=15.57]                           


Epoch #151: test_reward: -4.345000 ± 87.292130, best_reward: -1.450000 ± 86.758213 in #11


Epoch #152: 10001it [00:02, 4420.40it/s, bastaushy/loss=75.579, env_step=1520000, len=125, n/ep=8, n/st=1000, rew=-19.00]                           


Agent bastaushy vs random {'bastaushy': 50, 'qostaushy': 50}
Epoch #152: test_reward: 15.680000 ± 86.942726, best_reward: 15.680000 ± 86.942726 in #152


Epoch #153: 10001it [00:02, 4348.90it/s, bastaushy/loss=76.323, env_step=1530000, len=132, n/ep=10, n/st=1000, rew=41.40]                           


Epoch #153: test_reward: -9.630000 ± 89.036358, best_reward: 15.680000 ± 86.942726 in #152


Epoch #154: 10001it [00:02, 4365.03it/s, bastaushy/loss=76.956, env_step=1540000, len=116, n/ep=12, n/st=1000, rew=20.58]                           


Epoch #154: test_reward: -16.840000 ± 85.608612, best_reward: 15.680000 ± 86.942726 in #152


Epoch #155: 10001it [00:02, 4407.21it/s, bastaushy/loss=77.114, env_step=1550000, len=137, n/ep=7, n/st=1000, rew=35.43]                             


Epoch #155: test_reward: -3.135000 ± 88.022592, best_reward: 15.680000 ± 86.942726 in #152


Epoch #156: 10001it [00:02, 4504.61it/s, bastaushy/loss=77.154, env_step=1560000, len=101, n/ep=7, n/st=1000, rew=14.43]                           


Epoch #156: test_reward: -46.960000 ± 79.693465, best_reward: 15.680000 ± 86.942726 in #152


Epoch #157: 10001it [00:02, 4466.84it/s, bastaushy/loss=77.666, env_step=1570000, len=130, n/ep=6, n/st=1000, rew=-25.17]                           


Epoch #157: test_reward: -25.755000 ± 85.431054, best_reward: 15.680000 ± 86.942726 in #152


Epoch #158: 10001it [00:02, 4440.85it/s, bastaushy/loss=78.111, env_step=1580000, len=129, n/ep=10, n/st=1000, rew=-21.70]                           


Epoch #158: test_reward: -28.850000 ± 88.162733, best_reward: 15.680000 ± 86.942726 in #152


Epoch #159: 10001it [00:02, 4432.61it/s, bastaushy/loss=79.693, env_step=1590000, len=138, n/ep=6, n/st=1000, rew=30.33]                           


Epoch #159: test_reward: -53.235000 ± 77.626991, best_reward: 15.680000 ± 86.942726 in #152


Epoch #160: 10001it [00:02, 4428.47it/s, bastaushy/loss=78.924, env_step=1600000, len=103, n/ep=7, n/st=1000, rew=-35.43]                           


Epoch #160: test_reward: 7.780000 ± 90.644534, best_reward: 15.680000 ± 86.942726 in #152


Epoch #161: 10001it [00:02, 4469.46it/s, bastaushy/loss=80.038, env_step=1610000, len=129, n/ep=13, n/st=1000, rew=-6.08]                           


Epoch #161: test_reward: -3.390000 ± 88.325749, best_reward: 15.680000 ± 86.942726 in #152


Epoch #162: 10001it [00:02, 4359.80it/s, bastaushy/loss=80.546, env_step=1620000, len=96, n/ep=6, n/st=1000, rew=94.83]                            


Epoch #162: test_reward: -20.780000 ± 87.586252, best_reward: 15.680000 ± 86.942726 in #152


Epoch #163: 10001it [00:02, 4390.87it/s, bastaushy/loss=78.806, env_step=1630000, len=130, n/ep=7, n/st=1000, rew=-40.43]                            


Epoch #163: test_reward: -10.630000 ± 88.555763, best_reward: 15.680000 ± 86.942726 in #152


Epoch #164: 10001it [00:02, 4352.15it/s, bastaushy/loss=79.352, env_step=1640000, len=127, n/ep=7, n/st=1000, rew=-11.29]                           


Epoch #164: test_reward: -34.870000 ± 86.420039, best_reward: 15.680000 ± 86.942726 in #152


Epoch #165: 10001it [00:02, 4460.24it/s, bastaushy/loss=78.060, env_step=1650000, len=98, n/ep=6, n/st=1000, rew=96.33]                            


Epoch #165: test_reward: -16.115000 ± 89.387369, best_reward: 15.680000 ± 86.942726 in #152


Epoch #166: 10001it [00:02, 4475.11it/s, bastaushy/loss=76.197, env_step=1660000, len=136, n/ep=6, n/st=1000, rew=34.83]                           


Epoch #166: test_reward: -24.145000 ± 87.780601, best_reward: 15.680000 ± 86.942726 in #152


Epoch #167: 10001it [00:02, 4456.62it/s, bastaushy/loss=76.572, env_step=1670000, len=144, n/ep=5, n/st=1000, rew=57.80]                           


Epoch #167: test_reward: -0.595000 ± 87.843730, best_reward: 15.680000 ± 86.942726 in #152


Epoch #168: 10001it [00:02, 4476.90it/s, bastaushy/loss=77.088, env_step=1680000, len=116, n/ep=10, n/st=1000, rew=54.30]                           


Agent bastaushy vs random {'bastaushy': 62, 'qostaushy': 38}
Epoch #168: test_reward: 16.430000 ± 85.201556, best_reward: 16.430000 ± 85.201556 in #168


Epoch #169: 10001it [00:02, 4462.35it/s, bastaushy/loss=76.640, env_step=1690000, len=140, n/ep=13, n/st=1000, rew=4.77]                            


Epoch #169: test_reward: 3.790000 ± 89.466451, best_reward: 16.430000 ± 85.201556 in #168


Epoch #170: 10001it [00:02, 4407.95it/s, bastaushy/loss=75.508, env_step=1700000, len=138, n/ep=5, n/st=1000, rew=52.20]                           


Epoch #170: test_reward: -6.215000 ± 91.356657, best_reward: 16.430000 ± 85.201556 in #168


Epoch #171: 10001it [00:02, 4334.76it/s, bastaushy/loss=74.113, env_step=1710000, len=119, n/ep=5, n/st=1000, rew=90.00]                            


Epoch #171: test_reward: -17.410000 ± 88.690596, best_reward: 16.430000 ± 85.201556 in #168


Epoch #172: 10001it [00:02, 4359.26it/s, bastaushy/loss=71.845, env_step=1720000, len=113, n/ep=9, n/st=1000, rew=-22.11]                            


Epoch #172: test_reward: -27.500000 ± 86.632442, best_reward: 16.430000 ± 85.201556 in #168


Epoch #173: 10001it [00:02, 4411.99it/s, bastaushy/loss=71.940, env_step=1730000, len=125, n/ep=7, n/st=1000, rew=23.14]                           


Epoch #173: test_reward: -18.925000 ± 87.928206, best_reward: 16.430000 ± 85.201556 in #168


Epoch #174: 10001it [00:02, 4401.74it/s, bastaushy/loss=71.115, env_step=1740000, len=147, n/ep=5, n/st=1000, rew=16.20]                            


Epoch #174: test_reward: -22.725000 ± 87.802445, best_reward: 16.430000 ± 85.201556 in #168


Epoch #175: 10001it [00:02, 4225.10it/s, bastaushy/loss=70.726, env_step=1750000, len=127, n/ep=8, n/st=1000, rew=70.38]                            


Epoch #175: test_reward: -12.560000 ± 89.657160, best_reward: 16.430000 ± 85.201556 in #168


Epoch #176: 10001it [00:02, 4367.48it/s, bastaushy/loss=71.101, env_step=1760000, len=128, n/ep=8, n/st=1000, rew=-6.38]                           


Agent bastaushy vs random {'bastaushy': 55, 'qostaushy': 45}
Epoch #176: test_reward: 16.770000 ± 88.019868, best_reward: 16.770000 ± 88.019868 in #176


Epoch #177: 10001it [00:02, 4407.60it/s, bastaushy/loss=70.255, env_step=1770000, len=106, n/ep=11, n/st=1000, rew=28.91]                           


Epoch #177: test_reward: -27.515000 ± 87.252563, best_reward: 16.770000 ± 88.019868 in #176


Epoch #178: 10001it [00:02, 4563.36it/s, bastaushy/loss=70.138, env_step=1780000, len=128, n/ep=15, n/st=1000, rew=-16.07]                           


Epoch #178: test_reward: -31.995000 ± 86.144849, best_reward: 16.770000 ± 88.019868 in #176


Epoch #179: 10001it [00:02, 4564.57it/s, bastaushy/loss=70.703, env_step=1790000, len=108, n/ep=13, n/st=1000, rew=7.31]                            


Epoch #179: test_reward: -12.965000 ± 90.310652, best_reward: 16.770000 ± 88.019868 in #176


Epoch #180: 10001it [00:02, 4600.71it/s, bastaushy/loss=72.211, env_step=1800000, len=164, n/ep=7, n/st=1000, rew=-12.14]                           


Epoch #180: test_reward: -19.220000 ± 88.524356, best_reward: 16.770000 ± 88.019868 in #176


Epoch #181: 10001it [00:02, 4635.57it/s, bastaushy/loss=74.067, env_step=1810000, len=107, n/ep=8, n/st=1000, rew=22.12]                            


Epoch #181: test_reward: -18.530000 ± 86.555930, best_reward: 16.770000 ± 88.019868 in #176


Epoch #182: 10001it [00:02, 4590.48it/s, bastaushy/loss=74.228, env_step=1820000, len=134, n/ep=5, n/st=1000, rew=50.00]                            


Epoch #182: test_reward: -6.380000 ± 86.886855, best_reward: 16.770000 ± 88.019868 in #176


Epoch #183: 10001it [00:02, 4561.53it/s, bastaushy/loss=75.282, env_step=1830000, len=121, n/ep=15, n/st=1000, rew=8.73]                            


Epoch #183: test_reward: 14.305000 ± 84.996364, best_reward: 16.770000 ± 88.019868 in #176


Epoch #184: 10001it [00:02, 4596.10it/s, bastaushy/loss=76.454, env_step=1840000, len=110, n/ep=8, n/st=1000, rew=32.50]                            


Epoch #184: test_reward: -6.475000 ± 88.653479, best_reward: 16.770000 ± 88.019868 in #176


Epoch #185: 10001it [00:02, 4580.90it/s, bastaushy/loss=77.362, env_step=1850000, len=117, n/ep=8, n/st=1000, rew=-19.75]                           


Epoch #185: test_reward: -17.585000 ± 88.832949, best_reward: 16.770000 ± 88.019868 in #176


Epoch #186: 10001it [00:02, 4642.69it/s, bastaushy/loss=79.062, env_step=1860000, len=141, n/ep=7, n/st=1000, rew=37.14]                            


Epoch #186: test_reward: -14.815000 ± 88.839410, best_reward: 16.770000 ± 88.019868 in #176


Epoch #187: 10001it [00:02, 4667.24it/s, bastaushy/loss=79.427, env_step=1870000, len=132, n/ep=9, n/st=1000, rew=-9.67]                            


Epoch #187: test_reward: -8.470000 ± 88.649755, best_reward: 16.770000 ± 88.019868 in #176


Epoch #188: 10001it [00:02, 4690.65it/s, bastaushy/loss=79.751, env_step=1880000, len=134, n/ep=9, n/st=1000, rew=-46.22]                           


Epoch #188: test_reward: -3.890000 ± 88.953684, best_reward: 16.770000 ± 88.019868 in #176


Epoch #189: 10001it [00:02, 4707.01it/s, bastaushy/loss=79.051, env_step=1890000, len=122, n/ep=9, n/st=1000, rew=7.11]                             


Epoch #189: test_reward: -9.950000 ± 88.677153, best_reward: 16.770000 ± 88.019868 in #176


Epoch #190: 10001it [00:02, 4608.74it/s, bastaushy/loss=77.735, env_step=1900000, len=114, n/ep=17, n/st=1000, rew=30.53]                           


Epoch #190: test_reward: -29.855000 ± 87.488308, best_reward: 16.770000 ± 88.019868 in #176


Epoch #191: 10001it [00:02, 4646.09it/s, bastaushy/loss=76.380, env_step=1910000, len=111, n/ep=8, n/st=1000, rew=72.25]                             


Epoch #191: test_reward: 0.805000 ± 91.373667, best_reward: 16.770000 ± 88.019868 in #176


Epoch #192: 10001it [00:02, 4751.26it/s, bastaushy/loss=76.854, env_step=1920000, len=130, n/ep=6, n/st=1000, rew=-60.83]                            


Epoch #192: test_reward: -15.295000 ± 86.856709, best_reward: 16.770000 ± 88.019868 in #176


Epoch #193: 10001it [00:02, 4704.46it/s, bastaushy/loss=77.108, env_step=1930000, len=122, n/ep=11, n/st=1000, rew=41.09]                           


Epoch #193: test_reward: -10.665000 ± 88.802493, best_reward: 16.770000 ± 88.019868 in #176


Epoch #194: 10001it [00:02, 4596.53it/s, bastaushy/loss=76.696, env_step=1940000, len=127, n/ep=7, n/st=1000, rew=62.86]                           


Epoch #194: test_reward: 3.945000 ± 89.185212, best_reward: 16.770000 ± 88.019868 in #176


Epoch #195: 10001it [00:02, 4678.62it/s, bastaushy/loss=76.273, env_step=1950000, len=127, n/ep=6, n/st=1000, rew=2.17]                            


Epoch #195: test_reward: -9.910000 ± 88.689131, best_reward: 16.770000 ± 88.019868 in #176


Epoch #196: 10001it [00:02, 4650.46it/s, bastaushy/loss=75.374, env_step=1960000, len=131, n/ep=9, n/st=1000, rew=-18.33]                           


Epoch #196: test_reward: 10.765000 ± 86.393691, best_reward: 16.770000 ± 88.019868 in #176


Epoch #197: 10001it [00:02, 4691.61it/s, bastaushy/loss=76.170, env_step=1970000, len=140, n/ep=10, n/st=1000, rew=76.20]                           


Epoch #197: test_reward: -9.015000 ± 89.183938, best_reward: 16.770000 ± 88.019868 in #176


Epoch #198: 10001it [00:02, 4085.06it/s, bastaushy/loss=75.100, env_step=1980000, len=117, n/ep=12, n/st=1000, rew=32.00]                           


Epoch #198: test_reward: -38.135000 ± 84.984627, best_reward: 16.770000 ± 88.019868 in #176


Epoch #199: 10001it [00:02, 4568.81it/s, bastaushy/loss=75.159, env_step=1990000, len=129, n/ep=7, n/st=1000, rew=20.14]                           


Epoch #199: test_reward: -34.890000 ± 83.144741, best_reward: 16.770000 ± 88.019868 in #176


Epoch #200: 10001it [00:02, 4526.35it/s, bastaushy/loss=75.628, env_step=2000000, len=134, n/ep=8, n/st=1000, rew=-1.25]                            


Epoch #200: test_reward: 9.305000 ± 88.911878, best_reward: 16.770000 ± 88.019868 in #176


Epoch #201: 10001it [00:02, 4488.81it/s, bastaushy/loss=76.469, env_step=2010000, len=118, n/ep=8, n/st=1000, rew=68.62]                            


Epoch #201: test_reward: -23.060000 ± 88.352852, best_reward: 16.770000 ± 88.019868 in #176


Epoch #202: 10001it [00:02, 4537.97it/s, bastaushy/loss=76.420, env_step=2020000, len=124, n/ep=12, n/st=1000, rew=-0.08]                           


Epoch #202: test_reward: -34.895000 ± 85.074638, best_reward: 16.770000 ± 88.019868 in #176


Epoch #203: 10001it [00:02, 4578.60it/s, bastaushy/loss=75.149, env_step=2030000, len=131, n/ep=5, n/st=1000, rew=53.40]                             


Agent bastaushy vs random {'bastaushy': 43, 'qostaushy': 57}
Epoch #203: test_reward: 17.850000 ± 85.630529, best_reward: 17.850000 ± 85.630529 in #203


Epoch #204: 10001it [00:02, 4594.75it/s, bastaushy/loss=73.326, env_step=2040000, len=93, n/ep=7, n/st=1000, rew=34.57]                             


Epoch #204: test_reward: -1.595000 ± 89.921082, best_reward: 17.850000 ± 85.630529 in #203


Epoch #205: 10001it [00:02, 4615.49it/s, bastaushy/loss=72.898, env_step=2050000, len=119, n/ep=9, n/st=1000, rew=49.22]                            


Epoch #205: test_reward: -15.945000 ± 87.797619, best_reward: 17.850000 ± 85.630529 in #203


Epoch #206: 10001it [00:02, 4611.93it/s, bastaushy/loss=73.155, env_step=2060000, len=114, n/ep=11, n/st=1000, rew=5.91]                           


Epoch #206: test_reward: -8.985000 ± 88.774798, best_reward: 17.850000 ± 85.630529 in #203


Epoch #207: 10001it [00:02, 4553.26it/s, bastaushy/loss=71.437, env_step=2070000, len=150, n/ep=3, n/st=1000, rew=88.33]                           


Epoch #207: test_reward: -5.240000 ± 91.249890, best_reward: 17.850000 ± 85.630529 in #203


Epoch #208: 10001it [00:02, 4458.41it/s, bastaushy/loss=69.963, env_step=2080000, len=137, n/ep=7, n/st=1000, rew=37.00]                            


Epoch #208: test_reward: -19.320000 ± 89.277923, best_reward: 17.850000 ± 85.630529 in #203


Epoch #209: 10001it [00:02, 4616.93it/s, bastaushy/loss=68.479, env_step=2090000, len=122, n/ep=6, n/st=1000, rew=-3.00]                           


Epoch #209: test_reward: 1.725000 ± 88.512143, best_reward: 17.850000 ± 85.630529 in #203


Epoch #210: 10001it [00:02, 4623.99it/s, bastaushy/loss=67.666, env_step=2100000, len=118, n/ep=11, n/st=1000, rew=-26.18]                           


Epoch #210: test_reward: -9.160000 ± 88.894625, best_reward: 17.850000 ± 85.630529 in #203


Epoch #211: 10001it [00:02, 4696.51it/s, bastaushy/loss=66.233, env_step=2110000, len=119, n/ep=12, n/st=1000, rew=30.58]                           


Epoch #211: test_reward: -26.605000 ± 87.205212, best_reward: 17.850000 ± 85.630529 in #203


Epoch #212: 10001it [00:02, 4656.64it/s, bastaushy/loss=66.628, env_step=2120000, len=116, n/ep=10, n/st=1000, rew=38.80]                           


Epoch #212: test_reward: -20.280000 ± 89.848381, best_reward: 17.850000 ± 85.630529 in #203


Epoch #213: 10001it [00:02, 4649.34it/s, bastaushy/loss=66.771, env_step=2130000, len=109, n/ep=10, n/st=1000, rew=58.60]                           


Epoch #213: test_reward: -0.740000 ± 89.816549, best_reward: 17.850000 ± 85.630529 in #203


Epoch #214: 10001it [00:02, 4696.65it/s, bastaushy/loss=68.570, env_step=2140000, len=128, n/ep=7, n/st=1000, rew=42.86]                            


Agent bastaushy vs random {'bastaushy': 59, 'qostaushy': 41}
Epoch #214: test_reward: 19.120000 ± 86.293891, best_reward: 19.120000 ± 86.293891 in #214


Epoch #215: 10001it [00:02, 4414.31it/s, bastaushy/loss=69.348, env_step=2150000, len=113, n/ep=6, n/st=1000, rew=6.17]                             


Epoch #215: test_reward: -2.650000 ± 92.120288, best_reward: 19.120000 ± 86.293891 in #214


Epoch #216: 10001it [00:02, 4567.74it/s, bastaushy/loss=68.793, env_step=2160000, len=101, n/ep=6, n/st=1000, rew=65.33]                            


Epoch #216: test_reward: -16.080000 ± 88.982884, best_reward: 19.120000 ± 86.293891 in #214


Epoch #217: 10001it [00:02, 4614.96it/s, bastaushy/loss=69.206, env_step=2170000, len=93, n/ep=5, n/st=1000, rew=62.40]                            


Epoch #217: test_reward: 2.070000 ± 89.850905, best_reward: 19.120000 ± 86.293891 in #214


Epoch #218: 10001it [00:02, 4406.69it/s, bastaushy/loss=70.087, env_step=2180000, len=109, n/ep=9, n/st=1000, rew=10.00]                            


Epoch #218: test_reward: 4.540000 ± 90.813096, best_reward: 19.120000 ± 86.293891 in #214


Epoch #219: 10001it [00:02, 4591.65it/s, bastaushy/loss=71.120, env_step=2190000, len=94, n/ep=5, n/st=1000, rew=-17.80]                           


Epoch #219: test_reward: 3.730000 ± 89.428726, best_reward: 19.120000 ± 86.293891 in #214


Epoch #220: 10001it [00:02, 4576.35it/s, bastaushy/loss=71.279, env_step=2200000, len=127, n/ep=11, n/st=1000, rew=21.91]                           


Epoch #220: test_reward: -11.350000 ± 88.925798, best_reward: 19.120000 ± 86.293891 in #214


Epoch #221: 10001it [00:02, 4474.82it/s, bastaushy/loss=69.733, env_step=2210000, len=132, n/ep=9, n/st=1000, rew=34.67]                           


Epoch #221: test_reward: 6.570000 ± 88.944056, best_reward: 19.120000 ± 86.293891 in #214


Epoch #222: 10001it [00:02, 4568.76it/s, bastaushy/loss=69.668, env_step=2220000, len=101, n/ep=9, n/st=1000, rew=32.00]                            


Epoch #222: test_reward: 0.875000 ± 89.687398, best_reward: 19.120000 ± 86.293891 in #214


Epoch #223: 10001it [00:02, 4637.94it/s, bastaushy/loss=69.584, env_step=2230000, len=119, n/ep=7, n/st=1000, rew=92.71]                             


Epoch #223: test_reward: -8.850000 ± 89.548520, best_reward: 19.120000 ± 86.293891 in #214


Epoch #224: 10001it [00:02, 4622.19it/s, bastaushy/loss=69.167, env_step=2240000, len=116, n/ep=4, n/st=1000, rew=95.25]                           


Epoch #224: test_reward: -6.205000 ± 89.540119, best_reward: 19.120000 ± 86.293891 in #214


Epoch #225: 10001it [00:02, 4501.04it/s, bastaushy/loss=68.942, env_step=2250000, len=97, n/ep=5, n/st=1000, rew=-15.60]                           


Epoch #225: test_reward: 5.325000 ± 88.878059, best_reward: 19.120000 ± 86.293891 in #214


Epoch #226: 10001it [00:02, 4579.94it/s, bastaushy/loss=69.899, env_step=2260000, len=118, n/ep=10, n/st=1000, rew=3.70]                            


Agent bastaushy vs random {'bastaushy': 59, 'qostaushy': 41}
Epoch #226: test_reward: 22.510000 ± 85.844918, best_reward: 22.510000 ± 85.844918 in #226


Epoch #227: 10001it [00:02, 4482.43it/s, bastaushy/loss=69.938, env_step=2270000, len=108, n/ep=6, n/st=1000, rew=-4.67]                           


Epoch #227: test_reward: 5.095000 ± 88.939507, best_reward: 22.510000 ± 85.844918 in #226


Epoch #228: 10001it [00:02, 4577.68it/s, bastaushy/loss=70.664, env_step=2280000, len=112, n/ep=8, n/st=1000, rew=73.88]                           


Epoch #228: test_reward: 18.445000 ± 87.429955, best_reward: 22.510000 ± 85.844918 in #226


Epoch #229: 10001it [00:02, 4535.29it/s, bastaushy/loss=70.782, env_step=2290000, len=129, n/ep=7, n/st=1000, rew=42.86]                           


Epoch #229: test_reward: -2.410000 ± 89.090975, best_reward: 22.510000 ± 85.844918 in #226


Epoch #230: 10001it [00:02, 4636.52it/s, bastaushy/loss=71.122, env_step=2300000, len=142, n/ep=9, n/st=1000, rew=31.89]                            


Epoch #230: test_reward: 11.745000 ± 88.837830, best_reward: 22.510000 ± 85.844918 in #226


Epoch #231: 10001it [00:02, 4610.03it/s, bastaushy/loss=72.628, env_step=2310000, len=109, n/ep=5, n/st=1000, rew=-9.80]                            


Epoch #231: test_reward: -6.785000 ± 88.068262, best_reward: 22.510000 ± 85.844918 in #226


Epoch #232: 10001it [00:02, 4578.69it/s, bastaushy/loss=72.851, env_step=2320000, len=110, n/ep=9, n/st=1000, rew=12.56]                           


Epoch #232: test_reward: -4.155000 ± 88.392822, best_reward: 22.510000 ± 85.844918 in #226


Epoch #233: 10001it [00:02, 4527.44it/s, bastaushy/loss=73.444, env_step=2330000, len=108, n/ep=10, n/st=1000, rew=73.00]                           


Epoch #233: test_reward: 3.675000 ± 87.369842, best_reward: 22.510000 ± 85.844918 in #226


Epoch #234: 10001it [00:02, 4615.36it/s, bastaushy/loss=73.022, env_step=2340000, len=126, n/ep=2, n/st=1000, rew=95.00]                           


Epoch #234: test_reward: 13.895000 ± 89.451126, best_reward: 22.510000 ± 85.844918 in #226


Epoch #235: 10001it [00:02, 4615.10it/s, bastaushy/loss=74.643, env_step=2350000, len=119, n/ep=6, n/st=1000, rew=4.00]                             


Epoch #235: test_reward: 21.810000 ± 87.425419, best_reward: 22.510000 ± 85.844918 in #226


Epoch #236: 10001it [00:02, 4482.95it/s, bastaushy/loss=74.946, env_step=2360000, len=139, n/ep=9, n/st=1000, rew=49.44]                           


Agent bastaushy vs random {'bastaushy': 62, 'qostaushy': 38}
Epoch #236: test_reward: 24.375000 ± 85.785164, best_reward: 24.375000 ± 85.785164 in #236


Epoch #237: 10001it [00:02, 4687.99it/s, bastaushy/loss=75.808, env_step=2370000, len=100, n/ep=11, n/st=1000, rew=45.64]                           


Epoch #237: test_reward: -0.090000 ± 90.215974, best_reward: 24.375000 ± 85.785164 in #236


Epoch #238: 10001it [00:02, 4616.15it/s, bastaushy/loss=76.493, env_step=2380000, len=132, n/ep=8, n/st=1000, rew=30.88]                            


Epoch #238: test_reward: -0.630000 ± 90.827986, best_reward: 24.375000 ± 85.785164 in #236


Epoch #239: 10001it [00:02, 4636.39it/s, bastaushy/loss=77.645, env_step=2390000, len=121, n/ep=10, n/st=1000, rew=5.90]                           


Epoch #239: test_reward: 4.350000 ± 87.614197, best_reward: 24.375000 ± 85.785164 in #236


Epoch #240: 10001it [00:02, 4473.64it/s, bastaushy/loss=79.805, env_step=2400000, len=118, n/ep=7, n/st=1000, rew=67.29]                            


Epoch #240: test_reward: -5.520000 ± 90.143938, best_reward: 24.375000 ± 85.785164 in #236


Epoch #241: 10001it [00:02, 4620.27it/s, bastaushy/loss=81.376, env_step=2410000, len=127, n/ep=7, n/st=1000, rew=14.71]                            


Epoch #241: test_reward: 10.745000 ± 87.717102, best_reward: 24.375000 ± 85.785164 in #236


Epoch #242: 10001it [00:02, 4567.22it/s, bastaushy/loss=81.705, env_step=2420000, len=116, n/ep=4, n/st=1000, rew=37.50]                           


Epoch #242: test_reward: 5.445000 ± 88.731657, best_reward: 24.375000 ± 85.785164 in #236


Epoch #243: 10001it [00:02, 4560.83it/s, bastaushy/loss=81.884, env_step=2430000, len=118, n/ep=9, n/st=1000, rew=-51.33]                           


Epoch #243: test_reward: -7.290000 ± 90.314926, best_reward: 24.375000 ± 85.785164 in #236


Epoch #244: 10001it [00:02, 4635.16it/s, bastaushy/loss=82.247, env_step=2440000, len=122, n/ep=7, n/st=1000, rew=40.43]                            


Epoch #244: test_reward: 10.190000 ± 88.870827, best_reward: 24.375000 ± 85.785164 in #236


Epoch #245: 10001it [00:02, 4592.88it/s, bastaushy/loss=80.506, env_step=2450000, len=129, n/ep=10, n/st=1000, rew=14.10]                           


Epoch #245: test_reward: 17.085000 ± 87.328219, best_reward: 24.375000 ± 85.785164 in #236


Epoch #246: 10001it [00:02, 4601.90it/s, bastaushy/loss=79.245, env_step=2460000, len=114, n/ep=7, n/st=1000, rew=68.14]                           


Epoch #246: test_reward: 9.050000 ± 89.018130, best_reward: 24.375000 ± 85.785164 in #236


Epoch #247: 10001it [00:02, 4238.02it/s, bastaushy/loss=79.252, env_step=2470000, len=109, n/ep=8, n/st=1000, rew=26.75]                           


Epoch #247: test_reward: -1.515000 ± 91.177682, best_reward: 24.375000 ± 85.785164 in #236


Epoch #248: 10001it [00:02, 4506.38it/s, bastaushy/loss=78.408, env_step=2480000, len=127, n/ep=8, n/st=1000, rew=44.50]                            


Epoch #248: test_reward: 17.965000 ± 86.704232, best_reward: 24.375000 ± 85.785164 in #236


Epoch #249: 10001it [00:02, 4623.22it/s, bastaushy/loss=77.034, env_step=2490000, len=122, n/ep=15, n/st=1000, rew=30.53]                           


Epoch #249: test_reward: 14.475000 ± 87.618716, best_reward: 24.375000 ± 85.785164 in #236


Epoch #250: 10001it [00:02, 4617.24it/s, bastaushy/loss=75.385, env_step=2500000, len=101, n/ep=5, n/st=1000, rew=53.60]                            


Epoch #250: test_reward: 5.460000 ± 89.421409, best_reward: 24.375000 ± 85.785164 in #236


Epoch #251: 10001it [00:02, 4598.24it/s, bastaushy/loss=74.236, env_step=2510000, len=131, n/ep=11, n/st=1000, rew=-7.18]                           


Epoch #251: test_reward: 22.030000 ± 86.134773, best_reward: 24.375000 ± 85.785164 in #236


Epoch #252: 10001it [00:02, 4632.74it/s, bastaushy/loss=74.520, env_step=2520000, len=115, n/ep=5, n/st=1000, rew=21.60]                            


Agent bastaushy vs random {'bastaushy': 65, 'qostaushy': 35}
Epoch #252: test_reward: 29.765000 ± 84.006010, best_reward: 29.765000 ± 84.006010 in #252


Epoch #253: 10001it [00:02, 4574.15it/s, bastaushy/loss=75.004, env_step=2530000, len=112, n/ep=9, n/st=1000, rew=8.44]                            


Epoch #253: test_reward: 21.480000 ± 85.026052, best_reward: 29.765000 ± 84.006010 in #252


Epoch #254: 10001it [00:02, 4396.83it/s, bastaushy/loss=76.442, env_step=2540000, len=127, n/ep=6, n/st=1000, rew=34.17]                           


Epoch #254: test_reward: 16.070000 ± 85.926859, best_reward: 29.765000 ± 84.006010 in #252


Epoch #255: 10001it [00:02, 4272.86it/s, bastaushy/loss=76.659, env_step=2550000, len=114, n/ep=10, n/st=1000, rew=-38.20]                           


Epoch #255: test_reward: 25.120000 ± 84.977148, best_reward: 29.765000 ± 84.006010 in #252


Epoch #256: 10001it [00:02, 4471.65it/s, bastaushy/loss=77.969, env_step=2560000, len=112, n/ep=6, n/st=1000, rew=32.00]                            


Epoch #256: test_reward: 9.900000 ± 86.970512, best_reward: 29.765000 ± 84.006010 in #252


Epoch #257: 10001it [00:02, 4463.78it/s, bastaushy/loss=77.995, env_step=2570000, len=124, n/ep=7, n/st=1000, rew=14.43]                            


Agent bastaushy vs random {'bastaushy': 70, 'qostaushy': 30}
Epoch #257: test_reward: 50.870000 ± 72.843003, best_reward: 50.870000 ± 72.843003 in #257


Epoch #258: 10001it [00:02, 4571.27it/s, bastaushy/loss=77.781, env_step=2580000, len=120, n/ep=7, n/st=1000, rew=8.71]                            


Agent bastaushy vs random {'bastaushy': 65, 'qostaushy': 35}
Epoch #258: test_reward: 55.940000 ± 68.757737, best_reward: 55.940000 ± 68.757737 in #258


Epoch #259: 10001it [00:02, 4553.41it/s, bastaushy/loss=79.071, env_step=2590000, len=83, n/ep=6, n/st=1000, rew=31.17]                              


Epoch #259: test_reward: 7.110000 ± 89.673786, best_reward: 55.940000 ± 68.757737 in #258


Epoch #260: 10001it [00:02, 4524.87it/s, bastaushy/loss=80.131, env_step=2600000, len=127, n/ep=5, n/st=1000, rew=53.00]                            


Epoch #260: test_reward: 23.705000 ± 86.880251, best_reward: 55.940000 ± 68.757737 in #258


Epoch #261: 10001it [00:02, 4559.03it/s, bastaushy/loss=79.621, env_step=2610000, len=110, n/ep=11, n/st=1000, rew=25.82]                           


Epoch #261: test_reward: 24.915000 ± 84.706421, best_reward: 55.940000 ± 68.757737 in #258


Epoch #262: 10001it [00:02, 4616.92it/s, bastaushy/loss=79.421, env_step=2620000, len=125, n/ep=14, n/st=1000, rew=8.00]                           


Epoch #262: test_reward: 36.730000 ± 79.518156, best_reward: 55.940000 ± 68.757737 in #258


Epoch #263: 10001it [00:02, 4588.21it/s, bastaushy/loss=79.566, env_step=2630000, len=119, n/ep=9, n/st=1000, rew=34.67]                            


Epoch #263: test_reward: 5.325000 ± 88.316303, best_reward: 55.940000 ± 68.757737 in #258


Epoch #264: 10001it [00:02, 4535.18it/s, bastaushy/loss=76.993, env_step=2640000, len=103, n/ep=10, n/st=1000, rew=-19.40]                           


Epoch #264: test_reward: 29.220000 ± 81.777635, best_reward: 55.940000 ± 68.757737 in #258


Epoch #265: 10001it [00:02, 4529.38it/s, bastaushy/loss=76.785, env_step=2650000, len=122, n/ep=9, n/st=1000, rew=8.33]                             


Epoch #265: test_reward: 24.250000 ± 86.243362, best_reward: 55.940000 ± 68.757737 in #258


Epoch #266: 10001it [00:02, 4595.28it/s, bastaushy/loss=76.556, env_step=2660000, len=115, n/ep=10, n/st=1000, rew=23.00]                           


Epoch #266: test_reward: 32.400000 ± 84.008809, best_reward: 55.940000 ± 68.757737 in #258


Epoch #267: 10001it [00:02, 4577.38it/s, bastaushy/loss=76.143, env_step=2670000, len=117, n/ep=9, n/st=1000, rew=31.56]                           


Epoch #267: test_reward: 31.875000 ± 82.974149, best_reward: 55.940000 ± 68.757737 in #258


Epoch #268: 10001it [00:02, 4493.55it/s, bastaushy/loss=77.284, env_step=2680000, len=125, n/ep=4, n/st=1000, rew=-1.75]                           


Epoch #268: test_reward: 6.125000 ± 87.828978, best_reward: 55.940000 ± 68.757737 in #258


Epoch #269: 10001it [00:02, 4498.37it/s, bastaushy/loss=77.679, env_step=2690000, len=139, n/ep=6, n/st=1000, rew=-31.67]                           


Epoch #269: test_reward: 41.010000 ± 79.411334, best_reward: 55.940000 ± 68.757737 in #258


Epoch #270: 10001it [00:02, 4532.18it/s, bastaushy/loss=76.546, env_step=2700000, len=128, n/ep=8, n/st=1000, rew=69.38]                            


Epoch #270: test_reward: 21.595000 ± 86.592615, best_reward: 55.940000 ± 68.757737 in #258


Epoch #271: 10001it [00:02, 4625.25it/s, bastaushy/loss=77.311, env_step=2710000, len=129, n/ep=6, n/st=1000, rew=-0.33]                           


Epoch #271: test_reward: 12.630000 ± 89.249891, best_reward: 55.940000 ± 68.757737 in #258


Epoch #272: 10001it [00:02, 4576.46it/s, bastaushy/loss=76.183, env_step=2720000, len=116, n/ep=4, n/st=1000, rew=96.50]                            


Epoch #272: test_reward: 19.045000 ± 84.051609, best_reward: 55.940000 ± 68.757737 in #258


Epoch #273: 10001it [00:02, 4585.95it/s, bastaushy/loss=75.164, env_step=2730000, len=115, n/ep=9, n/st=1000, rew=31.56]                           


Epoch #273: test_reward: 17.050000 ± 88.028220, best_reward: 55.940000 ± 68.757737 in #258


Epoch #274: 10001it [00:02, 4569.51it/s, bastaushy/loss=76.174, env_step=2740000, len=130, n/ep=11, n/st=1000, rew=28.82]                           


Epoch #274: test_reward: 18.055000 ± 86.846025, best_reward: 55.940000 ± 68.757737 in #258


Epoch #275: 10001it [00:02, 4556.92it/s, bastaushy/loss=75.909, env_step=2750000, len=132, n/ep=11, n/st=1000, rew=27.55]                           


Epoch #275: test_reward: 45.315000 ± 73.319000, best_reward: 55.940000 ± 68.757737 in #258


Epoch #276: 10001it [00:02, 4586.60it/s, bastaushy/loss=76.782, env_step=2760000, len=115, n/ep=7, n/st=1000, rew=12.71]                             


Epoch #276: test_reward: 21.220000 ± 88.264611, best_reward: 55.940000 ± 68.757737 in #258


Epoch #277: 10001it [00:02, 4509.68it/s, bastaushy/loss=75.999, env_step=2770000, len=116, n/ep=10, n/st=1000, rew=37.10]                           


Epoch #277: test_reward: 27.290000 ± 84.086776, best_reward: 55.940000 ± 68.757737 in #258


Epoch #278: 10001it [00:02, 4585.35it/s, bastaushy/loss=74.723, env_step=2780000, len=128, n/ep=10, n/st=1000, rew=56.50]                           


Epoch #278: test_reward: -12.975000 ± 88.698390, best_reward: 55.940000 ± 68.757737 in #258


Epoch #279: 10001it [00:02, 4639.85it/s, bastaushy/loss=73.514, env_step=2790000, len=108, n/ep=8, n/st=1000, rew=-26.25]                            


Epoch #279: test_reward: 22.915000 ± 85.049267, best_reward: 55.940000 ± 68.757737 in #258


Epoch #280: 10001it [00:02, 4589.64it/s, bastaushy/loss=74.344, env_step=2800000, len=130, n/ep=7, n/st=1000, rew=14.29]                            


Epoch #280: test_reward: 10.135000 ± 90.174147, best_reward: 55.940000 ± 68.757737 in #258


Epoch #281: 10001it [00:02, 4580.07it/s, bastaushy/loss=74.344, env_step=2810000, len=114, n/ep=9, n/st=1000, rew=30.00]                           


Epoch #281: test_reward: 26.690000 ± 84.090094, best_reward: 55.940000 ± 68.757737 in #258


Epoch #282: 10001it [00:02, 4654.37it/s, bastaushy/loss=76.391, env_step=2820000, len=150, n/ep=6, n/st=1000, rew=-2.33]                            


Epoch #282: test_reward: 4.925000 ± 89.910841, best_reward: 55.940000 ± 68.757737 in #258


Epoch #283: 10001it [00:02, 4617.12it/s, bastaushy/loss=77.706, env_step=2830000, len=141, n/ep=6, n/st=1000, rew=32.33]                            


Epoch #283: test_reward: 19.740000 ± 86.868075, best_reward: 55.940000 ± 68.757737 in #258


Epoch #284: 10001it [00:02, 4531.92it/s, bastaushy/loss=79.278, env_step=2840000, len=98, n/ep=9, n/st=1000, rew=11.33]                              


Epoch #284: test_reward: 0.735000 ± 90.626126, best_reward: 55.940000 ± 68.757737 in #258


Epoch #285: 10001it [00:02, 4544.15it/s, bastaushy/loss=80.463, env_step=2850000, len=116, n/ep=9, n/st=1000, rew=-11.78]                           


Epoch #285: test_reward: 26.885000 ± 83.569921, best_reward: 55.940000 ± 68.757737 in #258


Epoch #286: 10001it [00:02, 4612.26it/s, bastaushy/loss=79.889, env_step=2860000, len=120, n/ep=5, n/st=1000, rew=57.20]                            


Epoch #286: test_reward: 35.055000 ± 80.317507, best_reward: 55.940000 ± 68.757737 in #258


Epoch #287: 10001it [00:02, 4607.30it/s, bastaushy/loss=79.981, env_step=2870000, len=114, n/ep=5, n/st=1000, rew=56.20]                            


Epoch #287: test_reward: 45.195000 ± 78.088264, best_reward: 55.940000 ± 68.757737 in #258


Epoch #288: 10001it [00:02, 4429.51it/s, bastaushy/loss=79.923, env_step=2880000, len=99, n/ep=5, n/st=1000, rew=65.40]                             


Epoch #288: test_reward: 23.705000 ± 87.101883, best_reward: 55.940000 ± 68.757737 in #258


Epoch #289: 10001it [00:02, 4579.84it/s, bastaushy/loss=80.458, env_step=2890000, len=140, n/ep=11, n/st=1000, rew=11.18]                           


Epoch #289: test_reward: 29.095000 ± 82.754915, best_reward: 55.940000 ± 68.757737 in #258


Epoch #290: 10001it [00:02, 4647.32it/s, bastaushy/loss=79.993, env_step=2900000, len=137, n/ep=9, n/st=1000, rew=32.67]                            


Epoch #290: test_reward: 29.025000 ± 81.569690, best_reward: 55.940000 ± 68.757737 in #258


Epoch #291: 10001it [00:02, 4484.03it/s, bastaushy/loss=80.025, env_step=2910000, len=121, n/ep=10, n/st=1000, rew=19.60]                           


Epoch #291: test_reward: 7.350000 ± 89.268682, best_reward: 55.940000 ± 68.757737 in #258


Epoch #292: 10001it [00:02, 4557.49it/s, bastaushy/loss=79.252, env_step=2920000, len=121, n/ep=13, n/st=1000, rew=11.00]                           


Epoch #292: test_reward: 25.770000 ± 85.095224, best_reward: 55.940000 ± 68.757737 in #258


Epoch #293: 10001it [00:02, 4536.26it/s, bastaushy/loss=78.335, env_step=2930000, len=95, n/ep=9, n/st=1000, rew=15.89]                            


Epoch #293: test_reward: 36.255000 ± 80.951467, best_reward: 55.940000 ± 68.757737 in #258


Epoch #294: 10001it [00:02, 4495.47it/s, bastaushy/loss=76.540, env_step=2940000, len=121, n/ep=10, n/st=1000, rew=38.50]                           


Epoch #294: test_reward: 20.365000 ± 83.944337, best_reward: 55.940000 ± 68.757737 in #258


Epoch #295: 10001it [00:02, 4571.80it/s, bastaushy/loss=76.569, env_step=2950000, len=127, n/ep=11, n/st=1000, rew=29.55]                           


Epoch #295: test_reward: 22.150000 ± 88.225946, best_reward: 55.940000 ± 68.757737 in #258


Epoch #296: 10001it [00:02, 4384.86it/s, bastaushy/loss=75.800, env_step=2960000, len=115, n/ep=7, n/st=1000, rew=-10.00]                           


Epoch #296: test_reward: 37.240000 ± 81.524183, best_reward: 55.940000 ± 68.757737 in #258


Epoch #297: 10001it [00:02, 4403.52it/s, bastaushy/loss=76.502, env_step=2970000, len=111, n/ep=10, n/st=1000, rew=53.40]                           


Epoch #297: test_reward: 35.385000 ± 80.335837, best_reward: 55.940000 ± 68.757737 in #258


Epoch #298: 10001it [00:02, 4482.28it/s, bastaushy/loss=77.801, env_step=2980000, len=131, n/ep=13, n/st=1000, rew=-20.38]                           


Epoch #298: test_reward: 39.285000 ± 79.889698, best_reward: 55.940000 ± 68.757737 in #258


Epoch #299: 10001it [00:02, 4438.83it/s, bastaushy/loss=77.613, env_step=2990000, len=123, n/ep=10, n/st=1000, rew=20.40]                           


Epoch #299: test_reward: 47.445000 ± 74.592137, best_reward: 55.940000 ± 68.757737 in #258


Epoch #300: 10001it [00:02, 4507.52it/s, bastaushy/loss=77.542, env_step=3000000, len=102, n/ep=5, n/st=1000, rew=-22.20]                           


Epoch #300: test_reward: 31.205000 ± 82.120296, best_reward: 55.940000 ± 68.757737 in #258


Epoch #301: 10001it [00:02, 4466.25it/s, bastaushy/loss=77.613, env_step=3010000, len=121, n/ep=8, n/st=1000, rew=72.25]                            


Epoch #301: test_reward: 27.935000 ± 83.801735, best_reward: 55.940000 ± 68.757737 in #258


Epoch #302: 10001it [00:02, 4402.84it/s, bastaushy/loss=78.148, env_step=3020000, len=108, n/ep=5, n/st=1000, rew=-94.60]                           


Epoch #302: test_reward: 30.890000 ± 83.782026, best_reward: 55.940000 ± 68.757737 in #258


Epoch #303: 10001it [00:02, 4516.62it/s, bastaushy/loss=77.630, env_step=3030000, len=162, n/ep=5, n/st=1000, rew=-20.40]                           


Epoch #303: test_reward: 37.140000 ± 80.381219, best_reward: 55.940000 ± 68.757737 in #258


Epoch #304: 10001it [00:02, 4585.49it/s, bastaushy/loss=77.557, env_step=3040000, len=124, n/ep=7, n/st=1000, rew=1.43]                            


Epoch #304: test_reward: 30.310000 ± 83.349948, best_reward: 55.940000 ± 68.757737 in #258


Epoch #305: 10001it [00:02, 4499.52it/s, bastaushy/loss=76.988, env_step=3050000, len=115, n/ep=7, n/st=1000, rew=36.14]                           


Epoch #305: test_reward: 40.910000 ± 80.499391, best_reward: 55.940000 ± 68.757737 in #258


Epoch #306: 10001it [00:02, 4544.03it/s, bastaushy/loss=77.644, env_step=3060000, len=126, n/ep=12, n/st=1000, rew=59.58]                           


Epoch #306: test_reward: 32.300000 ± 83.267581, best_reward: 55.940000 ± 68.757737 in #258


Epoch #307: 10001it [00:02, 4560.33it/s, bastaushy/loss=78.790, env_step=3070000, len=95, n/ep=3, n/st=1000, rew=91.67]                            


Epoch #307: test_reward: 29.445000 ± 81.902118, best_reward: 55.940000 ± 68.757737 in #258


Epoch #308: 10001it [00:02, 4555.37it/s, bastaushy/loss=77.306, env_step=3080000, len=140, n/ep=10, n/st=1000, rew=18.00]                           


Epoch #308: test_reward: 15.250000 ± 87.370805, best_reward: 55.940000 ± 68.757737 in #258


Epoch #309: 10001it [00:02, 4604.31it/s, bastaushy/loss=76.582, env_step=3090000, len=125, n/ep=6, n/st=1000, rew=35.50]                             


Epoch #309: test_reward: 12.090000 ± 88.142849, best_reward: 55.940000 ± 68.757737 in #258


Epoch #310: 10001it [00:02, 4520.21it/s, bastaushy/loss=76.041, env_step=3100000, len=127, n/ep=9, n/st=1000, rew=-35.33]                           


Epoch #310: test_reward: 27.885000 ± 84.660981, best_reward: 55.940000 ± 68.757737 in #258


Epoch #311: 10001it [00:02, 4642.71it/s, bastaushy/loss=75.438, env_step=3110000, len=165, n/ep=7, n/st=1000, rew=-37.29]                           


Epoch #311: test_reward: 43.065000 ± 77.536384, best_reward: 55.940000 ± 68.757737 in #258


Epoch #312: 10001it [00:02, 4617.87it/s, bastaushy/loss=75.733, env_step=3120000, len=129, n/ep=5, n/st=1000, rew=47.60]                           


Epoch #312: test_reward: 26.795000 ± 82.539281, best_reward: 55.940000 ± 68.757737 in #258


Epoch #313: 10001it [00:02, 4645.15it/s, bastaushy/loss=75.407, env_step=3130000, len=152, n/ep=4, n/st=1000, rew=-46.00]                           


Epoch #313: test_reward: 15.975000 ± 89.152983, best_reward: 55.940000 ± 68.757737 in #258


Epoch #314: 10001it [00:02, 4709.58it/s, bastaushy/loss=76.346, env_step=3140000, len=88, n/ep=10, n/st=1000, rew=19.60]                            


Epoch #314: test_reward: 20.165000 ± 87.154046, best_reward: 55.940000 ± 68.757737 in #258


Epoch #315: 10001it [00:02, 4671.74it/s, bastaushy/loss=77.157, env_step=3150000, len=130, n/ep=7, n/st=1000, rew=61.00]                           


Epoch #315: test_reward: 32.760000 ± 83.909430, best_reward: 55.940000 ± 68.757737 in #258


Epoch #316: 10001it [00:02, 4681.43it/s, bastaushy/loss=75.525, env_step=3160000, len=134, n/ep=10, n/st=1000, rew=-19.60]                           


Epoch #316: test_reward: 29.300000 ± 84.334038, best_reward: 55.940000 ± 68.757737 in #258


Epoch #317: 10001it [00:02, 4710.14it/s, bastaushy/loss=75.971, env_step=3170000, len=122, n/ep=10, n/st=1000, rew=55.60]                           


Epoch #317: test_reward: -1.315000 ± 89.066861, best_reward: 55.940000 ± 68.757737 in #258


Epoch #318: 10001it [00:02, 4643.43it/s, bastaushy/loss=77.125, env_step=3180000, len=123, n/ep=3, n/st=1000, rew=-34.00]                           


Epoch #318: test_reward: 21.205000 ± 87.872652, best_reward: 55.940000 ± 68.757737 in #258


Epoch #319: 10001it [00:02, 4659.78it/s, bastaushy/loss=78.083, env_step=3190000, len=114, n/ep=9, n/st=1000, rew=30.89]                            


Epoch #319: test_reward: 22.350000 ± 79.764638, best_reward: 55.940000 ± 68.757737 in #258


Epoch #320: 10001it [00:02, 4668.13it/s, bastaushy/loss=78.335, env_step=3200000, len=120, n/ep=8, n/st=1000, rew=-2.38]                            


Epoch #320: test_reward: 27.560000 ± 83.384989, best_reward: 55.940000 ± 68.757737 in #258


Epoch #321: 10001it [00:02, 4774.43it/s, bastaushy/loss=79.045, env_step=3210000, len=133, n/ep=12, n/st=1000, rew=3.58]                             


Epoch #321: test_reward: 51.095000 ± 74.136536, best_reward: 55.940000 ± 68.757737 in #258


Epoch #322: 10001it [00:02, 4634.56it/s, bastaushy/loss=77.496, env_step=3220000, len=115, n/ep=8, n/st=1000, rew=2.38]                            


Epoch #322: test_reward: 50.570000 ± 75.276591, best_reward: 55.940000 ± 68.757737 in #258


Epoch #323: 10001it [00:02, 4704.00it/s, bastaushy/loss=77.527, env_step=3230000, len=126, n/ep=13, n/st=1000, rew=11.92]                           


Epoch #323: test_reward: 50.835000 ± 71.983524, best_reward: 55.940000 ± 68.757737 in #258


Epoch #324: 10001it [00:02, 4703.87it/s, bastaushy/loss=77.391, env_step=3240000, len=128, n/ep=7, n/st=1000, rew=-39.14]                           


Epoch #324: test_reward: 37.640000 ± 83.378777, best_reward: 55.940000 ± 68.757737 in #258


Epoch #325: 10001it [00:02, 4700.35it/s, bastaushy/loss=75.875, env_step=3250000, len=116, n/ep=5, n/st=1000, rew=19.40]                           


Agent bastaushy vs random {'bastaushy': 82, 'qostaushy': 18}
Epoch #325: test_reward: 56.680000 ± 70.902804, best_reward: 56.680000 ± 70.902804 in #325


Epoch #326: 10001it [00:02, 4622.56it/s, bastaushy/loss=77.111, env_step=3260000, len=95, n/ep=11, n/st=1000, rew=5.55]                             


Epoch #326: test_reward: 43.135000 ± 78.587574, best_reward: 56.680000 ± 70.902804 in #325


Epoch #327: 10001it [00:02, 4629.79it/s, bastaushy/loss=75.954, env_step=3270000, len=125, n/ep=7, n/st=1000, rew=67.57]                           


Epoch #327: test_reward: 30.045000 ± 80.643617, best_reward: 56.680000 ± 70.902804 in #325


Epoch #328: 10001it [00:02, 4679.98it/s, bastaushy/loss=76.198, env_step=3280000, len=118, n/ep=8, n/st=1000, rew=-24.12]                           


Epoch #328: test_reward: 44.565000 ± 74.549821, best_reward: 56.680000 ± 70.902804 in #325


Epoch #329: 10001it [00:02, 4628.78it/s, bastaushy/loss=75.742, env_step=3290000, len=101, n/ep=4, n/st=1000, rew=-0.75]                            


Epoch #329: test_reward: 25.490000 ± 84.263396, best_reward: 56.680000 ± 70.902804 in #325


Epoch #330: 10001it [00:02, 4654.03it/s, bastaushy/loss=75.117, env_step=3300000, len=157, n/ep=5, n/st=1000, rew=51.40]                           


Epoch #330: test_reward: 42.260000 ± 79.315209, best_reward: 56.680000 ± 70.902804 in #325


Epoch #331: 10001it [00:02, 4695.13it/s, bastaushy/loss=73.820, env_step=3310000, len=125, n/ep=9, n/st=1000, rew=32.89]                            


Epoch #331: test_reward: 32.970000 ± 82.780487, best_reward: 56.680000 ± 70.902804 in #325


Epoch #332: 10001it [00:02, 4737.44it/s, bastaushy/loss=75.116, env_step=3320000, len=107, n/ep=13, n/st=1000, rew=48.85]                           


Epoch #332: test_reward: 47.670000 ± 79.032785, best_reward: 56.680000 ± 70.902804 in #325


Epoch #333: 10001it [00:02, 4679.48it/s, bastaushy/loss=75.749, env_step=3330000, len=150, n/ep=5, n/st=1000, rew=18.80]                            


Epoch #333: test_reward: 37.160000 ± 82.223138, best_reward: 56.680000 ± 70.902804 in #325


Epoch #334: 10001it [00:02, 4585.95it/s, bastaushy/loss=75.287, env_step=3340000, len=101, n/ep=12, n/st=1000, rew=0.67]                            


Epoch #334: test_reward: 37.075000 ± 80.688161, best_reward: 56.680000 ± 70.902804 in #325


Epoch #335: 10001it [00:02, 4674.78it/s, bastaushy/loss=74.893, env_step=3350000, len=103, n/ep=9, n/st=1000, rew=55.44]                            


Epoch #335: test_reward: 22.830000 ± 87.939758, best_reward: 56.680000 ± 70.902804 in #325


Epoch #336: 10001it [00:02, 4659.76it/s, bastaushy/loss=75.283, env_step=3360000, len=116, n/ep=8, n/st=1000, rew=26.25]                           


Epoch #336: test_reward: 51.380000 ± 74.774164, best_reward: 56.680000 ± 70.902804 in #325


Epoch #337: 10001it [00:02, 4684.01it/s, bastaushy/loss=74.383, env_step=3370000, len=147, n/ep=9, n/st=1000, rew=55.89]                           


Agent bastaushy vs random {'bastaushy': 75, 'qostaushy': 25}
Epoch #337: test_reward: 58.520000 ± 68.972818, best_reward: 58.520000 ± 68.972818 in #337


Epoch #338: 10001it [00:02, 4727.76it/s, bastaushy/loss=74.794, env_step=3380000, len=102, n/ep=11, n/st=1000, rew=60.91]                           


Epoch #338: test_reward: 32.255000 ± 84.387143, best_reward: 58.520000 ± 68.972818 in #337


Epoch #339: 10001it [00:02, 4588.33it/s, bastaushy/loss=74.399, env_step=3390000, len=132, n/ep=7, n/st=1000, rew=41.00]                           


Agent bastaushy vs random {'bastaushy': 87, 'qostaushy': 13}
Epoch #339: test_reward: 61.105000 ± 67.787786, best_reward: 61.105000 ± 67.787786 in #339


Epoch #340: 10001it [00:02, 4502.45it/s, bastaushy/loss=74.980, env_step=3400000, len=139, n/ep=10, n/st=1000, rew=-18.10]                           


Epoch #340: test_reward: 60.265000 ± 70.083484, best_reward: 61.105000 ± 67.787786 in #339


Epoch #341: 10001it [00:02, 4518.29it/s, bastaushy/loss=76.765, env_step=3410000, len=122, n/ep=13, n/st=1000, rew=-18.00]                           


Epoch #341: test_reward: 57.655000 ± 72.623040, best_reward: 61.105000 ± 67.787786 in #339


Epoch #342: 10001it [00:02, 4440.31it/s, bastaushy/loss=76.627, env_step=3420000, len=91, n/ep=3, n/st=1000, rew=94.00]                             


Epoch #342: test_reward: 36.810000 ± 84.151969, best_reward: 61.105000 ± 67.787786 in #339


Epoch #343: 10001it [00:02, 4516.17it/s, bastaushy/loss=75.352, env_step=3430000, len=92, n/ep=7, n/st=1000, rew=16.43]                            


Epoch #343: test_reward: 52.645000 ± 73.950855, best_reward: 61.105000 ± 67.787786 in #339


Epoch #344: 10001it [00:02, 4555.06it/s, bastaushy/loss=76.183, env_step=3440000, len=120, n/ep=9, n/st=1000, rew=11.89]                           


Agent bastaushy vs random {'bastaushy': 82, 'qostaushy': 18}
Epoch #344: test_reward: 64.960000 ± 59.417324, best_reward: 64.960000 ± 59.417324 in #344


Epoch #345: 10001it [00:02, 4537.78it/s, bastaushy/loss=76.992, env_step=3450000, len=137, n/ep=9, n/st=1000, rew=10.67]                            


Epoch #345: test_reward: 27.425000 ± 82.402818, best_reward: 64.960000 ± 59.417324 in #344


Epoch #346: 10001it [00:02, 4413.38it/s, bastaushy/loss=75.480, env_step=3460000, len=123, n/ep=12, n/st=1000, rew=34.33]                           


Epoch #346: test_reward: 34.010000 ± 81.554766, best_reward: 64.960000 ± 59.417324 in #344


Epoch #347: 10001it [00:02, 4594.68it/s, bastaushy/loss=77.141, env_step=3470000, len=110, n/ep=4, n/st=1000, rew=-4.50]                            


Epoch #347: test_reward: 57.070000 ± 73.931963, best_reward: 64.960000 ± 59.417324 in #344


Epoch #348: 10001it [00:02, 4563.17it/s, bastaushy/loss=75.593, env_step=3480000, len=136, n/ep=5, n/st=1000, rew=-14.00]                           


Epoch #348: test_reward: 38.265000 ± 81.373182, best_reward: 64.960000 ± 59.417324 in #344


Epoch #349: 10001it [00:02, 4402.12it/s, bastaushy/loss=75.335, env_step=3490000, len=139, n/ep=9, n/st=1000, rew=-10.11]                           


Epoch #349: test_reward: 40.645000 ± 78.645527, best_reward: 64.960000 ± 59.417324 in #344


Epoch #350: 10001it [00:02, 4479.62it/s, bastaushy/loss=75.635, env_step=3500000, len=130, n/ep=8, n/st=1000, rew=1.00]                             


Epoch #350: test_reward: 45.665000 ± 76.748634, best_reward: 64.960000 ± 59.417324 in #344


Epoch #351: 10001it [00:02, 4531.42it/s, bastaushy/loss=74.012, env_step=3510000, len=94, n/ep=10, n/st=1000, rew=-40.60]                           


Epoch #351: test_reward: 41.155000 ± 79.018865, best_reward: 64.960000 ± 59.417324 in #344


Epoch #352: 10001it [00:02, 4451.06it/s, bastaushy/loss=73.670, env_step=3520000, len=125, n/ep=5, n/st=1000, rew=-29.20]                           


Epoch #352: test_reward: 29.640000 ± 84.275384, best_reward: 64.960000 ± 59.417324 in #344


Epoch #353: 10001it [00:02, 4523.42it/s, bastaushy/loss=75.399, env_step=3530000, len=104, n/ep=8, n/st=1000, rew=2.50]                             


Epoch #353: test_reward: 33.670000 ± 82.680476, best_reward: 64.960000 ± 59.417324 in #344


Epoch #354: 10001it [00:02, 4563.86it/s, bastaushy/loss=76.004, env_step=3540000, len=127, n/ep=14, n/st=1000, rew=26.93]                           


Epoch #354: test_reward: 26.595000 ± 86.407181, best_reward: 64.960000 ± 59.417324 in #344


Epoch #355: 10001it [00:02, 4450.29it/s, bastaushy/loss=76.822, env_step=3550000, len=131, n/ep=10, n/st=1000, rew=39.20]                           


Epoch #355: test_reward: 57.670000 ± 68.797900, best_reward: 64.960000 ± 59.417324 in #344


Epoch #356: 10001it [00:02, 4626.65it/s, bastaushy/loss=78.817, env_step=3560000, len=131, n/ep=10, n/st=1000, rew=-13.90]                           


Epoch #356: test_reward: 29.705000 ± 82.262251, best_reward: 64.960000 ± 59.417324 in #344


Epoch #357: 10001it [00:02, 4587.88it/s, bastaushy/loss=76.756, env_step=3570000, len=107, n/ep=7, n/st=1000, rew=73.86]                           


Epoch #357: test_reward: 18.765000 ± 86.867426, best_reward: 64.960000 ± 59.417324 in #344


Epoch #358: 10001it [00:02, 4424.70it/s, bastaushy/loss=78.182, env_step=3580000, len=107, n/ep=7, n/st=1000, rew=-9.29]                           


Epoch #358: test_reward: 38.760000 ± 79.461578, best_reward: 64.960000 ± 59.417324 in #344


Epoch #359: 10001it [00:02, 4515.46it/s, bastaushy/loss=78.394, env_step=3590000, len=132, n/ep=10, n/st=1000, rew=24.60]                           


Epoch #359: test_reward: 27.985000 ± 84.156014, best_reward: 64.960000 ± 59.417324 in #344


Epoch #360: 10001it [00:02, 4590.32it/s, bastaushy/loss=78.946, env_step=3600000, len=114, n/ep=11, n/st=1000, rew=77.27]                           


Epoch #360: test_reward: 51.465000 ± 76.959787, best_reward: 64.960000 ± 59.417324 in #344


Epoch #361: 10001it [00:02, 4648.77it/s, bastaushy/loss=78.553, env_step=3610000, len=150, n/ep=5, n/st=1000, rew=16.40]                           


Epoch #361: test_reward: 34.700000 ± 83.176980, best_reward: 64.960000 ± 59.417324 in #344


Epoch #362: 10001it [00:02, 4696.85it/s, bastaushy/loss=78.159, env_step=3620000, len=132, n/ep=7, n/st=1000, rew=68.86]                            


Epoch #362: test_reward: 44.955000 ± 78.268340, best_reward: 64.960000 ± 59.417324 in #344


Epoch #363: 10001it [00:02, 4620.06it/s, bastaushy/loss=78.071, env_step=3630000, len=125, n/ep=7, n/st=1000, rew=15.43]                           


Epoch #363: test_reward: 22.080000 ± 85.089915, best_reward: 64.960000 ± 59.417324 in #344


Epoch #364: 10001it [00:02, 4630.10it/s, bastaushy/loss=76.641, env_step=3640000, len=154, n/ep=6, n/st=1000, rew=-1.83]                            


Epoch #364: test_reward: 35.060000 ± 78.691654, best_reward: 64.960000 ± 59.417324 in #344


Epoch #365: 10001it [00:02, 4619.26it/s, bastaushy/loss=77.678, env_step=3650000, len=121, n/ep=13, n/st=1000, rew=7.62]                            


Epoch #365: test_reward: 46.565000 ± 75.878559, best_reward: 64.960000 ± 59.417324 in #344


Epoch #366: 10001it [00:02, 4667.17it/s, bastaushy/loss=77.084, env_step=3660000, len=117, n/ep=8, n/st=1000, rew=24.25]                            


Epoch #366: test_reward: 49.965000 ± 73.113978, best_reward: 64.960000 ± 59.417324 in #344


Epoch #367: 10001it [00:02, 4610.01it/s, bastaushy/loss=77.831, env_step=3670000, len=117, n/ep=8, n/st=1000, rew=65.62]                           


Epoch #367: test_reward: 31.390000 ± 82.966125, best_reward: 64.960000 ± 59.417324 in #344


Epoch #368: 10001it [00:02, 4675.83it/s, bastaushy/loss=74.994, env_step=3680000, len=116, n/ep=4, n/st=1000, rew=41.00]                           


Epoch #368: test_reward: 28.450000 ± 80.697692, best_reward: 64.960000 ± 59.417324 in #344


Epoch #369: 10001it [00:02, 4637.49it/s, bastaushy/loss=75.819, env_step=3690000, len=117, n/ep=6, n/st=1000, rew=92.17]                            


Epoch #369: test_reward: 37.180000 ± 80.170678, best_reward: 64.960000 ± 59.417324 in #344


Epoch #370: 10001it [00:02, 4561.87it/s, bastaushy/loss=74.731, env_step=3700000, len=146, n/ep=6, n/st=1000, rew=3.17]                            


Agent bastaushy vs random {'bastaushy': 82, 'qostaushy': 18}
Epoch #370: test_reward: 69.335000 ± 55.199663, best_reward: 69.335000 ± 55.199663 in #370


Epoch #371: 10001it [00:02, 4606.86it/s, bastaushy/loss=75.190, env_step=3710000, len=137, n/ep=9, n/st=1000, rew=-13.33]                           


Epoch #371: test_reward: 55.500000 ± 70.675880, best_reward: 69.335000 ± 55.199663 in #370


Epoch #372: 10001it [00:02, 4653.33it/s, bastaushy/loss=75.557, env_step=3720000, len=128, n/ep=10, n/st=1000, rew=2.60]                            


Epoch #372: test_reward: 64.795000 ± 58.877610, best_reward: 69.335000 ± 55.199663 in #370


Epoch #373: 10001it [00:02, 4638.11it/s, bastaushy/loss=75.019, env_step=3730000, len=142, n/ep=12, n/st=1000, rew=48.00]                           


Epoch #373: test_reward: 39.720000 ± 77.533293, best_reward: 69.335000 ± 55.199663 in #370


Epoch #374: 10001it [00:02, 4540.88it/s, bastaushy/loss=75.740, env_step=3740000, len=98, n/ep=11, n/st=1000, rew=28.27]                             


Epoch #374: test_reward: 39.185000 ± 75.628703, best_reward: 69.335000 ± 55.199663 in #370


Epoch #375: 10001it [00:02, 4642.18it/s, bastaushy/loss=74.107, env_step=3750000, len=114, n/ep=13, n/st=1000, rew=7.08]                            


Epoch #375: test_reward: 48.555000 ± 74.756919, best_reward: 69.335000 ± 55.199663 in #370


Epoch #376: 10001it [00:02, 4551.19it/s, bastaushy/loss=74.271, env_step=3760000, len=139, n/ep=8, n/st=1000, rew=71.50]                            


Epoch #376: test_reward: 55.055000 ± 71.148942, best_reward: 69.335000 ± 55.199663 in #370


Epoch #377: 10001it [00:02, 4664.46it/s, bastaushy/loss=74.891, env_step=3770000, len=108, n/ep=9, n/st=1000, rew=10.89]                           


Epoch #377: test_reward: 46.855000 ± 77.253181, best_reward: 69.335000 ± 55.199663 in #370


Epoch #378: 10001it [00:02, 4616.85it/s, bastaushy/loss=76.028, env_step=3780000, len=142, n/ep=6, n/st=1000, rew=-28.00]                           


Epoch #378: test_reward: 51.425000 ± 71.379299, best_reward: 69.335000 ± 55.199663 in #370


Epoch #379: 10001it [00:02, 4659.12it/s, bastaushy/loss=74.720, env_step=3790000, len=113, n/ep=5, n/st=1000, rew=19.80]                           


Epoch #379: test_reward: 43.190000 ± 81.281018, best_reward: 69.335000 ± 55.199663 in #370


Epoch #380: 10001it [00:02, 4700.92it/s, bastaushy/loss=75.717, env_step=3800000, len=125, n/ep=9, n/st=1000, rew=9.33]                             


Epoch #380: test_reward: 52.760000 ± 74.209719, best_reward: 69.335000 ± 55.199663 in #370


Epoch #381: 10001it [00:02, 4710.67it/s, bastaushy/loss=75.796, env_step=3810000, len=111, n/ep=7, n/st=1000, rew=65.29]                           


Epoch #381: test_reward: 42.925000 ± 78.115807, best_reward: 69.335000 ± 55.199663 in #370


Epoch #382: 10001it [00:02, 4649.93it/s, bastaushy/loss=74.884, env_step=3820000, len=116, n/ep=9, n/st=1000, rew=33.67]                           


Epoch #382: test_reward: 61.700000 ± 69.293867, best_reward: 69.335000 ± 55.199663 in #370


Epoch #383: 10001it [00:02, 4698.38it/s, bastaushy/loss=74.574, env_step=3830000, len=110, n/ep=7, n/st=1000, rew=-13.43]                           


Epoch #383: test_reward: 61.090000 ± 69.492963, best_reward: 69.335000 ± 55.199663 in #370


Epoch #384: 10001it [00:02, 4714.62it/s, bastaushy/loss=73.555, env_step=3840000, len=143, n/ep=7, n/st=1000, rew=22.71]                            


Epoch #384: test_reward: 54.090000 ± 70.298307, best_reward: 69.335000 ± 55.199663 in #370


Epoch #385: 10001it [00:02, 4745.37it/s, bastaushy/loss=74.118, env_step=3850000, len=135, n/ep=6, n/st=1000, rew=-0.50]                           


Epoch #385: test_reward: 40.555000 ± 76.841440, best_reward: 69.335000 ± 55.199663 in #370


Epoch #386: 10001it [00:02, 4680.11it/s, bastaushy/loss=73.812, env_step=3860000, len=107, n/ep=11, n/st=1000, rew=11.18]                           


Epoch #386: test_reward: 66.335000 ± 62.030096, best_reward: 69.335000 ± 55.199663 in #370


Epoch #387: 10001it [00:02, 4666.00it/s, bastaushy/loss=71.817, env_step=3870000, len=131, n/ep=12, n/st=1000, rew=-31.58]                           


Epoch #387: test_reward: 45.175000 ± 76.884552, best_reward: 69.335000 ± 55.199663 in #370


Epoch #388: 10001it [00:02, 4683.59it/s, bastaushy/loss=71.941, env_step=3880000, len=133, n/ep=4, n/st=1000, rew=-45.25]                           


Epoch #388: test_reward: 41.175000 ± 77.867544, best_reward: 69.335000 ± 55.199663 in #370


Epoch #389: 10001it [00:02, 4634.25it/s, bastaushy/loss=72.488, env_step=3890000, len=113, n/ep=13, n/st=1000, rew=-38.15]                           


Epoch #389: test_reward: 48.165000 ± 77.153339, best_reward: 69.335000 ± 55.199663 in #370


Epoch #390: 10001it [00:02, 4648.33it/s, bastaushy/loss=71.230, env_step=3900000, len=119, n/ep=7, n/st=1000, rew=14.29]                            


Epoch #390: test_reward: 67.000000 ± 61.745688, best_reward: 69.335000 ± 55.199663 in #370


Epoch #391: 10001it [00:02, 4692.12it/s, bastaushy/loss=70.665, env_step=3910000, len=100, n/ep=6, n/st=1000, rew=67.33]                            


Epoch #391: test_reward: 33.530000 ± 83.189657, best_reward: 69.335000 ± 55.199663 in #370


Epoch #392: 10001it [00:02, 4520.44it/s, bastaushy/loss=71.653, env_step=3920000, len=103, n/ep=8, n/st=1000, rew=70.88]                            


Epoch #392: test_reward: 32.200000 ± 82.309355, best_reward: 69.335000 ± 55.199663 in #370


Epoch #393: 10001it [00:02, 4641.35it/s, bastaushy/loss=72.157, env_step=3930000, len=93, n/ep=12, n/st=1000, rew=45.25]                           


Epoch #393: test_reward: 35.860000 ± 81.485645, best_reward: 69.335000 ± 55.199663 in #370


Epoch #394: 10001it [00:02, 4737.36it/s, bastaushy/loss=72.355, env_step=3940000, len=99, n/ep=6, n/st=1000, rew=32.67]                             


Epoch #394: test_reward: 37.280000 ± 78.921363, best_reward: 69.335000 ± 55.199663 in #370


Epoch #395: 10001it [00:02, 4601.32it/s, bastaushy/loss=71.184, env_step=3950000, len=94, n/ep=9, n/st=1000, rew=35.33]                              


Epoch #395: test_reward: 50.225000 ± 74.372336, best_reward: 69.335000 ± 55.199663 in #370


Epoch #396: 10001it [00:02, 4700.00it/s, bastaushy/loss=71.056, env_step=3960000, len=116, n/ep=8, n/st=1000, rew=-3.62]                           


Epoch #396: test_reward: 28.045000 ± 84.092705, best_reward: 69.335000 ± 55.199663 in #370


Epoch #397: 10001it [00:02, 4649.46it/s, bastaushy/loss=72.904, env_step=3970000, len=121, n/ep=6, n/st=1000, rew=59.50]                           


Epoch #397: test_reward: 61.780000 ± 65.096556, best_reward: 69.335000 ± 55.199663 in #370


Epoch #398: 10001it [00:02, 4518.49it/s, bastaushy/loss=72.987, env_step=3980000, len=139, n/ep=8, n/st=1000, rew=4.12]                            


Epoch #398: test_reward: 45.425000 ± 77.730074, best_reward: 69.335000 ± 55.199663 in #370


Epoch #399: 10001it [00:02, 4470.12it/s, bastaushy/loss=72.941, env_step=3990000, len=124, n/ep=8, n/st=1000, rew=37.88]                           


Epoch #399: test_reward: 44.805000 ± 75.640578, best_reward: 69.335000 ± 55.199663 in #370


Epoch #400: 10001it [00:02, 4646.76it/s, bastaushy/loss=74.107, env_step=4000000, len=99, n/ep=9, n/st=1000, rew=54.00]                            


Epoch #400: test_reward: 46.610000 ± 74.747896, best_reward: 69.335000 ± 55.199663 in #370


Epoch #401: 10001it [00:02, 4625.01it/s, bastaushy/loss=76.576, env_step=4010000, len=131, n/ep=13, n/st=1000, rew=32.08]                           


Epoch #401: test_reward: 4.020000 ± 90.203213, best_reward: 69.335000 ± 55.199663 in #370


Epoch #402: 10001it [00:02, 4658.75it/s, bastaushy/loss=76.009, env_step=4020000, len=114, n/ep=10, n/st=1000, rew=6.30]                           


Epoch #402: test_reward: 36.205000 ± 78.607589, best_reward: 69.335000 ± 55.199663 in #370


Epoch #403: 10001it [00:02, 4661.88it/s, bastaushy/loss=76.065, env_step=4030000, len=128, n/ep=6, n/st=1000, rew=27.67]                            


Epoch #403: test_reward: 31.050000 ± 82.348512, best_reward: 69.335000 ± 55.199663 in #370


Epoch #404: 10001it [00:02, 4538.58it/s, bastaushy/loss=76.324, env_step=4040000, len=108, n/ep=12, n/st=1000, rew=19.08]                            


Epoch #404: test_reward: 49.005000 ± 76.720955, best_reward: 69.335000 ± 55.199663 in #370


Epoch #405: 10001it [00:02, 4628.64it/s, bastaushy/loss=76.977, env_step=4050000, len=113, n/ep=11, n/st=1000, rew=58.73]                           


Epoch #405: test_reward: 28.710000 ± 82.329921, best_reward: 69.335000 ± 55.199663 in #370


Epoch #406: 10001it [00:02, 4627.17it/s, bastaushy/loss=76.526, env_step=4060000, len=95, n/ep=12, n/st=1000, rew=60.83]                           


Epoch #406: test_reward: 56.225000 ± 70.809282, best_reward: 69.335000 ± 55.199663 in #370


Epoch #407: 10001it [00:02, 4370.59it/s, bastaushy/loss=75.848, env_step=4070000, len=137, n/ep=8, n/st=1000, rew=24.12]                           


Epoch #407: test_reward: 41.025000 ± 78.563760, best_reward: 69.335000 ± 55.199663 in #370


Epoch #408: 10001it [00:02, 4447.74it/s, bastaushy/loss=76.100, env_step=4080000, len=145, n/ep=10, n/st=1000, rew=19.90]                           


Epoch #408: test_reward: 42.015000 ± 77.952260, best_reward: 69.335000 ± 55.199663 in #370


Epoch #409: 10001it [00:02, 4569.05it/s, bastaushy/loss=75.438, env_step=4090000, len=118, n/ep=6, n/st=1000, rew=-28.00]                           


Agent bastaushy vs random {'bastaushy': 91, 'qostaushy': 9}
Epoch #409: test_reward: 73.750000 ± 53.761208, best_reward: 73.750000 ± 53.761208 in #409


Epoch #410: 10001it [00:02, 4566.70it/s, bastaushy/loss=74.171, env_step=4100000, len=124, n/ep=8, n/st=1000, rew=3.00]                             


Epoch #410: test_reward: 42.460000 ± 75.035448, best_reward: 73.750000 ± 53.761208 in #409


Epoch #411: 10001it [00:02, 4603.45it/s, bastaushy/loss=71.296, env_step=4110000, len=98, n/ep=8, n/st=1000, rew=46.12]                              


Epoch #411: test_reward: 41.440000 ± 77.789179, best_reward: 73.750000 ± 53.761208 in #409
