In [51]:
import sys
sys.path.append('/tmp-data/zhx/DriverOrderOfflineRL/cage-challenge-1/CybORG')
sys.path.append('/tmp-data/zhx/DriverOrderOfflineRL/tianshou')
sys.path.append('/tmp-data/zhx/DriverOrderOfflineRL/gym')

### args define

In [52]:
import argparse
import datetime
import os
import pprint

import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
from torch import nn

from tianshou.data import Collector, VectorReplayBuffer
from tianshou.policy import DQNPolicy
from tianshou.policy.modelbased.icm import ICMPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger, WandbLogger
from tianshou.utils.net.discrete import IntrinsicCuriosityModule
from tianshou.env import SubprocVectorEnv, DummyVectorEnv

from typing import Any, Dict, Optional, Sequence, Tuple, Union


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--task", type=str, default="cyborg")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--scale-obs", type=int, default=0)
    parser.add_argument("--eps-test", type=float, default=0.05)
    parser.add_argument("--eps-train", type=float, default=1.)
    parser.add_argument("--eps-train-final", type=float, default=0.05)
    parser.add_argument("--buffer-size", type=int, default=5000)
    parser.add_argument("--lr", type=float, default=0.0001)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--n-step", type=int, default=1)
    parser.add_argument("--target-update-freq", type=int, default=1000)
    parser.add_argument("--epoch", type=int, default=2000)
    parser.add_argument("--step-per-epoch", type=int, default=10)
    parser.add_argument("--step-per-collect", type=int, default=100)
    parser.add_argument("--update-per-step", type=float, default=1)
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--training-num", type=int, default=10)
    parser.add_argument("--test-num", type=int, default=10)
    parser.add_argument("--logdir", type=str, default="log")
    parser.add_argument("--render", type=float, default=0.)
    parser.add_argument(
        "--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu"
    )
    parser.add_argument("--frames-stack", type=int, default=1)
    parser.add_argument("--resume-path", type=str, default=None)
    parser.add_argument("--resume-id", type=str, default=None)
    parser.add_argument(
        "--logger",
        type=str,
        default="wandb",
        choices=["tensorboard", "wandb"],
    )
    parser.add_argument("--wandb-project", type=str, default="cyborg.dqn")
    parser.add_argument(
        "--watch",
        default=False,
        action="store_true",
        help="watch the play of pre-trained policy only"
    )
    parser.add_argument("--save-buffer-name", type=str, default=None)
    parser.add_argument(
        "--icm-lr-scale",
        type=float,
        default=0.,
        help="use intrinsic curiosity module with this lr scale"
    )
    parser.add_argument(
        "--icm-reward-scale",
        type=float,
        default=0.01,
        help="scaling factor for intrinsic curiosity reward"
    )
    parser.add_argument(
        "--icm-forward-loss-weight",
        type=float,
        default=0.2,
        help="weight for the forward model loss in ICM"
    )
    return parser.parse_args(args=[])


### env

In [53]:
args = get_args()

In [54]:
import inspect
from pprint import pprint
from CybORG import CybORG
from CybORG.Shared.Actions import *
from CybORG.Agents import RedMeanderAgent, B_lineAgent
from CybORG.Agents.Wrappers import *

path = str(inspect.getfile(CybORG))
path = path[:-10] + '/Shared/Scenarios/Scenario1b.yaml'

# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)

# env
CYBORG = CybORG(path,'sim', agents={'Red': RedMeanderAgent})
env = ChallengeWrapper(env=CYBORG, agent_name="Blue", max_steps=args.step_per_epoch)
train_envs = DummyVectorEnv([lambda: ChallengeWrapper(env=CybORG(path,'sim', agents={'Red': RedMeanderAgent}), agent_name="Blue", max_steps=args.step_per_epoch) for _ in range(1)])
test_envs = DummyVectorEnv([lambda: ChallengeWrapper(env=CybORG(path,'sim', agents={'Red': RedMeanderAgent}), agent_name="Blue", max_steps=args.step_per_epoch) for _ in range(1)])

In [55]:
args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
print("Observations shape:", args.state_shape)
print("Actions shape:", args.action_shape)

Observations shape: (52,)
Actions shape: 54


### network

In [56]:
class D5QN(nn.Module):
    """Reference: Human-level control through deep reinforcement learning.

    For advanced usage (how to customize the network), please refer to
    :ref:`build_the_network`.
    """

    def __init__(
        self,
        state_shape: Sequence[int],
        action_shape: Sequence[int],
        device: Union[str, int, torch.device] = "cpu",
        features_only: bool = False,
        output_dim: Optional[int] = None,
    ) -> None:
        super().__init__()
        self.device = device
        self.net = nn.Sequential(
            nn.Linear(state_shape, 512), nn.ReLU(inplace=True),
            nn.Linear(512, action_shape)
        )

    def forward(
        self,
        obs: Union[np.ndarray, torch.Tensor],
        state: Optional[Any] = None,
        info: Dict[str, Any] = {},
    ) -> Tuple[torch.Tensor, Any]:
        r"""Mapping: s -> Q(s, \*)."""
        obs = torch.as_tensor(obs, device=self.device, dtype=torch.float32)
        return self.net(obs), state

### define policy

In [57]:
# define model
net = D5QN(args.state_shape[0], args.action_shape, args.device).to(args.device)
optim = torch.optim.Adam(net.parameters(), lr=args.lr)
# define policy
policy = DQNPolicy(
    net,
    optim,
    args.gamma,
    args.n_step,
    target_update_freq=args.target_update_freq
)

In [58]:
if args.icm_lr_scale > 0:
    feature_net = DQN(
        args.state_shape[0], args.action_shape, args.device, features_only=True
    )
    action_dim = np.prod(args.action_shape)
    feature_dim = feature_net.output_dim
    icm_net = IntrinsicCuriosityModule(
        feature_net.net,
        feature_dim,
        action_dim,
        hidden_sizes=[512],
        device=args.device
    )
    icm_optim = torch.optim.Adam(icm_net.parameters(), lr=args.lr)
    policy = ICMPolicy(
        policy, icm_net, icm_optim, args.icm_lr_scale, args.icm_reward_scale,
        args.icm_forward_loss_weight
    ).to(args.device)

### load policy

In [59]:
# load a previous policy
if args.resume_path:
    policy.load_state_dict(torch.load(args.resume_path, map_location=args.device))
    print("Loaded agent from: ", args.resume_path)

### replay buffer

In [60]:
# replay buffer: `save_last_obs` and `stack_num` can be removed together
# when you have enough RAM
buffer = VectorReplayBuffer(
    args.buffer_size,
    buffer_num=len(train_envs),
    ignore_obs_next=True,
    save_only_last_obs=False,
    stack_num=args.frames_stack
)

### collector

In [61]:
# collector
train_collector = Collector(policy, train_envs, buffer, exploration_noise=True)
test_collector = Collector(policy, test_envs, exploration_noise=True)

### log

In [62]:
# log
now = datetime.datetime.now().strftime("%y%m%d-%H%M%S")
args.algo_name = "dqn_icm" if args.icm_lr_scale > 0 else "dqn"
log_name = os.path.join(args.task, args.algo_name, str(args.seed), now)
log_path = os.path.join(args.logdir, log_name)

In [63]:
# logger
if args.logger == "wandb":
    logger = WandbLogger(
        save_interval=1,
        name=log_name.replace(os.path.sep, "__"),
        run_id=args.resume_id,
        config=args,
        project=args.wandb_project,
    )
writer = SummaryWriter(log_path)
writer.add_text("args", str(args))
if args.logger == "tensorboard":
    logger = TensorboardLogger(writer)
else:  # wandb
    logger.load(writer)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhongxi[0m. Use [1m`wandb login --relogin`[0m to force relogin
/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help` for usage information.


/usr/bin/nvidia-modprobe: unrecognized option: "-s"

ERROR: Invalid commandline, please run `/usr/bin/nvidia-modprobe --help` for usage information.


  from IPython.core.display import HTML, display  # type: ignore


### train helper funciton

In [64]:
def save_best_fn(policy):
    torch.save(policy.state_dict(), os.path.join(log_path, "policy.pth"))

def stop_fn(mean_rewards):
    return mean_rewards >= 20

def train_fn(epoch, env_step):
    # nature DQN setting, linear decay in the first 1M steps
    if env_step <= 1e6:
        eps = args.eps_train - env_step / 1e6 * \
            (args.eps_train - args.eps_train_final)
    else:
        eps = args.eps_train_final
    policy.set_eps(eps)
    if env_step % 1000 == 0:
        logger.write("train/env_step", env_step, {"train/eps": eps})

def test_fn(epoch, env_step):
    policy.set_eps(args.eps_test)

def save_checkpoint_fn(epoch, env_step, gradient_step):
    # see also: https://pytorch.org/tutorials/beginner/saving_loading_models.html
    ckpt_path = os.path.join(log_path, "checkpoint.pth")
    torch.save({"model": policy.state_dict()}, ckpt_path)
    return ckpt_path

# watch agent's performance
def watch():
    print("Setup test envs ...")
    policy.eval()
    policy.set_eps(args.eps_test)
    test_envs.seed(args.seed)
    if args.save_buffer_name:
        print(f"Generate buffer with size {args.buffer_size}")
        buffer = VectorReplayBuffer(
            args.buffer_size,
            buffer_num=len(test_envs),
            ignore_obs_next=True,
            save_only_last_obs=True,
            stack_num=args.frames_stack
        )
        collector = Collector(policy, test_envs, buffer, exploration_noise=True)
        result = collector.collect(n_step=args.buffer_size)
        print(f"Save buffer into {args.save_buffer_name}")
        # Unfortunately, pickle will cause oom with 1M buffer size
        buffer.save_hdf5(args.save_buffer_name)
    else:
        print("Testing agent ...")
        test_collector.reset()
        result = test_collector.collect(
            n_episode=args.test_num, render=args.render
        )
    rew = result["rews"].mean()
    print(f"Mean reward (over {result['n/ep']} episodes): {rew}")

In [65]:
### train core

In [None]:
if args.watch:
    watch()
    exit(0)

# test train_collector and start filling replay buffer
train_collector.collect(n_step=args.batch_size * args.training_num)
# trainer
result = offpolicy_trainer(
    policy,
    train_collector,
    test_collector,
    args.epoch,
    args.step_per_epoch,
    args.step_per_collect,
    args.test_num,
    args.batch_size,
    train_fn=train_fn,
    test_fn=test_fn,
    stop_fn=stop_fn,
    save_best_fn=save_best_fn,
    logger=logger,
    update_per_step=args.update_per_step,
    test_in_train=False,
    resume_from_log=args.resume_id is not None,
    save_checkpoint_fn=save_checkpoint_fn,
)

pprint.pprint(result)
watch()

Epoch #1: 101it [00:10,  9.71it/s, env_step=100, len=100, loss=92.949, n/ep=1, n/st=100, rew=-842.70]                         


Epoch #1: test_reward: -285.820000 ± 168.290902, best_reward: -285.820000 ± 168.290902 in #1


Epoch #2: 101it [00:08, 12.47it/s, env_step=200, len=100, loss=66.129, n/ep=1, n/st=100, rew=-318.60]                         


Epoch #2: test_reward: -713.090000 ± 285.517202, best_reward: -285.820000 ± 168.290902 in #1


Epoch #3: 101it [00:07, 13.33it/s, env_step=300, len=100, loss=56.188, n/ep=1, n/st=100, rew=-759.70]                         


Epoch #3: test_reward: -831.650000 ± 214.775903, best_reward: -285.820000 ± 168.290902 in #1


Epoch #4: 101it [00:10,  9.67it/s, env_step=400, len=100, loss=35.148, n/ep=1, n/st=100, rew=-254.20]                         


Epoch #4: test_reward: -730.520000 ± 240.348097, best_reward: -285.820000 ± 168.290902 in #1


Epoch #5: 101it [00:28,  3.59it/s, env_step=500, len=100, loss=19.899, n/ep=1, n/st=100, rew=-256.10]                         


Epoch #5: test_reward: -845.760000 ± 173.374844, best_reward: -285.820000 ± 168.290902 in #1


Epoch #6: 101it [00:08, 11.79it/s, env_step=600, len=100, loss=12.948, n/ep=1, n/st=100, rew=-84.40]                         


Epoch #6: test_reward: -671.900000 ± 322.736038, best_reward: -285.820000 ± 168.290902 in #1


Epoch #7: 101it [00:13,  7.39it/s, env_step=700, len=100, loss=8.648, n/ep=1, n/st=100, rew=-188.40]                         


Epoch #7: test_reward: -611.690000 ± 360.334779, best_reward: -285.820000 ± 168.290902 in #1


Epoch #8: 101it [00:10,  9.58it/s, env_step=800, len=100, loss=7.304, n/ep=1, n/st=100, rew=-225.30]                         


Epoch #8: test_reward: -731.490000 ± 270.565413, best_reward: -285.820000 ± 168.290902 in #1


Epoch #9: 101it [00:04, 23.17it/s, env_step=900, len=100, loss=9.258, n/ep=1, n/st=100, rew=-738.00]                         


Epoch #9: test_reward: -502.700000 ± 317.856166, best_reward: -285.820000 ± 168.290902 in #1


Epoch #10: 101it [00:13,  7.60it/s, env_step=1000, len=100, loss=8.268, n/ep=1, n/st=100, rew=-258.60]                         


Epoch #10: test_reward: -554.730000 ± 339.279021, best_reward: -285.820000 ± 168.290902 in #1


Epoch #11: 101it [00:10,  9.80it/s, env_step=1100, len=100, loss=11.095, n/ep=1, n/st=100, rew=-293.80]                         


Epoch #11: test_reward: -836.810000 ± 197.974324, best_reward: -285.820000 ± 168.290902 in #1


Epoch #12: 101it [00:24,  4.12it/s, env_step=1200, len=100, loss=10.064, n/ep=1, n/st=100, rew=-662.20]                         


Epoch #12: test_reward: -888.450000 ± 48.562707, best_reward: -285.820000 ± 168.290902 in #1


Epoch #13: 101it [00:24,  4.17it/s, env_step=1300, len=100, loss=8.525, n/ep=1, n/st=100, rew=-228.90]                         


Epoch #13: test_reward: -779.250000 ± 232.670979, best_reward: -285.820000 ± 168.290902 in #1


Epoch #14: 101it [00:13,  7.25it/s, env_step=1400, len=100, loss=7.389, n/ep=1, n/st=100, rew=-253.90]                         


Epoch #14: test_reward: -570.120000 ± 322.016987, best_reward: -285.820000 ± 168.290902 in #1


Epoch #15: 101it [00:18,  5.46it/s, env_step=1500, len=100, loss=7.092, n/ep=1, n/st=100, rew=-161.60]                         


Epoch #15: test_reward: -392.170000 ± 191.119523, best_reward: -285.820000 ± 168.290902 in #1


Epoch #16: 101it [00:28,  3.49it/s, env_step=1600, len=100, loss=6.888, n/ep=1, n/st=100, rew=-273.90]                          


Epoch #16: test_reward: -425.520000 ± 242.266419, best_reward: -285.820000 ± 168.290902 in #1


Epoch #17: 101it [00:28,  3.58it/s, env_step=1700, len=100, loss=6.049, n/ep=1, n/st=100, rew=-231.80]                         


Epoch #17: test_reward: -320.890000 ± 204.591234, best_reward: -285.820000 ± 168.290902 in #1


Epoch #18: 101it [00:27,  3.71it/s, env_step=1800, len=100, loss=5.859, n/ep=1, n/st=100, rew=-65.40]                         


Epoch #18: test_reward: -249.690000 ± 81.046437, best_reward: -249.690000 ± 81.046437 in #18


Epoch #19: 101it [00:26,  3.80it/s, env_step=1900, len=100, loss=5.330, n/ep=1, n/st=100, rew=-268.40]                         


Epoch #19: test_reward: -624.850000 ± 312.886137, best_reward: -249.690000 ± 81.046437 in #18


Epoch #20: 101it [00:25,  3.89it/s, env_step=2000, len=100, loss=5.319, n/ep=1, n/st=100, rew=-203.10]                         


Epoch #20: test_reward: -333.260000 ± 156.101142, best_reward: -249.690000 ± 81.046437 in #18


Epoch #21: 101it [00:08, 11.72it/s, env_step=2100, len=100, loss=7.862, n/ep=1, n/st=100, rew=-234.20]                         


Epoch #21: test_reward: -406.820000 ± 269.677284, best_reward: -249.690000 ± 81.046437 in #18


Epoch #22: 101it [00:25,  3.90it/s, env_step=2200, len=100, loss=5.301, n/ep=1, n/st=100, rew=-193.40]                         


Epoch #22: test_reward: -426.350000 ± 274.812632, best_reward: -249.690000 ± 81.046437 in #18


Epoch #23: 101it [00:16,  6.26it/s, env_step=2300, len=100, loss=4.670, n/ep=1, n/st=100, rew=-310.60]                         


Epoch #23: test_reward: -353.370000 ± 179.759857, best_reward: -249.690000 ± 81.046437 in #18


Epoch #24: 101it [00:41,  2.41it/s, env_step=2400, len=100, loss=4.489, n/ep=1, n/st=100, rew=-312.30]                         


Epoch #24: test_reward: -261.770000 ± 64.312192, best_reward: -249.690000 ± 81.046437 in #18


Epoch #25: 101it [00:35,  2.81it/s, env_step=2500, len=100, loss=4.497, n/ep=1, n/st=100, rew=-162.70]                         


Epoch #25: test_reward: -309.260000 ± 217.297870, best_reward: -249.690000 ± 81.046437 in #18


Epoch #26: 101it [00:13,  7.49it/s, env_step=2600, len=100, loss=4.302, n/ep=1, n/st=100, rew=-276.40]                         


Epoch #26: test_reward: -800.980000 ± 209.986189, best_reward: -249.690000 ± 81.046437 in #18


Epoch #27: 101it [00:27,  3.63it/s, env_step=2700, len=100, loss=4.472, n/ep=1, n/st=100, rew=-113.10]                         


Epoch #27: test_reward: -452.130000 ± 270.464593, best_reward: -249.690000 ± 81.046437 in #18


Epoch #28: 101it [00:03, 27.03it/s, env_step=2800, len=100, loss=3.847, n/ep=1, n/st=100, rew=-172.20]                          


Epoch #28: test_reward: -716.580000 ± 245.557215, best_reward: -249.690000 ± 81.046437 in #18


Epoch #29: 101it [00:09, 10.44it/s, env_step=2900, len=100, loss=3.818, n/ep=1, n/st=100, rew=-250.90]                         


Epoch #29: test_reward: -629.250000 ± 251.530206, best_reward: -249.690000 ± 81.046437 in #18


Epoch #30: 101it [00:25,  4.04it/s, env_step=3000, len=100, loss=3.548, n/ep=1, n/st=100, rew=-243.50]                         


Epoch #30: test_reward: -484.860000 ± 264.596316, best_reward: -249.690000 ± 81.046437 in #18


Epoch #31: 101it [00:22,  4.40it/s, env_step=3100, len=100, loss=5.630, n/ep=1, n/st=100, rew=-525.40]                          


Epoch #31: test_reward: -218.860000 ± 35.732176, best_reward: -218.860000 ± 35.732176 in #31


Epoch #32: 101it [00:24,  4.09it/s, env_step=3200, len=100, loss=5.046, n/ep=1, n/st=100, rew=-192.10]                         


Epoch #32: test_reward: -250.150000 ± 29.820102, best_reward: -218.860000 ± 35.732176 in #31


Epoch #33: 101it [00:31,  3.25it/s, env_step=3300, len=100, loss=4.451, n/ep=1, n/st=100, rew=-190.70]                         


Epoch #33: test_reward: -267.100000 ± 35.795503, best_reward: -218.860000 ± 35.732176 in #31


Epoch #34: 101it [00:35,  2.84it/s, env_step=3400, len=100, loss=4.579, n/ep=1, n/st=100, rew=-292.30]                         


Epoch #34: test_reward: -279.800000 ± 38.129018, best_reward: -218.860000 ± 35.732176 in #31


Epoch #35: 101it [00:26,  3.87it/s, env_step=3500, len=100, loss=4.210, n/ep=1, n/st=100, rew=-78.30]                         


Epoch #35: test_reward: -257.800000 ± 40.119771, best_reward: -218.860000 ± 35.732176 in #31


Epoch #36: 101it [00:30,  3.28it/s, env_step=3600, len=100, loss=4.160, n/ep=1, n/st=100, rew=-216.10]                         


Epoch #36: test_reward: -274.060000 ± 54.552382, best_reward: -218.860000 ± 35.732176 in #31


Epoch #37: 101it [00:25,  3.89it/s, env_step=3700, len=100, loss=4.066, n/ep=1, n/st=100, rew=-254.10]                         


Epoch #37: test_reward: -226.140000 ± 76.448678, best_reward: -218.860000 ± 35.732176 in #31


Epoch #38: 101it [00:26,  3.82it/s, env_step=3800, len=100, loss=3.873, n/ep=1, n/st=100, rew=-30.00]                         


Epoch #38: test_reward: -229.970000 ± 32.485999, best_reward: -218.860000 ± 35.732176 in #31


Epoch #39: 101it [00:06, 15.81it/s, env_step=3900, len=100, loss=3.409, n/ep=1, n/st=100, rew=-249.60]                         


Epoch #39: test_reward: -231.810000 ± 56.843055, best_reward: -218.860000 ± 35.732176 in #31


Epoch #40: 101it [00:21,  4.77it/s, env_step=4000, len=100, loss=4.058, n/ep=1, n/st=100, rew=-806.80]                         


Epoch #40: test_reward: -344.240000 ± 231.750561, best_reward: -218.860000 ± 35.732176 in #31


Epoch #41: 101it [00:14,  7.19it/s, env_step=4100, len=100, loss=5.244, n/ep=1, n/st=100, rew=-291.80]                         


Epoch #41: test_reward: -244.800000 ± 30.321807, best_reward: -218.860000 ± 35.732176 in #31


Epoch #42: 101it [00:28,  3.60it/s, env_step=4200, len=100, loss=5.616, n/ep=1, n/st=100, rew=-294.30]                         


Epoch #42: test_reward: -295.930000 ± 44.319613, best_reward: -218.860000 ± 35.732176 in #31


Epoch #43: 101it [00:17,  5.90it/s, env_step=4300, len=100, loss=4.790, n/ep=1, n/st=100, rew=-497.90]                         


Epoch #43: test_reward: -278.740000 ± 19.182346, best_reward: -218.860000 ± 35.732176 in #31


Epoch #44: 101it [00:25,  3.99it/s, env_step=4400, len=100, loss=4.421, n/ep=1, n/st=100, rew=-65.80]                         


Epoch #44: test_reward: -259.020000 ± 46.421047, best_reward: -218.860000 ± 35.732176 in #31


Epoch #45: 101it [00:05, 19.30it/s, env_step=4500, len=100, loss=4.648, n/ep=1, n/st=100, rew=-246.80]                         


Epoch #45: test_reward: -283.250000 ± 59.260362, best_reward: -218.860000 ± 35.732176 in #31


Epoch #46: 101it [00:06, 16.46it/s, env_step=4600, len=100, loss=4.072, n/ep=1, n/st=100, rew=-228.30]                          


Epoch #46: test_reward: -246.450000 ± 51.764800, best_reward: -218.860000 ± 35.732176 in #31


Epoch #47: 101it [00:09, 10.94it/s, env_step=4700, len=100, loss=4.522, n/ep=1, n/st=100, rew=-217.90]                         


Epoch #47: test_reward: -201.270000 ± 45.011266, best_reward: -201.270000 ± 45.011266 in #47


Epoch #48: 101it [00:04, 23.37it/s, env_step=4800, len=100, loss=4.516, n/ep=1, n/st=100, rew=-607.50]                          


Epoch #48: test_reward: -261.860000 ± 40.026821, best_reward: -201.270000 ± 45.011266 in #47


Epoch #49: 101it [00:03, 32.73it/s, env_step=4900, len=100, loss=4.997, n/ep=1, n/st=100, rew=-214.10]                         


Epoch #49: test_reward: -254.970000 ± 54.879560, best_reward: -201.270000 ± 45.011266 in #47


Epoch #50: 101it [00:07, 12.67it/s, env_step=5000, len=100, loss=4.486, n/ep=1, n/st=100, rew=-200.50]                         


Epoch #50: test_reward: -302.700000 ± 85.971065, best_reward: -201.270000 ± 45.011266 in #47


Epoch #51: 101it [00:02, 40.89it/s, env_step=5100, len=100, loss=4.740, n/ep=1, n/st=100, rew=-114.90]                         


Epoch #51: test_reward: -226.660000 ± 22.475329, best_reward: -201.270000 ± 45.011266 in #47


Epoch #52: 101it [00:01, 50.52it/s, env_step=5200, len=100, loss=4.513, n/ep=1, n/st=100, rew=-108.60]                          


Epoch #52: test_reward: -223.730000 ± 25.244209, best_reward: -201.270000 ± 45.011266 in #47


Epoch #53: 101it [00:06, 14.50it/s, env_step=5300, len=100, loss=4.858, n/ep=1, n/st=100, rew=-102.60]                         


Epoch #53: test_reward: -245.140000 ± 33.046277, best_reward: -201.270000 ± 45.011266 in #47


Epoch #54: 101it [00:05, 19.05it/s, env_step=5400, len=100, loss=4.481, n/ep=1, n/st=100, rew=-282.30]                          


Epoch #54: test_reward: -199.490000 ± 40.908934, best_reward: -199.490000 ± 40.908934 in #54


Epoch #55: 101it [00:14,  7.05it/s, env_step=5500, len=100, loss=4.480, n/ep=1, n/st=100, rew=-143.00]                         


Epoch #55: test_reward: -344.390000 ± 207.901936, best_reward: -199.490000 ± 40.908934 in #54


Epoch #56: 101it [00:04, 20.26it/s, env_step=5600, len=100, loss=4.253, n/ep=1, n/st=100, rew=-76.10]                         


Epoch #56: test_reward: -616.840000 ± 260.478580, best_reward: -199.490000 ± 40.908934 in #54


Epoch #57: 101it [00:13,  7.42it/s, env_step=5700, len=100, loss=4.773, n/ep=1, n/st=100, rew=-240.40]                         


Epoch #57: test_reward: -238.530000 ± 57.075582, best_reward: -199.490000 ± 40.908934 in #54


Epoch #58: 101it [00:03, 31.40it/s, env_step=5800, len=100, loss=4.117, n/ep=1, n/st=100, rew=-89.50]                         


Epoch #58: test_reward: -232.630000 ± 62.143271, best_reward: -199.490000 ± 40.908934 in #54


Epoch #59: 101it [00:08, 11.76it/s, env_step=5900, len=100, loss=4.589, n/ep=1, n/st=100, rew=-822.10]                         


Epoch #59: test_reward: -278.720000 ± 48.534437, best_reward: -199.490000 ± 40.908934 in #54


Epoch #60: 101it [00:01, 75.01it/s, env_step=6000, len=100, loss=4.072, n/ep=1, n/st=100, rew=-185.70]                          


Epoch #60: test_reward: -274.180000 ± 35.749400, best_reward: -199.490000 ± 40.908934 in #54


Epoch #61: 101it [00:04, 24.79it/s, env_step=6100, len=100, loss=4.101, n/ep=1, n/st=100, rew=-248.10]                         


Epoch #61: test_reward: -200.860000 ± 85.467844, best_reward: -199.490000 ± 40.908934 in #54


Epoch #62: 101it [00:01, 73.15it/s, env_step=6200, len=100, loss=4.217, n/ep=1, n/st=100, rew=-71.40]                          


Epoch #62: test_reward: -241.070000 ± 56.269495, best_reward: -199.490000 ± 40.908934 in #54


Epoch #63: 101it [00:06, 16.14it/s, env_step=6300, len=100, loss=3.894, n/ep=1, n/st=100, rew=-168.10]                         


Epoch #63: test_reward: -176.290000 ± 47.673944, best_reward: -176.290000 ± 47.673944 in #63


Epoch #64: 101it [00:07, 14.17it/s, env_step=6400, len=100, loss=4.389, n/ep=1, n/st=100, rew=-194.80]                         


Epoch #64: test_reward: -301.340000 ± 221.433589, best_reward: -176.290000 ± 47.673944 in #63


Epoch #65: 101it [00:09, 10.30it/s, env_step=6500, len=100, loss=4.208, n/ep=1, n/st=100, rew=-253.00]                          


Epoch #65: test_reward: -190.030000 ± 58.252297, best_reward: -176.290000 ± 47.673944 in #63


Epoch #66: 101it [00:04, 20.80it/s, env_step=6600, len=100, loss=3.614, n/ep=1, n/st=100, rew=-217.90]                          


Epoch #66: test_reward: -299.020000 ± 166.707581, best_reward: -176.290000 ± 47.673944 in #63


Epoch #67: 101it [00:06, 14.73it/s, env_step=6700, len=100, loss=3.569, n/ep=1, n/st=100, rew=-163.80]                          


Epoch #67: test_reward: -228.960000 ± 40.812895, best_reward: -176.290000 ± 47.673944 in #63


Epoch #68: 101it [00:08, 12.58it/s, env_step=6800, len=100, loss=3.716, n/ep=1, n/st=100, rew=-239.10]                         


Epoch #68: test_reward: -216.590000 ± 53.199388, best_reward: -176.290000 ± 47.673944 in #63


Epoch #69: 101it [00:05, 17.71it/s, env_step=6900, len=100, loss=3.580, n/ep=1, n/st=100, rew=-74.60]                         


Epoch #69: test_reward: -361.150000 ± 228.482552, best_reward: -176.290000 ± 47.673944 in #63


Epoch #70: 101it [00:13,  7.73it/s, env_step=7000, len=100, loss=3.925, n/ep=1, n/st=100, rew=-196.90]                         


Epoch #70: test_reward: -398.940000 ± 271.746184, best_reward: -176.290000 ± 47.673944 in #63


Epoch #71: 101it [00:09, 10.37it/s, env_step=7100, len=100, loss=4.202, n/ep=1, n/st=100, rew=-247.00]                          


Epoch #71: test_reward: -276.300000 ± 47.542108, best_reward: -176.290000 ± 47.673944 in #63


Epoch #72: 101it [00:04, 24.02it/s, env_step=7200, len=100, loss=4.229, n/ep=1, n/st=100, rew=-102.30]                          


Epoch #72: test_reward: -143.720000 ± 48.588842, best_reward: -143.720000 ± 48.588842 in #72


Epoch #73: 101it [00:02, 39.69it/s, env_step=7300, len=100, loss=3.673, n/ep=1, n/st=100, rew=-209.10]                         


Epoch #73: test_reward: -114.030000 ± 58.173861, best_reward: -114.030000 ± 58.173861 in #73


Epoch #74: 101it [00:05, 16.95it/s, env_step=7400, len=100, loss=3.689, n/ep=1, n/st=100, rew=-173.90]                         


Epoch #74: test_reward: -183.630000 ± 58.015240, best_reward: -114.030000 ± 58.173861 in #73


Epoch #75: 101it [00:04, 21.60it/s, env_step=7500, len=100, loss=3.492, n/ep=1, n/st=100, rew=-382.70]                         


Epoch #75: test_reward: -156.150000 ± 91.154706, best_reward: -114.030000 ± 58.173861 in #73


Epoch #76: 101it [00:03, 28.32it/s, env_step=7600, len=100, loss=4.100, n/ep=1, n/st=100, rew=-189.70]                          


Epoch #76: test_reward: -175.160000 ± 38.522570, best_reward: -114.030000 ± 58.173861 in #73


Epoch #77: 101it [00:05, 18.72it/s, env_step=7700, len=100, loss=4.043, n/ep=1, n/st=100, rew=-153.10]                         


Epoch #77: test_reward: -132.690000 ± 56.975248, best_reward: -114.030000 ± 58.173861 in #73


Epoch #78: 101it [00:04, 20.33it/s, env_step=7800, len=100, loss=3.810, n/ep=1, n/st=100, rew=-185.80]                         


Epoch #78: test_reward: -152.200000 ± 48.571576, best_reward: -114.030000 ± 58.173861 in #73


Epoch #79: 101it [00:05, 19.00it/s, env_step=7900, len=100, loss=4.029, n/ep=1, n/st=100, rew=-143.40]                         


Epoch #79: test_reward: -135.440000 ± 19.503395, best_reward: -114.030000 ± 58.173861 in #73


Epoch #80: 101it [00:05, 17.70it/s, env_step=8000, len=100, loss=4.069, n/ep=1, n/st=100, rew=-187.80]                         


Epoch #80: test_reward: -134.100000 ± 46.191623, best_reward: -114.030000 ± 58.173861 in #73


Epoch #81: 101it [00:06, 16.29it/s, env_step=8100, len=100, loss=4.167, n/ep=1, n/st=100, rew=-451.20]                         


Epoch #81: test_reward: -167.990000 ± 65.984156, best_reward: -114.030000 ± 58.173861 in #73


Epoch #82: 101it [00:05, 19.38it/s, env_step=8200, len=100, loss=3.979, n/ep=1, n/st=100, rew=-113.90]                         


Epoch #82: test_reward: -171.970000 ± 63.001048, best_reward: -114.030000 ± 58.173861 in #73


Epoch #83: 101it [00:06, 15.90it/s, env_step=8300, len=100, loss=4.014, n/ep=1, n/st=100, rew=-170.00]                          


Epoch #83: test_reward: -139.890000 ± 52.985459, best_reward: -114.030000 ± 58.173861 in #73


Epoch #84: 101it [00:06, 15.14it/s, env_step=8400, len=100, loss=5.002, n/ep=1, n/st=100, rew=-694.90]                         


Epoch #84: test_reward: -103.640000 ± 55.452217, best_reward: -103.640000 ± 55.452217 in #84


Epoch #85: 101it [00:03, 29.94it/s, env_step=8500, len=100, loss=4.824, n/ep=1, n/st=100, rew=-280.80]                         


Epoch #85: test_reward: -91.800000 ± 65.556312, best_reward: -91.800000 ± 65.556312 in #85


Epoch #86: 101it [00:16,  6.03it/s, env_step=8600, len=100, loss=4.423, n/ep=1, n/st=100, rew=-263.90]                         


Epoch #86: test_reward: -156.590000 ± 69.627961, best_reward: -91.800000 ± 65.556312 in #85


Epoch #87: 101it [00:04, 22.75it/s, env_step=8700, len=100, loss=5.283, n/ep=1, n/st=100, rew=-891.00]                         


Epoch #87: test_reward: -127.730000 ± 79.292169, best_reward: -91.800000 ± 65.556312 in #85


Epoch #88: 101it [00:09, 11.16it/s, env_step=8800, len=100, loss=5.787, n/ep=1, n/st=100, rew=-386.10]                         


Epoch #88: test_reward: -122.080000 ± 64.270768, best_reward: -91.800000 ± 65.556312 in #85


Epoch #89: 101it [00:05, 18.89it/s, env_step=8900, len=100, loss=5.512, n/ep=1, n/st=100, rew=-222.90]                         


Epoch #89: test_reward: -108.560000 ± 61.557229, best_reward: -91.800000 ± 65.556312 in #85


Epoch #90: 101it [00:15,  6.56it/s, env_step=9000, len=100, loss=5.194, n/ep=1, n/st=100, rew=-235.90]                         


Epoch #90: test_reward: -115.640000 ± 59.706335, best_reward: -91.800000 ± 65.556312 in #85


Epoch #91: 101it [00:01, 76.36it/s, env_step=9100, len=100, loss=5.718, n/ep=1, n/st=100, rew=-166.30]                          


Epoch #91: test_reward: -133.700000 ± 77.113968, best_reward: -91.800000 ± 65.556312 in #85


Epoch #92: 101it [00:10,  9.47it/s, env_step=9200, len=100, loss=4.730, n/ep=1, n/st=100, rew=-37.70]                         


Epoch #92: test_reward: -138.630000 ± 73.534564, best_reward: -91.800000 ± 65.556312 in #85


Epoch #93: 101it [00:06, 15.26it/s, env_step=9300, len=100, loss=4.737, n/ep=1, n/st=100, rew=-111.40]                         


Epoch #93: test_reward: -127.210000 ± 64.638927, best_reward: -91.800000 ± 65.556312 in #85


Epoch #94: 101it [00:08, 11.97it/s, env_step=9400, len=100, loss=4.180, n/ep=1, n/st=100, rew=-282.00]                         


Epoch #94: test_reward: -138.200000 ± 82.384804, best_reward: -91.800000 ± 65.556312 in #85


Epoch #95: 101it [00:07, 12.95it/s, env_step=9500, len=100, loss=5.233, n/ep=1, n/st=100, rew=-309.70]                          


Epoch #95: test_reward: -89.040000 ± 55.656432, best_reward: -89.040000 ± 55.656432 in #95


Epoch #96: 101it [00:04, 22.50it/s, env_step=9600, len=100, loss=5.144, n/ep=1, n/st=100, rew=-225.40]                         


Epoch #96: test_reward: -112.970000 ± 45.312163, best_reward: -89.040000 ± 55.656432 in #95


Epoch #97: 101it [00:04, 21.69it/s, env_step=9700, len=100, loss=4.968, n/ep=1, n/st=100, rew=-219.40]                         


Epoch #97: test_reward: -144.560000 ± 202.897202, best_reward: -89.040000 ± 55.656432 in #95


Epoch #98: 101it [00:09, 10.32it/s, env_step=9800, len=100, loss=5.035, n/ep=1, n/st=100, rew=-397.00]                         


Epoch #98: test_reward: -112.700000 ± 64.781263, best_reward: -89.040000 ± 55.656432 in #95


Epoch #99: 101it [00:04, 20.35it/s, env_step=9900, len=100, loss=4.760, n/ep=1, n/st=100, rew=-322.90]                          


Epoch #99: test_reward: -97.200000 ± 50.869362, best_reward: -89.040000 ± 55.656432 in #95


Epoch #100: 101it [00:15,  6.39it/s, env_step=10000, len=100, loss=4.362, n/ep=1, n/st=100, rew=-219.80]                         


Epoch #100: test_reward: -113.230000 ± 64.411133, best_reward: -89.040000 ± 55.656432 in #95


Epoch #101: 101it [00:02, 44.66it/s, env_step=10100, len=100, loss=5.204, n/ep=1, n/st=100, rew=-46.40]                         


Epoch #101: test_reward: -105.540000 ± 58.696460, best_reward: -89.040000 ± 55.656432 in #95


Epoch #102: 101it [00:18,  5.53it/s, env_step=10200, len=100, loss=5.042, n/ep=1, n/st=100, rew=-242.90]                         


Epoch #102: test_reward: -93.910000 ± 58.801266, best_reward: -89.040000 ± 55.656432 in #95


Epoch #103: 101it [00:02, 47.34it/s, env_step=10300, len=100, loss=4.876, n/ep=1, n/st=100, rew=-154.00]                         


Epoch #103: test_reward: -146.560000 ± 121.039979, best_reward: -89.040000 ± 55.656432 in #95


Epoch #104: 101it [00:17,  5.61it/s, env_step=10400, len=100, loss=4.317, n/ep=1, n/st=100, rew=-384.10]                         


Epoch #104: test_reward: -56.050000 ± 16.038345, best_reward: -56.050000 ± 16.038345 in #104


Epoch #105: 101it [00:01, 50.53it/s, env_step=10500, len=100, loss=5.018, n/ep=1, n/st=100, rew=-200.80]                         


Epoch #105: test_reward: -91.760000 ± 58.562995, best_reward: -56.050000 ± 16.038345 in #104


Epoch #106: 101it [00:16,  6.28it/s, env_step=10600, len=100, loss=5.365, n/ep=1, n/st=100, rew=-270.40]                         


Epoch #106: test_reward: -111.700000 ± 53.875746, best_reward: -56.050000 ± 16.038345 in #104


Epoch #107: 101it [00:09, 11.12it/s, env_step=10700, len=100, loss=5.229, n/ep=1, n/st=100, rew=-237.40]                          


Epoch #107: test_reward: -146.670000 ± 62.985745, best_reward: -56.050000 ± 16.038345 in #104


Epoch #108: 101it [00:04, 23.29it/s, env_step=10800, len=100, loss=4.852, n/ep=1, n/st=100, rew=-286.90]                         


Epoch #108: test_reward: -78.760000 ± 45.144550, best_reward: -56.050000 ± 16.038345 in #104


Epoch #109: 101it [00:05, 18.10it/s, env_step=10900, len=100, loss=5.200, n/ep=1, n/st=100, rew=-68.20]                         


Epoch #109: test_reward: -147.230000 ± 112.997346, best_reward: -56.050000 ± 16.038345 in #104


Epoch #110: 101it [00:06, 15.48it/s, env_step=11000, len=100, loss=5.489, n/ep=1, n/st=100, rew=-222.50]                         


Epoch #110: test_reward: -80.550000 ± 42.190218, best_reward: -56.050000 ± 16.038345 in #104


Epoch #111: 101it [00:08, 12.17it/s, env_step=11100, len=100, loss=5.033, n/ep=1, n/st=100, rew=-35.20]                          


Epoch #111: test_reward: -112.070000 ± 63.963131, best_reward: -56.050000 ± 16.038345 in #104


Epoch #112: 101it [00:03, 31.70it/s, env_step=11200, len=100, loss=5.375, n/ep=1, n/st=100, rew=-478.00]                         


Epoch #112: test_reward: -124.150000 ± 27.650253, best_reward: -56.050000 ± 16.038345 in #104


Epoch #113: 101it [00:07, 13.13it/s, env_step=11300, len=100, loss=5.815, n/ep=1, n/st=100, rew=-322.60]                         


Epoch #113: test_reward: -91.680000 ± 48.388817, best_reward: -56.050000 ± 16.038345 in #104


Epoch #114: 101it [00:09, 10.48it/s, env_step=11400, len=100, loss=5.313, n/ep=1, n/st=100, rew=-257.80]                         


Epoch #114: test_reward: -196.260000 ± 217.693143, best_reward: -56.050000 ± 16.038345 in #104


Epoch #115: 101it [00:05, 19.30it/s, env_step=11500, len=100, loss=5.305, n/ep=1, n/st=100, rew=-248.10]                         


Epoch #115: test_reward: -162.560000 ± 175.071992, best_reward: -56.050000 ± 16.038345 in #104


Epoch #116: 101it [00:13,  7.29it/s, env_step=11600, len=100, loss=5.080, n/ep=1, n/st=100, rew=-125.60]                         


Epoch #116: test_reward: -90.610000 ± 56.619951, best_reward: -56.050000 ± 16.038345 in #104


Epoch #117: 101it [00:08, 12.49it/s, env_step=11700, len=100, loss=4.856, n/ep=1, n/st=100, rew=-236.70]                         


Epoch #117: test_reward: -146.690000 ± 47.119199, best_reward: -56.050000 ± 16.038345 in #104


Epoch #118: 101it [00:08, 12.05it/s, env_step=11800, len=100, loss=5.619, n/ep=1, n/st=100, rew=-223.80]                         


Epoch #118: test_reward: -136.390000 ± 65.969909, best_reward: -56.050000 ± 16.038345 in #104


Epoch #119: 101it [00:11,  9.13it/s, env_step=11900, len=100, loss=5.227, n/ep=1, n/st=100, rew=-122.70]                          


Epoch #119: test_reward: -121.430000 ± 75.366784, best_reward: -56.050000 ± 16.038345 in #104


Epoch #120: 101it [00:14,  7.21it/s, env_step=12000, len=100, loss=4.972, n/ep=1, n/st=100, rew=-266.60]                         


Epoch #120: test_reward: -82.090000 ± 39.434894, best_reward: -56.050000 ± 16.038345 in #104


Epoch #121: 101it [00:07, 12.88it/s, env_step=12100, len=100, loss=5.860, n/ep=1, n/st=100, rew=-222.40]                          


Epoch #121: test_reward: -104.850000 ± 54.308825, best_reward: -56.050000 ± 16.038345 in #104


Epoch #122: 101it [00:16,  6.23it/s, env_step=12200, len=100, loss=4.971, n/ep=1, n/st=100, rew=-296.10]                         


Epoch #122: test_reward: -120.140000 ± 49.310530, best_reward: -56.050000 ± 16.038345 in #104


Epoch #123: 101it [00:07, 13.42it/s, env_step=12300, len=100, loss=4.993, n/ep=1, n/st=100, rew=-271.50]                          


Epoch #123: test_reward: -80.920000 ± 31.897674, best_reward: -56.050000 ± 16.038345 in #104


Epoch #124: 101it [00:13,  7.31it/s, env_step=12400, len=100, loss=5.323, n/ep=1, n/st=100, rew=-39.20]                         


Epoch #124: test_reward: -75.790000 ± 34.868050, best_reward: -56.050000 ± 16.038345 in #104


Epoch #125: 101it [00:02, 50.38it/s, env_step=12500, len=100, loss=5.462, n/ep=1, n/st=100, rew=-36.70]                          


Epoch #125: test_reward: -103.640000 ± 47.163125, best_reward: -56.050000 ± 16.038345 in #104


Epoch #126: 101it [00:18,  5.59it/s, env_step=12600, len=100, loss=5.565, n/ep=1, n/st=100, rew=-70.30]                         


Epoch #126: test_reward: -56.300000 ± 24.872756, best_reward: -56.050000 ± 16.038345 in #104


Epoch #127: 101it [00:04, 22.53it/s, env_step=12700, len=100, loss=4.853, n/ep=1, n/st=100, rew=-306.30]                          


Epoch #127: test_reward: -94.380000 ± 37.600899, best_reward: -56.050000 ± 16.038345 in #104


Epoch #128: 101it [00:11,  8.83it/s, env_step=12800, len=100, loss=5.574, n/ep=1, n/st=100, rew=-210.70]                         


Epoch #128: test_reward: -112.660000 ± 44.170945, best_reward: -56.050000 ± 16.038345 in #104


Epoch #129: 101it [00:05, 19.53it/s, env_step=12900, len=100, loss=6.039, n/ep=1, n/st=100, rew=-553.90]                          


Epoch #129: test_reward: -84.650000 ± 48.948856, best_reward: -56.050000 ± 16.038345 in #104


Epoch #130: 101it [00:01, 78.64it/s, env_step=13000, len=100, loss=5.097, n/ep=1, n/st=100, rew=-54.90]                          


Epoch #130: test_reward: -78.930000 ± 59.575482, best_reward: -56.050000 ± 16.038345 in #104


Epoch #131: 101it [00:05, 17.25it/s, env_step=13100, len=100, loss=5.342, n/ep=1, n/st=100, rew=-260.00]                         


Epoch #131: test_reward: -138.300000 ± 164.532659, best_reward: -56.050000 ± 16.038345 in #104


Epoch #132: 101it [00:05, 18.49it/s, env_step=13200, len=100, loss=5.430, n/ep=1, n/st=100, rew=-257.70]                         


Epoch #132: test_reward: -66.850000 ± 51.464867, best_reward: -56.050000 ± 16.038345 in #104


Epoch #133: 101it [00:07, 13.02it/s, env_step=13300, len=100, loss=4.714, n/ep=1, n/st=100, rew=-171.40]                         


Epoch #133: test_reward: -72.400000 ± 28.672600, best_reward: -56.050000 ± 16.038345 in #104


Epoch #134: 101it [00:05, 17.35it/s, env_step=13400, len=100, loss=4.461, n/ep=1, n/st=100, rew=-345.90]                         


Epoch #134: test_reward: -67.550000 ± 39.646696, best_reward: -56.050000 ± 16.038345 in #104


Epoch #135: 101it [00:14,  6.91it/s, env_step=13500, len=100, loss=4.337, n/ep=1, n/st=100, rew=-233.50]                         


Epoch #135: test_reward: -74.060000 ± 39.939935, best_reward: -56.050000 ± 16.038345 in #104


Epoch #136: 101it [00:04, 21.78it/s, env_step=13600, len=100, loss=4.563, n/ep=1, n/st=100, rew=-262.30]                         


Epoch #136: test_reward: -111.650000 ± 59.537438, best_reward: -56.050000 ± 16.038345 in #104


Epoch #137: 101it [00:07, 12.73it/s, env_step=13700, len=100, loss=4.404, n/ep=1, n/st=100, rew=-269.30]                          


Epoch #137: test_reward: -89.610000 ± 55.164236, best_reward: -56.050000 ± 16.038345 in #104


Epoch #138: 101it [00:01, 85.06it/s, env_step=13800, len=100, loss=3.712, n/ep=1, n/st=100, rew=-31.30]                          


Epoch #138: test_reward: -52.600000 ± 35.363965, best_reward: -52.600000 ± 35.363965 in #138


Epoch #139: 101it [00:06, 15.73it/s, env_step=13900, len=100, loss=3.603, n/ep=1, n/st=100, rew=-264.70]                         


Epoch #139: test_reward: -43.930000 ± 30.389309, best_reward: -43.930000 ± 30.389309 in #139


Epoch #140: 101it [00:08, 11.75it/s, env_step=14000, len=100, loss=3.988, n/ep=1, n/st=100, rew=-163.00]                         


Epoch #140: test_reward: -52.240000 ± 40.005955, best_reward: -43.930000 ± 30.389309 in #139


Epoch #141: 101it [00:01, 72.35it/s, env_step=14100, len=100, loss=3.948, n/ep=1, n/st=100, rew=-225.00]                          


Epoch #141: test_reward: -54.330000 ± 31.567707, best_reward: -43.930000 ± 30.389309 in #139


Epoch #142: 101it [00:03, 29.70it/s, env_step=14200, len=100, loss=3.498, n/ep=1, n/st=100, rew=-191.80]                         


Epoch #142: test_reward: -64.540000 ± 27.009117, best_reward: -43.930000 ± 30.389309 in #139


Epoch #143: 101it [00:03, 31.84it/s, env_step=14300, len=100, loss=4.695, n/ep=1, n/st=100, rew=-587.70]                         


Epoch #143: test_reward: -69.000000 ± 35.067050, best_reward: -43.930000 ± 30.389309 in #139


Epoch #144: 101it [00:06, 16.36it/s, env_step=14400, len=100, loss=4.614, n/ep=1, n/st=100, rew=-535.60]                          


Epoch #144: test_reward: -61.150000 ± 44.570399, best_reward: -43.930000 ± 30.389309 in #139


Epoch #145: 101it [00:13,  7.54it/s, env_step=14500, len=100, loss=4.596, n/ep=1, n/st=100, rew=-69.10]                         


Epoch #145: test_reward: -49.510000 ± 21.290207, best_reward: -43.930000 ± 30.389309 in #139


Epoch #146: 101it [00:07, 14.16it/s, env_step=14600, len=100, loss=4.492, n/ep=1, n/st=100, rew=-229.00]                          


Epoch #146: test_reward: -87.780000 ± 42.237751, best_reward: -43.930000 ± 30.389309 in #139


Epoch #147: 101it [00:20,  4.81it/s, env_step=14700, len=100, loss=4.954, n/ep=1, n/st=100, rew=-100.80]                         


Epoch #147: test_reward: -75.110000 ± 26.554414, best_reward: -43.930000 ± 30.389309 in #139


Epoch #148: 101it [00:01, 58.61it/s, env_step=14800, len=100, loss=4.236, n/ep=1, n/st=100, rew=-284.80]                          


Epoch #148: test_reward: -71.890000 ± 32.086771, best_reward: -43.930000 ± 30.389309 in #139


Epoch #149: 101it [00:15,  6.52it/s, env_step=14900, len=100, loss=4.432, n/ep=1, n/st=100, rew=-295.80]                          


Epoch #149: test_reward: -85.630000 ± 37.486399, best_reward: -43.930000 ± 30.389309 in #139


Epoch #150: 101it [00:06, 14.78it/s, env_step=15000, len=100, loss=3.751, n/ep=1, n/st=100, rew=-311.20]                         


Epoch #150: test_reward: -71.500000 ± 31.031049, best_reward: -43.930000 ± 30.389309 in #139


Epoch #151: 101it [00:10,  9.32it/s, env_step=15100, len=100, loss=4.272, n/ep=1, n/st=100, rew=-67.30]                         


Epoch #151: test_reward: -65.320000 ± 19.724695, best_reward: -43.930000 ± 30.389309 in #139


Epoch #152: 101it [00:02, 39.07it/s, env_step=15200, len=100, loss=4.225, n/ep=1, n/st=100, rew=-197.90]                         


Epoch #152: test_reward: -51.330000 ± 15.215522, best_reward: -43.930000 ± 30.389309 in #139


Epoch #153: 101it [00:10,  9.43it/s, env_step=15300, len=100, loss=4.088, n/ep=1, n/st=100, rew=-279.20]                         


Epoch #153: test_reward: -53.520000 ± 40.608442, best_reward: -43.930000 ± 30.389309 in #139


Epoch #154: 101it [00:04, 22.98it/s, env_step=15400, len=100, loss=4.244, n/ep=1, n/st=100, rew=-240.90]                          


Epoch #154: test_reward: -56.950000 ± 16.372126, best_reward: -43.930000 ± 30.389309 in #139


Epoch #155: 101it [00:02, 46.66it/s, env_step=15500, len=100, loss=4.350, n/ep=1, n/st=100, rew=-657.90]                         


Epoch #155: test_reward: -74.750000 ± 40.526097, best_reward: -43.930000 ± 30.389309 in #139


Epoch #156: 101it [00:03, 29.13it/s, env_step=15600, len=100, loss=4.479, n/ep=1, n/st=100, rew=-223.50]                         


Epoch #156: test_reward: -74.750000 ± 45.602352, best_reward: -43.930000 ± 30.389309 in #139


Epoch #157: 101it [00:06, 14.46it/s, env_step=15700, len=100, loss=4.522, n/ep=1, n/st=100, rew=-50.20]                         


Epoch #157: test_reward: -68.060000 ± 39.788596, best_reward: -43.930000 ± 30.389309 in #139


Epoch #158: 101it [00:06, 15.70it/s, env_step=15800, len=100, loss=4.243, n/ep=1, n/st=100, rew=-255.60]                         


Epoch #158: test_reward: -54.720000 ± 20.504575, best_reward: -43.930000 ± 30.389309 in #139


Epoch #159: 101it [00:09, 10.78it/s, env_step=15900, len=100, loss=4.520, n/ep=1, n/st=100, rew=-228.00]                          


Epoch #159: test_reward: -62.870000 ± 27.164685, best_reward: -43.930000 ± 30.389309 in #139


Epoch #160: 101it [00:12,  8.01it/s, env_step=16000, len=100, loss=4.827, n/ep=1, n/st=100, rew=-202.80]                         


Epoch #160: test_reward: -45.990000 ± 25.356753, best_reward: -43.930000 ± 30.389309 in #139


Epoch #161: 101it [00:06, 14.80it/s, env_step=16100, len=100, loss=4.032, n/ep=1, n/st=100, rew=-226.00]                          


Epoch #161: test_reward: -54.030000 ± 12.092812, best_reward: -43.930000 ± 30.389309 in #139


Epoch #162: 101it [00:13,  7.74it/s, env_step=16200, len=100, loss=4.944, n/ep=1, n/st=100, rew=-206.60]                         


Epoch #162: test_reward: -64.760000 ± 31.420286, best_reward: -43.930000 ± 30.389309 in #139


Epoch #163: 101it [00:06, 14.94it/s, env_step=16300, len=100, loss=4.253, n/ep=1, n/st=100, rew=-239.30]                         


Epoch #163: test_reward: -63.020000 ± 20.720512, best_reward: -43.930000 ± 30.389309 in #139


Epoch #164: 101it [00:09, 11.18it/s, env_step=16400, len=100, loss=4.702, n/ep=1, n/st=100, rew=-126.60]                         


Epoch #164: test_reward: -58.500000 ± 22.394374, best_reward: -43.930000 ± 30.389309 in #139


Epoch #165: 101it [00:06, 15.93it/s, env_step=16500, len=100, loss=4.161, n/ep=1, n/st=100, rew=-208.80]                          


Epoch #165: test_reward: -72.570000 ± 23.560562, best_reward: -43.930000 ± 30.389309 in #139


Epoch #166: 101it [00:15,  6.67it/s, env_step=16600, len=100, loss=4.477, n/ep=1, n/st=100, rew=-131.40]                         


Epoch #166: test_reward: -52.200000 ± 18.299672, best_reward: -43.930000 ± 30.389309 in #139


Epoch #167: 101it [00:04, 23.10it/s, env_step=16700, len=100, loss=4.024, n/ep=1, n/st=100, rew=-240.60]                         


Epoch #167: test_reward: -65.940000 ± 28.512040, best_reward: -43.930000 ± 30.389309 in #139


Epoch #168: 101it [00:06, 15.02it/s, env_step=16800, len=100, loss=4.111, n/ep=1, n/st=100, rew=-134.80]                         


Epoch #168: test_reward: -83.370000 ± 37.795557, best_reward: -43.930000 ± 30.389309 in #139


Epoch #169: 101it [00:04, 24.10it/s, env_step=16900, len=100, loss=3.949, n/ep=1, n/st=100, rew=-104.90]                          


Epoch #169: test_reward: -79.380000 ± 44.446008, best_reward: -43.930000 ± 30.389309 in #139


Epoch #170: 101it [00:03, 33.16it/s, env_step=17000, len=100, loss=4.119, n/ep=1, n/st=100, rew=-220.50]                         


Epoch #170: test_reward: -83.630000 ± 35.239014, best_reward: -43.930000 ± 30.389309 in #139


Epoch #171: 101it [00:01, 60.67it/s, env_step=17100, len=100, loss=3.890, n/ep=1, n/st=100, rew=-222.50]                          


Epoch #171: test_reward: -50.680000 ± 20.841871, best_reward: -43.930000 ± 30.389309 in #139


Epoch #172: 101it [00:04, 21.88it/s, env_step=17200, len=100, loss=3.773, n/ep=1, n/st=100, rew=-109.80]                         


Epoch #172: test_reward: -56.530000 ± 27.103913, best_reward: -43.930000 ± 30.389309 in #139


Epoch #173: 101it [00:13,  7.68it/s, env_step=17300, len=100, loss=3.407, n/ep=1, n/st=100, rew=-247.40]                         


Epoch #173: test_reward: -65.750000 ± 27.168741, best_reward: -43.930000 ± 30.389309 in #139


Epoch #174: 101it [00:07, 14.19it/s, env_step=17400, len=100, loss=4.137, n/ep=1, n/st=100, rew=-517.70]                          


Epoch #174: test_reward: -58.020000 ± 25.952449, best_reward: -43.930000 ± 30.389309 in #139


Epoch #175: 101it [00:17,  5.81it/s, env_step=17500, len=100, loss=4.077, n/ep=1, n/st=100, rew=-200.10]                         


Epoch #175: test_reward: -50.330000 ± 21.598799, best_reward: -43.930000 ± 30.389309 in #139


Epoch #176: 101it [00:03, 33.55it/s, env_step=17600, len=100, loss=4.699, n/ep=1, n/st=100, rew=-59.20]                          


Epoch #176: test_reward: -54.920000 ± 23.521939, best_reward: -43.930000 ± 30.389309 in #139


Epoch #177: 101it [00:15,  6.39it/s, env_step=17700, len=100, loss=3.892, n/ep=1, n/st=100, rew=-151.20]                         


Epoch #177: test_reward: -76.200000 ± 39.173103, best_reward: -43.930000 ± 30.389309 in #139


Epoch #178: 101it [00:02, 48.28it/s, env_step=17800, len=100, loss=3.896, n/ep=1, n/st=100, rew=-178.00]                          


Epoch #178: test_reward: -61.280000 ± 29.814922, best_reward: -43.930000 ± 30.389309 in #139


Epoch #179: 101it [00:12,  7.85it/s, env_step=17900, len=100, loss=3.568, n/ep=1, n/st=100, rew=-287.40]                         


Epoch #179: test_reward: -63.740000 ± 31.216861, best_reward: -43.930000 ± 30.389309 in #139


Epoch #180: 101it [00:05, 18.62it/s, env_step=18000, len=100, loss=3.738, n/ep=1, n/st=100, rew=-95.90]                         


Epoch #180: test_reward: -74.150000 ± 31.241103, best_reward: -43.930000 ± 30.389309 in #139


Epoch #181: 101it [00:07, 13.10it/s, env_step=18100, len=100, loss=3.719, n/ep=1, n/st=100, rew=-250.90]                          


Epoch #181: test_reward: -74.320000 ± 30.031377, best_reward: -43.930000 ± 30.389309 in #139


Epoch #182: 101it [00:08, 11.42it/s, env_step=18200, len=100, loss=3.772, n/ep=1, n/st=100, rew=-202.70]                         


Epoch #182: test_reward: -67.040000 ± 24.868985, best_reward: -43.930000 ± 30.389309 in #139


Epoch #183: 101it [00:08, 11.35it/s, env_step=18300, len=100, loss=3.833, n/ep=1, n/st=100, rew=-148.90]                          


Epoch #183: test_reward: -93.880000 ± 34.811774, best_reward: -43.930000 ± 30.389309 in #139


Epoch #184: 101it [00:11,  9.14it/s, env_step=18400, len=100, loss=3.493, n/ep=1, n/st=100, rew=-251.20]                         


Epoch #184: test_reward: -63.460000 ± 45.961031, best_reward: -43.930000 ± 30.389309 in #139


Epoch #185: 101it [00:04, 22.42it/s, env_step=18500, len=100, loss=3.843, n/ep=1, n/st=100, rew=-117.90]                         


Epoch #185: test_reward: -77.860000 ± 36.391790, best_reward: -43.930000 ± 30.389309 in #139


Epoch #186: 101it [00:08, 11.93it/s, env_step=18600, len=100, loss=3.360, n/ep=1, n/st=100, rew=-195.70]                          


Epoch #186: test_reward: -67.020000 ± 24.167449, best_reward: -43.930000 ± 30.389309 in #139


Epoch #187: 101it [00:01, 70.42it/s, env_step=18700, len=100, loss=4.104, n/ep=1, n/st=100, rew=-647.50]                          


Epoch #187: test_reward: -66.740000 ± 27.416973, best_reward: -43.930000 ± 30.389309 in #139


Epoch #188: 101it [00:06, 15.63it/s, env_step=18800, len=100, loss=4.101, n/ep=1, n/st=100, rew=-214.40]                         


Epoch #188: test_reward: -49.230000 ± 26.584283, best_reward: -43.930000 ± 30.389309 in #139


Epoch #189: 101it [00:12,  7.89it/s, env_step=18900, len=100, loss=4.062, n/ep=1, n/st=100, rew=-235.50]                         


Epoch #189: test_reward: -66.890000 ± 47.962036, best_reward: -43.930000 ± 30.389309 in #139


Epoch #190: 101it [00:02, 37.39it/s, env_step=19000, len=100, loss=4.783, n/ep=1, n/st=100, rew=-186.40]                          


Epoch #190: test_reward: -32.190000 ± 27.678275, best_reward: -32.190000 ± 27.678275 in #190


Epoch #191: 101it [00:09, 10.89it/s, env_step=19100, len=100, loss=4.318, n/ep=1, n/st=100, rew=-188.20]                         


Epoch #191: test_reward: -54.770000 ± 24.830950, best_reward: -32.190000 ± 27.678275 in #190


Epoch #192: 101it [00:04, 25.09it/s, env_step=19200, len=100, loss=4.363, n/ep=1, n/st=100, rew=-131.20]                         


Epoch #192: test_reward: -52.930000 ± 31.571761, best_reward: -32.190000 ± 27.678275 in #190


Epoch #193: 101it [00:06, 15.16it/s, env_step=19300, len=100, loss=3.747, n/ep=1, n/st=100, rew=-289.70]                          


Epoch #193: test_reward: -55.770000 ± 23.772087, best_reward: -32.190000 ± 27.678275 in #190


Epoch #194: 101it [00:03, 30.31it/s, env_step=19400, len=100, loss=3.554, n/ep=1, n/st=100, rew=-206.50]                          


Epoch #194: test_reward: -40.200000 ± 24.746515, best_reward: -32.190000 ± 27.678275 in #190


Epoch #195: 101it [00:04, 21.54it/s, env_step=19500, len=100, loss=3.483, n/ep=1, n/st=100, rew=-90.10]                          


Epoch #195: test_reward: -42.570000 ± 31.707635, best_reward: -32.190000 ± 27.678275 in #190


Epoch #196: 101it [00:06, 15.29it/s, env_step=19600, len=100, loss=3.441, n/ep=1, n/st=100, rew=-284.10]                         


Epoch #196: test_reward: -53.200000 ± 27.196985, best_reward: -32.190000 ± 27.678275 in #190


Epoch #197: 101it [00:06, 14.70it/s, env_step=19700, len=100, loss=3.121, n/ep=1, n/st=100, rew=-90.40]                          


Epoch #197: test_reward: -57.810000 ± 35.439962, best_reward: -32.190000 ± 27.678275 in #190


Epoch #198: 101it [00:20,  4.99it/s, env_step=19800, len=100, loss=3.422, n/ep=1, n/st=100, rew=-248.70]                         


Epoch #198: test_reward: -52.580000 ± 43.027382, best_reward: -32.190000 ± 27.678275 in #190


Epoch #199: 101it [00:04, 20.44it/s, env_step=19900, len=100, loss=3.870, n/ep=1, n/st=100, rew=-569.20]                         


Epoch #199: test_reward: -46.020000 ± 26.957626, best_reward: -32.190000 ± 27.678275 in #190


Epoch #200: 101it [00:13,  7.29it/s, env_step=20000, len=100, loss=3.775, n/ep=1, n/st=100, rew=-71.80]                         


Epoch #200: test_reward: -30.980000 ± 16.434646, best_reward: -30.980000 ± 16.434646 in #200


Epoch #201: 101it [00:02, 36.48it/s, env_step=20100, len=100, loss=4.050, n/ep=1, n/st=100, rew=-219.70]                          


Epoch #201: test_reward: -53.390000 ± 50.622178, best_reward: -30.980000 ± 16.434646 in #200


Epoch #202: 101it [00:08, 11.53it/s, env_step=20200, len=100, loss=3.940, n/ep=1, n/st=100, rew=-250.40]                          


Epoch #202: test_reward: -67.450000 ± 44.050772, best_reward: -30.980000 ± 16.434646 in #200


Epoch #203: 101it [00:02, 48.53it/s, env_step=20300, len=100, loss=4.188, n/ep=1, n/st=100, rew=-301.80]                          


Epoch #203: test_reward: -43.040000 ± 23.892141, best_reward: -30.980000 ± 16.434646 in #200


Epoch #204: 101it [00:05, 18.49it/s, env_step=20400, len=100, loss=4.332, n/ep=1, n/st=100, rew=-256.50]                         


Epoch #204: test_reward: -70.980000 ± 41.914480, best_reward: -30.980000 ± 16.434646 in #200


Epoch #205: 101it [00:10,  9.87it/s, env_step=20500, len=100, loss=4.242, n/ep=1, n/st=100, rew=-506.80]                         


Epoch #205: test_reward: -39.070000 ± 21.228427, best_reward: -30.980000 ± 16.434646 in #200


Epoch #206: 101it [00:04, 22.96it/s, env_step=20600, len=100, loss=3.992, n/ep=1, n/st=100, rew=-209.50]                         


Epoch #206: test_reward: -42.150000 ± 44.552334, best_reward: -30.980000 ± 16.434646 in #200


Epoch #207: 101it [00:04, 20.64it/s, env_step=20700, len=100, loss=3.999, n/ep=1, n/st=100, rew=-129.10]                         


Epoch #207: test_reward: -43.420000 ± 26.981319, best_reward: -30.980000 ± 16.434646 in #200


Epoch #208: 101it [00:03, 31.99it/s, env_step=20800, len=100, loss=3.470, n/ep=1, n/st=100, rew=-288.70]                         


Epoch #208: test_reward: -36.570000 ± 20.118501, best_reward: -30.980000 ± 16.434646 in #200


Epoch #209: 101it [00:03, 26.40it/s, env_step=20900, len=100, loss=3.791, n/ep=1, n/st=100, rew=-280.40]                         


Epoch #209: test_reward: -65.880000 ± 36.822732, best_reward: -30.980000 ± 16.434646 in #200


Epoch #210: 101it [00:09, 10.85it/s, env_step=21000, len=100, loss=3.949, n/ep=1, n/st=100, rew=-331.00]                         


Epoch #210: test_reward: -51.710000 ± 27.231799, best_reward: -30.980000 ± 16.434646 in #200


Epoch #211: 101it [00:01, 60.09it/s, env_step=21100, len=100, loss=3.990, n/ep=1, n/st=100, rew=-198.00]                          


Epoch #211: test_reward: -57.950000 ± 32.129216, best_reward: -30.980000 ± 16.434646 in #200


Epoch #212: 101it [00:04, 24.04it/s, env_step=21200, len=100, loss=4.352, n/ep=1, n/st=100, rew=-268.50]                         


Epoch #212: test_reward: -67.260000 ± 60.801286, best_reward: -30.980000 ± 16.434646 in #200


Epoch #213: 101it [00:08, 11.59it/s, env_step=21300, len=100, loss=3.925, n/ep=1, n/st=100, rew=-125.10]                         


Epoch #213: test_reward: -60.750000 ± 28.194405, best_reward: -30.980000 ± 16.434646 in #200


Epoch #214: 101it [00:04, 23.46it/s, env_step=21400, len=100, loss=4.085, n/ep=1, n/st=100, rew=-214.70]                         


Epoch #214: test_reward: -61.380000 ± 32.487499, best_reward: -30.980000 ± 16.434646 in #200


Epoch #215: 101it [00:18,  5.46it/s, env_step=21500, len=100, loss=3.632, n/ep=1, n/st=100, rew=-177.70]                         


Epoch #215: test_reward: -72.910000 ± 21.453925, best_reward: -30.980000 ± 16.434646 in #200


Epoch #216: 101it [00:03, 27.48it/s, env_step=21600, len=100, loss=3.095, n/ep=1, n/st=100, rew=-319.30]                         


Epoch #216: test_reward: -53.670000 ± 30.026823, best_reward: -30.980000 ± 16.434646 in #200


Epoch #217: 101it [00:14,  7.11it/s, env_step=21700, len=100, loss=3.683, n/ep=1, n/st=100, rew=-276.60]                         


Epoch #217: test_reward: -54.090000 ± 34.366421, best_reward: -30.980000 ± 16.434646 in #200


Epoch #218: 101it [00:04, 22.42it/s, env_step=21800, len=100, loss=3.831, n/ep=1, n/st=100, rew=-369.70]                         


Epoch #218: test_reward: -50.310000 ± 29.307591, best_reward: -30.980000 ± 16.434646 in #200


Epoch #219: 101it [00:11,  8.64it/s, env_step=21900, len=100, loss=3.784, n/ep=1, n/st=100, rew=-197.90]                         


Epoch #219: test_reward: -80.080000 ± 44.861293, best_reward: -30.980000 ± 16.434646 in #200


Epoch #220: 101it [00:03, 27.63it/s, env_step=22000, len=100, loss=3.514, n/ep=1, n/st=100, rew=-86.10]                         


Epoch #220: test_reward: -45.150000 ± 21.486705, best_reward: -30.980000 ± 16.434646 in #200


Epoch #221: 101it [00:15,  6.56it/s, env_step=22100, len=100, loss=4.207, n/ep=1, n/st=100, rew=-302.30]                         


Epoch #221: test_reward: -57.120000 ± 31.725693, best_reward: -30.980000 ± 16.434646 in #200


Epoch #222: 101it [00:04, 20.79it/s, env_step=22200, len=100, loss=3.746, n/ep=1, n/st=100, rew=-39.70]                         


Epoch #222: test_reward: -35.290000 ± 17.944216, best_reward: -30.980000 ± 16.434646 in #200


Epoch #223: 101it [00:14,  7.15it/s, env_step=22300, len=100, loss=3.635, n/ep=1, n/st=100, rew=-77.80]                         


Epoch #223: test_reward: -58.380000 ± 23.890492, best_reward: -30.980000 ± 16.434646 in #200


Epoch #224: 101it [00:06, 14.54it/s, env_step=22400, len=100, loss=3.029, n/ep=1, n/st=100, rew=-140.50]                          


Epoch #224: test_reward: -49.450000 ± 29.760620, best_reward: -30.980000 ± 16.434646 in #200


Epoch #225: 101it [00:13,  7.23it/s, env_step=22500, len=100, loss=3.161, n/ep=1, n/st=100, rew=-255.30]                         


Epoch #225: test_reward: -53.110000 ± 43.885748, best_reward: -30.980000 ± 16.434646 in #200


Epoch #226: 101it [00:05, 18.46it/s, env_step=22600, len=100, loss=3.293, n/ep=1, n/st=100, rew=-64.20]                          


Epoch #226: test_reward: -25.560000 ± 18.763912, best_reward: -25.560000 ± 18.763912 in #226


Epoch #227: 101it [00:14,  7.20it/s, env_step=22700, len=100, loss=3.130, n/ep=1, n/st=100, rew=-234.30]                         


Epoch #227: test_reward: -78.540000 ± 44.792705, best_reward: -25.560000 ± 18.763912 in #226


Epoch #228: 101it [00:08, 11.86it/s, env_step=22800, len=100, loss=3.221, n/ep=1, n/st=100, rew=-209.30]                         


Epoch #228: test_reward: -58.650000 ± 33.965402, best_reward: -25.560000 ± 18.763912 in #226


Epoch #229: 101it [00:09, 10.50it/s, env_step=22900, len=100, loss=2.900, n/ep=1, n/st=100, rew=-59.30]                          


Epoch #229: test_reward: -37.050000 ± 27.460672, best_reward: -25.560000 ± 18.763912 in #226


Epoch #230: 101it [00:02, 42.76it/s, env_step=23000, len=100, loss=3.006, n/ep=1, n/st=100, rew=-214.20]                          


Epoch #230: test_reward: -42.560000 ± 51.220547, best_reward: -25.560000 ± 18.763912 in #226


Epoch #231: 101it [00:09, 10.14it/s, env_step=23100, len=100, loss=3.176, n/ep=1, n/st=100, rew=-82.60]                         


Epoch #231: test_reward: -55.540000 ± 37.922241, best_reward: -25.560000 ± 18.763912 in #226


Epoch #232: 101it [00:05, 17.98it/s, env_step=23200, len=100, loss=3.338, n/ep=1, n/st=100, rew=-45.50]                         


Epoch #232: test_reward: -35.700000 ± 31.265124, best_reward: -25.560000 ± 18.763912 in #226


Epoch #233: 101it [00:25,  3.89it/s, env_step=23300, len=100, loss=3.730, n/ep=1, n/st=100, rew=-599.60]                         


Epoch #233: test_reward: -17.080000 ± 13.172532, best_reward: -17.080000 ± 13.172532 in #233


Epoch #234: 101it [00:07, 13.18it/s, env_step=23400, len=100, loss=3.711, n/ep=1, n/st=100, rew=-480.60]                          


Epoch #234: test_reward: -44.260000 ± 44.093859, best_reward: -17.080000 ± 13.172532 in #233


Epoch #235: 101it [00:11,  8.49it/s, env_step=23500, len=100, loss=3.824, n/ep=1, n/st=100, rew=-200.40]                         


Epoch #235: test_reward: -22.350000 ± 11.731688, best_reward: -17.080000 ± 13.172532 in #233


Epoch #236: 101it [00:07, 13.58it/s, env_step=23600, len=100, loss=3.645, n/ep=1, n/st=100, rew=-215.30]                          


Epoch #236: test_reward: -28.610000 ± 26.163389, best_reward: -17.080000 ± 13.172532 in #233


Epoch #237: 101it [00:03, 28.70it/s, env_step=23700, len=100, loss=3.265, n/ep=1, n/st=100, rew=-187.40]                         


Epoch #237: test_reward: -45.820000 ± 36.771288, best_reward: -17.080000 ± 13.172532 in #233


Epoch #238: 101it [00:07, 13.98it/s, env_step=23800, len=100, loss=2.988, n/ep=1, n/st=100, rew=-36.70]                          


Epoch #238: test_reward: -75.040000 ± 47.719853, best_reward: -17.080000 ± 13.172532 in #233


Epoch #239: 101it [00:10,  9.99it/s, env_step=23900, len=100, loss=3.248, n/ep=1, n/st=100, rew=-210.10]                         


Epoch #239: test_reward: -63.080000 ± 46.521582, best_reward: -17.080000 ± 13.172532 in #233


Epoch #240: 101it [00:07, 13.74it/s, env_step=24000, len=100, loss=3.128, n/ep=1, n/st=100, rew=-190.20]                         


Epoch #240: test_reward: -56.860000 ± 18.340294, best_reward: -17.080000 ± 13.172532 in #233


Epoch #241: 101it [00:03, 30.84it/s, env_step=24100, len=100, loss=3.408, n/ep=1, n/st=100, rew=-208.70]                         


Epoch #241: test_reward: -80.870000 ± 32.854134, best_reward: -17.080000 ± 13.172532 in #233


Epoch #242: 101it [00:01, 71.86it/s, env_step=24200, len=100, loss=3.281, n/ep=1, n/st=100, rew=-178.60]                          


Epoch #242: test_reward: -108.270000 ± 47.579640, best_reward: -17.080000 ± 13.172532 in #233


Epoch #243: 101it [00:05, 18.54it/s, env_step=24300, len=100, loss=3.462, n/ep=1, n/st=100, rew=-84.00]                         


Epoch #243: test_reward: -106.720000 ± 28.440563, best_reward: -17.080000 ± 13.172532 in #233


Epoch #244: 101it [00:03, 30.46it/s, env_step=24400, len=100, loss=3.318, n/ep=1, n/st=100, rew=-176.90]                          


Epoch #244: test_reward: -91.390000 ± 34.945084, best_reward: -17.080000 ± 13.172532 in #233


Epoch #245: 101it [00:05, 19.47it/s, env_step=24500, len=100, loss=3.911, n/ep=1, n/st=100, rew=-790.00]                          


Epoch #245: test_reward: -69.380000 ± 47.789597, best_reward: -17.080000 ± 13.172532 in #233


Epoch #246: 101it [00:14,  7.12it/s, env_step=24600, len=100, loss=3.613, n/ep=1, n/st=100, rew=-269.60]                         


Epoch #246: test_reward: -52.530000 ± 36.062170, best_reward: -17.080000 ± 13.172532 in #233


Epoch #247: 101it [00:04, 24.15it/s, env_step=24700, len=100, loss=3.897, n/ep=1, n/st=100, rew=-116.20]                         


Epoch #247: test_reward: -64.930000 ± 41.542173, best_reward: -17.080000 ± 13.172532 in #233


Epoch #248: 101it [00:11,  8.51it/s, env_step=24800, len=100, loss=3.520, n/ep=1, n/st=100, rew=-49.50]                         


Epoch #248: test_reward: -83.030000 ± 56.808785, best_reward: -17.080000 ± 13.172532 in #233


Epoch #249: 101it [00:05, 17.21it/s, env_step=24900, len=100, loss=3.148, n/ep=1, n/st=100, rew=-27.00]                         


Epoch #249: test_reward: -83.040000 ± 47.638119, best_reward: -17.080000 ± 13.172532 in #233


Epoch #250: 101it [00:14,  6.87it/s, env_step=25000, len=100, loss=3.182, n/ep=1, n/st=100, rew=-209.70]                         


Epoch #250: test_reward: -70.010000 ± 42.777271, best_reward: -17.080000 ± 13.172532 in #233


Epoch #251: 101it [00:05, 19.13it/s, env_step=25100, len=100, loss=3.833, n/ep=1, n/st=100, rew=-293.80]                         


Epoch #251: test_reward: -130.570000 ± 34.457134, best_reward: -17.080000 ± 13.172532 in #233


Epoch #252: 101it [00:16,  6.26it/s, env_step=25200, len=100, loss=3.217, n/ep=1, n/st=100, rew=-201.70]                         


Epoch #252: test_reward: -113.990000 ± 31.141659, best_reward: -17.080000 ± 13.172532 in #233


Epoch #253: 101it [00:05, 18.44it/s, env_step=25300, len=100, loss=2.973, n/ep=1, n/st=100, rew=-211.90]                         


Epoch #253: test_reward: -102.920000 ± 29.777099, best_reward: -17.080000 ± 13.172532 in #233


Epoch #254: 101it [00:17,  5.77it/s, env_step=25400, len=100, loss=3.069, n/ep=1, n/st=100, rew=-226.80]                         


Epoch #254: test_reward: -121.340000 ± 25.705066, best_reward: -17.080000 ± 13.172532 in #233


Epoch #255: 101it [00:07, 14.14it/s, env_step=25500, len=100, loss=3.100, n/ep=1, n/st=100, rew=-263.60]                         


Epoch #255: test_reward: -135.300000 ± 105.263488, best_reward: -17.080000 ± 13.172532 in #233


Epoch #256: 101it [00:09, 10.87it/s, env_step=25600, len=100, loss=3.039, n/ep=1, n/st=100, rew=-229.00]                         


Epoch #256: test_reward: -52.440000 ± 28.089329, best_reward: -17.080000 ± 13.172532 in #233


Epoch #257: 101it [00:03, 27.19it/s, env_step=25700, len=100, loss=3.105, n/ep=1, n/st=100, rew=-202.60]                         


Epoch #257: test_reward: -104.430000 ± 30.217447, best_reward: -17.080000 ± 13.172532 in #233


Epoch #258: 101it [00:16,  6.27it/s, env_step=25800, len=100, loss=2.810, n/ep=1, n/st=100, rew=-269.10]                         


Epoch #258: test_reward: -102.040000 ± 45.673213, best_reward: -17.080000 ± 13.172532 in #233


Epoch #259: 101it [00:03, 29.16it/s, env_step=25900, len=100, loss=2.662, n/ep=1, n/st=100, rew=-61.10]                         


Epoch #259: test_reward: -101.960000 ± 35.634539, best_reward: -17.080000 ± 13.172532 in #233


Epoch #260: 101it [00:16,  6.12it/s, env_step=26000, len=100, loss=2.579, n/ep=1, n/st=100, rew=-286.70]                          


Epoch #260: test_reward: -69.890000 ± 19.456796, best_reward: -17.080000 ± 13.172532 in #233


Epoch #261: 101it [00:02, 36.72it/s, env_step=26100, len=100, loss=2.840, n/ep=1, n/st=100, rew=-215.80]                         


Epoch #261: test_reward: -77.980000 ± 26.961558, best_reward: -17.080000 ± 13.172532 in #233


Epoch #262: 101it [00:12,  7.82it/s, env_step=26200, len=100, loss=2.673, n/ep=1, n/st=100, rew=-186.80]                         


Epoch #262: test_reward: -56.810000 ± 19.370723, best_reward: -17.080000 ± 13.172532 in #233


Epoch #263: 101it [00:05, 18.28it/s, env_step=26300, len=100, loss=3.008, n/ep=1, n/st=100, rew=-257.30]                         


Epoch #263: test_reward: -64.850000 ± 39.867236, best_reward: -17.080000 ± 13.172532 in #233


Epoch #264: 101it [00:15,  6.39it/s, env_step=26400, len=100, loss=2.497, n/ep=1, n/st=100, rew=-272.30]                         


Epoch #264: test_reward: -95.400000 ± 51.047037, best_reward: -17.080000 ± 13.172532 in #233


Epoch #265: 101it [00:02, 37.04it/s, env_step=26500, len=100, loss=2.429, n/ep=1, n/st=100, rew=-196.40]                         


Epoch #265: test_reward: -52.360000 ± 31.763350, best_reward: -17.080000 ± 13.172532 in #233


Epoch #266: 101it [00:16,  6.29it/s, env_step=26600, len=100, loss=2.522, n/ep=1, n/st=100, rew=-244.70]                         


Epoch #266: test_reward: -29.590000 ± 14.587080, best_reward: -17.080000 ± 13.172532 in #233


Epoch #267: 101it [00:05, 17.98it/s, env_step=26700, len=100, loss=2.774, n/ep=1, n/st=100, rew=-117.60]                         


Epoch #267: test_reward: -91.300000 ± 53.075606, best_reward: -17.080000 ± 13.172532 in #233


Epoch #268: 101it [00:14,  6.83it/s, env_step=26800, len=100, loss=2.315, n/ep=1, n/st=100, rew=-288.70]                         


Epoch #268: test_reward: -95.800000 ± 29.425193, best_reward: -17.080000 ± 13.172532 in #233


Epoch #269: 101it [00:06, 16.75it/s, env_step=26900, len=100, loss=2.270, n/ep=1, n/st=100, rew=-202.20]                         


Epoch #269: test_reward: -62.600000 ± 17.246855, best_reward: -17.080000 ± 13.172532 in #233


Epoch #270: 101it [00:07, 13.31it/s, env_step=27000, len=100, loss=2.358, n/ep=1, n/st=100, rew=-243.70]                          


Epoch #270: test_reward: -77.180000 ± 38.997995, best_reward: -17.080000 ± 13.172532 in #233


Epoch #271: 101it [00:05, 19.02it/s, env_step=27100, len=100, loss=2.450, n/ep=1, n/st=100, rew=-306.90]                         


Epoch #271: test_reward: -80.320000 ± 41.574507, best_reward: -17.080000 ± 13.172532 in #233


Epoch #272: 101it [00:15,  6.70it/s, env_step=27200, len=100, loss=2.287, n/ep=1, n/st=100, rew=-254.10]                         


Epoch #272: test_reward: -69.520000 ± 37.092258, best_reward: -17.080000 ± 13.172532 in #233


Epoch #273: 101it [00:05, 19.74it/s, env_step=27300, len=100, loss=2.664, n/ep=1, n/st=100, rew=-221.50]                         


Epoch #273: test_reward: -75.170000 ± 24.467082, best_reward: -17.080000 ± 13.172532 in #233


Epoch #274: 101it [00:05, 17.37it/s, env_step=27400, len=100, loss=2.451, n/ep=1, n/st=100, rew=-334.90]                         


Epoch #274: test_reward: -84.400000 ± 31.532364, best_reward: -17.080000 ± 13.172532 in #233


Epoch #275: 101it [00:03, 33.08it/s, env_step=27500, len=100, loss=2.397, n/ep=1, n/st=100, rew=-80.00]                          


Epoch #275: test_reward: -89.400000 ± 44.568890, best_reward: -17.080000 ± 13.172532 in #233


Epoch #276: 101it [00:04, 23.17it/s, env_step=27600, len=100, loss=2.733, n/ep=1, n/st=100, rew=-428.20]                         
Epoch #280: 101it [00:01, 71.20it/s, env_step=28000, len=100, loss=2.898, n/ep=1, n/st=100, rew=-231.60]                          


Epoch #280: test_reward: -101.070000 ± 34.364896, best_reward: -17.080000 ± 13.172532 in #233


Epoch #281: 101it [00:11,  9.11it/s, env_step=28100, len=100, loss=2.839, n/ep=1, n/st=100, rew=-174.60]                         


Epoch #281: test_reward: -90.160000 ± 41.724146, best_reward: -17.080000 ± 13.172532 in #233


Epoch #282: 101it [00:05, 16.99it/s, env_step=28200, len=100, loss=3.212, n/ep=1, n/st=100, rew=-155.20]                          


Epoch #282: test_reward: -90.300000 ± 38.826589, best_reward: -17.080000 ± 13.172532 in #233


Epoch #283: 101it [00:03, 30.26it/s, env_step=28300, len=100, loss=2.706, n/ep=1, n/st=100, rew=-206.90]                         


Epoch #283: test_reward: -69.260000 ± 30.491054, best_reward: -17.080000 ± 13.172532 in #233


Epoch #284: 101it [00:02, 50.47it/s, env_step=28400, len=100, loss=3.591, n/ep=1, n/st=100, rew=-739.30]                          


Epoch #284: test_reward: -132.740000 ± 171.668350, best_reward: -17.080000 ± 13.172532 in #233


Epoch #285: 101it [00:06, 16.32it/s, env_step=28500, len=100, loss=3.279, n/ep=1, n/st=100, rew=-284.90]                         


Epoch #285: test_reward: -178.590000 ± 158.372046, best_reward: -17.080000 ± 13.172532 in #233


Epoch #286: 101it [00:14,  7.07it/s, env_step=28600, len=100, loss=3.191, n/ep=1, n/st=100, rew=-197.40]                         


Epoch #286: test_reward: -82.610000 ± 39.466073, best_reward: -17.080000 ± 13.172532 in #233


Epoch #287: 101it [00:06, 15.90it/s, env_step=28700, len=100, loss=3.322, n/ep=1, n/st=100, rew=-162.90]                         


Epoch #287: test_reward: -92.180000 ± 35.420836, best_reward: -17.080000 ± 13.172532 in #233


Epoch #288: 101it [00:08, 11.60it/s, env_step=28800, len=100, loss=3.442, n/ep=1, n/st=100, rew=-233.60]                         


Epoch #288: test_reward: -100.480000 ± 42.537461, best_reward: -17.080000 ± 13.172532 in #233


Epoch #289: 101it [00:05, 16.98it/s, env_step=28900, len=100, loss=3.239, n/ep=1, n/st=100, rew=-126.10]                         


Epoch #289: test_reward: -81.620000 ± 31.394611, best_reward: -17.080000 ± 13.172532 in #233


Epoch #290: 101it [00:04, 22.26it/s, env_step=29000, len=100, loss=3.141, n/ep=1, n/st=100, rew=-255.60]                         


Epoch #290: test_reward: -87.790000 ± 37.763195, best_reward: -17.080000 ± 13.172532 in #233


Epoch #291: 101it [00:04, 20.29it/s, env_step=29100, len=100, loss=3.748, n/ep=1, n/st=100, rew=-628.40]                         


Epoch #291: test_reward: -88.480000 ± 40.832580, best_reward: -17.080000 ± 13.172532 in #233


Epoch #292: 101it [00:10, 10.03it/s, env_step=29200, len=100, loss=3.746, n/ep=1, n/st=100, rew=-41.60]                         


Epoch #292: test_reward: -80.010000 ± 40.521117, best_reward: -17.080000 ± 13.172532 in #233


Epoch #293: 101it [00:07, 12.78it/s, env_step=29300, len=100, loss=3.902, n/ep=1, n/st=100, rew=-297.80]                          


Epoch #293: test_reward: -51.890000 ± 17.469256, best_reward: -17.080000 ± 13.172532 in #233


Epoch #294: 101it [00:06, 15.86it/s, env_step=29400, len=100, loss=3.649, n/ep=1, n/st=100, rew=-163.80]                         


Epoch #294: test_reward: -44.130000 ± 15.377779, best_reward: -17.080000 ± 13.172532 in #233


Epoch #295: 101it [00:05, 18.75it/s, env_step=29500, len=100, loss=3.770, n/ep=1, n/st=100, rew=-189.90]                          


Epoch #295: test_reward: -68.260000 ± 35.221618, best_reward: -17.080000 ± 13.172532 in #233


Epoch #296: 101it [00:09, 10.66it/s, env_step=29600, len=100, loss=3.176, n/ep=1, n/st=100, rew=-254.00]                         


Epoch #296: test_reward: -60.420000 ± 24.646127, best_reward: -17.080000 ± 13.172532 in #233


Epoch #297: 101it [00:02, 35.33it/s, env_step=29700, len=100, loss=3.549, n/ep=1, n/st=100, rew=-279.30]                         


Epoch #297: test_reward: -72.830000 ± 36.938734, best_reward: -17.080000 ± 13.172532 in #233


Epoch #298: 101it [00:08, 11.98it/s, env_step=29800, len=100, loss=3.365, n/ep=1, n/st=100, rew=-193.20]                         


Epoch #298: test_reward: -77.310000 ± 40.684283, best_reward: -17.080000 ± 13.172532 in #233


Epoch #299: 101it [00:05, 19.26it/s, env_step=29900, len=100, loss=3.436, n/ep=1, n/st=100, rew=-163.50]                         


Epoch #299: test_reward: -65.860000 ± 32.233498, best_reward: -17.080000 ± 13.172532 in #233


Epoch #300: 101it [00:15,  6.43it/s, env_step=30000, len=100, loss=3.039, n/ep=1, n/st=100, rew=-83.90]                         


Epoch #300: test_reward: -49.930000 ± 21.091138, best_reward: -17.080000 ± 13.172532 in #233


Epoch #301: 101it [00:03, 27.42it/s, env_step=30100, len=100, loss=3.652, n/ep=1, n/st=100, rew=-162.00]                         


Epoch #301: test_reward: -60.130000 ± 30.119729, best_reward: -17.080000 ± 13.172532 in #233


Epoch #302: 101it [00:03, 30.01it/s, env_step=30200, len=100, loss=3.545, n/ep=1, n/st=100, rew=-235.70]                         


Epoch #302: test_reward: -73.170000 ± 23.380977, best_reward: -17.080000 ± 13.172532 in #233


Epoch #303: 101it [00:01, 64.09it/s, env_step=30300, len=100, loss=3.665, n/ep=1, n/st=100, rew=-231.00]                          


Epoch #303: test_reward: -77.660000 ± 33.542516, best_reward: -17.080000 ± 13.172532 in #233


Epoch #304: 101it [00:03, 26.72it/s, env_step=30400, len=100, loss=3.250, n/ep=1, n/st=100, rew=-214.40]                         


Epoch #304: test_reward: -53.600000 ± 36.958274, best_reward: -17.080000 ± 13.172532 in #233


Epoch #305: 101it [00:02, 46.83it/s, env_step=30500, len=100, loss=3.358, n/ep=1, n/st=100, rew=-272.10]                          


Epoch #305: test_reward: -63.570000 ± 36.667862, best_reward: -17.080000 ± 13.172532 in #233


Epoch #306: 101it [00:03, 27.00it/s, env_step=30600, len=100, loss=3.004, n/ep=1, n/st=100, rew=-165.10]                         


Epoch #306: test_reward: -42.090000 ± 26.320465, best_reward: -17.080000 ± 13.172532 in #233


Epoch #307: 101it [00:10,  9.70it/s, env_step=30700, len=100, loss=2.912, n/ep=1, n/st=100, rew=-151.90]                         


Epoch #307: test_reward: -59.430000 ± 22.636831, best_reward: -17.080000 ± 13.172532 in #233


Epoch #308: 101it [00:06, 15.79it/s, env_step=30800, len=100, loss=3.005, n/ep=1, n/st=100, rew=-184.10]                          


Epoch #308: test_reward: -62.370000 ± 38.243171, best_reward: -17.080000 ± 13.172532 in #233


Epoch #309: 101it [00:05, 19.21it/s, env_step=30900, len=100, loss=2.856, n/ep=1, n/st=100, rew=-289.10]                         


Epoch #309: test_reward: -44.680000 ± 20.033861, best_reward: -17.080000 ± 13.172532 in #233


Epoch #310: 101it [00:07, 12.98it/s, env_step=31000, len=100, loss=2.996, n/ep=1, n/st=100, rew=-163.10]                         


Epoch #310: test_reward: -61.130000 ± 30.951964, best_reward: -17.080000 ± 13.172532 in #233


Epoch #311: 101it [00:17,  5.73it/s, env_step=31100, len=100, loss=3.081, n/ep=1, n/st=100, rew=-188.20]                         


Epoch #311: test_reward: -63.800000 ± 37.722354, best_reward: -17.080000 ± 13.172532 in #233


Epoch #312: 101it [00:01, 65.01it/s, env_step=31200, len=100, loss=3.027, n/ep=1, n/st=100, rew=-266.40]                          


Epoch #312: test_reward: -62.830000 ± 30.728946, best_reward: -17.080000 ± 13.172532 in #233


Epoch #313: 101it [00:03, 27.64it/s, env_step=31300, len=100, loss=3.790, n/ep=1, n/st=100, rew=-730.90]                          


Epoch #313: test_reward: -78.250000 ± 41.707080, best_reward: -17.080000 ± 13.172532 in #233


Epoch #314: 101it [00:04, 21.11it/s, env_step=31400, len=100, loss=3.511, n/ep=1, n/st=100, rew=-183.80]                         


Epoch #314: test_reward: -83.440000 ± 26.092612, best_reward: -17.080000 ± 13.172532 in #233


Epoch #315: 101it [00:07, 13.46it/s, env_step=31500, len=100, loss=3.528, n/ep=1, n/st=100, rew=-207.30]                          


Epoch #315: test_reward: -67.310000 ± 20.052354, best_reward: -17.080000 ± 13.172532 in #233


Epoch #316: 101it [00:06, 15.35it/s, env_step=31600, len=100, loss=3.689, n/ep=1, n/st=100, rew=-125.00]                         


Epoch #316: test_reward: -54.900000 ± 18.297978, best_reward: -17.080000 ± 13.172532 in #233


Epoch #317: 101it [00:05, 18.93it/s, env_step=31700, len=100, loss=3.892, n/ep=1, n/st=100, rew=-93.10]                         


Epoch #317: test_reward: -83.710000 ± 44.124765, best_reward: -17.080000 ± 13.172532 in #233


Epoch #318: 101it [00:09, 10.19it/s, env_step=31800, len=100, loss=3.461, n/ep=1, n/st=100, rew=-232.60]                         


Epoch #318: test_reward: -78.800000 ± 35.554972, best_reward: -17.080000 ± 13.172532 in #233


Epoch #319: 101it [00:02, 43.07it/s, env_step=31900, len=100, loss=3.480, n/ep=1, n/st=100, rew=-98.80]                         


Epoch #319: test_reward: -87.900000 ± 31.804968, best_reward: -17.080000 ± 13.172532 in #233


Epoch #320: 101it [00:07, 12.93it/s, env_step=32000, len=100, loss=3.282, n/ep=1, n/st=100, rew=-88.60]                         


Epoch #320: test_reward: -63.060000 ± 28.727624, best_reward: -17.080000 ± 13.172532 in #233


Epoch #321: 101it [00:07, 13.10it/s, env_step=32100, len=100, loss=3.348, n/ep=1, n/st=100, rew=-64.70]                         


Epoch #321: test_reward: -57.180000 ± 27.880918, best_reward: -17.080000 ± 13.172532 in #233


Epoch #322: 101it [00:01, 56.29it/s, env_step=32200, len=100, loss=3.575, n/ep=1, n/st=100, rew=-297.40]                          


Epoch #322: test_reward: -51.010000 ± 22.769122, best_reward: -17.080000 ± 13.172532 in #233


Epoch #323: 101it [00:01, 57.99it/s, env_step=32300, len=100, loss=3.241, n/ep=1, n/st=100, rew=-155.20]                          


Epoch #323: test_reward: -62.060000 ± 33.490243, best_reward: -17.080000 ± 13.172532 in #233


Epoch #324: 101it [00:03, 32.28it/s, env_step=32400, len=100, loss=3.266, n/ep=1, n/st=100, rew=-140.60]                         


Epoch #324: test_reward: -63.720000 ± 26.693662, best_reward: -17.080000 ± 13.172532 in #233


Epoch #325: 101it [00:03, 33.40it/s, env_step=32500, len=100, loss=3.272, n/ep=1, n/st=100, rew=-142.70]                         


Epoch #325: test_reward: -74.060000 ± 34.364202, best_reward: -17.080000 ± 13.172532 in #233


Epoch #326: 101it [00:06, 14.64it/s, env_step=32600, len=100, loss=3.100, n/ep=1, n/st=100, rew=-233.70]                         


Epoch #326: test_reward: -87.110000 ± 43.472415, best_reward: -17.080000 ± 13.172532 in #233


Epoch #327: 101it [00:03, 27.97it/s, env_step=32700, len=100, loss=3.137, n/ep=1, n/st=100, rew=-303.30]                         


Epoch #327: test_reward: -84.430000 ± 37.145122, best_reward: -17.080000 ± 13.172532 in #233


Epoch #328: 101it [00:02, 47.87it/s, env_step=32800, len=100, loss=3.012, n/ep=1, n/st=100, rew=-200.80]                          


Epoch #328: test_reward: -64.770000 ± 29.642809, best_reward: -17.080000 ± 13.172532 in #233


Epoch #329: 101it [00:10,  9.77it/s, env_step=32900, len=100, loss=2.745, n/ep=1, n/st=100, rew=-192.70]                         


Epoch #329: test_reward: -76.510000 ± 38.855462, best_reward: -17.080000 ± 13.172532 in #233


Epoch #330: 101it [00:01, 74.18it/s, env_step=33000, len=100, loss=3.056, n/ep=1, n/st=100, rew=-68.60]                          


Epoch #330: test_reward: -75.140000 ± 35.136995, best_reward: -17.080000 ± 13.172532 in #233


Epoch #331: 101it [00:02, 36.89it/s, env_step=33100, len=100, loss=3.391, n/ep=1, n/st=100, rew=-189.50]                         


Epoch #331: test_reward: -76.410000 ± 26.105611, best_reward: -17.080000 ± 13.172532 in #233


Epoch #332: 101it [00:12,  8.09it/s, env_step=33200, len=100, loss=3.088, n/ep=1, n/st=100, rew=-275.70]                          


Epoch #332: test_reward: -75.840000 ± 39.803045, best_reward: -17.080000 ± 13.172532 in #233


Epoch #333: 101it [00:01, 80.28it/s, env_step=33300, len=100, loss=2.744, n/ep=1, n/st=100, rew=-215.50]                          


Epoch #333: test_reward: -106.570000 ± 34.755030, best_reward: -17.080000 ± 13.172532 in #233


Epoch #334: 101it [00:05, 18.68it/s, env_step=33400, len=100, loss=2.752, n/ep=1, n/st=100, rew=-243.40]                         


Epoch #334: test_reward: -68.330000 ± 37.976468, best_reward: -17.080000 ± 13.172532 in #233


Epoch #335: 101it [00:15,  6.67it/s, env_step=33500, len=100, loss=2.632, n/ep=1, n/st=100, rew=-293.50]                         


Epoch #335: test_reward: -73.060000 ± 28.582099, best_reward: -17.080000 ± 13.172532 in #233


Epoch #336: 101it [00:02, 50.33it/s, env_step=33600, len=100, loss=2.664, n/ep=1, n/st=100, rew=-220.60]                         


Epoch #336: test_reward: -55.240000 ± 11.631870, best_reward: -17.080000 ± 13.172532 in #233


Epoch #337: 101it [00:09, 11.05it/s, env_step=33700, len=100, loss=2.529, n/ep=1, n/st=100, rew=-109.30]                          


Epoch #337: test_reward: -53.460000 ± 20.489763, best_reward: -17.080000 ± 13.172532 in #233


Epoch #338: 101it [00:01, 70.95it/s, env_step=33800, len=100, loss=2.740, n/ep=1, n/st=100, rew=-221.70]                          


Epoch #338: test_reward: -73.700000 ± 25.046637, best_reward: -17.080000 ± 13.172532 in #233


Epoch #339: 101it [00:03, 26.42it/s, env_step=33900, len=100, loss=2.517, n/ep=1, n/st=100, rew=-42.20]                         


Epoch #339: test_reward: -75.600000 ± 30.966111, best_reward: -17.080000 ± 13.172532 in #233


Epoch #340: 101it [00:12,  8.18it/s, env_step=34000, len=100, loss=2.934, n/ep=1, n/st=100, rew=-716.50]                         


Epoch #340: test_reward: -67.210000 ± 25.772483, best_reward: -17.080000 ± 13.172532 in #233


Epoch #341: 101it [00:02, 36.48it/s, env_step=34100, len=100, loss=2.997, n/ep=1, n/st=100, rew=-33.10]                          


Epoch #341: test_reward: -69.970000 ± 39.747605, best_reward: -17.080000 ± 13.172532 in #233


Epoch #342: 101it [00:02, 37.57it/s, env_step=34200, len=100, loss=2.833, n/ep=1, n/st=100, rew=-101.90]                          


Epoch #342: test_reward: -74.420000 ± 35.043567, best_reward: -17.080000 ± 13.172532 in #233


Epoch #343: 101it [00:08, 12.32it/s, env_step=34300, len=100, loss=2.976, n/ep=1, n/st=100, rew=-389.00]                         


Epoch #343: test_reward: -71.520000 ± 28.933745, best_reward: -17.080000 ± 13.172532 in #233


Epoch #344: 101it [00:01, 53.37it/s, env_step=34400, len=100, loss=2.669, n/ep=1, n/st=100, rew=-68.10]                         


Epoch #344: test_reward: -105.060000 ± 93.180837, best_reward: -17.080000 ± 13.172532 in #233


Epoch #345: 101it [00:05, 17.38it/s, env_step=34500, len=100, loss=2.797, n/ep=1, n/st=100, rew=-276.80]                          


Epoch #345: test_reward: -87.460000 ± 32.199727, best_reward: -17.080000 ± 13.172532 in #233


Epoch #346: 101it [00:03, 26.80it/s, env_step=34600, len=100, loss=2.809, n/ep=1, n/st=100, rew=-222.50]                          


Epoch #346: test_reward: -94.300000 ± 48.219394, best_reward: -17.080000 ± 13.172532 in #233


Epoch #347: 101it [00:01, 70.19it/s, env_step=34700, len=100, loss=2.663, n/ep=1, n/st=100, rew=-123.90]                          


Epoch #347: test_reward: -70.190000 ± 24.767053, best_reward: -17.080000 ± 13.172532 in #233


Epoch #348: 101it [00:05, 18.76it/s, env_step=34800, len=100, loss=2.637, n/ep=1, n/st=100, rew=-280.10]                          


Epoch #348: test_reward: -78.080000 ± 33.795467, best_reward: -17.080000 ± 13.172532 in #233


Epoch #349: 101it [00:01, 75.40it/s, env_step=34900, len=100, loss=2.493, n/ep=1, n/st=100, rew=-114.80]                          


Epoch #349: test_reward: -96.630000 ± 49.099125, best_reward: -17.080000 ± 13.172532 in #233


Epoch #350: 101it [00:03, 25.44it/s, env_step=35000, len=100, loss=2.373, n/ep=1, n/st=100, rew=-230.00]                          


Epoch #350: test_reward: -119.210000 ± 42.833432, best_reward: -17.080000 ± 13.172532 in #233


Epoch #351: 101it [00:05, 18.67it/s, env_step=35100, len=100, loss=2.896, n/ep=1, n/st=100, rew=-209.20]                         


Epoch #351: test_reward: -76.100000 ± 21.334010, best_reward: -17.080000 ± 13.172532 in #233


Epoch #352: 101it [00:07, 14.26it/s, env_step=35200, len=100, loss=3.284, n/ep=1, n/st=100, rew=-173.10]                         


Epoch #352: test_reward: -87.720000 ± 37.457864, best_reward: -17.080000 ± 13.172532 in #233


Epoch #353: 101it [00:01, 82.24it/s, env_step=35300, len=100, loss=3.208, n/ep=1, n/st=100, rew=-604.50]                          


Epoch #353: test_reward: -61.830000 ± 34.577277, best_reward: -17.080000 ± 13.172532 in #233


Epoch #354: 101it [00:13,  7.74it/s, env_step=35400, len=100, loss=3.142, n/ep=1, n/st=100, rew=-79.30]                         


Epoch #354: test_reward: -70.920000 ± 33.529444, best_reward: -17.080000 ± 13.172532 in #233


Epoch #355: 101it [00:03, 31.39it/s, env_step=35500, len=100, loss=2.743, n/ep=1, n/st=100, rew=-232.10]                         


Epoch #355: test_reward: -79.630000 ± 34.036541, best_reward: -17.080000 ± 13.172532 in #233


Epoch #356: 101it [00:02, 44.42it/s, env_step=35600, len=100, loss=2.763, n/ep=1, n/st=100, rew=-198.70]                          


Epoch #356: test_reward: -83.060000 ± 42.122991, best_reward: -17.080000 ± 13.172532 in #233


Epoch #357: 101it [00:07, 13.39it/s, env_step=35700, len=100, loss=2.617, n/ep=1, n/st=100, rew=-238.10]                         


Epoch #357: test_reward: -86.560000 ± 37.372107, best_reward: -17.080000 ± 13.172532 in #233


Epoch #358: 101it [00:04, 25.17it/s, env_step=35800, len=100, loss=2.611, n/ep=1, n/st=100, rew=-195.30]                         


Epoch #358: test_reward: -68.730000 ± 33.169988, best_reward: -17.080000 ± 13.172532 in #233


Epoch #359: 101it [00:02, 41.48it/s, env_step=35900, len=100, loss=3.005, n/ep=1, n/st=100, rew=-125.80]                         


Epoch #359: test_reward: -61.830000 ± 24.711497, best_reward: -17.080000 ± 13.172532 in #233


Epoch #360: 101it [00:08, 12.10it/s, env_step=36000, len=100, loss=2.990, n/ep=1, n/st=100, rew=-211.70]                          


Epoch #360: test_reward: -57.080000 ± 16.947259, best_reward: -17.080000 ± 13.172532 in #233


Epoch #361: 101it [00:01, 56.54it/s, env_step=36100, len=100, loss=3.133, n/ep=1, n/st=100, rew=-90.80]                          


Epoch #361: test_reward: -92.010000 ± 36.707941, best_reward: -17.080000 ± 13.172532 in #233


Epoch #362: 101it [00:07, 13.41it/s, env_step=36200, len=100, loss=3.155, n/ep=1, n/st=100, rew=-594.80]                          


Epoch #362: test_reward: -84.420000 ± 37.868108, best_reward: -17.080000 ± 13.172532 in #233


Epoch #363: 101it [00:01, 79.12it/s, env_step=36300, len=100, loss=3.164, n/ep=1, n/st=100, rew=-302.60]                          


Epoch #363: test_reward: -74.010000 ± 32.759531, best_reward: -17.080000 ± 13.172532 in #233


Epoch #364: 101it [00:03, 26.17it/s, env_step=36400, len=100, loss=3.049, n/ep=1, n/st=100, rew=-226.50]                          


Epoch #364: test_reward: -52.480000 ± 12.178325, best_reward: -17.080000 ± 13.172532 in #233


Epoch #365: 101it [00:14,  6.75it/s, env_step=36500, len=100, loss=2.654, n/ep=1, n/st=100, rew=-218.30]                          


Epoch #365: test_reward: -101.070000 ± 35.497072, best_reward: -17.080000 ± 13.172532 in #233


Epoch #366: 101it [00:05, 18.06it/s, env_step=36600, len=100, loss=2.831, n/ep=1, n/st=100, rew=-139.40]                         


Epoch #366: test_reward: -55.630000 ± 23.600044, best_reward: -17.080000 ± 13.172532 in #233


Epoch #367: 101it [00:09, 11.07it/s, env_step=36700, len=100, loss=2.964, n/ep=1, n/st=100, rew=-155.30]                         


Epoch #367: test_reward: -79.180000 ± 38.432741, best_reward: -17.080000 ± 13.172532 in #233


Epoch #368: 101it [00:08, 12.59it/s, env_step=36800, len=100, loss=2.536, n/ep=1, n/st=100, rew=-108.20]                         


Epoch #368: test_reward: -85.880000 ± 42.256971, best_reward: -17.080000 ± 13.172532 in #233


Epoch #369: 101it [00:05, 19.52it/s, env_step=36900, len=100, loss=2.421, n/ep=1, n/st=100, rew=-271.60]                          


Epoch #369: test_reward: -84.230000 ± 32.688776, best_reward: -17.080000 ± 13.172532 in #233


Epoch #370: 101it [00:03, 26.87it/s, env_step=37000, len=100, loss=2.912, n/ep=1, n/st=100, rew=-151.20]                          


Epoch #370: test_reward: -71.300000 ± 28.007428, best_reward: -17.080000 ± 13.172532 in #233


Epoch #371: 101it [00:03, 28.65it/s, env_step=37100, len=100, loss=2.679, n/ep=1, n/st=100, rew=-236.60]                          


Epoch #371: test_reward: -68.520000 ± 35.310474, best_reward: -17.080000 ± 13.172532 in #233


Epoch #372: 101it [00:05, 18.58it/s, env_step=37200, len=100, loss=2.867, n/ep=1, n/st=100, rew=-233.70]                         


Epoch #372: test_reward: -78.750000 ± 28.760711, best_reward: -17.080000 ± 13.172532 in #233


Epoch #373: 101it [00:13,  7.53it/s, env_step=37300, len=100, loss=2.674, n/ep=1, n/st=100, rew=-212.50]                         


Epoch #373: test_reward: -75.950000 ± 35.684710, best_reward: -17.080000 ± 13.172532 in #233


Epoch #374: 101it [00:03, 27.83it/s, env_step=37400, len=100, loss=2.873, n/ep=1, n/st=100, rew=-181.00]                          


Epoch #374: test_reward: -88.560000 ± 35.542656, best_reward: -17.080000 ± 13.172532 in #233


Epoch #375: 101it [00:09, 10.64it/s, env_step=37500, len=100, loss=2.695, n/ep=1, n/st=100, rew=-42.70]                         


Epoch #375: test_reward: -81.420000 ± 26.231500, best_reward: -17.080000 ± 13.172532 in #233


Epoch #376: 101it [00:01, 81.28it/s, env_step=37600, len=100, loss=2.484, n/ep=1, n/st=100, rew=-241.80]                          


Epoch #376: test_reward: -78.230000 ± 33.261270, best_reward: -17.080000 ± 13.172532 in #233


Epoch #377: 101it [00:03, 27.73it/s, env_step=37700, len=100, loss=2.591, n/ep=1, n/st=100, rew=-222.40]                         


Epoch #377: test_reward: -39.490000 ± 13.083535, best_reward: -17.080000 ± 13.172532 in #233


Epoch #378: 101it [00:18,  5.38it/s, env_step=37800, len=100, loss=2.376, n/ep=1, n/st=100, rew=-276.40]                         


Epoch #378: test_reward: -77.460000 ± 45.407581, best_reward: -17.080000 ± 13.172532 in #233


Epoch #379: 101it [00:02, 48.45it/s, env_step=37900, len=100, loss=2.475, n/ep=1, n/st=100, rew=-74.10]                          


Epoch #379: test_reward: -59.850000 ± 22.147743, best_reward: -17.080000 ± 13.172532 in #233


Epoch #380: 101it [00:06, 16.62it/s, env_step=38000, len=100, loss=2.265, n/ep=1, n/st=100, rew=-259.00]                         


Epoch #380: test_reward: -67.030000 ± 28.325291, best_reward: -17.080000 ± 13.172532 in #233


Epoch #381: 101it [00:01, 82.34it/s, env_step=38100, len=100, loss=2.803, n/ep=1, n/st=100, rew=-326.40]                          


Epoch #381: test_reward: -48.340000 ± 10.719627, best_reward: -17.080000 ± 13.172532 in #233


Epoch #382: 101it [00:06, 16.30it/s, env_step=38200, len=100, loss=2.405, n/ep=1, n/st=100, rew=-173.60]                          


Epoch #382: test_reward: -56.470000 ± 23.446836, best_reward: -17.080000 ± 13.172532 in #233


Epoch #383: 101it [00:09, 11.14it/s, env_step=38300, len=100, loss=2.753, n/ep=1, n/st=100, rew=-182.20]                         


Epoch #383: test_reward: -56.640000 ± 22.889701, best_reward: -17.080000 ± 13.172532 in #233


Epoch #384: 101it [00:03, 27.17it/s, env_step=38400, len=100, loss=2.645, n/ep=1, n/st=100, rew=-323.40]                         


Epoch #384: test_reward: -60.000000 ± 26.321474, best_reward: -17.080000 ± 13.172532 in #233


Epoch #385: 101it [00:02, 38.56it/s, env_step=38500, len=100, loss=2.340, n/ep=1, n/st=100, rew=-202.20]                          


Epoch #385: test_reward: -41.830000 ± 16.638452, best_reward: -17.080000 ± 13.172532 in #233


Epoch #386: 101it [00:05, 18.40it/s, env_step=38600, len=100, loss=2.596, n/ep=1, n/st=100, rew=-220.60]                         


Epoch #386: test_reward: -57.190000 ± 36.513243, best_reward: -17.080000 ± 13.172532 in #233


Epoch #387: 101it [00:05, 19.59it/s, env_step=38700, len=100, loss=2.505, n/ep=1, n/st=100, rew=-185.20]                          


Epoch #387: test_reward: -38.470000 ± 23.753991, best_reward: -17.080000 ± 13.172532 in #233


Epoch #388: 101it [00:09, 10.71it/s, env_step=38800, len=100, loss=2.398, n/ep=1, n/st=100, rew=-168.00]                          


Epoch #388: test_reward: -33.740000 ± 18.842250, best_reward: -17.080000 ± 13.172532 in #233


Epoch #389: 101it [00:03, 26.23it/s, env_step=38900, len=100, loss=2.183, n/ep=1, n/st=100, rew=-167.90]                         


Epoch #389: test_reward: -55.280000 ± 27.939177, best_reward: -17.080000 ± 13.172532 in #233


Epoch #390: 101it [00:03, 29.22it/s, env_step=39000, len=100, loss=2.264, n/ep=1, n/st=100, rew=-190.40]                          


Epoch #390: test_reward: -40.570000 ± 19.709848, best_reward: -17.080000 ± 13.172532 in #233


Epoch #391: 101it [00:05, 17.81it/s, env_step=39100, len=100, loss=2.146, n/ep=1, n/st=100, rew=-90.60]                          


Epoch #391: test_reward: -53.640000 ± 29.497227, best_reward: -17.080000 ± 13.172532 in #233


Epoch #392: 101it [00:06, 14.73it/s, env_step=39200, len=100, loss=2.251, n/ep=1, n/st=100, rew=-194.00]                          


Epoch #392: test_reward: -42.510000 ± 19.518527, best_reward: -17.080000 ± 13.172532 in #233


Epoch #393: 101it [00:10,  9.24it/s, env_step=39300, len=100, loss=1.764, n/ep=1, n/st=100, rew=-199.20]                         


Epoch #393: test_reward: -51.590000 ± 29.194193, best_reward: -17.080000 ± 13.172532 in #233


Epoch #394: 101it [00:02, 38.78it/s, env_step=39400, len=100, loss=1.870, n/ep=1, n/st=100, rew=-216.50]                          


Epoch #394: test_reward: -44.350000 ± 21.928212, best_reward: -17.080000 ± 13.172532 in #233


Epoch #395: 101it [00:12,  8.41it/s, env_step=39500, len=100, loss=1.858, n/ep=1, n/st=100, rew=-63.60]                          


Epoch #395: test_reward: -48.610000 ± 28.268373, best_reward: -17.080000 ± 13.172532 in #233


Epoch #396: 101it [00:01, 89.16it/s, env_step=39600, len=100, loss=1.885, n/ep=1, n/st=100, rew=-128.50]                          


Epoch #396: test_reward: -37.450000 ± 31.351053, best_reward: -17.080000 ± 13.172532 in #233


Epoch #397: 101it [00:03, 25.77it/s, env_step=39700, len=100, loss=1.766, n/ep=1, n/st=100, rew=-153.40]                          


Epoch #397: test_reward: -42.400000 ± 29.784325, best_reward: -17.080000 ± 13.172532 in #233


Epoch #398: 101it [00:09, 10.22it/s, env_step=39800, len=100, loss=1.607, n/ep=1, n/st=100, rew=-36.20]                          


Epoch #398: test_reward: -54.670000 ± 27.711191, best_reward: -17.080000 ± 13.172532 in #233


Epoch #399: 101it [00:02, 37.17it/s, env_step=39900, len=100, loss=1.669, n/ep=1, n/st=100, rew=-176.80]                         


Epoch #399: test_reward: -56.790000 ± 32.770305, best_reward: -17.080000 ± 13.172532 in #233


Epoch #400: 101it [00:07, 13.42it/s, env_step=40000, len=100, loss=1.994, n/ep=1, n/st=100, rew=-180.40]                         


Epoch #400: test_reward: -43.510000 ± 38.411963, best_reward: -17.080000 ± 13.172532 in #233


Epoch #401: 101it [00:01, 72.39it/s, env_step=40100, len=100, loss=1.987, n/ep=1, n/st=100, rew=-59.30]                          


Epoch #401: test_reward: -43.080000 ± 24.876005, best_reward: -17.080000 ± 13.172532 in #233


Epoch #402: 101it [00:08, 12.61it/s, env_step=40200, len=100, loss=1.920, n/ep=1, n/st=100, rew=-38.90]                          


Epoch #402: test_reward: -47.030000 ± 27.618076, best_reward: -17.080000 ± 13.172532 in #233


Epoch #403: 101it [00:04, 21.12it/s, env_step=40300, len=100, loss=1.696, n/ep=1, n/st=100, rew=-168.10]                         


Epoch #403: test_reward: -22.040000 ± 16.695101, best_reward: -17.080000 ± 13.172532 in #233


Epoch #404: 101it [00:02, 47.12it/s, env_step=40400, len=100, loss=1.579, n/ep=1, n/st=100, rew=-292.00]                          


Epoch #404: test_reward: -34.280000 ± 18.818119, best_reward: -17.080000 ± 13.172532 in #233


Epoch #405: 101it [00:03, 25.92it/s, env_step=40500, len=100, loss=1.666, n/ep=1, n/st=100, rew=-31.40]                          


Epoch #405: test_reward: -42.970000 ± 20.080640, best_reward: -17.080000 ± 13.172532 in #233


Epoch #406: 101it [00:06, 16.01it/s, env_step=40600, len=100, loss=1.513, n/ep=1, n/st=100, rew=-220.50]                          


Epoch #406: test_reward: -54.000000 ± 41.381469, best_reward: -17.080000 ± 13.172532 in #233


Epoch #407: 101it [00:04, 20.48it/s, env_step=40700, len=100, loss=1.688, n/ep=1, n/st=100, rew=-182.20]                          


Epoch #407: test_reward: -32.970000 ± 21.860286, best_reward: -17.080000 ± 13.172532 in #233


Epoch #408: 101it [00:09, 10.50it/s, env_step=40800, len=100, loss=1.624, n/ep=1, n/st=100, rew=-39.50]                         


Epoch #408: test_reward: -40.470000 ± 18.907937, best_reward: -17.080000 ± 13.172532 in #233


Epoch #409: 101it [00:03, 27.48it/s, env_step=40900, len=100, loss=1.462, n/ep=1, n/st=100, rew=-36.60]                          


Epoch #409: test_reward: -39.430000 ± 18.783453, best_reward: -17.080000 ± 13.172532 in #233


Epoch #410: 101it [00:02, 37.51it/s, env_step=41000, len=100, loss=1.426, n/ep=1, n/st=100, rew=-178.60]                         


Epoch #410: test_reward: -53.910000 ± 18.581197, best_reward: -17.080000 ± 13.172532 in #233


Epoch #411: 101it [00:02, 33.86it/s, env_step=41100, len=100, loss=1.822, n/ep=1, n/st=100, rew=-377.70]                         


Epoch #411: test_reward: -43.150000 ± 15.761298, best_reward: -17.080000 ± 13.172532 in #233


Epoch #412: 101it [00:06, 14.45it/s, env_step=41200, len=100, loss=1.455, n/ep=1, n/st=100, rew=-194.90]                          


Epoch #412: test_reward: -55.590000 ± 6.932453, best_reward: -17.080000 ± 13.172532 in #233


Epoch #413: 101it [00:16,  6.19it/s, env_step=41300, len=100, loss=1.341, n/ep=1, n/st=100, rew=-194.10]                         


Epoch #413: test_reward: -71.460000 ± 30.137392, best_reward: -17.080000 ± 13.172532 in #233


Epoch #414: 101it [00:01, 61.41it/s, env_step=41400, len=100, loss=2.231, n/ep=1, n/st=100, rew=-715.70]                          


Epoch #414: test_reward: -70.740000 ± 26.680487, best_reward: -17.080000 ± 13.172532 in #233


Epoch #415: 101it [00:07, 13.33it/s, env_step=41500, len=100, loss=1.962, n/ep=1, n/st=100, rew=-343.50]                          


Epoch #415: test_reward: -58.580000 ± 22.456171, best_reward: -17.080000 ± 13.172532 in #233


Epoch #416: 101it [00:01, 78.54it/s, env_step=41600, len=100, loss=2.224, n/ep=1, n/st=100, rew=-43.30]                          


Epoch #416: test_reward: -52.210000 ± 5.960277, best_reward: -17.080000 ± 13.172532 in #233


Epoch #417: 101it [00:05, 17.37it/s, env_step=41700, len=100, loss=2.081, n/ep=1, n/st=100, rew=-84.20]                          


Epoch #417: test_reward: -60.040000 ± 21.872046, best_reward: -17.080000 ± 13.172532 in #233


Epoch #418: 101it [00:08, 11.35it/s, env_step=41800, len=100, loss=1.788, n/ep=1, n/st=100, rew=-147.70]                         


Epoch #418: test_reward: -49.330000 ± 13.174297, best_reward: -17.080000 ± 13.172532 in #233


Epoch #419: 101it [00:04, 23.36it/s, env_step=41900, len=100, loss=1.836, n/ep=1, n/st=100, rew=-205.00]                         


Epoch #419: test_reward: -62.920000 ± 21.856340, best_reward: -17.080000 ± 13.172532 in #233


Epoch #420: 101it [00:12,  8.21it/s, env_step=42000, len=100, loss=1.872, n/ep=1, n/st=100, rew=-199.90]                          


Epoch #420: test_reward: -52.250000 ± 6.349213, best_reward: -17.080000 ± 13.172532 in #233


Epoch #421: 101it [00:01, 86.01it/s, env_step=42100, len=100, loss=2.050, n/ep=1, n/st=100, rew=-218.80]                          


Epoch #421: test_reward: -62.710000 ± 20.323309, best_reward: -17.080000 ± 13.172532 in #233


Epoch #422: 101it [00:04, 20.80it/s, env_step=42200, len=100, loss=1.652, n/ep=1, n/st=100, rew=-256.60]                          


Epoch #422: test_reward: -56.570000 ± 20.559380, best_reward: -17.080000 ± 13.172532 in #233


Epoch #423: 101it [00:09, 10.71it/s, env_step=42300, len=100, loss=1.946, n/ep=1, n/st=100, rew=-241.80]                         


Epoch #423: test_reward: -63.350000 ± 21.651478, best_reward: -17.080000 ± 13.172532 in #233


Epoch #424: 101it [00:03, 31.53it/s, env_step=42400, len=100, loss=1.965, n/ep=1, n/st=100, rew=-177.70]                         


Epoch #424: test_reward: -58.090000 ± 17.221757, best_reward: -17.080000 ± 13.172532 in #233


Epoch #425: 101it [00:02, 37.75it/s, env_step=42500, len=100, loss=2.191, n/ep=1, n/st=100, rew=-169.10]                          


Epoch #425: test_reward: -66.030000 ± 23.243969, best_reward: -17.080000 ± 13.172532 in #233


Epoch #426: 101it [00:11,  8.70it/s, env_step=42600, len=100, loss=1.820, n/ep=1, n/st=100, rew=-241.00]                         


Epoch #426: test_reward: -53.490000 ± 6.343256, best_reward: -17.080000 ± 13.172532 in #233


Epoch #427: 101it [00:05, 18.34it/s, env_step=42700, len=100, loss=1.666, n/ep=1, n/st=100, rew=-221.70]                         


Epoch #427: test_reward: -44.570000 ± 15.427446, best_reward: -17.080000 ± 13.172532 in #233


Epoch #428: 101it [00:12,  8.12it/s, env_step=42800, len=100, loss=1.814, n/ep=1, n/st=100, rew=-230.40]                         


Epoch #428: test_reward: -52.030000 ± 23.713332, best_reward: -17.080000 ± 13.172532 in #233


Epoch #429: 101it [00:04, 21.00it/s, env_step=42900, len=100, loss=1.639, n/ep=1, n/st=100, rew=-220.80]                         


Epoch #429: test_reward: -51.930000 ± 12.496964, best_reward: -17.080000 ± 13.172532 in #233


Epoch #430: 101it [00:09, 11.14it/s, env_step=43000, len=100, loss=1.754, n/ep=1, n/st=100, rew=-172.10]                         


Epoch #430: test_reward: -50.660000 ± 2.091028, best_reward: -17.080000 ± 13.172532 in #233


Epoch #431: 101it [00:08, 12.29it/s, env_step=43100, len=100, loss=1.990, n/ep=1, n/st=100, rew=-207.70]                         


Epoch #431: test_reward: -65.590000 ± 39.554632, best_reward: -17.080000 ± 13.172532 in #233


Epoch #432: 101it [00:06, 15.09it/s, env_step=43200, len=100, loss=1.945, n/ep=1, n/st=100, rew=-202.60]                          


Epoch #432: test_reward: -53.270000 ± 7.669426, best_reward: -17.080000 ± 13.172532 in #233


Epoch #433: 101it [00:12,  7.85it/s, env_step=43300, len=100, loss=1.931, n/ep=1, n/st=100, rew=-371.90]                         


Epoch #433: test_reward: -60.150000 ± 30.072155, best_reward: -17.080000 ± 13.172532 in #233


Epoch #434: 101it [00:01, 60.29it/s, env_step=43400, len=100, loss=2.079, n/ep=1, n/st=100, rew=-283.20]                         


Epoch #434: test_reward: -50.060000 ± 10.674755, best_reward: -17.080000 ± 13.172532 in #233


Epoch #435: 101it [00:01, 64.70it/s, env_step=43500, len=100, loss=1.809, n/ep=1, n/st=100, rew=-279.90]                          


Epoch #435: test_reward: -48.680000 ± 14.095872, best_reward: -17.080000 ± 13.172532 in #233


Epoch #436: 101it [00:06, 15.74it/s, env_step=43600, len=100, loss=1.803, n/ep=1, n/st=100, rew=-107.30]                         


Epoch #436: test_reward: -42.870000 ± 12.054630, best_reward: -17.080000 ± 13.172532 in #233


Epoch #437: 101it [00:02, 35.37it/s, env_step=43700, len=100, loss=1.713, n/ep=1, n/st=100, rew=-119.40]                          


Epoch #437: test_reward: -57.830000 ± 40.510592, best_reward: -17.080000 ± 13.172532 in #233


Epoch #438: 101it [00:05, 17.22it/s, env_step=43800, len=100, loss=1.726, n/ep=1, n/st=100, rew=-98.40]                          


Epoch #438: test_reward: -59.850000 ± 32.557680, best_reward: -17.080000 ± 13.172532 in #233


Epoch #439: 101it [00:06, 16.50it/s, env_step=43900, len=100, loss=1.773, n/ep=1, n/st=100, rew=-176.70]                          


Epoch #439: test_reward: -73.650000 ± 47.394857, best_reward: -17.080000 ± 13.172532 in #233


Epoch #440: 101it [00:05, 19.35it/s, env_step=44000, len=100, loss=1.797, n/ep=1, n/st=100, rew=-52.80]                          


Epoch #440: test_reward: -40.350000 ± 9.930181, best_reward: -17.080000 ± 13.172532 in #233


Epoch #441: 101it [00:06, 14.45it/s, env_step=44100, len=100, loss=1.755, n/ep=1, n/st=100, rew=-165.00]                          


Epoch #441: test_reward: -54.960000 ± 25.687320, best_reward: -17.080000 ± 13.172532 in #233


Epoch #442: 101it [00:04, 24.13it/s, env_step=44200, len=100, loss=1.729, n/ep=1, n/st=100, rew=-202.20]                          


Epoch #442: test_reward: -71.810000 ± 37.128519, best_reward: -17.080000 ± 13.172532 in #233


Epoch #443: 101it [00:03, 31.41it/s, env_step=44300, len=100, loss=1.728, n/ep=1, n/st=100, rew=-28.70]                          


Epoch #443: test_reward: -78.430000 ± 31.223422, best_reward: -17.080000 ± 13.172532 in #233


Epoch #444: 101it [00:06, 15.87it/s, env_step=44400, len=100, loss=1.774, n/ep=1, n/st=100, rew=-206.40]                         


Epoch #444: test_reward: -82.130000 ± 45.696106, best_reward: -17.080000 ± 13.172532 in #233


Epoch #445: 101it [00:01, 79.64it/s, env_step=44500, len=100, loss=1.588, n/ep=1, n/st=100, rew=-251.90]                          


Epoch #445: test_reward: -50.950000 ± 35.668621, best_reward: -17.080000 ± 13.172532 in #233


Epoch #446: 101it [00:10,  9.32it/s, env_step=44600, len=100, loss=1.692, n/ep=1, n/st=100, rew=-56.40]                         


Epoch #446: test_reward: -58.630000 ± 38.542523, best_reward: -17.080000 ± 13.172532 in #233


Epoch #447: 101it [00:02, 45.86it/s, env_step=44700, len=100, loss=1.567, n/ep=1, n/st=100, rew=-239.70]                         


Epoch #447: test_reward: -60.950000 ± 24.946833, best_reward: -17.080000 ± 13.172532 in #233


Epoch #448: 101it [00:01, 79.03it/s, env_step=44800, len=100, loss=1.613, n/ep=1, n/st=100, rew=-303.00]                          


Epoch #448: test_reward: -52.470000 ± 29.670425, best_reward: -17.080000 ± 13.172532 in #233


Epoch #449: 101it [00:08, 12.41it/s, env_step=44900, len=100, loss=1.530, n/ep=1, n/st=100, rew=-216.80]                          


Epoch #449: test_reward: -73.620000 ± 35.199170, best_reward: -17.080000 ± 13.172532 in #233


Epoch #450: 101it [00:03, 27.87it/s, env_step=45000, len=100, loss=1.573, n/ep=1, n/st=100, rew=-156.20]                         


Epoch #450: test_reward: -47.560000 ± 28.897827, best_reward: -17.080000 ± 13.172532 in #233


Epoch #451: 101it [00:02, 37.01it/s, env_step=45100, len=100, loss=1.674, n/ep=1, n/st=100, rew=-215.20]                         


Epoch #451: test_reward: -74.340000 ± 32.942957, best_reward: -17.080000 ± 13.172532 in #233


Epoch #452: 101it [00:13,  7.75it/s, env_step=45200, len=100, loss=1.712, n/ep=1, n/st=100, rew=-48.90]                         


Epoch #452: test_reward: -55.770000 ± 28.060722, best_reward: -17.080000 ± 13.172532 in #233


Epoch #453: 101it [00:06, 16.33it/s, env_step=45300, len=100, loss=1.581, n/ep=1, n/st=100, rew=-185.80]                          


Epoch #453: test_reward: -49.920000 ± 11.933633, best_reward: -17.080000 ± 13.172532 in #233


Epoch #454: 101it [00:13,  7.32it/s, env_step=45400, len=100, loss=1.640, n/ep=1, n/st=100, rew=-71.50]                          


Epoch #454: test_reward: -45.830000 ± 14.769093, best_reward: -17.080000 ± 13.172532 in #233


Epoch #455: 101it [00:04, 22.14it/s, env_step=45500, len=100, loss=2.404, n/ep=1, n/st=100, rew=-638.00]                          


Epoch #455: test_reward: -74.750000 ± 32.085207, best_reward: -17.080000 ± 13.172532 in #233


Epoch #456: 101it [00:04, 21.65it/s, env_step=45600, len=100, loss=2.481, n/ep=1, n/st=100, rew=-130.00]                         


Epoch #456: test_reward: -61.960000 ± 26.555271, best_reward: -17.080000 ± 13.172532 in #233


Epoch #457: 101it [00:08, 11.88it/s, env_step=45700, len=100, loss=2.078, n/ep=1, n/st=100, rew=-258.20]                         


Epoch #457: test_reward: -101.810000 ± 43.717375, best_reward: -17.080000 ± 13.172532 in #233


Epoch #458: 101it [00:05, 17.63it/s, env_step=45800, len=100, loss=2.317, n/ep=1, n/st=100, rew=-115.70]                          


Epoch #458: test_reward: -71.650000 ± 34.394135, best_reward: -17.080000 ± 13.172532 in #233


Epoch #459: 101it [00:11,  8.51it/s, env_step=45900, len=100, loss=2.280, n/ep=1, n/st=100, rew=-252.50]                          


Epoch #459: test_reward: -44.720000 ± 20.696850, best_reward: -17.080000 ± 13.172532 in #233


Epoch #460: 101it [00:03, 28.21it/s, env_step=46000, len=100, loss=2.146, n/ep=1, n/st=100, rew=-246.30]                          


Epoch #460: test_reward: -89.510000 ± 40.652981, best_reward: -17.080000 ± 13.172532 in #233


Epoch #461: 101it [00:10,  9.96it/s, env_step=46100, len=100, loss=2.270, n/ep=1, n/st=100, rew=-216.70]                          


Epoch #461: test_reward: -70.080000 ± 32.698465, best_reward: -17.080000 ± 13.172532 in #233


Epoch #462: 101it [00:01, 73.65it/s, env_step=46200, len=100, loss=1.949, n/ep=1, n/st=100, rew=-220.20]                          


Epoch #462: test_reward: -81.010000 ± 30.209219, best_reward: -17.080000 ± 13.172532 in #233


Epoch #463: 101it [00:03, 26.58it/s, env_step=46300, len=100, loss=1.893, n/ep=1, n/st=100, rew=-201.70]                         


Epoch #463: test_reward: -62.690000 ± 21.586915, best_reward: -17.080000 ± 13.172532 in #233


Epoch #464: 101it [00:09, 11.18it/s, env_step=46400, len=100, loss=2.042, n/ep=1, n/st=100, rew=-196.50]                         


Epoch #464: test_reward: -63.260000 ± 32.193422, best_reward: -17.080000 ± 13.172532 in #233


Epoch #465: 101it [00:05, 17.04it/s, env_step=46500, len=100, loss=1.998, n/ep=1, n/st=100, rew=-190.10]                         


Epoch #465: test_reward: -70.720000 ± 33.023894, best_reward: -17.080000 ± 13.172532 in #233


Epoch #466: 101it [00:07, 12.73it/s, env_step=46600, len=100, loss=1.804, n/ep=1, n/st=100, rew=-253.80]                          


Epoch #466: test_reward: -73.340000 ± 43.928241, best_reward: -17.080000 ± 13.172532 in #233


Epoch #467: 101it [00:08, 12.41it/s, env_step=46700, len=100, loss=1.968, n/ep=1, n/st=100, rew=-42.60]                          


Epoch #467: test_reward: -58.840000 ± 40.166160, best_reward: -17.080000 ± 13.172532 in #233


Epoch #468: 101it [00:07, 13.38it/s, env_step=46800, len=100, loss=1.819, n/ep=1, n/st=100, rew=-234.20]                          


Epoch #468: test_reward: -73.650000 ± 45.953634, best_reward: -17.080000 ± 13.172532 in #233


Epoch #469: 101it [00:02, 47.59it/s, env_step=46900, len=100, loss=2.035, n/ep=1, n/st=100, rew=-173.30]                          


Epoch #469: test_reward: -68.790000 ± 29.822054, best_reward: -17.080000 ± 13.172532 in #233


Epoch #470: 101it [00:04, 20.27it/s, env_step=47000, len=100, loss=1.871, n/ep=1, n/st=100, rew=-101.20]                          


Epoch #470: test_reward: -70.780000 ± 36.352931, best_reward: -17.080000 ± 13.172532 in #233


Epoch #471: 101it [00:08, 11.63it/s, env_step=47100, len=100, loss=2.055, n/ep=1, n/st=100, rew=-64.50]                         


Epoch #471: test_reward: -56.640000 ± 21.923467, best_reward: -17.080000 ± 13.172532 in #233


Epoch #472: 101it [00:06, 15.96it/s, env_step=47200, len=100, loss=1.889, n/ep=1, n/st=100, rew=-162.50]                         


Epoch #472: test_reward: -64.080000 ± 27.442551, best_reward: -17.080000 ± 13.172532 in #233


Epoch #473: 101it [00:15,  6.39it/s, env_step=47300, len=100, loss=2.082, n/ep=1, n/st=100, rew=-182.90]                         


Epoch #473: test_reward: -60.520000 ± 23.646217, best_reward: -17.080000 ± 13.172532 in #233


Epoch #474: 101it [00:01, 77.24it/s, env_step=47400, len=100, loss=1.805, n/ep=1, n/st=100, rew=-98.40]                          


Epoch #474: test_reward: -58.670000 ± 17.934997, best_reward: -17.080000 ± 13.172532 in #233


Epoch #475: 101it [00:04, 21.42it/s, env_step=47500, len=100, loss=1.942, n/ep=1, n/st=100, rew=-200.10]                         


Epoch #475: test_reward: -84.050000 ± 44.920981, best_reward: -17.080000 ± 13.172532 in #233


Epoch #476: 101it [00:10,  9.36it/s, env_step=47600, len=100, loss=1.690, n/ep=1, n/st=100, rew=-160.70]                         


Epoch #476: test_reward: -72.660000 ± 30.613402, best_reward: -17.080000 ± 13.172532 in #233


Epoch #477: 101it [00:03, 29.60it/s, env_step=47700, len=100, loss=1.915, n/ep=1, n/st=100, rew=-195.20]                          


Epoch #477: test_reward: -77.050000 ± 35.432732, best_reward: -17.080000 ± 13.172532 in #233


Epoch #478: 101it [00:12,  8.11it/s, env_step=47800, len=100, loss=1.612, n/ep=1, n/st=100, rew=-205.70]                          


Epoch #478: test_reward: -80.480000 ± 28.647541, best_reward: -17.080000 ± 13.172532 in #233


Epoch #479: 101it [00:01, 62.68it/s, env_step=47900, len=100, loss=1.820, n/ep=1, n/st=100, rew=-237.70]                          


Epoch #479: test_reward: -74.210000 ± 26.328937, best_reward: -17.080000 ± 13.172532 in #233


Epoch #480: 101it [00:01, 52.93it/s, env_step=48000, len=100, loss=1.738, n/ep=1, n/st=100, rew=-171.20]                          


Epoch #480: test_reward: -76.450000 ± 32.919394, best_reward: -17.080000 ± 13.172532 in #233


Epoch #481: 101it [00:15,  6.64it/s, env_step=48100, len=100, loss=1.690, n/ep=1, n/st=100, rew=-35.60]                         


Epoch #481: test_reward: -75.560000 ± 27.745385, best_reward: -17.080000 ± 13.172532 in #233


Epoch #482: 101it [00:03, 31.31it/s, env_step=48200, len=100, loss=1.724, n/ep=1, n/st=100, rew=-32.30]                          


Epoch #482: test_reward: -88.470000 ± 45.187964, best_reward: -17.080000 ± 13.172532 in #233


Epoch #483: 101it [00:08, 12.52it/s, env_step=48300, len=100, loss=1.587, n/ep=1, n/st=100, rew=-209.90]                         


Epoch #483: test_reward: -64.630000 ± 31.570589, best_reward: -17.080000 ± 13.172532 in #233


Epoch #484: 101it [00:01, 58.60it/s, env_step=48400, len=100, loss=1.513, n/ep=1, n/st=100, rew=-160.10]                          


Epoch #484: test_reward: -68.870000 ± 28.181521, best_reward: -17.080000 ± 13.172532 in #233


Epoch #485: 101it [00:07, 13.51it/s, env_step=48500, len=100, loss=1.523, n/ep=1, n/st=100, rew=-79.80]                         


Epoch #485: test_reward: -72.180000 ± 39.800246, best_reward: -17.080000 ± 13.172532 in #233


Epoch #486: 101it [00:09, 11.21it/s, env_step=48600, len=100, loss=1.492, n/ep=1, n/st=100, rew=-184.30]                         


Epoch #486: test_reward: -52.480000 ± 23.838112, best_reward: -17.080000 ± 13.172532 in #233


Epoch #487: 101it [00:01, 60.73it/s, env_step=48700, len=100, loss=1.465, n/ep=1, n/st=100, rew=-285.80]                          


Epoch #487: test_reward: -75.550000 ± 30.518527, best_reward: -17.080000 ± 13.172532 in #233


Epoch #488: 101it [00:16,  6.13it/s, env_step=48800, len=100, loss=1.437, n/ep=1, n/st=100, rew=-33.10]                          


Epoch #488: test_reward: -88.760000 ± 26.033140, best_reward: -17.080000 ± 13.172532 in #233


Epoch #489: 101it [00:02, 35.01it/s, env_step=48900, len=100, loss=1.389, n/ep=1, n/st=100, rew=-174.50]                          


Epoch #489: test_reward: -73.080000 ± 34.220573, best_reward: -17.080000 ± 13.172532 in #233


Epoch #490: 101it [00:03, 27.89it/s, env_step=49000, len=100, loss=1.941, n/ep=1, n/st=100, rew=-374.70]                          


Epoch #490: test_reward: -85.720000 ± 36.776781, best_reward: -17.080000 ± 13.172532 in #233


Epoch #491: 101it [00:05, 17.41it/s, env_step=49100, len=100, loss=1.811, n/ep=1, n/st=100, rew=-157.10]                         


Epoch #491: test_reward: -73.360000 ± 34.333605, best_reward: -17.080000 ± 13.172532 in #233


Epoch #492: 101it [00:04, 20.73it/s, env_step=49200, len=100, loss=1.800, n/ep=1, n/st=100, rew=-182.20]                          


Epoch #492: test_reward: -97.530000 ± 47.619556, best_reward: -17.080000 ± 13.172532 in #233


Epoch #493: 101it [00:02, 39.16it/s, env_step=49300, len=100, loss=1.713, n/ep=1, n/st=100, rew=-134.60]                         


Epoch #493: test_reward: -80.750000 ± 34.499399, best_reward: -17.080000 ± 13.172532 in #233


Epoch #494: 101it [00:03, 31.32it/s, env_step=49400, len=100, loss=2.060, n/ep=1, n/st=100, rew=-449.00]                         


Epoch #494: test_reward: -77.740000 ± 30.400763, best_reward: -17.080000 ± 13.172532 in #233


Epoch #495: 101it [00:05, 17.37it/s, env_step=49500, len=100, loss=2.238, n/ep=1, n/st=100, rew=-32.20]                          


Epoch #495: test_reward: -88.480000 ± 38.078283, best_reward: -17.080000 ± 13.172532 in #233


Epoch #496: 101it [00:10,  9.60it/s, env_step=49600, len=100, loss=2.161, n/ep=1, n/st=100, rew=-248.40]                         


Epoch #496: test_reward: -69.170000 ± 29.087250, best_reward: -17.080000 ± 13.172532 in #233


Epoch #497: 101it [00:01, 74.92it/s, env_step=49700, len=100, loss=1.991, n/ep=1, n/st=100, rew=-145.50]                          


Epoch #497: test_reward: -63.610000 ± 29.577845, best_reward: -17.080000 ± 13.172532 in #233


Epoch #498: 101it [00:03, 25.70it/s, env_step=49800, len=100, loss=2.085, n/ep=1, n/st=100, rew=-217.30]                         


Epoch #498: test_reward: -83.140000 ± 40.483878, best_reward: -17.080000 ± 13.172532 in #233


Epoch #499: 101it [00:07, 13.13it/s, env_step=49900, len=100, loss=2.741, n/ep=1, n/st=100, rew=-581.20]                         


Epoch #499: test_reward: -87.160000 ± 29.932932, best_reward: -17.080000 ± 13.172532 in #233


Epoch #500: 101it [00:05, 19.68it/s, env_step=50000, len=100, loss=2.658, n/ep=1, n/st=100, rew=-44.00]                          


Epoch #500: test_reward: -68.190000 ± 35.998485, best_reward: -17.080000 ± 13.172532 in #233


Epoch #501: 101it [00:16,  6.31it/s, env_step=50100, len=100, loss=2.504, n/ep=1, n/st=100, rew=-205.80]                         


Epoch #501: test_reward: -61.890000 ± 30.647984, best_reward: -17.080000 ± 13.172532 in #233


Epoch #502: 101it [00:04, 21.80it/s, env_step=50200, len=100, loss=2.540, n/ep=1, n/st=100, rew=-49.00]                         


Epoch #502: test_reward: -84.070000 ± 45.283442, best_reward: -17.080000 ± 13.172532 in #233


Epoch #503: 101it [00:12,  8.19it/s, env_step=50300, len=100, loss=2.347, n/ep=1, n/st=100, rew=-148.40]                          


Epoch #503: test_reward: -78.750000 ± 27.645370, best_reward: -17.080000 ± 13.172532 in #233


Epoch #504: 101it [00:01, 80.52it/s, env_step=50400, len=100, loss=2.575, n/ep=1, n/st=100, rew=-345.00]                          


Epoch #504: test_reward: -72.710000 ± 43.358170, best_reward: -17.080000 ± 13.172532 in #233


Epoch #505: 101it [00:04, 24.20it/s, env_step=50500, len=100, loss=2.579, n/ep=1, n/st=100, rew=-229.50]                         


Epoch #505: test_reward: -55.120000 ± 30.301875, best_reward: -17.080000 ± 13.172532 in #233


Epoch #506: 101it [00:10,  9.74it/s, env_step=50600, len=100, loss=2.316, n/ep=1, n/st=100, rew=-190.90]                         


Epoch #506: test_reward: -72.470000 ± 35.113560, best_reward: -17.080000 ± 13.172532 in #233


Epoch #507: 101it [00:02, 38.66it/s, env_step=50700, len=100, loss=1.981, n/ep=1, n/st=100, rew=-107.10]                          


Epoch #507: test_reward: -64.390000 ± 27.082742, best_reward: -17.080000 ± 13.172532 in #233


Epoch #508: 101it [00:05, 20.01it/s, env_step=50800, len=100, loss=2.017, n/ep=1, n/st=100, rew=-186.70]                          


Epoch #508: test_reward: -73.620000 ± 38.848933, best_reward: -17.080000 ± 13.172532 in #233


Epoch #509: 101it [00:08, 11.52it/s, env_step=50900, len=100, loss=2.249, n/ep=1, n/st=100, rew=-223.10]                         


Epoch #509: test_reward: -49.480000 ± 29.179198, best_reward: -17.080000 ± 13.172532 in #233


Epoch #510: 101it [00:08, 11.98it/s, env_step=51000, len=100, loss=2.188, n/ep=1, n/st=100, rew=-222.40]                          


Epoch #510: test_reward: -64.800000 ± 30.977024, best_reward: -17.080000 ± 13.172532 in #233


Epoch #511: 101it [00:08, 12.35it/s, env_step=51100, len=100, loss=2.963, n/ep=1, n/st=100, rew=-416.90]                         


Epoch #511: test_reward: -61.570000 ± 28.971954, best_reward: -17.080000 ± 13.172532 in #233


Epoch #512: 101it [00:01, 78.87it/s, env_step=51200, len=100, loss=2.494, n/ep=1, n/st=100, rew=-181.30]                          


Epoch #512: test_reward: -84.780000 ± 38.095139, best_reward: -17.080000 ± 13.172532 in #233


Epoch #513: 101it [00:02, 38.88it/s, env_step=51300, len=100, loss=2.341, n/ep=1, n/st=100, rew=-131.00]                          


Epoch #513: test_reward: -89.970000 ± 44.175741, best_reward: -17.080000 ± 13.172532 in #233


Epoch #514: 101it [00:09, 10.46it/s, env_step=51400, len=100, loss=2.277, n/ep=1, n/st=100, rew=-239.80]                         


Epoch #514: test_reward: -99.280000 ± 34.404151, best_reward: -17.080000 ± 13.172532 in #233


Epoch #515: 101it [00:02, 50.30it/s, env_step=51500, len=100, loss=2.484, n/ep=1, n/st=100, rew=-383.00]                          


Epoch #515: test_reward: -82.210000 ± 30.297308, best_reward: -17.080000 ± 13.172532 in #233


Epoch #516: 101it [00:12,  8.25it/s, env_step=51600, len=100, loss=2.409, n/ep=1, n/st=100, rew=-188.20]                          


Epoch #516: test_reward: -43.290000 ± 16.188357, best_reward: -17.080000 ± 13.172532 in #233


Epoch #517: 101it [00:01, 77.70it/s, env_step=51700, len=100, loss=2.281, n/ep=1, n/st=100, rew=-69.70]                          


Epoch #517: test_reward: -58.530000 ± 29.018203, best_reward: -17.080000 ± 13.172532 in #233


Epoch #518: 101it [00:02, 35.64it/s, env_step=51800, len=100, loss=2.697, n/ep=1, n/st=100, rew=-140.10]                          


Epoch #518: test_reward: -71.660000 ± 27.230468, best_reward: -17.080000 ± 13.172532 in #233


Epoch #519: 101it [00:09, 10.68it/s, env_step=51900, len=100, loss=2.476, n/ep=1, n/st=100, rew=-69.40]                         


Epoch #519: test_reward: -69.340000 ± 33.552711, best_reward: -17.080000 ± 13.172532 in #233


Epoch #520: 101it [00:05, 17.06it/s, env_step=52000, len=100, loss=2.449, n/ep=1, n/st=100, rew=-300.40]                         


Epoch #520: test_reward: -69.490000 ± 32.946425, best_reward: -17.080000 ± 13.172532 in #233


Epoch #521: 101it [00:08, 11.46it/s, env_step=52100, len=100, loss=2.551, n/ep=1, n/st=100, rew=-146.80]                         


Epoch #521: test_reward: -52.200000 ± 27.738565, best_reward: -17.080000 ± 13.172532 in #233


Epoch #522: 101it [00:01, 61.34it/s, env_step=52200, len=100, loss=2.570, n/ep=1, n/st=100, rew=-82.60]                          


Epoch #522: test_reward: -60.180000 ± 23.897732, best_reward: -17.080000 ± 13.172532 in #233


Epoch #523: 101it [00:01, 52.98it/s, env_step=52300, len=100, loss=2.653, n/ep=1, n/st=100, rew=-80.50]                          


Epoch #523: test_reward: -60.770000 ± 27.645869, best_reward: -17.080000 ± 13.172532 in #233


Epoch #524: 101it [00:11,  8.49it/s, env_step=52400, len=100, loss=2.742, n/ep=1, n/st=100, rew=-188.20]                         


Epoch #524: test_reward: -76.890000 ± 36.090288, best_reward: -17.080000 ± 13.172532 in #233


Epoch #525: 101it [00:04, 24.87it/s, env_step=52500, len=100, loss=2.048, n/ep=1, n/st=100, rew=-77.70]                          


Epoch #525: test_reward: -58.640000 ± 38.497901, best_reward: -17.080000 ± 13.172532 in #233


Epoch #526: 101it [00:04, 24.73it/s, env_step=52600, len=100, loss=2.196, n/ep=1, n/st=100, rew=-186.90]                          


Epoch #526: test_reward: -73.800000 ± 32.588924, best_reward: -17.080000 ± 13.172532 in #233


Epoch #527: 101it [00:08, 11.53it/s, env_step=52700, len=100, loss=2.132, n/ep=1, n/st=100, rew=-199.50]                         


Epoch #527: test_reward: -60.230000 ± 15.798041, best_reward: -17.080000 ± 13.172532 in #233


Epoch #528: 101it [00:08, 12.39it/s, env_step=52800, len=100, loss=2.433, n/ep=1, n/st=100, rew=-176.80]                          


Epoch #528: test_reward: -59.050000 ± 32.409759, best_reward: -17.080000 ± 13.172532 in #233


Epoch #529: 101it [00:10,  9.83it/s, env_step=52900, len=100, loss=2.247, n/ep=1, n/st=100, rew=-169.10]                         


Epoch #529: test_reward: -86.890000 ± 34.088516, best_reward: -17.080000 ± 13.172532 in #233


Epoch #530: 101it [00:04, 25.19it/s, env_step=53000, len=100, loss=2.230, n/ep=1, n/st=100, rew=-99.30]                          


Epoch #530: test_reward: -96.370000 ± 46.677426, best_reward: -17.080000 ± 13.172532 in #233


Epoch #531: 101it [00:14,  6.92it/s, env_step=53100, len=100, loss=2.221, n/ep=1, n/st=100, rew=-175.50]                          


Epoch #531: test_reward: -67.090000 ± 34.306456, best_reward: -17.080000 ± 13.172532 in #233


Epoch #532: 101it [00:02, 43.30it/s, env_step=53200, len=100, loss=2.531, n/ep=1, n/st=100, rew=-185.60]                         


Epoch #532: test_reward: -51.790000 ± 10.314693, best_reward: -17.080000 ± 13.172532 in #233


Epoch #533: 101it [00:12,  8.30it/s, env_step=53300, len=100, loss=2.551, n/ep=1, n/st=100, rew=-443.50]                          


Epoch #533: test_reward: -66.660000 ± 19.573870, best_reward: -17.080000 ± 13.172532 in #233


Epoch #534: 101it [00:01, 68.90it/s, env_step=53400, len=100, loss=2.488, n/ep=1, n/st=100, rew=-202.70]                          


Epoch #534: test_reward: -86.160000 ± 37.347000, best_reward: -17.080000 ± 13.172532 in #233


Epoch #535: 101it [00:05, 18.75it/s, env_step=53500, len=100, loss=2.522, n/ep=1, n/st=100, rew=-28.40]                         


Epoch #535: test_reward: -76.380000 ± 34.682843, best_reward: -17.080000 ± 13.172532 in #233


Epoch #536: 101it [00:07, 14.41it/s, env_step=53600, len=100, loss=2.512, n/ep=1, n/st=100, rew=-138.60]                         


Epoch #536: test_reward: -62.170000 ± 18.825039, best_reward: -17.080000 ± 13.172532 in #233


Epoch #537: 101it [00:01, 51.68it/s, env_step=53700, len=100, loss=2.189, n/ep=1, n/st=100, rew=-120.50]                          


Epoch #537: test_reward: -77.750000 ± 30.293803, best_reward: -17.080000 ± 13.172532 in #233


Epoch #538: 101it [00:11,  8.68it/s, env_step=53800, len=100, loss=2.258, n/ep=1, n/st=100, rew=-112.10]                         


Epoch #538: test_reward: -90.660000 ± 38.223794, best_reward: -17.080000 ± 13.172532 in #233


Epoch #539: 101it [00:01, 63.41it/s, env_step=53900, len=100, loss=2.119, n/ep=1, n/st=100, rew=-95.50]                          


Epoch #539: test_reward: -61.780000 ± 28.494203, best_reward: -17.080000 ± 13.172532 in #233


Epoch #540: 101it [00:03, 29.92it/s, env_step=54000, len=100, loss=1.781, n/ep=1, n/st=100, rew=-250.90]                         


Epoch #540: test_reward: -69.270000 ± 25.704671, best_reward: -17.080000 ± 13.172532 in #233


Epoch #541: 101it [00:08, 12.14it/s, env_step=54100, len=100, loss=2.082, n/ep=1, n/st=100, rew=-89.10]                         


Epoch #541: test_reward: -76.040000 ± 33.579375, best_reward: -17.080000 ± 13.172532 in #233


Epoch #542: 101it [00:03, 32.93it/s, env_step=54200, len=100, loss=2.215, n/ep=1, n/st=100, rew=-245.90]                         


Epoch #542: test_reward: -89.240000 ± 48.934327, best_reward: -17.080000 ± 13.172532 in #233


Epoch #543: 101it [00:02, 36.96it/s, env_step=54300, len=100, loss=2.273, n/ep=1, n/st=100, rew=-202.20]                          


Epoch #543: test_reward: -94.740000 ± 45.781223, best_reward: -17.080000 ± 13.172532 in #233


Epoch #544: 101it [00:12,  8.07it/s, env_step=54400, len=100, loss=2.070, n/ep=1, n/st=100, rew=-241.70]                         


Epoch #544: test_reward: -54.630000 ± 25.285966, best_reward: -17.080000 ± 13.172532 in #233


Epoch #545: 101it [00:04, 23.81it/s, env_step=54500, len=100, loss=2.005, n/ep=1, n/st=100, rew=-163.50]                          


Epoch #545: test_reward: -53.870000 ± 20.156341, best_reward: -17.080000 ± 13.172532 in #233


Epoch #546: 101it [00:15,  6.64it/s, env_step=54600, len=100, loss=2.038, n/ep=1, n/st=100, rew=-221.10]                         


Epoch #546: test_reward: -86.590000 ± 48.678403, best_reward: -17.080000 ± 13.172532 in #233


Epoch #547: 101it [00:04, 21.04it/s, env_step=54700, len=100, loss=1.901, n/ep=1, n/st=100, rew=-212.30]                         


Epoch #547: test_reward: -71.870000 ± 42.147409, best_reward: -17.080000 ± 13.172532 in #233


Epoch #548: 101it [00:14,  6.99it/s, env_step=54800, len=100, loss=1.986, n/ep=1, n/st=100, rew=-198.50]                          


Epoch #548: test_reward: -62.100000 ± 42.452774, best_reward: -17.080000 ± 13.172532 in #233


Epoch #549: 101it [00:04, 25.02it/s, env_step=54900, len=100, loss=1.913, n/ep=1, n/st=100, rew=-176.90]                          


Epoch #549: test_reward: -53.260000 ± 19.208342, best_reward: -17.080000 ± 13.172532 in #233


Epoch #550: 101it [00:02, 45.81it/s, env_step=55000, len=100, loss=1.723, n/ep=1, n/st=100, rew=-158.50]                          


Epoch #550: test_reward: -55.510000 ± 35.450429, best_reward: -17.080000 ± 13.172532 in #233


Epoch #551: 101it [00:10, 10.02it/s, env_step=55100, len=100, loss=1.849, n/ep=1, n/st=100, rew=-217.00]                         


Epoch #551: test_reward: -47.410000 ± 25.644627, best_reward: -17.080000 ± 13.172532 in #233


Epoch #552: 101it [00:05, 20.16it/s, env_step=55200, len=100, loss=2.034, n/ep=1, n/st=100, rew=-49.90]                          


Epoch #552: test_reward: -53.640000 ± 21.920639, best_reward: -17.080000 ± 13.172532 in #233


Epoch #553: 101it [00:04, 21.47it/s, env_step=55300, len=100, loss=1.933, n/ep=1, n/st=100, rew=-264.80]                         


Epoch #553: test_reward: -78.740000 ± 36.308159, best_reward: -17.080000 ± 13.172532 in #233


Epoch #554: 101it [00:02, 45.94it/s, env_step=55400, len=100, loss=1.909, n/ep=1, n/st=100, rew=-207.60]                         


Epoch #554: test_reward: -77.640000 ± 39.093176, best_reward: -17.080000 ± 13.172532 in #233


Epoch #555: 101it [00:01, 60.81it/s, env_step=55500, len=100, loss=1.857, n/ep=1, n/st=100, rew=-166.40]                          


Epoch #555: test_reward: -73.810000 ± 42.706942, best_reward: -17.080000 ± 13.172532 in #233


Epoch #556: 101it [00:07, 13.54it/s, env_step=55600, len=100, loss=1.813, n/ep=1, n/st=100, rew=-38.20]                         


Epoch #556: test_reward: -38.670000 ± 13.369596, best_reward: -17.080000 ± 13.172532 in #233


Epoch #557: 101it [00:05, 19.20it/s, env_step=55700, len=100, loss=1.541, n/ep=1, n/st=100, rew=-159.20]                         


Epoch #557: test_reward: -61.940000 ± 21.142620, best_reward: -17.080000 ± 13.172532 in #233


Epoch #558: 101it [00:05, 19.60it/s, env_step=55800, len=100, loss=1.617, n/ep=1, n/st=100, rew=-164.00]                          


Epoch #558: test_reward: -47.310000 ± 34.136299, best_reward: -17.080000 ± 13.172532 in #233


Epoch #559: 101it [00:11,  9.04it/s, env_step=55900, len=100, loss=1.708, n/ep=1, n/st=100, rew=-43.50]                         


Epoch #559: test_reward: -59.640000 ± 36.613145, best_reward: -17.080000 ± 13.172532 in #233


Epoch #560: 101it [00:04, 23.97it/s, env_step=56000, len=100, loss=2.000, n/ep=1, n/st=100, rew=-154.90]                          


Epoch #560: test_reward: -51.270000 ± 39.405610, best_reward: -17.080000 ± 13.172532 in #233


Epoch #561: 101it [00:02, 46.33it/s, env_step=56100, len=100, loss=1.863, n/ep=1, n/st=100, rew=-51.30]                          


Epoch #561: test_reward: -52.350000 ± 25.340570, best_reward: -17.080000 ± 13.172532 in #233


Epoch #562: 101it [00:09, 11.16it/s, env_step=56200, len=100, loss=1.752, n/ep=1, n/st=100, rew=-130.50]                         


Epoch #562: test_reward: -43.350000 ± 24.268302, best_reward: -17.080000 ± 13.172532 in #233


Epoch #563: 101it [00:03, 25.59it/s, env_step=56300, len=100, loss=1.638, n/ep=1, n/st=100, rew=-268.40]                          


Epoch #563: test_reward: -51.950000 ± 27.670824, best_reward: -17.080000 ± 13.172532 in #233


Epoch #564: 101it [00:02, 36.86it/s, env_step=56400, len=100, loss=1.640, n/ep=1, n/st=100, rew=-204.40]                         


Epoch #564: test_reward: -50.010000 ± 27.272457, best_reward: -17.080000 ± 13.172532 in #233


Epoch #565: 101it [00:05, 19.01it/s, env_step=56500, len=100, loss=1.527, n/ep=1, n/st=100, rew=-165.40]                         


Epoch #565: test_reward: -46.330000 ± 29.740008, best_reward: -17.080000 ± 13.172532 in #233


Epoch #566: 101it [00:02, 43.21it/s, env_step=56600, len=100, loss=1.656, n/ep=1, n/st=100, rew=-35.40]                          


Epoch #566: test_reward: -59.800000 ± 36.164071, best_reward: -17.080000 ± 13.172532 in #233


Epoch #567: 101it [00:02, 41.88it/s, env_step=56700, len=100, loss=1.571, n/ep=1, n/st=100, rew=-192.50]                          


Epoch #567: test_reward: -78.570000 ± 48.039506, best_reward: -17.080000 ± 13.172532 in #233


Epoch #568: 101it [00:06, 15.02it/s, env_step=56800, len=100, loss=1.490, n/ep=1, n/st=100, rew=-130.30]                         


Epoch #568: test_reward: -82.220000 ± 33.863219, best_reward: -17.080000 ± 13.172532 in #233


Epoch #569: 101it [00:04, 22.82it/s, env_step=56900, len=100, loss=1.725, n/ep=1, n/st=100, rew=-165.70]                         


Epoch #569: test_reward: -65.060000 ± 37.518961, best_reward: -17.080000 ± 13.172532 in #233


Epoch #570: 101it [00:03, 25.34it/s, env_step=57000, len=100, loss=1.396, n/ep=1, n/st=100, rew=-192.10]                          


Epoch #570: test_reward: -71.680000 ± 41.019600, best_reward: -17.080000 ± 13.172532 in #233


Epoch #571: 101it [00:13,  7.43it/s, env_step=57100, len=100, loss=1.615, n/ep=1, n/st=100, rew=-265.60]                         


Epoch #571: test_reward: -78.890000 ± 32.889647, best_reward: -17.080000 ± 13.172532 in #233


Epoch #572: 101it [00:04, 22.69it/s, env_step=57200, len=100, loss=1.604, n/ep=1, n/st=100, rew=-35.90]                          


Epoch #572: test_reward: -80.560000 ± 39.276919, best_reward: -17.080000 ± 13.172532 in #233


Epoch #573: 101it [00:09, 10.58it/s, env_step=57300, len=100, loss=1.537, n/ep=1, n/st=100, rew=-32.20]                          


Epoch #573: test_reward: -53.180000 ± 25.359054, best_reward: -17.080000 ± 13.172532 in #233


Epoch #574: 101it [00:03, 27.61it/s, env_step=57400, len=100, loss=1.640, n/ep=1, n/st=100, rew=-36.50]                          


Epoch #574: test_reward: -60.330000 ± 31.413024, best_reward: -17.080000 ± 13.172532 in #233


Epoch #575: 101it [00:01, 88.90it/s, env_step=57500, len=100, loss=1.468, n/ep=1, n/st=100, rew=-190.90]                          


Epoch #575: test_reward: -60.190000 ± 37.096858, best_reward: -17.080000 ± 13.172532 in #233


Epoch #576: 101it [00:02, 33.69it/s, env_step=57600, len=100, loss=1.462, n/ep=1, n/st=100, rew=-87.90]                          


Epoch #576: test_reward: -56.810000 ± 37.033754, best_reward: -17.080000 ± 13.172532 in #233


Epoch #577: 101it [00:01, 84.10it/s, env_step=57700, len=100, loss=1.407, n/ep=1, n/st=100, rew=-264.30]                          


Epoch #577: test_reward: -84.430000 ± 34.123014, best_reward: -17.080000 ± 13.172532 in #233


Epoch #578: 101it [00:02, 42.12it/s, env_step=57800, len=100, loss=2.194, n/ep=1, n/st=100, rew=-442.50]                         


Epoch #578: test_reward: -65.410000 ± 31.397881, best_reward: -17.080000 ± 13.172532 in #233


Epoch #579: 101it [00:06, 15.92it/s, env_step=57900, len=100, loss=2.170, n/ep=1, n/st=100, rew=-269.50]                          


Epoch #579: test_reward: -67.220000 ± 35.071464, best_reward: -17.080000 ± 13.172532 in #233


Epoch #580: 101it [00:01, 71.84it/s, env_step=58000, len=100, loss=2.124, n/ep=1, n/st=100, rew=-112.40]                          


Epoch #580: test_reward: -62.700000 ± 38.660290, best_reward: -17.080000 ± 13.172532 in #233


Epoch #581: 101it [00:03, 26.25it/s, env_step=58100, len=100, loss=1.792, n/ep=1, n/st=100, rew=-225.20]                         


Epoch #581: test_reward: -43.970000 ± 19.304303, best_reward: -17.080000 ± 13.172532 in #233


Epoch #582: 101it [00:09, 10.11it/s, env_step=58200, len=100, loss=1.988, n/ep=1, n/st=100, rew=-85.90]                         


Epoch #582: test_reward: -64.720000 ± 37.633065, best_reward: -17.080000 ± 13.172532 in #233


Epoch #583: 101it [00:01, 74.33it/s, env_step=58300, len=100, loss=1.920, n/ep=1, n/st=100, rew=-230.10]                          


Epoch #583: test_reward: -55.180000 ± 34.723761, best_reward: -17.080000 ± 13.172532 in #233


Epoch #584: 101it [00:03, 29.31it/s, env_step=58400, len=100, loss=1.857, n/ep=1, n/st=100, rew=-60.50]                         


Epoch #584: test_reward: -57.000000 ± 29.118070, best_reward: -17.080000 ± 13.172532 in #233


Epoch #585: 101it [00:11,  8.74it/s, env_step=58500, len=100, loss=1.640, n/ep=1, n/st=100, rew=-216.00]                         


Epoch #585: test_reward: -69.500000 ± 25.898687, best_reward: -17.080000 ± 13.172532 in #233


Epoch #586: 101it [00:02, 43.76it/s, env_step=58600, len=100, loss=1.735, n/ep=1, n/st=100, rew=-55.20]                          


Epoch #586: test_reward: -70.560000 ± 35.682775, best_reward: -17.080000 ± 13.172532 in #233


Epoch #587: 101it [00:02, 38.21it/s, env_step=58700, len=100, loss=1.723, n/ep=1, n/st=100, rew=-171.90]                          


Epoch #587: test_reward: -57.340000 ± 31.760139, best_reward: -17.080000 ± 13.172532 in #233


Epoch #588: 101it [00:09, 11.11it/s, env_step=58800, len=100, loss=1.743, n/ep=1, n/st=100, rew=-48.20]                         


Epoch #588: test_reward: -110.520000 ± 53.541251, best_reward: -17.080000 ± 13.172532 in #233


Epoch #589: 101it [00:03, 25.89it/s, env_step=58900, len=100, loss=1.542, n/ep=1, n/st=100, rew=-253.40]                         


Epoch #589: test_reward: -59.960000 ± 28.661933, best_reward: -17.080000 ± 13.172532 in #233


Epoch #590: 101it [00:02, 38.86it/s, env_step=59000, len=100, loss=1.499, n/ep=1, n/st=100, rew=-226.60]                          


Epoch #590: test_reward: -67.230000 ± 42.616148, best_reward: -17.080000 ± 13.172532 in #233


Epoch #591: 101it [00:11,  8.88it/s, env_step=59100, len=100, loss=1.481, n/ep=1, n/st=100, rew=-233.70]                         


Epoch #591: test_reward: -86.020000 ± 35.118593, best_reward: -17.080000 ± 13.172532 in #233


Epoch #592: 101it [00:02, 41.85it/s, env_step=59200, len=100, loss=1.609, n/ep=1, n/st=100, rew=-31.80]                         


Epoch #592: test_reward: -53.870000 ± 22.755002, best_reward: -17.080000 ± 13.172532 in #233


Epoch #593: 101it [00:10,  9.71it/s, env_step=59300, len=100, loss=1.593, n/ep=1, n/st=100, rew=-34.60]                          


Epoch #593: test_reward: -56.230000 ± 14.010000, best_reward: -17.080000 ± 13.172532 in #233


Epoch #594: 101it [00:03, 30.42it/s, env_step=59400, len=100, loss=1.767, n/ep=1, n/st=100, rew=-252.90]                         


Epoch #594: test_reward: -73.080000 ± 33.613414, best_reward: -17.080000 ± 13.172532 in #233


Epoch #595: 101it [00:07, 12.94it/s, env_step=59500, len=100, loss=1.853, n/ep=1, n/st=100, rew=-294.50]                          


Epoch #595: test_reward: -86.360000 ± 43.938939, best_reward: -17.080000 ± 13.172532 in #233


Epoch #596: 101it [00:01, 79.19it/s, env_step=59600, len=100, loss=1.661, n/ep=1, n/st=100, rew=-217.90]                          


Epoch #596: test_reward: -85.810000 ± 39.219394, best_reward: -17.080000 ± 13.172532 in #233


Epoch #597: 101it [00:04, 22.80it/s, env_step=59700, len=100, loss=1.724, n/ep=1, n/st=100, rew=-252.30]                          


Epoch #597: test_reward: -73.090000 ± 34.490070, best_reward: -17.080000 ± 13.172532 in #233


Epoch #598: 101it [00:04, 23.60it/s, env_step=59800, len=100, loss=1.491, n/ep=1, n/st=100, rew=-92.20]                         


Epoch #598: test_reward: -61.530000 ± 32.410864, best_reward: -17.080000 ± 13.172532 in #233


Epoch #599: 101it [00:01, 67.18it/s, env_step=59900, len=100, loss=1.642, n/ep=1, n/st=100, rew=-111.10]                          


Epoch #599: test_reward: -72.750000 ± 35.048659, best_reward: -17.080000 ± 13.172532 in #233


Epoch #600: 101it [00:03, 25.43it/s, env_step=60000, len=100, loss=1.554, n/ep=1, n/st=100, rew=-202.70]                         


Epoch #600: test_reward: -85.150000 ± 33.658647, best_reward: -17.080000 ± 13.172532 in #233


Epoch #601: 101it [00:10,  9.66it/s, env_step=60100, len=100, loss=1.899, n/ep=1, n/st=100, rew=-87.80]                         


Epoch #601: test_reward: -84.080000 ± 39.204535, best_reward: -17.080000 ± 13.172532 in #233


Epoch #602: 101it [00:02, 42.13it/s, env_step=60200, len=100, loss=1.679, n/ep=1, n/st=100, rew=-198.90]                          


Epoch #602: test_reward: -75.620000 ± 48.337973, best_reward: -17.080000 ± 13.172532 in #233


Epoch #603: 101it [00:05, 18.84it/s, env_step=60300, len=100, loss=1.519, n/ep=1, n/st=100, rew=-163.30]                          


Epoch #603: test_reward: -53.770000 ± 30.319665, best_reward: -17.080000 ± 13.172532 in #233


Epoch #604: 101it [00:05, 19.29it/s, env_step=60400, len=100, loss=1.287, n/ep=1, n/st=100, rew=-210.70]                         


Epoch #604: test_reward: -73.700000 ± 31.801698, best_reward: -17.080000 ± 13.172532 in #233


Epoch #605: 101it [00:03, 25.58it/s, env_step=60500, len=100, loss=1.367, n/ep=1, n/st=100, rew=-172.70]                          


Epoch #605: test_reward: -76.280000 ± 36.505035, best_reward: -17.080000 ± 13.172532 in #233


Epoch #606: 101it [00:09, 10.86it/s, env_step=60600, len=100, loss=1.857, n/ep=1, n/st=100, rew=-297.70]                         


Epoch #606: test_reward: -70.860000 ± 33.949292, best_reward: -17.080000 ± 13.172532 in #233


Epoch #607: 101it [00:02, 35.05it/s, env_step=60700, len=100, loss=1.584, n/ep=1, n/st=100, rew=-79.40]                          


Epoch #607: test_reward: -60.060000 ± 34.188659, best_reward: -17.080000 ± 13.172532 in #233


Epoch #608: 101it [00:06, 14.58it/s, env_step=60800, len=100, loss=1.686, n/ep=1, n/st=100, rew=-193.40]                          


Epoch #608: test_reward: -54.190000 ± 27.559951, best_reward: -17.080000 ± 13.172532 in #233


Epoch #609: 101it [00:01, 62.55it/s, env_step=60900, len=100, loss=1.487, n/ep=1, n/st=100, rew=-169.90]                          


Epoch #609: test_reward: -81.460000 ± 45.724090, best_reward: -17.080000 ± 13.172532 in #233


Epoch #610: 101it [00:04, 23.67it/s, env_step=61000, len=100, loss=1.529, n/ep=1, n/st=100, rew=-33.70]                         


Epoch #610: test_reward: -64.530000 ± 33.721656, best_reward: -17.080000 ± 13.172532 in #233


Epoch #611: 101it [00:06, 15.70it/s, env_step=61100, len=100, loss=1.450, n/ep=1, n/st=100, rew=-129.30]                         


Epoch #611: test_reward: -81.650000 ± 32.541243, best_reward: -17.080000 ± 13.172532 in #233


Epoch #612: 101it [00:01, 89.20it/s, env_step=61200, len=100, loss=1.615, n/ep=1, n/st=100, rew=-226.40]                          


Epoch #612: test_reward: -74.090000 ± 32.797544, best_reward: -17.080000 ± 13.172532 in #233


Epoch #613: 101it [00:03, 27.58it/s, env_step=61300, len=100, loss=1.998, n/ep=1, n/st=100, rew=-214.60]                          


Epoch #613: test_reward: -81.550000 ± 46.478043, best_reward: -17.080000 ± 13.172532 in #233


Epoch #614: 101it [00:09, 10.94it/s, env_step=61400, len=100, loss=2.554, n/ep=1, n/st=100, rew=-696.40]                          


Epoch #614: test_reward: -90.560000 ± 29.760988, best_reward: -17.080000 ± 13.172532 in #233


Epoch #615: 101it [00:03, 28.69it/s, env_step=61500, len=100, loss=2.486, n/ep=1, n/st=100, rew=-260.80]                          


Epoch #615: test_reward: -80.790000 ± 39.900813, best_reward: -17.080000 ± 13.172532 in #233


Epoch #616: 101it [00:02, 42.71it/s, env_step=61600, len=100, loss=2.611, n/ep=1, n/st=100, rew=-157.50]                         


Epoch #616: test_reward: -60.660000 ± 23.707897, best_reward: -17.080000 ± 13.172532 in #233


Epoch #617: 101it [00:03, 26.74it/s, env_step=61700, len=100, loss=2.373, n/ep=1, n/st=100, rew=-37.30]                          


Epoch #617: test_reward: -85.710000 ± 41.506637, best_reward: -17.080000 ± 13.172532 in #233


Epoch #618: 101it [00:03, 25.62it/s, env_step=61800, len=100, loss=2.456, n/ep=1, n/st=100, rew=-19.90]                         


Epoch #618: test_reward: -100.150000 ± 45.422005, best_reward: -17.080000 ± 13.172532 in #233


Epoch #619: 101it [00:08, 11.68it/s, env_step=61900, len=100, loss=2.555, n/ep=1, n/st=100, rew=-250.00]                         


Epoch #619: test_reward: -68.820000 ± 27.197162, best_reward: -17.080000 ± 13.172532 in #233


Epoch #620: 101it [00:04, 22.21it/s, env_step=62000, len=100, loss=2.228, n/ep=1, n/st=100, rew=-92.50]                          


Epoch #620: test_reward: -69.520000 ± 31.883281, best_reward: -17.080000 ± 13.172532 in #233


Epoch #621: 101it [00:05, 17.02it/s, env_step=62100, len=100, loss=2.229, n/ep=1, n/st=100, rew=-113.10]                          


Epoch #621: test_reward: -82.750000 ± 34.735407, best_reward: -17.080000 ± 13.172532 in #233


Epoch #622: 101it [00:11,  8.97it/s, env_step=62200, len=100, loss=2.245, n/ep=1, n/st=100, rew=-214.30]                         


Epoch #622: test_reward: -69.100000 ± 37.541923, best_reward: -17.080000 ± 13.172532 in #233


Epoch #623: 101it [00:03, 26.75it/s, env_step=62300, len=100, loss=2.257, n/ep=1, n/st=100, rew=-214.20]                         


Epoch #623: test_reward: -104.380000 ± 32.358053, best_reward: -17.080000 ± 13.172532 in #233


Epoch #624: 101it [00:02, 40.24it/s, env_step=62400, len=100, loss=2.422, n/ep=1, n/st=100, rew=-157.30]                          


Epoch #624: test_reward: -68.600000 ± 33.213732, best_reward: -17.080000 ± 13.172532 in #233


Epoch #625: 101it [00:07, 13.26it/s, env_step=62500, len=100, loss=2.497, n/ep=1, n/st=100, rew=-146.80]                         


Epoch #625: test_reward: -72.210000 ± 34.429506, best_reward: -17.080000 ± 13.172532 in #233


Epoch #626: 101it [00:03, 27.90it/s, env_step=62600, len=100, loss=2.152, n/ep=1, n/st=100, rew=-124.10]                         


Epoch #626: test_reward: -76.490000 ± 29.094895, best_reward: -17.080000 ± 13.172532 in #233


Epoch #627: 101it [00:14,  6.79it/s, env_step=62700, len=100, loss=2.161, n/ep=1, n/st=100, rew=-236.30]                          


Epoch #627: test_reward: -74.390000 ± 38.260148, best_reward: -17.080000 ± 13.172532 in #233


Epoch #628: 101it [00:06, 15.87it/s, env_step=62800, len=100, loss=1.989, n/ep=1, n/st=100, rew=-157.50]                         


Epoch #628: test_reward: -60.920000 ± 14.743188, best_reward: -17.080000 ± 13.172532 in #233


Epoch #629: 101it [00:18,  5.46it/s, env_step=62900, len=100, loss=2.412, n/ep=1, n/st=100, rew=-62.10]                         


Epoch #629: test_reward: -57.590000 ± 22.315710, best_reward: -17.080000 ± 13.172532 in #233


Epoch #630: 101it [00:01, 69.40it/s, env_step=63000, len=100, loss=1.961, n/ep=1, n/st=100, rew=-207.60]                          


Epoch #630: test_reward: -93.010000 ± 49.034507, best_reward: -17.080000 ± 13.172532 in #233


Epoch #631: 101it [00:22,  4.48it/s, env_step=63100, len=100, loss=2.095, n/ep=1, n/st=100, rew=-156.20]                         


Epoch #631: test_reward: -78.470000 ± 40.062103, best_reward: -17.080000 ± 13.172532 in #233


Epoch #632: 101it [00:04, 24.96it/s, env_step=63200, len=100, loss=2.640, n/ep=1, n/st=100, rew=-47.70]                          


Epoch #632: test_reward: -66.660000 ± 30.648204, best_reward: -17.080000 ± 13.172532 in #233


Epoch #633: 101it [00:09, 10.31it/s, env_step=63300, len=100, loss=2.508, n/ep=1, n/st=100, rew=-221.00]                         


Epoch #633: test_reward: -80.120000 ± 42.557744, best_reward: -17.080000 ± 13.172532 in #233


Epoch #634: 101it [00:01, 51.18it/s, env_step=63400, len=100, loss=2.109, n/ep=1, n/st=100, rew=-295.60]                         


Epoch #634: test_reward: -78.280000 ± 50.361013, best_reward: -17.080000 ± 13.172532 in #233


Epoch #635: 101it [00:15,  6.69it/s, env_step=63500, len=100, loss=2.260, n/ep=1, n/st=100, rew=-93.60]                         


Epoch #635: test_reward: -46.830000 ± 23.652867, best_reward: -17.080000 ± 13.172532 in #233


Epoch #636: 101it [00:01, 69.03it/s, env_step=63600, len=100, loss=2.383, n/ep=1, n/st=100, rew=-249.90]                          


Epoch #636: test_reward: -72.320000 ± 36.122674, best_reward: -17.080000 ± 13.172532 in #233


Epoch #637: 101it [00:15,  6.60it/s, env_step=63700, len=100, loss=2.204, n/ep=1, n/st=100, rew=-131.70]                         


Epoch #637: test_reward: -64.500000 ± 40.284141, best_reward: -17.080000 ± 13.172532 in #233


Epoch #638: 101it [00:04, 25.10it/s, env_step=63800, len=100, loss=2.106, n/ep=1, n/st=100, rew=-178.80]                          


Epoch #638: test_reward: -88.440000 ± 38.650542, best_reward: -17.080000 ± 13.172532 in #233


Epoch #639: 101it [00:12,  8.41it/s, env_step=63900, len=100, loss=2.220, n/ep=1, n/st=100, rew=-194.60]                          


Epoch #639: test_reward: -78.330000 ± 41.486915, best_reward: -17.080000 ± 13.172532 in #233


Epoch #640: 101it [00:02, 41.66it/s, env_step=64000, len=100, loss=2.077, n/ep=1, n/st=100, rew=-136.70]                          


Epoch #640: test_reward: -50.940000 ± 24.485759, best_reward: -17.080000 ± 13.172532 in #233


Epoch #641: 101it [00:03, 28.70it/s, env_step=64100, len=100, loss=2.196, n/ep=1, n/st=100, rew=-82.70]                          


Epoch #641: test_reward: -43.620000 ± 27.162724, best_reward: -17.080000 ± 13.172532 in #233


Epoch #642: 101it [00:06, 16.34it/s, env_step=64200, len=100, loss=2.501, n/ep=1, n/st=100, rew=-73.60]                          


Epoch #642: test_reward: -43.020000 ± 26.093057, best_reward: -17.080000 ± 13.172532 in #233


Epoch #643: 101it [00:07, 14.40it/s, env_step=64300, len=100, loss=2.407, n/ep=1, n/st=100, rew=-247.80]                          


Epoch #643: test_reward: -31.610000 ± 19.234472, best_reward: -17.080000 ± 13.172532 in #233


Epoch #644: 101it [00:09, 10.98it/s, env_step=64400, len=100, loss=2.179, n/ep=1, n/st=100, rew=-139.30]                         


Epoch #644: test_reward: -56.440000 ± 31.936850, best_reward: -17.080000 ± 13.172532 in #233


Epoch #645: 101it [00:01, 83.83it/s, env_step=64500, len=100, loss=2.293, n/ep=1, n/st=100, rew=-203.10]                          


Epoch #645: test_reward: -58.560000 ± 32.501268, best_reward: -17.080000 ± 13.172532 in #233


Epoch #646: 101it [00:09, 10.81it/s, env_step=64600, len=100, loss=2.045, n/ep=1, n/st=100, rew=-116.00]                         


Epoch #646: test_reward: -41.540000 ± 30.558344, best_reward: -17.080000 ± 13.172532 in #233


Epoch #647: 101it [00:06, 15.74it/s, env_step=64700, len=100, loss=1.968, n/ep=1, n/st=100, rew=-192.90]                         


Epoch #647: test_reward: -42.720000 ± 26.823900, best_reward: -17.080000 ± 13.172532 in #233


Epoch #648: 101it [00:16,  6.23it/s, env_step=64800, len=100, loss=2.037, n/ep=1, n/st=100, rew=-229.00]                         


Epoch #648: test_reward: -53.850000 ± 25.087656, best_reward: -17.080000 ± 13.172532 in #233


Epoch #649: 101it [00:03, 29.19it/s, env_step=64900, len=100, loss=2.387, n/ep=1, n/st=100, rew=-220.70]                          


Epoch #649: test_reward: -42.400000 ± 35.252319, best_reward: -17.080000 ± 13.172532 in #233


Epoch #650: 101it [00:20,  4.83it/s, env_step=65000, len=100, loss=1.837, n/ep=1, n/st=100, rew=-246.90]                          


Epoch #650: test_reward: -36.440000 ± 16.922246, best_reward: -17.080000 ± 13.172532 in #233


Epoch #651: 101it [00:01, 64.43it/s, env_step=65100, len=100, loss=1.974, n/ep=1, n/st=100, rew=-30.60]                          


Epoch #651: test_reward: -45.350000 ± 26.211534, best_reward: -17.080000 ± 13.172532 in #233


Epoch #652: 101it [00:08, 11.49it/s, env_step=65200, len=100, loss=2.120, n/ep=1, n/st=100, rew=-188.80]                         


Epoch #652: test_reward: -49.720000 ± 25.421715, best_reward: -17.080000 ± 13.172532 in #233


Epoch #653: 101it [00:05, 19.33it/s, env_step=65300, len=100, loss=2.175, n/ep=1, n/st=100, rew=-131.60]                          


Epoch #653: test_reward: -53.510000 ± 41.817376, best_reward: -17.080000 ± 13.172532 in #233


Epoch #654: 101it [00:16,  5.94it/s, env_step=65400, len=100, loss=1.970, n/ep=1, n/st=100, rew=-71.10]                         


Epoch #654: test_reward: -44.390000 ± 40.729902, best_reward: -17.080000 ± 13.172532 in #233


Epoch #655: 101it [00:01, 80.43it/s, env_step=65500, len=100, loss=2.237, n/ep=1, n/st=100, rew=-168.60]                          


Epoch #655: test_reward: -45.870000 ± 26.235588, best_reward: -17.080000 ± 13.172532 in #233


Epoch #656: 101it [00:08, 11.57it/s, env_step=65600, len=100, loss=2.024, n/ep=1, n/st=100, rew=-39.50]                          


Epoch #656: test_reward: -39.450000 ± 26.617823, best_reward: -17.080000 ± 13.172532 in #233


Epoch #657: 101it [00:02, 37.58it/s, env_step=65700, len=100, loss=1.635, n/ep=1, n/st=100, rew=-199.60]                         


Epoch #657: test_reward: -52.180000 ± 21.898530, best_reward: -17.080000 ± 13.172532 in #233


Epoch #658: 101it [00:04, 22.72it/s, env_step=65800, len=100, loss=1.824, n/ep=1, n/st=100, rew=-147.50]                         


Epoch #658: test_reward: -57.260000 ± 26.493780, best_reward: -17.080000 ± 13.172532 in #233


Epoch #659: 101it [00:13,  7.76it/s, env_step=65900, len=100, loss=1.842, n/ep=1, n/st=100, rew=-259.20]                         


Epoch #659: test_reward: -72.300000 ± 29.379278, best_reward: -17.080000 ± 13.172532 in #233


Epoch #660: 101it [00:01, 79.38it/s, env_step=66000, len=100, loss=1.750, n/ep=1, n/st=100, rew=-58.80]                          


Epoch #660: test_reward: -52.040000 ± 28.260828, best_reward: -17.080000 ± 13.172532 in #233


Epoch #661: 101it [00:07, 12.78it/s, env_step=66100, len=100, loss=2.137, n/ep=1, n/st=100, rew=-91.50]                         


Epoch #661: test_reward: -59.760000 ± 38.429759, best_reward: -17.080000 ± 13.172532 in #233


Epoch #662: 101it [00:06, 16.23it/s, env_step=66200, len=100, loss=1.936, n/ep=1, n/st=100, rew=-90.30]                          


Epoch #662: test_reward: -52.100000 ± 28.008035, best_reward: -17.080000 ± 13.172532 in #233


Epoch #663: 101it [00:13,  7.23it/s, env_step=66300, len=100, loss=1.860, n/ep=1, n/st=100, rew=-274.60]                         


Epoch #663: test_reward: -48.710000 ± 23.054347, best_reward: -17.080000 ± 13.172532 in #233


Epoch #664: 101it [00:03, 29.87it/s, env_step=66400, len=100, loss=1.779, n/ep=1, n/st=100, rew=-142.40]                         


Epoch #664: test_reward: -52.750000 ± 27.456666, best_reward: -17.080000 ± 13.172532 in #233


Epoch #665: 101it [00:16,  6.17it/s, env_step=66500, len=100, loss=1.354, n/ep=1, n/st=100, rew=-37.60]                         


Epoch #665: test_reward: -65.560000 ± 48.507303, best_reward: -17.080000 ± 13.172532 in #233


Epoch #666: 101it [00:05, 17.56it/s, env_step=66600, len=100, loss=1.650, n/ep=1, n/st=100, rew=-179.40]                          


Epoch #666: test_reward: -57.430000 ± 29.769651, best_reward: -17.080000 ± 13.172532 in #233


Epoch #667: 101it [00:09, 10.12it/s, env_step=66700, len=100, loss=1.644, n/ep=1, n/st=100, rew=-205.00]                          


Epoch #667: test_reward: -54.610000 ± 21.461801, best_reward: -17.080000 ± 13.172532 in #233


Epoch #668: 101it [00:03, 29.49it/s, env_step=66800, len=100, loss=1.509, n/ep=1, n/st=100, rew=-321.50]                          


Epoch #668: test_reward: -69.900000 ± 29.560244, best_reward: -17.080000 ± 13.172532 in #233


Epoch #669: 101it [00:06, 14.95it/s, env_step=66900, len=100, loss=1.516, n/ep=1, n/st=100, rew=-197.80]                         


Epoch #669: test_reward: -76.510000 ± 40.050629, best_reward: -17.080000 ± 13.172532 in #233


Epoch #670: 101it [00:04, 23.92it/s, env_step=67000, len=100, loss=1.567, n/ep=1, n/st=100, rew=-238.20]                          


Epoch #670: test_reward: -56.840000 ± 36.357013, best_reward: -17.080000 ± 13.172532 in #233


Epoch #671: 101it [00:03, 25.36it/s, env_step=67100, len=100, loss=1.836, n/ep=1, n/st=100, rew=-155.20]                          


Epoch #671: test_reward: -48.900000 ± 23.758619, best_reward: -17.080000 ± 13.172532 in #233


Epoch #672: 101it [00:14,  6.87it/s, env_step=67200, len=100, loss=1.553, n/ep=1, n/st=100, rew=-157.60]                         


Epoch #672: test_reward: -75.930000 ± 38.777237, best_reward: -17.080000 ± 13.172532 in #233


Epoch #673: 101it [00:05, 17.69it/s, env_step=67300, len=100, loss=1.453, n/ep=1, n/st=100, rew=-163.00]                         


Epoch #673: test_reward: -63.900000 ± 34.448657, best_reward: -17.080000 ± 13.172532 in #233


Epoch #674: 101it [00:05, 20.09it/s, env_step=67400, len=100, loss=1.527, n/ep=1, n/st=100, rew=-181.30]                          


Epoch #674: test_reward: -66.610000 ± 24.681307, best_reward: -17.080000 ± 13.172532 in #233


Epoch #675: 101it [00:02, 34.48it/s, env_step=67500, len=100, loss=1.553, n/ep=1, n/st=100, rew=-403.30]                         


Epoch #675: test_reward: -50.560000 ± 35.444695, best_reward: -17.080000 ± 13.172532 in #233


Epoch #676: 101it [00:06, 16.61it/s, env_step=67600, len=100, loss=1.880, n/ep=1, n/st=100, rew=-105.60]                         


Epoch #676: test_reward: -59.330000 ± 28.614683, best_reward: -17.080000 ± 13.172532 in #233


Epoch #677: 101it [00:08, 11.70it/s, env_step=67700, len=100, loss=1.994, n/ep=1, n/st=100, rew=-172.70]                         


Epoch #677: test_reward: -56.280000 ± 31.809929, best_reward: -17.080000 ± 13.172532 in #233


Epoch #678: 101it [00:01, 73.81it/s, env_step=67800, len=100, loss=1.705, n/ep=1, n/st=100, rew=-142.10]                          


Epoch #678: test_reward: -41.860000 ± 26.673927, best_reward: -17.080000 ± 13.172532 in #233


Epoch #679: 101it [00:12,  8.22it/s, env_step=67900, len=100, loss=2.046, n/ep=1, n/st=100, rew=-398.00]                         


Epoch #679: test_reward: -59.970000 ± 31.873627, best_reward: -17.080000 ± 13.172532 in #233


Epoch #680: 101it [00:03, 28.40it/s, env_step=68000, len=100, loss=1.866, n/ep=1, n/st=100, rew=-243.00]                         


Epoch #680: test_reward: -70.070000 ± 57.626661, best_reward: -17.080000 ± 13.172532 in #233


Epoch #681: 101it [00:07, 12.72it/s, env_step=68100, len=100, loss=2.146, n/ep=1, n/st=100, rew=-271.90]                         


Epoch #681: test_reward: -57.980000 ± 49.759759, best_reward: -17.080000 ± 13.172532 in #233


Epoch #682: 101it [00:02, 44.68it/s, env_step=68200, len=100, loss=1.937, n/ep=1, n/st=100, rew=-30.20]                          


Epoch #682: test_reward: -79.780000 ± 35.447646, best_reward: -17.080000 ± 13.172532 in #233


Epoch #683: 101it [00:06, 15.34it/s, env_step=68300, len=100, loss=1.665, n/ep=1, n/st=100, rew=-158.40]                         


Epoch #683: test_reward: -58.370000 ± 33.945193, best_reward: -17.080000 ± 13.172532 in #233


Epoch #684: 101it [00:04, 20.49it/s, env_step=68400, len=100, loss=1.821, n/ep=1, n/st=100, rew=-261.60]                         


Epoch #684: test_reward: -67.440000 ± 34.720346, best_reward: -17.080000 ± 13.172532 in #233


Epoch #685: 101it [00:04, 23.05it/s, env_step=68500, len=100, loss=1.523, n/ep=1, n/st=100, rew=-102.20]                          


Epoch #685: test_reward: -44.240000 ± 23.945613, best_reward: -17.080000 ± 13.172532 in #233


Epoch #686: 101it [00:12,  8.00it/s, env_step=68600, len=100, loss=1.835, n/ep=1, n/st=100, rew=-76.60]                         


Epoch #686: test_reward: -58.670000 ± 31.290320, best_reward: -17.080000 ± 13.172532 in #233


Epoch #687: 101it [00:01, 54.49it/s, env_step=68700, len=100, loss=1.814, n/ep=1, n/st=100, rew=-84.90]                         


Epoch #687: test_reward: -61.740000 ± 32.093993, best_reward: -17.080000 ± 13.172532 in #233


Epoch #688: 101it [00:14,  7.02it/s, env_step=68800, len=100, loss=1.745, n/ep=1, n/st=100, rew=-231.00]                         


Epoch #688: test_reward: -66.530000 ± 23.760431, best_reward: -17.080000 ± 13.172532 in #233


Epoch #689: 101it [00:01, 84.52it/s, env_step=68900, len=100, loss=1.895, n/ep=1, n/st=100, rew=-175.10]                          


Epoch #689: test_reward: -53.500000 ± 21.671179, best_reward: -17.080000 ± 13.172532 in #233


Epoch #690: 101it [00:04, 23.37it/s, env_step=69000, len=100, loss=1.724, n/ep=1, n/st=100, rew=-174.20]                         


Epoch #690: test_reward: -46.320000 ± 42.716690, best_reward: -17.080000 ± 13.172532 in #233


Epoch #691: 101it [00:17,  5.65it/s, env_step=69100, len=100, loss=1.728, n/ep=1, n/st=100, rew=-74.30]                         


Epoch #691: test_reward: -48.220000 ± 25.841238, best_reward: -17.080000 ± 13.172532 in #233


Epoch #692: 101it [00:07, 13.72it/s, env_step=69200, len=100, loss=1.643, n/ep=1, n/st=100, rew=-28.80]                         


Epoch #692: test_reward: -42.560000 ± 32.818781, best_reward: -17.080000 ± 13.172532 in #233


Epoch #693: 101it [00:12,  8.39it/s, env_step=69300, len=100, loss=1.533, n/ep=1, n/st=100, rew=-46.60]                         


Epoch #693: test_reward: -55.090000 ± 25.560143, best_reward: -17.080000 ± 13.172532 in #233


Epoch #694: 101it [00:05, 17.32it/s, env_step=69400, len=100, loss=1.710, n/ep=1, n/st=100, rew=-209.00]                          


Epoch #694: test_reward: -74.920000 ± 46.813840, best_reward: -17.080000 ± 13.172532 in #233


Epoch #695: 101it [00:09, 10.94it/s, env_step=69500, len=100, loss=1.714, n/ep=1, n/st=100, rew=-179.90]                         


Epoch #695: test_reward: -40.830000 ± 17.179351, best_reward: -17.080000 ± 13.172532 in #233


Epoch #696: 101it [00:07, 12.77it/s, env_step=69600, len=100, loss=1.574, n/ep=1, n/st=100, rew=-108.60]                          


Epoch #696: test_reward: -72.160000 ± 39.153370, best_reward: -17.080000 ± 13.172532 in #233


Epoch #697: 101it [00:02, 45.80it/s, env_step=69700, len=100, loss=1.620, n/ep=1, n/st=100, rew=-89.40]                         


Epoch #697: test_reward: -62.870000 ± 34.113547, best_reward: -17.080000 ± 13.172532 in #233


Epoch #698: 101it [00:07, 12.77it/s, env_step=69800, len=100, loss=1.757, n/ep=1, n/st=100, rew=-164.70]                         


Epoch #698: test_reward: -44.520000 ± 24.029890, best_reward: -17.080000 ± 13.172532 in #233


Epoch #699: 101it [00:03, 33.32it/s, env_step=69900, len=100, loss=1.862, n/ep=1, n/st=100, rew=-52.90]                         


Epoch #699: test_reward: -78.870000 ± 47.253996, best_reward: -17.080000 ± 13.172532 in #233


Epoch #700: 101it [00:06, 15.09it/s, env_step=70000, len=100, loss=1.718, n/ep=1, n/st=100, rew=-176.30]                         


Epoch #700: test_reward: -54.950000 ± 30.088943, best_reward: -17.080000 ± 13.172532 in #233


Epoch #701: 101it [00:08, 11.33it/s, env_step=70100, len=100, loss=1.831, n/ep=1, n/st=100, rew=-32.40]                         


Epoch #701: test_reward: -57.830000 ± 43.036126, best_reward: -17.080000 ± 13.172532 in #233


Epoch #702: 101it [00:08, 12.22it/s, env_step=70200, len=100, loss=1.851, n/ep=1, n/st=100, rew=-186.40]                         


Epoch #702: test_reward: -82.940000 ± 40.727882, best_reward: -17.080000 ± 13.172532 in #233


Epoch #703: 101it [00:07, 13.32it/s, env_step=70300, len=100, loss=1.634, n/ep=1, n/st=100, rew=-71.40]                         


Epoch #703: test_reward: -50.420000 ± 20.170612, best_reward: -17.080000 ± 13.172532 in #233


Epoch #704: 101it [00:02, 44.46it/s, env_step=70400, len=100, loss=1.409, n/ep=1, n/st=100, rew=-58.30]                          


Epoch #704: test_reward: -70.700000 ± 34.451909, best_reward: -17.080000 ± 13.172532 in #233


Epoch #705: 101it [00:15,  6.53it/s, env_step=70500, len=100, loss=1.704, n/ep=1, n/st=100, rew=-70.00]                         


Epoch #705: test_reward: -57.700000 ± 30.172206, best_reward: -17.080000 ± 13.172532 in #233


Epoch #706: 101it [00:02, 35.27it/s, env_step=70600, len=100, loss=1.626, n/ep=1, n/st=100, rew=-37.80]                         

libgomp: Thread creation failed: Resource temporarily unavailable
