In [1]:
import gymnasium as gym
from stk_actor.wrappers import StuckStopWrapper
import torch
import torch.nn.functional as F

path = 'stk_actor/trained_agents/'
agent = 'normed_a2c_num5_best'

class PreprocessObservationWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        """
        A Gym wrapper to preprocess mixed observation space (continuous + discrete)
        into a flat tensor.
        
        Args:
            env: The Gym environment to wrap.
        """
        super().__init__(env)
        self.observation_space = self._get_flat_observation_space(env.observation_space)
        self.mean = torch.load(path+f'{agent}/buffer_mean', map_location='cpu')
        self.std = torch.load(path+f'{agent}/buffer_std', map_location='cpu')

    def _get_flat_observation_space(self, observation_space):
        """
        Create a flat observation space based on the original observation space.
        
        Args:
            observation_space: Original observation space with 'continuous' and 'discrete' components.
        
        Returns:
            A flattened observation space.
        """
        continuous_dim = observation_space['continuous'].shape[0]
        discrete_dims = sum(space.n for space in observation_space['discrete'])
        flat_dim = continuous_dim + discrete_dims
        return gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(flat_dim,), dtype=float)

    def observation(self, obs):
        """
        Process the observation into a flat tensor.
        
        Args:
            obs: The raw observation from the environment.
        
        Returns:
            A preprocessed flat tensor.
        """
        continuous_obs, discrete_obs = obs['continuous'], obs['discrete']
        continuous_tensor = torch.FloatTensor(continuous_obs)
        
        discrete_tensors = [
            F.one_hot(torch.tensor(x), num_classes=num_classes.n).float()
            for x, num_classes in zip(discrete_obs, self.env.observation_space['discrete'])
        ]
        
        flat_tensor = torch.cat([continuous_tensor] + discrete_tensors)
        normed_flat_tensor = (flat_tensor - self.mean) / (self.std + 1e-8)
        return normed_flat_tensor

import gymnasium as gym
from gymnasium import Wrapper

class SkipFirstNStepsWrapper(Wrapper):
    def __init__(self, env, n):
        super().__init__(env)
        self.n = n

    def reset(self, **kwargs):
        # Reset the environment
        obs, info = self.env.reset(**kwargs)
        # Skip the first n steps
        for _ in range(self.n):
            obs, _, done, truncated, info = self.env.step(self.env.action_space.sample())
            if done or truncated:
                obs, info = self.env.reset(**kwargs)
        return obs, info

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from typing import Dict, List, Tuple, Union, Type
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

import gymnasium as gym
from gymnasium import spaces

def get_device(device: Union[torch.device, str] = "auto") -> torch.device:
    if device == "auto":
        device = "cuda"
    device = torch.device(device)
    if device.type == torch.device("cuda").type and not torch.cuda.is_available():
        return torch.device("cpu")
    return device

class BaseFeaturesExtractor(nn.Module):
    def __init__(self, observation_space: gym.Space, features_dim: int = 0) -> None:
        super().__init__()
        assert features_dim > 0
        self._observation_space = observation_space
        self._features_dim = features_dim
    @property
    def features_dim(self) -> int:
        return self._features_dim

def get_flattened_obs_dim(observation_space: spaces.Space) -> int:
    if isinstance(observation_space, spaces.MultiDiscrete):
        return sum(observation_space.nvec)
    else:
        return spaces.utils.flatdim(observation_space)

class FlattenExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space) -> None:
        super().__init__(observation_space, get_flattened_obs_dim(observation_space))
        self.flatten = nn.Flatten()
    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.flatten(observations)
    
class MlpExtractor(nn.Module):
    def __init__(
        self,
        feature_dim: int,
        net_arch: Union[List[int], Dict[str, List[int]]],
        activation_fn: Type[nn.Module],
        device: Union[torch.device, str] = "auto",
    ) -> None:
        super().__init__()
        # device = torch.get_device(device)
        policy_net: List[nn.Module] = []
        value_net: List[nn.Module] = []
        last_layer_dim_pi = feature_dim
        last_layer_dim_vf = feature_dim

        if isinstance(net_arch, dict):
            pi_layers_dims = net_arch.get("pi", []) 
            vf_layers_dims = net_arch.get("vf", []) 
        else:
            pi_layers_dims = vf_layers_dims = net_arch
        for curr_layer_dim in pi_layers_dims:
            policy_net.append(nn.Linear(last_layer_dim_pi, curr_layer_dim))
            policy_net.append(activation_fn())
            last_layer_dim_pi = curr_layer_dim
        for curr_layer_dim in vf_layers_dims:
            value_net.append(nn.Linear(last_layer_dim_vf, curr_layer_dim))
            value_net.append(activation_fn())
            last_layer_dim_vf = curr_layer_dim

        self.latent_dim_pi = last_layer_dim_pi
        self.latent_dim_vf = last_layer_dim_vf
        self.policy_net = nn.Sequential(*policy_net)#.to(device)
        self.value_net = nn.Sequential(*value_net)#.to(device)

    def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        :return: latent_policy, latent_value of the specified network.
            If all layers are shared, then ``latent_policy == latent_value``
        """
        return self.forward_actor(features), self.forward_critic(features)

    def forward_actor(self, features: torch.Tensor) -> torch.Tensor:
        return self.policy_net(features)

    def forward_critic(self, features: torch.Tensor) -> torch.Tensor:
        return self.value_net(features)

    
class Policy(nn.Module):
    def __init__(self, observation_space, action_dims, net_arch, activation_fn,):
        super().__init__()
        self.features_extractor = FlattenExtractor(observation_space)
        self.pi_features_extractor = self.features_extractor
        self.vf_features_extractor = self.features_extractor
        self.mlp_extractor = MlpExtractor(
            self.features_extractor.features_dim,
            net_arch=net_arch,
            activation_fn=activation_fn,
        )
        self.action_net = nn.Linear(net_arch[-1], sum(action_dims))
        self.value_net = nn.Linear(net_arch[-1], 1)


class UnifiedSACPolicy(nn.Module):
    def __init__(self, observation_space, action_dims, net_arch, activation_fn):
        super().__init__()
        
        self.shared = Policy(
            observation_space,
            action_dims,
            net_arch=net_arch,
            activation_fn=activation_fn
        )
        self.action_dims = action_dims
    
    def forward(self, x):
        x = self.shared.features_extractor(x)
        x = self.shared.mlp_extractor.policy_net(x)
        x = self.shared.action_net(x)
        return x
    
    def sample(self, x, deterministic=False):
        logits = self.forward(x)
        
        # Split logits for each action dimension
        split_logits = torch.split(logits, self.action_dims, dim=-1)
        
        actions = []
        log_probs = []
        probs = []
        
        for logit in split_logits:
            distribution = Categorical(logits=logit)
            if deterministic:
                action = torch.argmax(logit, dim=-1)
            else:
                action = distribution.sample()
            
            log_prob = distribution.log_prob(action)
            prob = F.softmax(logit, dim=-1)
            
            actions.append(action)
            log_probs.append(log_prob)
            probs.append(prob)
        
        return (
            torch.stack(actions),
            torch.stack(log_probs),
            probs
        )
    
#policy = torch.load('policy_512_512_512_512_SiLU_3_statedict', map_location='cuda')


from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym
from pystk2_gymnasium import AgentSpec
from bbrl.agents.gymnasium import ParallelGymAgent, make_env
from functools import partial

tracks = [
    'abyss',
    'black_forest',
    'candela_city',
    'cocoa_temple',
    'cornfield_crossing',
    'fortmagma',
    'gran_paradiso_island',
    'hacienda',
    'lighthouse',
    'mines',
    'minigolf',
    'olivermath',
    'ravenbridge_mansion',
    'sandtrack',
    'scotland',
    'snowmountain',
    'snowtuxpeak',
    'stk_enterprise',
    'volcano_island',
    'xr591',
    'zengarden',

# # #   ==================   #

#     'fortmagma',
#     'ravenbridge_mansion',
#     'snowmountain',
#     'cocoa_temple',
#     'sandtrack',    
#     'scotland', 
#     'stk_enterprise',
#     'volcano_island', # 1104
#     'xr591', # 864   
]


karts = [12]
n_envs = len(tracks)*len(karts)

print('making', n_envs, 'environments')
vec_env = make_vec_env(
    "supertuxkart/flattened_multidiscrete-v0",
    # seed=12,
    n_envs=n_envs, 
    wrapper_class=lambda x : (
        SkipFirstNStepsWrapper(
            StuckStopWrapper(
                PreprocessObservationWrapper(x),
                n=92,
            ), 
            n=20,
        )
    ), 
    env_kwargs={
    'render_mode':None, 'agent':AgentSpec(use_ai=False, name="walid"), #'track':'minigolf', 
    'laps':1,
    'difficulty':2, 
    'num_kart':12, #'difficulty':0
})

ix = 0
for num_kart in enumerate(karts):
    for track in enumerate(tracks):
        venv = vec_env.envs[ix]
        venv.env.default_track = track
        venv.env.num_kart = num_kart
        print(ix, track, )
        ix+=1

net_arch=[1024,1024,1024]
activation_fn=torch.nn.Tanh
filename = path+f'{agent}/statedict'

action_dims = [space.n for space in vec_env.action_space]
unified_policy = UnifiedSACPolicy(
    vec_env.observation_space, 
    action_dims, 
    net_arch=net_arch, 
    activation_fn=activation_fn
)
unified_policy.load_state_dict(torch.load(filename, map_location='cpu'))


making 21 environments
..:: Antarctica Rendering Engine 2.0 ::..


  self.mean = torch.load(path+f'{agent}/buffer_mean', map_location='cpu')
  self.std = torch.load(path+f'{agent}/buffer_std', map_location='cpu')


0 (0, 'abyss')
1 (1, 'black_forest')
2 (2, 'candela_city')
3 (3, 'cocoa_temple')
4 (4, 'cornfield_crossing')
5 (5, 'fortmagma')
6 (6, 'gran_paradiso_island')
7 (7, 'hacienda')
8 (8, 'lighthouse')
9 (9, 'mines')
10 (10, 'minigolf')
11 (11, 'olivermath')
12 (12, 'ravenbridge_mansion')
13 (13, 'sandtrack')
14 (14, 'scotland')
15 (15, 'snowmountain')
16 (16, 'snowtuxpeak')
17 (17, 'stk_enterprise')
18 (18, 'volcano_island')
19 (19, 'xr591')
20 (20, 'zengarden')


  unified_policy.load_state_dict(torch.load(filename, map_location='cpu'))


<All keys matched successfully>

In [2]:
from stable_baselines3 import A2C

steps = [(
    # 1024*8,
    20*n_envs,
    300_000,
)]
for n_steps, total_timesteps in steps:
    # model = PPO(
    #     "MlpPolicy", 
    #     vec_env, 
    #     verbose=1, 
    #     policy_kwargs = dict(net_arch=net_arch, activation_fn=activation_fn,),
    #     device='cpu',
    #     learning_rate=0.0001,
    #     batch_size=128,
    #     n_epochs=100,
    #     n_steps=n_steps,
    #     tensorboard_log="./outputs/",
    #     # ent_coef=0.001,
    #     clip_range=0.2,
    # )
    model = A2C(
        "MlpPolicy", 
        vec_env, 
        verbose=1, 
        policy_kwargs = dict(net_arch=net_arch, activation_fn=activation_fn,),
        device='cpu',
        learning_rate=0.0006,
        n_steps=n_steps,
        tensorboard_log="./outputs/",
        use_rms_prop=False,
        normalize_advantage=True,
    )
    print('DOING', n_steps, total_timesteps)
    model.policy.load_state_dict(unified_policy.shared.state_dict())
    model.learn(total_timesteps=total_timesteps, progress_bar=True, log_interval=1)

        


Using cpu device
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
DOING 420 300000
Logging to ./outputs/A2C_12


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 348      |
|    ep_rew_mean     | 458      |
| time/              |          |
|    fps             | 56       |
|    iterations      | 1        |
|    time_elapsed    | 154      |
|    total_timesteps | 8820     |
---------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 502      |
|    ep_rew_mean        | 520      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 2        |
|    time_elapsed       | 322      |
|    total_timesteps    | 17640    |
| train/                |          |
|    entropy_loss       | -0.222   |
|    explained_variance | 0.14     |
|    learning_rate      | 0.0006   |
|    n_updates          | 1        |
|    policy_loss        | -0.0102  |
|    value_loss         | 2.53e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 535      |
|    ep_rew_mean        | 499      |
| time/                 |          |
|    fps                | 51       |
|    iterations         | 3        |
|    time_elapsed       | 517      |
|    total_timesteps    | 26460    |
| train/                |          |
|    entropy_loss       | -0.244   |
|    explained_variance | 0.264    |
|    learning_rate      | 0.0006   |
|    n_updates          | 2        |
|    policy_loss        | -0.0199  |
|    value_loss         | 1.95e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 576      |
|    ep_rew_mean        | 508      |
| time/                 |          |
|    fps                | 51       |
|    iterations         | 4        |
|    time_elapsed       | 685      |
|    total_timesteps    | 35280    |
| train/                |          |
|    entropy_loss       | -0.241   |
|    explained_variance | 0.372    |
|    learning_rate      | 0.0006   |
|    n_updates          | 3        |
|    policy_loss        | 0.000122 |
|    value_loss         | 1.51e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 584      |
|    ep_rew_mean        | 516      |
| time/                 |          |
|    fps                | 51       |
|    iterations         | 5        |
|    time_elapsed       | 851      |
|    total_timesteps    | 44100    |
| train/                |          |
|    entropy_loss       | -0.237   |
|    explained_variance | 0.368    |
|    learning_rate      | 0.0006   |
|    n_updates          | 4        |
|    policy_loss        | 0.00375  |
|    value_loss         | 1.55e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 582      |
|    ep_rew_mean        | 507      |
| time/                 |          |
|    fps                | 52       |
|    iterations         | 6        |
|    time_elapsed       | 1001     |
|    total_timesteps    | 52920    |
| train/                |          |
|    entropy_loss       | -0.228   |
|    explained_variance | 0.245    |
|    learning_rate      | 0.0006   |
|    n_updates          | 5        |
|    policy_loss        | -0.01    |
|    value_loss         | 1.48e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 588      |
|    ep_rew_mean        | 499      |
| time/                 |          |
|    fps                | 52       |
|    iterations         | 7        |
|    time_elapsed       | 1171     |
|    total_timesteps    | 61740    |
| train/                |          |
|    entropy_loss       | -0.228   |
|    explained_variance | 0.183    |
|    learning_rate      | 0.0006   |
|    n_updates          | 6        |
|    policy_loss        | -0.0222  |
|    value_loss         | 2.08e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 593      |
|    ep_rew_mean        | 488      |
| time/                 |          |
|    fps                | 52       |
|    iterations         | 8        |
|    time_elapsed       | 1341     |
|    total_timesteps    | 70560    |
| train/                |          |
|    entropy_loss       | -0.254   |
|    explained_variance | 0.209    |
|    learning_rate      | 0.0006   |
|    n_updates          | 7        |
|    policy_loss        | 0.0029   |
|    value_loss         | 1.15e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 597      |
|    ep_rew_mean        | 489      |
| time/                 |          |
|    fps                | 51       |
|    iterations         | 9        |
|    time_elapsed       | 1526     |
|    total_timesteps    | 79380    |
| train/                |          |
|    entropy_loss       | -0.237   |
|    explained_variance | 0.315    |
|    learning_rate      | 0.0006   |
|    n_updates          | 8        |
|    policy_loss        | -0.00889 |
|    value_loss         | 1.38e+03 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 608       |
|    ep_rew_mean        | 482       |
| time/                 |           |
|    fps                | 52        |
|    iterations         | 10        |
|    time_elapsed       | 1694      |
|    total_timesteps    | 88200     |
| train/                |           |
|    entropy_loss       | -0.242    |
|    explained_variance | 0.328     |
|    learning_rate      | 0.0006    |
|    n_updates          | 9         |
|    policy_loss        | -0.000463 |
|    value_loss         | 1.08e+03  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 616      |
|    ep_rew_mean        | 485      |
| time/                 |          |
|    fps                | 52       |
|    iterations         | 11       |
|    time_elapsed       | 1861     |
|    total_timesteps    | 97020    |
| train/                |          |
|    entropy_loss       | -0.249   |
|    explained_variance | 0.337    |
|    learning_rate      | 0.0006   |
|    n_updates          | 10       |
|    policy_loss        | -0.00171 |
|    value_loss         | 1.96e+03 |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 611       |
|    ep_rew_mean        | 492       |
| time/                 |           |
|    fps                | 52        |
|    iterations         | 12        |
|    time_elapsed       | 2027      |
|    total_timesteps    | 105840    |
| train/                |           |
|    entropy_loss       | -0.237    |
|    explained_variance | 0.38      |
|    learning_rate      | 0.0006    |
|    n_updates          | 11        |
|    policy_loss        | -0.000392 |
|    value_loss         | 1.08e+03  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 616      |
|    ep_rew_mean        | 491      |
| time/                 |          |
|    fps                | 52       |
|    iterations         | 13       |
|    time_elapsed       | 2190     |
|    total_timesteps    | 114660   |
| train/                |          |
|    entropy_loss       | -0.238   |
|    explained_variance | 0.321    |
|    learning_rate      | 0.0006   |
|    n_updates          | 12       |
|    policy_loss        | -0.00715 |
|    value_loss         | 1.99e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 606      |
|    ep_rew_mean        | 501      |
| time/                 |          |
|    fps                | 52       |
|    iterations         | 14       |
|    time_elapsed       | 2351     |
|    total_timesteps    | 123480   |
| train/                |          |
|    entropy_loss       | -0.231   |
|    explained_variance | 0.238    |
|    learning_rate      | 0.0006   |
|    n_updates          | 13       |
|    policy_loss        | 0.0148   |
|    value_loss         | 2.31e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 619      |
|    ep_rew_mean        | 490      |
| time/                 |          |
|    fps                | 52       |
|    iterations         | 15       |
|    time_elapsed       | 2501     |
|    total_timesteps    | 132300   |
| train/                |          |
|    entropy_loss       | -0.234   |
|    explained_variance | 0.37     |
|    learning_rate      | 0.0006   |
|    n_updates          | 14       |
|    policy_loss        | 0.00869  |
|    value_loss         | 1.73e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 625      |
|    ep_rew_mean        | 516      |
| time/                 |          |
|    fps                | 53       |
|    iterations         | 16       |
|    time_elapsed       | 2632     |
|    total_timesteps    | 141120   |
| train/                |          |
|    entropy_loss       | -0.232   |
|    explained_variance | 0.392    |
|    learning_rate      | 0.0006   |
|    n_updates          | 15       |
|    policy_loss        | 0.0142   |
|    value_loss         | 1.85e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 623      |
|    ep_rew_mean        | 516      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 17       |
|    time_elapsed       | 2746     |
|    total_timesteps    | 149940   |
| train/                |          |
|    entropy_loss       | -0.236   |
|    explained_variance | 0.395    |
|    learning_rate      | 0.0006   |
|    n_updates          | 16       |
|    policy_loss        | -0.0142  |
|    value_loss         | 1.4e+03  |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 618      |
|    ep_rew_mean        | 526      |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 18       |
|    time_elapsed       | 2859     |
|    total_timesteps    | 158760   |
| train/                |          |
|    entropy_loss       | -0.231   |
|    explained_variance | 0.364    |
|    learning_rate      | 0.0006   |
|    n_updates          | 17       |
|    policy_loss        | 0.0121   |
|    value_loss         | 976      |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 608      |
|    ep_rew_mean        | 499      |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 19       |
|    time_elapsed       | 3014     |
|    total_timesteps    | 167580   |
| train/                |          |
|    entropy_loss       | -0.224   |
|    explained_variance | 0.0734   |
|    learning_rate      | 0.0006   |
|    n_updates          | 18       |
|    policy_loss        | -0.0057  |
|    value_loss         | 1.88e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 598      |
|    ep_rew_mean        | 469      |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 20       |
|    time_elapsed       | 3182     |
|    total_timesteps    | 176400   |
| train/                |          |
|    entropy_loss       | -0.242   |
|    explained_variance | 0.412    |
|    learning_rate      | 0.0006   |
|    n_updates          | 19       |
|    policy_loss        | 0.00806  |
|    value_loss         | 1.11e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 604      |
|    ep_rew_mean        | 468      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 21       |
|    time_elapsed       | 3370     |
|    total_timesteps    | 185220   |
| train/                |          |
|    entropy_loss       | -0.241   |
|    explained_variance | 0.261    |
|    learning_rate      | 0.0006   |
|    n_updates          | 20       |
|    policy_loss        | 0.00308  |
|    value_loss         | 1.68e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 619      |
|    ep_rew_mean        | 476      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 22       |
|    time_elapsed       | 3551     |
|    total_timesteps    | 194040   |
| train/                |          |
|    entropy_loss       | -0.252   |
|    explained_variance | 0.512    |
|    learning_rate      | 0.0006   |
|    n_updates          | 21       |
|    policy_loss        | -0.00594 |
|    value_loss         | 1.16e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 596      |
|    ep_rew_mean        | 490      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 23       |
|    time_elapsed       | 3722     |
|    total_timesteps    | 202860   |
| train/                |          |
|    entropy_loss       | -0.244   |
|    explained_variance | 0.3      |
|    learning_rate      | 0.0006   |
|    n_updates          | 22       |
|    policy_loss        | -0.00448 |
|    value_loss         | 2.12e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 591      |
|    ep_rew_mean        | 482      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 24       |
|    time_elapsed       | 3874     |
|    total_timesteps    | 211680   |
| train/                |          |
|    entropy_loss       | -0.244   |
|    explained_variance | 0.241    |
|    learning_rate      | 0.0006   |
|    n_updates          | 23       |
|    policy_loss        | -0.00844 |
|    value_loss         | 2.48e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 596      |
|    ep_rew_mean        | 476      |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 25       |
|    time_elapsed       | 3979     |
|    total_timesteps    | 220500   |
| train/                |          |
|    entropy_loss       | -0.242   |
|    explained_variance | 0.337    |
|    learning_rate      | 0.0006   |
|    n_updates          | 24       |
|    policy_loss        | -0.0162  |
|    value_loss         | 1.72e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 591      |
|    ep_rew_mean        | 460      |
| time/                 |          |
|    fps                | 56       |
|    iterations         | 26       |
|    time_elapsed       | 4087     |
|    total_timesteps    | 229320   |
| train/                |          |
|    entropy_loss       | -0.226   |
|    explained_variance | 0.283    |
|    learning_rate      | 0.0006   |
|    n_updates          | 25       |
|    policy_loss        | -0.00197 |
|    value_loss         | 1.73e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 602      |
|    ep_rew_mean        | 463      |
| time/                 |          |
|    fps                | 56       |
|    iterations         | 27       |
|    time_elapsed       | 4239     |
|    total_timesteps    | 238140   |
| train/                |          |
|    entropy_loss       | -0.235   |
|    explained_variance | 0.384    |
|    learning_rate      | 0.0006   |
|    n_updates          | 26       |
|    policy_loss        | 0.0014   |
|    value_loss         | 1.09e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 612      |
|    ep_rew_mean        | 473      |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 28       |
|    time_elapsed       | 4435     |
|    total_timesteps    | 246960   |
| train/                |          |
|    entropy_loss       | -0.234   |
|    explained_variance | 0.277    |
|    learning_rate      | 0.0006   |
|    n_updates          | 27       |
|    policy_loss        | -0.0048  |
|    value_loss         | 1.41e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 606      |
|    ep_rew_mean        | 461      |
| time/                 |          |
|    fps                | 55       |
|    iterations         | 29       |
|    time_elapsed       | 4644     |
|    total_timesteps    | 255780   |
| train/                |          |
|    entropy_loss       | -0.241   |
|    explained_variance | 0.301    |
|    learning_rate      | 0.0006   |
|    n_updates          | 28       |
|    policy_loss        | 0.00591  |
|    value_loss         | 1.65e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 627      |
|    ep_rew_mean        | 484      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 30       |
|    time_elapsed       | 4837     |
|    total_timesteps    | 264600   |
| train/                |          |
|    entropy_loss       | -0.263   |
|    explained_variance | 0.26     |
|    learning_rate      | 0.0006   |
|    n_updates          | 29       |
|    policy_loss        | 0.0166   |
|    value_loss         | 1.32e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 631      |
|    ep_rew_mean        | 468      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 31       |
|    time_elapsed       | 4973     |
|    total_timesteps    | 273420   |
| train/                |          |
|    entropy_loss       | -0.248   |
|    explained_variance | 0.288    |
|    learning_rate      | 0.0006   |
|    n_updates          | 30       |
|    policy_loss        | -0.00271 |
|    value_loss         | 1.8e+03  |
------------------------------------


-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 628       |
|    ep_rew_mean        | 478       |
| time/                 |           |
|    fps                | 54        |
|    iterations         | 32        |
|    time_elapsed       | 5148      |
|    total_timesteps    | 282240    |
| train/                |           |
|    entropy_loss       | -0.257    |
|    explained_variance | 0.458     |
|    learning_rate      | 0.0006    |
|    n_updates          | 31        |
|    policy_loss        | -0.000971 |
|    value_loss         | 1.66e+03  |
-------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 620      |
|    ep_rew_mean        | 480      |
| time/                 |          |
|    fps                | 54       |
|    iterations         | 33       |
|    time_elapsed       | 5372     |
|    total_timesteps    | 291060   |
| train/                |          |
|    entropy_loss       | -0.238   |
|    explained_variance | 0.318    |
|    learning_rate      | 0.0006   |
|    n_updates          | 32       |
|    policy_loss        | 0.0149   |
|    value_loss         | 1.63e+03 |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 642      |
|    ep_rew_mean        | 529      |
| time/                 |          |
|    fps                | 53       |
|    iterations         | 34       |
|    time_elapsed       | 5583     |
|    total_timesteps    | 299880   |
| train/                |          |
|    entropy_loss       | -0.253   |
|    explained_variance | 0.2      |
|    learning_rate      | 0.0006   |
|    n_updates          | 33       |
|    policy_loss        | 0.0188   |
|    value_loss         | 2.5e+03  |
------------------------------------


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 639      |
|    ep_rew_mean        | 542      |
| time/                 |          |
|    fps                | 53       |
|    iterations         | 35       |
|    time_elapsed       | 5730     |
|    total_timesteps    | 308700   |
| train/                |          |
|    entropy_loss       | -0.241   |
|    explained_variance | 0.188    |
|    learning_rate      | 0.0006   |
|    n_updates          | 34       |
|    policy_loss        | 0.0218   |
|    value_loss         | 1.91e+03 |
------------------------------------


In [3]:
model.save(f'final_a2c_{n_steps}')
f'final_a2c_{n_steps}'

'final_a2c_420'