In [1]:
import gymnasium as gym
from stk_actor.wrappers import StuckStopWrapper
import torch
import torch.nn.functional as F

path = 'stk_actor/trained_agents/'
agent = 'normed_a2c_num5_best'

class PreprocessObservationWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        """
        A Gym wrapper to preprocess mixed observation space (continuous + discrete)
        into a flat tensor.
        
        Args:
            env: The Gym environment to wrap.
        """
        super().__init__(env)
        self.observation_space = self._get_flat_observation_space(env.observation_space)
        self.mean = torch.load(path+f'{agent}/buffer_mean', map_location='cpu')
        self.std = torch.load(path+f'{agent}/buffer_std', map_location='cpu')

    def _get_flat_observation_space(self, observation_space):
        """
        Create a flat observation space based on the original observation space.
        
        Args:
            observation_space: Original observation space with 'continuous' and 'discrete' components.
        
        Returns:
            A flattened observation space.
        """
        continuous_dim = observation_space['continuous'].shape[0]
        discrete_dims = sum(space.n for space in observation_space['discrete'])
        flat_dim = continuous_dim + discrete_dims
        return gym.spaces.Box(low=-float('inf'), high=float('inf'), shape=(flat_dim,), dtype=float)

    def observation(self, obs):
        """
        Process the observation into a flat tensor.
        
        Args:
            obs: The raw observation from the environment.
        
        Returns:
            A preprocessed flat tensor.
        """
        continuous_obs, discrete_obs = obs['continuous'], obs['discrete']
        continuous_tensor = torch.FloatTensor(continuous_obs)
        
        discrete_tensors = [
            F.one_hot(torch.tensor(x), num_classes=num_classes.n).float()
            for x, num_classes in zip(discrete_obs, self.env.observation_space['discrete'])
        ]
        
        flat_tensor = torch.cat([continuous_tensor] + discrete_tensors)
        normed_flat_tensor = (flat_tensor - self.mean) / (self.std + 1e-8)
        return normed_flat_tensor

import gymnasium as gym
from gymnasium import Wrapper

class SkipFirstNStepsWrapper(Wrapper):
    def __init__(self, env, n):
        super().__init__(env)
        self.n = n

    def reset(self, **kwargs):
        # Reset the environment
        obs, info = self.env.reset(**kwargs)
        # Skip the first n steps
        for _ in range(self.n):
            obs, _, done, truncated, info = self.env.step(self.env.action_space.sample())
            if done or truncated:
                obs, info = self.env.reset(**kwargs)
        return obs, info

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from typing import Dict, List, Tuple, Union, Type
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

import gymnasium as gym
from gymnasium import spaces

def get_device(device: Union[torch.device, str] = "auto") -> torch.device:
    if device == "auto":
        device = "cuda"
    device = torch.device(device)
    if device.type == torch.device("cuda").type and not torch.cuda.is_available():
        return torch.device("cpu")
    return device

class BaseFeaturesExtractor(nn.Module):
    def __init__(self, observation_space: gym.Space, features_dim: int = 0) -> None:
        super().__init__()
        assert features_dim > 0
        self._observation_space = observation_space
        self._features_dim = features_dim
    @property
    def features_dim(self) -> int:
        return self._features_dim

def get_flattened_obs_dim(observation_space: spaces.Space) -> int:
    if isinstance(observation_space, spaces.MultiDiscrete):
        return sum(observation_space.nvec)
    else:
        return spaces.utils.flatdim(observation_space)

class FlattenExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space) -> None:
        super().__init__(observation_space, get_flattened_obs_dim(observation_space))
        self.flatten = nn.Flatten()
    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.flatten(observations)
    
class MlpExtractor(nn.Module):
    def __init__(
        self,
        feature_dim: int,
        net_arch: Union[List[int], Dict[str, List[int]]],
        activation_fn: Type[nn.Module],
        device: Union[torch.device, str] = "auto",
    ) -> None:
        super().__init__()
        # device = torch.get_device(device)
        policy_net: List[nn.Module] = []
        value_net: List[nn.Module] = []
        last_layer_dim_pi = feature_dim
        last_layer_dim_vf = feature_dim

        if isinstance(net_arch, dict):
            pi_layers_dims = net_arch.get("pi", []) 
            vf_layers_dims = net_arch.get("vf", []) 
        else:
            pi_layers_dims = vf_layers_dims = net_arch
        for curr_layer_dim in pi_layers_dims:
            policy_net.append(nn.Linear(last_layer_dim_pi, curr_layer_dim))
            policy_net.append(activation_fn())
            last_layer_dim_pi = curr_layer_dim
        for curr_layer_dim in vf_layers_dims:
            value_net.append(nn.Linear(last_layer_dim_vf, curr_layer_dim))
            value_net.append(activation_fn())
            last_layer_dim_vf = curr_layer_dim

        self.latent_dim_pi = last_layer_dim_pi
        self.latent_dim_vf = last_layer_dim_vf
        self.policy_net = nn.Sequential(*policy_net)#.to(device)
        self.value_net = nn.Sequential(*value_net)#.to(device)

    def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        :return: latent_policy, latent_value of the specified network.
            If all layers are shared, then ``latent_policy == latent_value``
        """
        return self.forward_actor(features), self.forward_critic(features)

    def forward_actor(self, features: torch.Tensor) -> torch.Tensor:
        return self.policy_net(features)

    def forward_critic(self, features: torch.Tensor) -> torch.Tensor:
        return self.value_net(features)

    
class Policy(nn.Module):
    def __init__(self, observation_space, action_dims, net_arch, activation_fn,):
        super().__init__()
        self.features_extractor = FlattenExtractor(observation_space)
        self.pi_features_extractor = self.features_extractor
        self.vf_features_extractor = self.features_extractor
        self.mlp_extractor = MlpExtractor(
            self.features_extractor.features_dim,
            net_arch=net_arch,
            activation_fn=activation_fn,
        )
        self.action_net = nn.Linear(net_arch[-1], sum(action_dims))
        self.value_net = nn.Linear(net_arch[-1], 1)


class UnifiedSACPolicy(nn.Module):
    def __init__(self, observation_space, action_dims, net_arch, activation_fn):
        super().__init__()
        
        self.shared = Policy(
            observation_space,
            action_dims,
            net_arch=net_arch,
            activation_fn=activation_fn
        )
        self.action_dims = action_dims
    
    def forward(self, x):
        x = self.shared.features_extractor(x)
        x = self.shared.mlp_extractor.policy_net(x)
        x = self.shared.action_net(x)
        return x
    
    def sample(self, x, deterministic=False):
        logits = self.forward(x)
        
        # Split logits for each action dimension
        split_logits = torch.split(logits, self.action_dims, dim=-1)
        
        actions = []
        log_probs = []
        probs = []
        
        for logit in split_logits:
            distribution = Categorical(logits=logit)
            if deterministic:
                action = torch.argmax(logit, dim=-1)
            else:
                action = distribution.sample()
            
            log_prob = distribution.log_prob(action)
            prob = F.softmax(logit, dim=-1)
            
            actions.append(action)
            log_probs.append(log_prob)
            probs.append(prob)
        
        return (
            torch.stack(actions),
            torch.stack(log_probs),
            probs
        )
    
#policy = torch.load('policy_512_512_512_512_SiLU_3_statedict', map_location='cuda')


from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym
from pystk2_gymnasium import AgentSpec
from bbrl.agents.gymnasium import ParallelGymAgent, make_env
from functools import partial

tracks = [
    'abyss',
    'black_forest',
    'candela_city',
    'cocoa_temple',
    'cornfield_crossing',
    'fortmagma',
    'gran_paradiso_island',
    'hacienda',
    'lighthouse',
    'mines',
    'minigolf',
    'olivermath',
    'ravenbridge_mansion',
    'sandtrack',
    'scotland',
    'snowmountain',
    'snowtuxpeak',
    'stk_enterprise',
    'volcano_island',
    'xr591',
    'zengarden',

# # #   ==================   #

#     'fortmagma',
#     'ravenbridge_mansion',
#     'snowmountain',
#     'cocoa_temple',
#     'sandtrack',    
#     'scotland', 
#     'stk_enterprise',
#     'volcano_island', # 1104
#     'xr591', # 864   
]

# karts = [4,12]
karts = [3,12]
n_envs = len(tracks)*len(karts)

print('making', n_envs, 'environments')
vec_env = make_vec_env(
    "supertuxkart/flattened_multidiscrete-v0",
    # seed=12,
    n_envs=n_envs, 
    wrapper_class=lambda x : (
        SkipFirstNStepsWrapper(
            StuckStopWrapper(
                PreprocessObservationWrapper(x),
                n=128,
            ), 
            n=19,
        )
    ), 
    env_kwargs={
    'render_mode':None, 'agent':AgentSpec(use_ai=False, name="walid"), #'track':'minigolf', 
    'laps':1,
    'difficulty':2, 
    'num_kart':12, #'difficulty':0
})

ix = 0
for num_kart in enumerate(karts):
    for track in enumerate(tracks):
        venv = vec_env.envs[ix]
        venv.env.default_track = track
        venv.env.num_kart = num_kart
        print(ix, track, )
        ix+=1



net_arch=[1024,1024,1024]
activation_fn=torch.nn.Tanh
# filename = 'policy_normed_1024_1024_1024_Tanh_statedict_2'
filename = path+f'{agent}/statedict'

action_dims = [space.n for space in vec_env.action_space]
unified_policy = UnifiedSACPolicy(
    vec_env.observation_space, 
    action_dims, 
    net_arch=net_arch, 
    activation_fn=activation_fn
)
unified_policy.load_state_dict(torch.load(filename, map_location='cpu'))


making 42 environments
..:: Antarctica Rendering Engine 2.0 ::..


  self.mean = torch.load(path+f'{agent}/buffer_mean', map_location='cpu')
  self.std = torch.load(path+f'{agent}/buffer_std', map_location='cpu')


..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
0 (0, 'abyss')
1 (1, 'black_forest')
2 (2, 'candela_city')
3 (3, 'cocoa_temple')
4 (4, 'cornfield_crossing')
5 (5, 'fortmagma')
6 (6, 'gran_paradiso_island')
7 (7, 'hacienda')
8 (8, 'lighthouse')
9 (9, 'mines')
10 (10, 'minigolf')
11 (11, 'olivermath')
12 (12, 'ravenbridge_mansion')
13 (13, 'sandtrack')
14 (14, 'scotland')
15 (15, 'snowmountain')
16 (16, 'snowtuxpeak')
17 (17, 'stk_enterprise')
18 (18, 'volcano_island')
19 (19, 'xr591')
20 (20, 'zengarden')
21 (0, 'abyss')
22 (1, 'black_forest')
23 (2, 'candela_city')
24 (3, 'cocoa_temple')
25 (4, 'cornfield_crossing')
26 (5, 'fortmagma')
27 (6, 'gran_paradiso_island')
28 (7, 'hacienda')
29 (8, 'lighthouse

  unified_policy.load_state_dict(torch.load(filename, map_location='cpu'))


<All keys matched successfully>

..:: Antarctica Rendering Engine 2.0 ::..


In [None]:
from stable_baselines3 import A2C


steps = [(
    # 1024*8,
    5*n_envs,
    1_000_000,
)]
for n_steps, total_timesteps in steps:
    # model = PPO(
    #     "MlpPolicy", 
    #     vec_env, 
    #     verbose=1, 
    #     policy_kwargs = dict(net_arch=net_arch, activation_fn=activation_fn,),
    #     device='cpu',
    #     learning_rate=0.0001,
    #     batch_size=128,
    #     n_epochs=100,
    #     n_steps=n_steps,
    #     tensorboard_log="./outputs/",
    #     # ent_coef=0.001,
    #     clip_range=0.2,
    # )
    model = A2C(
        "MlpPolicy", 
        vec_env, 
        verbose=1, 
        policy_kwargs = dict(net_arch=net_arch, activation_fn=activation_fn,),
        device='cpu',
        # learning_rate=0.001,
        n_steps=n_steps,
        tensorboard_log="./outputs/",
        use_rms_prop=False,
        normalize_advantage=True,
    )
    print('DOING', n_steps, total_timesteps)
    model.policy.load_state_dict(unified_policy.shared.state_dict())
    model.learn(total_timesteps=total_timesteps, progress_bar=True, log_interval=1)

        


..:: Antarctica Rendering Engine 2.0 ::..
Using cpu device
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica R

Output()

----------------------------
| time/              |     |
|    fps             | 9   |
|    iterations      | 1   |
|    time_elapsed    | 21  |
|    total_timesteps | 210 |
----------------------------


------------------------------------
| time/                 |          |
|    fps                | 16       |
|    iterations         | 2        |
|    time_elapsed       | 24       |
|    total_timesteps    | 420      |
| train/                |          |
|    entropy_loss       | -0.125   |
|    explained_variance | 0.428    |
|    learning_rate      | 0.0007   |
|    n_updates          | 1        |
|    policy_loss        | -0.0792  |
|    value_loss         | 151      |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 22       |
|    iterations         | 3        |
|    time_elapsed       | 28       |
|    total_timesteps    | 630      |
| train/                |          |
|    entropy_loss       | -0.184   |
|    explained_variance | 0.397    |
|    learning_rate      | 0.0007   |
|    n_updates          | 2        |
|    policy_loss        | -0.00817 |
|    value_loss         | 107      |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 26       |
|    iterations         | 4        |
|    time_elapsed       | 31       |
|    total_timesteps    | 840      |
| train/                |          |
|    entropy_loss       | -0.216   |
|    explained_variance | 0.598    |
|    learning_rate      | 0.0007   |
|    n_updates          | 3        |
|    policy_loss        | -0.144   |
|    value_loss         | 11.3     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 30       |
|    iterations         | 5        |
|    time_elapsed       | 34       |
|    total_timesteps    | 1050     |
| train/                |          |
|    entropy_loss       | -0.183   |
|    explained_variance | -0.109   |
|    learning_rate      | 0.0007   |
|    n_updates          | 4        |
|    policy_loss        | -0.0129  |
|    value_loss         | 7        |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 33       |
|    iterations         | 6        |
|    time_elapsed       | 37       |
|    total_timesteps    | 1260     |
| train/                |          |
|    entropy_loss       | -0.182   |
|    explained_variance | -0.18    |
|    learning_rate      | 0.0007   |
|    n_updates          | 5        |
|    policy_loss        | -0.0222  |
|    value_loss         | 6.97     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 36       |
|    iterations         | 7        |
|    time_elapsed       | 40       |
|    total_timesteps    | 1470     |
| train/                |          |
|    entropy_loss       | -0.292   |
|    explained_variance | -0.39    |
|    learning_rate      | 0.0007   |
|    n_updates          | 6        |
|    policy_loss        | 0.0358   |
|    value_loss         | 9.33     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 38       |
|    iterations         | 8        |
|    time_elapsed       | 44       |
|    total_timesteps    | 1680     |
| train/                |          |
|    entropy_loss       | -0.257   |
|    explained_variance | -0.0343  |
|    learning_rate      | 0.0007   |
|    n_updates          | 7        |
|    policy_loss        | -0.0219  |
|    value_loss         | 8.27     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 39       |
|    iterations         | 9        |
|    time_elapsed       | 47       |
|    total_timesteps    | 1890     |
| train/                |          |
|    entropy_loss       | -0.246   |
|    explained_variance | 0.179    |
|    learning_rate      | 0.0007   |
|    n_updates          | 8        |
|    policy_loss        | 0.0822   |
|    value_loss         | 11       |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 41       |
|    iterations         | 10       |
|    time_elapsed       | 50       |
|    total_timesteps    | 2100     |
| train/                |          |
|    entropy_loss       | -0.228   |
|    explained_variance | 0.11     |
|    learning_rate      | 0.0007   |
|    n_updates          | 9        |
|    policy_loss        | 0.0734   |
|    value_loss         | 35.9     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 42       |
|    iterations         | 11       |
|    time_elapsed       | 53       |
|    total_timesteps    | 2310     |
| train/                |          |
|    entropy_loss       | -0.279   |
|    explained_variance | 0.215    |
|    learning_rate      | 0.0007   |
|    n_updates          | 10       |
|    policy_loss        | 0.0285   |
|    value_loss         | 31.2     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 44       |
|    iterations         | 12       |
|    time_elapsed       | 57       |
|    total_timesteps    | 2520     |
| train/                |          |
|    entropy_loss       | -0.219   |
|    explained_variance | 0.439    |
|    learning_rate      | 0.0007   |
|    n_updates          | 11       |
|    policy_loss        | 0.0241   |
|    value_loss         | 38       |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 45       |
|    iterations         | 13       |
|    time_elapsed       | 60       |
|    total_timesteps    | 2730     |
| train/                |          |
|    entropy_loss       | -0.249   |
|    explained_variance | -5.16    |
|    learning_rate      | 0.0007   |
|    n_updates          | 12       |
|    policy_loss        | -0.0354  |
|    value_loss         | 146      |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 46       |
|    iterations         | 14       |
|    time_elapsed       | 63       |
|    total_timesteps    | 2940     |
| train/                |          |
|    entropy_loss       | -0.268   |
|    explained_variance | 0.822    |
|    learning_rate      | 0.0007   |
|    n_updates          | 13       |
|    policy_loss        | 0.00771  |
|    value_loss         | 36.9     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 47       |
|    iterations         | 15       |
|    time_elapsed       | 66       |
|    total_timesteps    | 3150     |
| train/                |          |
|    entropy_loss       | -0.264   |
|    explained_variance | 0.944    |
|    learning_rate      | 0.0007   |
|    n_updates          | 14       |
|    policy_loss        | 0.077    |
|    value_loss         | 12.3     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 47       |
|    iterations         | 16       |
|    time_elapsed       | 70       |
|    total_timesteps    | 3360     |
| train/                |          |
|    entropy_loss       | -0.188   |
|    explained_variance | -0.628   |
|    learning_rate      | 0.0007   |
|    n_updates          | 15       |
|    policy_loss        | 0.0265   |
|    value_loss         | 49.5     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 48       |
|    iterations         | 17       |
|    time_elapsed       | 73       |
|    total_timesteps    | 3570     |
| train/                |          |
|    entropy_loss       | -0.211   |
|    explained_variance | 0.775    |
|    learning_rate      | 0.0007   |
|    n_updates          | 16       |
|    policy_loss        | 0.0543   |
|    value_loss         | 10.1     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 49       |
|    iterations         | 18       |
|    time_elapsed       | 76       |
|    total_timesteps    | 3780     |
| train/                |          |
|    entropy_loss       | -0.172   |
|    explained_variance | 0.85     |
|    learning_rate      | 0.0007   |
|    n_updates          | 17       |
|    policy_loss        | 0.00149  |
|    value_loss         | 9.26     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 50       |
|    iterations         | 19       |
|    time_elapsed       | 79       |
|    total_timesteps    | 3990     |
| train/                |          |
|    entropy_loss       | -0.18    |
|    explained_variance | 0.933    |
|    learning_rate      | 0.0007   |
|    n_updates          | 18       |
|    policy_loss        | 0.0164   |
|    value_loss         | 6.41     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 50       |
|    iterations         | 20       |
|    time_elapsed       | 82       |
|    total_timesteps    | 4200     |
| train/                |          |
|    entropy_loss       | -0.247   |
|    explained_variance | 0.859    |
|    learning_rate      | 0.0007   |
|    n_updates          | 19       |
|    policy_loss        | -0.00675 |
|    value_loss         | 13.7     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 51       |
|    iterations         | 21       |
|    time_elapsed       | 86       |
|    total_timesteps    | 4410     |
| train/                |          |
|    entropy_loss       | -0.214   |
|    explained_variance | 0.89     |
|    learning_rate      | 0.0007   |
|    n_updates          | 20       |
|    policy_loss        | 0.0121   |
|    value_loss         | 14.5     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 51       |
|    iterations         | 22       |
|    time_elapsed       | 89       |
|    total_timesteps    | 4620     |
| train/                |          |
|    entropy_loss       | -0.232   |
|    explained_variance | 0.951    |
|    learning_rate      | 0.0007   |
|    n_updates          | 21       |
|    policy_loss        | 0.121    |
|    value_loss         | 6.49     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 52       |
|    iterations         | 23       |
|    time_elapsed       | 92       |
|    total_timesteps    | 4830     |
| train/                |          |
|    entropy_loss       | -0.282   |
|    explained_variance | 0.957    |
|    learning_rate      | 0.0007   |
|    n_updates          | 22       |
|    policy_loss        | -0.00106 |
|    value_loss         | 7.62     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 52       |
|    iterations         | 24       |
|    time_elapsed       | 95       |
|    total_timesteps    | 5040     |
| train/                |          |
|    entropy_loss       | -0.295   |
|    explained_variance | 0.859    |
|    learning_rate      | 0.0007   |
|    n_updates          | 23       |
|    policy_loss        | 0.0147   |
|    value_loss         | 33.4     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 52       |
|    iterations         | 25       |
|    time_elapsed       | 99       |
|    total_timesteps    | 5250     |
| train/                |          |
|    entropy_loss       | -0.262   |
|    explained_variance | 0.982    |
|    learning_rate      | 0.0007   |
|    n_updates          | 24       |
|    policy_loss        | -0.00291 |
|    value_loss         | 5.18     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 53       |
|    iterations         | 26       |
|    time_elapsed       | 102      |
|    total_timesteps    | 5460     |
| train/                |          |
|    entropy_loss       | -0.241   |
|    explained_variance | 0.957    |
|    learning_rate      | 0.0007   |
|    n_updates          | 25       |
|    policy_loss        | 0.066    |
|    value_loss         | 13.5     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 53       |
|    iterations         | 27       |
|    time_elapsed       | 105      |
|    total_timesteps    | 5670     |
| train/                |          |
|    entropy_loss       | -0.21    |
|    explained_variance | 0.963    |
|    learning_rate      | 0.0007   |
|    n_updates          | 26       |
|    policy_loss        | 0.00209  |
|    value_loss         | 13.5     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 53       |
|    iterations         | 28       |
|    time_elapsed       | 108      |
|    total_timesteps    | 5880     |
| train/                |          |
|    entropy_loss       | -0.2     |
|    explained_variance | 0.967    |
|    learning_rate      | 0.0007   |
|    n_updates          | 27       |
|    policy_loss        | 0.161    |
|    value_loss         | 12.9     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 54       |
|    iterations         | 29       |
|    time_elapsed       | 112      |
|    total_timesteps    | 6090     |
| train/                |          |
|    entropy_loss       | -0.227   |
|    explained_variance | 0.962    |
|    learning_rate      | 0.0007   |
|    n_updates          | 28       |
|    policy_loss        | 0.0481   |
|    value_loss         | 16.8     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 54       |
|    iterations         | 30       |
|    time_elapsed       | 115      |
|    total_timesteps    | 6300     |
| train/                |          |
|    entropy_loss       | -0.174   |
|    explained_variance | 0.955    |
|    learning_rate      | 0.0007   |
|    n_updates          | 29       |
|    policy_loss        | -0.0274  |
|    value_loss         | 20.9     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 54       |
|    iterations         | 31       |
|    time_elapsed       | 118      |
|    total_timesteps    | 6510     |
| train/                |          |
|    entropy_loss       | -0.235   |
|    explained_variance | 0.973    |
|    learning_rate      | 0.0007   |
|    n_updates          | 30       |
|    policy_loss        | -0.0066  |
|    value_loss         | 11.4     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 55       |
|    iterations         | 32       |
|    time_elapsed       | 122      |
|    total_timesteps    | 6720     |
| train/                |          |
|    entropy_loss       | -0.248   |
|    explained_variance | 0.932    |
|    learning_rate      | 0.0007   |
|    n_updates          | 31       |
|    policy_loss        | -0.00429 |
|    value_loss         | 29       |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 55       |
|    iterations         | 33       |
|    time_elapsed       | 125      |
|    total_timesteps    | 6930     |
| train/                |          |
|    entropy_loss       | -0.275   |
|    explained_variance | 0.965    |
|    learning_rate      | 0.0007   |
|    n_updates          | 32       |
|    policy_loss        | 0.0298   |
|    value_loss         | 15.4     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 55       |
|    iterations         | 34       |
|    time_elapsed       | 128      |
|    total_timesteps    | 7140     |
| train/                |          |
|    entropy_loss       | -0.314   |
|    explained_variance | 0.903    |
|    learning_rate      | 0.0007   |
|    n_updates          | 33       |
|    policy_loss        | 0.0484   |
|    value_loss         | 41.7     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 55       |
|    iterations         | 35       |
|    time_elapsed       | 131      |
|    total_timesteps    | 7350     |
| train/                |          |
|    entropy_loss       | -0.369   |
|    explained_variance | 0.947    |
|    learning_rate      | 0.0007   |
|    n_updates          | 34       |
|    policy_loss        | -0.0631  |
|    value_loss         | 20.5     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 55       |
|    iterations         | 36       |
|    time_elapsed       | 135      |
|    total_timesteps    | 7560     |
| train/                |          |
|    entropy_loss       | -0.291   |
|    explained_variance | 0.893    |
|    learning_rate      | 0.0007   |
|    n_updates          | 35       |
|    policy_loss        | -0.0229  |
|    value_loss         | 39.2     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 56       |
|    iterations         | 37       |
|    time_elapsed       | 138      |
|    total_timesteps    | 7770     |
| train/                |          |
|    entropy_loss       | -0.295   |
|    explained_variance | 0.925    |
|    learning_rate      | 0.0007   |
|    n_updates          | 36       |
|    policy_loss        | 0.00636  |
|    value_loss         | 32       |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 56       |
|    iterations         | 38       |
|    time_elapsed       | 141      |
|    total_timesteps    | 7980     |
| train/                |          |
|    entropy_loss       | -0.245   |
|    explained_variance | 0.934    |
|    learning_rate      | 0.0007   |
|    n_updates          | 37       |
|    policy_loss        | -0.0449  |
|    value_loss         | 36.8     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 56       |
|    iterations         | 39       |
|    time_elapsed       | 144      |
|    total_timesteps    | 8190     |
| train/                |          |
|    entropy_loss       | -0.229   |
|    explained_variance | 0.864    |
|    learning_rate      | 0.0007   |
|    n_updates          | 38       |
|    policy_loss        | -0.00606 |
|    value_loss         | 70.2     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 56       |
|    iterations         | 40       |
|    time_elapsed       | 148      |
|    total_timesteps    | 8400     |
| train/                |          |
|    entropy_loss       | -0.322   |
|    explained_variance | 0.973    |
|    learning_rate      | 0.0007   |
|    n_updates          | 39       |
|    policy_loss        | -0.0357  |
|    value_loss         | 15.6     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 56       |
|    iterations         | 41       |
|    time_elapsed       | 151      |
|    total_timesteps    | 8610     |
| train/                |          |
|    entropy_loss       | -0.312   |
|    explained_variance | 0.775    |
|    learning_rate      | 0.0007   |
|    n_updates          | 40       |
|    policy_loss        | -0.00786 |
|    value_loss         | 118      |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 56       |
|    iterations         | 42       |
|    time_elapsed       | 154      |
|    total_timesteps    | 8820     |
| train/                |          |
|    entropy_loss       | -0.304   |
|    explained_variance | 0.926    |
|    learning_rate      | 0.0007   |
|    n_updates          | 41       |
|    policy_loss        | -0.00146 |
|    value_loss         | 46.9     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 43       |
|    time_elapsed       | 158      |
|    total_timesteps    | 9030     |
| train/                |          |
|    entropy_loss       | -0.267   |
|    explained_variance | 0.847    |
|    learning_rate      | 0.0007   |
|    n_updates          | 42       |
|    policy_loss        | 0.0134   |
|    value_loss         | 92.7     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 44       |
|    time_elapsed       | 161      |
|    total_timesteps    | 9240     |
| train/                |          |
|    entropy_loss       | -0.319   |
|    explained_variance | 0.91     |
|    learning_rate      | 0.0007   |
|    n_updates          | 43       |
|    policy_loss        | 0.0482   |
|    value_loss         | 53.8     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 45       |
|    time_elapsed       | 165      |
|    total_timesteps    | 9450     |
| train/                |          |
|    entropy_loss       | -0.301   |
|    explained_variance | 0.942    |
|    learning_rate      | 0.0007   |
|    n_updates          | 44       |
|    policy_loss        | -0.104   |
|    value_loss         | 35.6     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 46       |
|    time_elapsed       | 168      |
|    total_timesteps    | 9660     |
| train/                |          |
|    entropy_loss       | -0.278   |
|    explained_variance | 0.928    |
|    learning_rate      | 0.0007   |
|    n_updates          | 45       |
|    policy_loss        | -0.0149  |
|    value_loss         | 44.3     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 47       |
|    time_elapsed       | 171      |
|    total_timesteps    | 9870     |
| train/                |          |
|    entropy_loss       | -0.322   |
|    explained_variance | 0.924    |
|    learning_rate      | 0.0007   |
|    n_updates          | 46       |
|    policy_loss        | -0.0167  |
|    value_loss         | 44.3     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 48       |
|    time_elapsed       | 175      |
|    total_timesteps    | 10080    |
| train/                |          |
|    entropy_loss       | -0.292   |
|    explained_variance | 0.924    |
|    learning_rate      | 0.0007   |
|    n_updates          | 47       |
|    policy_loss        | 0.0129   |
|    value_loss         | 40.9     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 49       |
|    time_elapsed       | 178      |
|    total_timesteps    | 10290    |
| train/                |          |
|    entropy_loss       | -0.263   |
|    explained_variance | 0.965    |
|    learning_rate      | 0.0007   |
|    n_updates          | 48       |
|    policy_loss        | 0.0296   |
|    value_loss         | 19.7     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 50       |
|    time_elapsed       | 181      |
|    total_timesteps    | 10500    |
| train/                |          |
|    entropy_loss       | -0.314   |
|    explained_variance | 0.956    |
|    learning_rate      | 0.0007   |
|    n_updates          | 49       |
|    policy_loss        | 0.0241   |
|    value_loss         | 25.4     |
------------------------------------


------------------------------------
| time/                 |          |
|    fps                | 57       |
|    iterations         | 51       |
|    time_elapsed       | 184      |
|    total_timesteps    | 10710    |
| train/                |          |
|    entropy_loss       | -0.293   |
|    explained_variance | 0.848    |
|    learning_rate      | 0.0007   |
|    n_updates          | 50       |
|    policy_loss        | -0.019   |
|    value_loss         | 88.2     |
------------------------------------


In [None]:
model.save(f'final_a2c_{n_steps}')
f'final_a2c_{n_steps}'

'final_a2c_2048'