In [1]:
import gymnasium as gym
from stk_actor.wrappers import PreprocessObservationWrapper
import torch.nn as nn
from torch.distributions import Categorical
from typing import Dict, List, Tuple, Union, Type
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from gymnasium import spaces

def get_device(device: Union[torch.device, str] = "auto") -> torch.device:
    if device == "auto":
        device = "cuda"
    device = torch.device(device)
    if device.type == torch.device("cuda").type and not torch.cuda.is_available():
        return torch.device("cpu")
    return device

class BaseFeaturesExtractor(nn.Module):
    def __init__(self, observation_space: gym.Space, features_dim: int = 0) -> None:
        super().__init__()
        assert features_dim > 0
        self._observation_space = observation_space
        self._features_dim = features_dim
    @property
    def features_dim(self) -> int:
        return self._features_dim

def get_flattened_obs_dim(observation_space: spaces.Space) -> int:
    if isinstance(observation_space, spaces.MultiDiscrete):
        return sum(observation_space.nvec)
    else:
        return spaces.utils.flatdim(observation_space)

class FlattenExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space) -> None:
        super().__init__(observation_space, get_flattened_obs_dim(observation_space))
        self.flatten = nn.Flatten()
    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.flatten(observations)
    
class MlpExtractor(nn.Module):
    def __init__(
        self,
        feature_dim: int,
        net_arch: Union[List[int], Dict[str, List[int]]],
        activation_fn: Type[nn.Module],
        device: Union[torch.device, str] = "auto",
    ) -> None:
        super().__init__()
        # device = torch.get_device(device)
        policy_net: List[nn.Module] = []
        value_net: List[nn.Module] = []
        last_layer_dim_pi = feature_dim
        last_layer_dim_vf = feature_dim

        if isinstance(net_arch, dict):
            pi_layers_dims = net_arch.get("pi", []) 
            vf_layers_dims = net_arch.get("vf", []) 
        else:
            pi_layers_dims = vf_layers_dims = net_arch
        for curr_layer_dim in pi_layers_dims:
            policy_net.append(nn.Linear(last_layer_dim_pi, curr_layer_dim))
            policy_net.append(activation_fn())
            last_layer_dim_pi = curr_layer_dim
        for curr_layer_dim in vf_layers_dims:
            value_net.append(nn.Linear(last_layer_dim_vf, curr_layer_dim))
            value_net.append(activation_fn())
            last_layer_dim_vf = curr_layer_dim

        self.latent_dim_pi = last_layer_dim_pi
        self.latent_dim_vf = last_layer_dim_vf
        self.policy_net = nn.Sequential(*policy_net)#.to(device)
        self.value_net = nn.Sequential(*value_net)#.to(device)

    def forward(self, features: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        :return: latent_policy, latent_value of the specified network.
            If all layers are shared, then ``latent_policy == latent_value``
        """
        return self.forward_actor(features), self.forward_critic(features)

    def forward_actor(self, features: torch.Tensor) -> torch.Tensor:
        return self.policy_net(features)

    def forward_critic(self, features: torch.Tensor) -> torch.Tensor:
        return self.value_net(features)

    
class Policy(nn.Module):
    def __init__(self, observation_space, action_dims, net_arch, activation_fn,):
        super().__init__()
        self.features_extractor = FlattenExtractor(observation_space)
        self.pi_features_extractor = self.features_extractor
        self.vf_features_extractor = self.features_extractor
        self.mlp_extractor = MlpExtractor(
            self.features_extractor.features_dim,
            net_arch=net_arch,
            activation_fn=activation_fn,
        )
        self.action_net = nn.Linear(net_arch[-1], sum(action_dims))
        self.value_net = nn.Linear(net_arch[-1], 1)


class UnifiedSACPolicy(nn.Module):
    def __init__(self, observation_space, action_dims, net_arch, activation_fn):
        super().__init__()
        
        self.shared = Policy(
            observation_space,
            action_dims,
            net_arch=net_arch,
            activation_fn=activation_fn
        )
        self.action_dims = action_dims
    
    def forward(self, x):
        x = self.shared.features_extractor(x)
        x = self.shared.mlp_extractor.policy_net(x)
        x = self.shared.action_net(x)
        return x
    
    def sample(self, x, deterministic=False):
        logits = self.forward(x)
        
        # Split logits for each action dimension
        split_logits = torch.split(logits, self.action_dims, dim=-1)
        
        actions = []
        log_probs = []
        probs = []
        
        for logit in split_logits:
            distribution = Categorical(logits=logit)
            if deterministic:
                action = torch.argmax(logit, dim=-1)
            else:
                action = distribution.sample()
            
            log_prob = distribution.log_prob(action)
            prob = F.softmax(logit, dim=-1)
            
            actions.append(action)
            log_probs.append(log_prob)
            probs.append(prob)
        
        return (
            torch.stack(actions),
            torch.stack(log_probs),
            probs
        )
    
#policy = torch.load('policy_512_512_512_512_SiLU_3_statedict', map_location='cuda')


from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym
from pystk2_gymnasium import AgentSpec
from bbrl.agents.gymnasium import ParallelGymAgent, make_env
from functools import partial

vec_env = make_vec_env("supertuxkart/flattened_multidiscrete-v0", seed=0, n_envs=21, wrapper_class=lambda x : (PreprocessObservationWrapper(x, ret_dict=False)), env_kwargs={
    'render_mode':None, 'agent':AgentSpec(use_ai=False, name="walid"), 'track':'minigolf', 'laps':2#'difficulty':0, #'num_kart':2, 'difficulty':0
})

tracks = ['abyss',
 'black_forest',
 'candela_city',
 'cocoa_temple',
 'cornfield_crossing',
 'fortmagma',
 'gran_paradiso_island',
 'hacienda',
 'lighthouse',
 'mines',
 'minigolf',
 'olivermath',
 'ravenbridge_mansion',
 'sandtrack',
 'scotland',
 'snowmountain',
 'snowtuxpeak',
 'stk_enterprise',
 'volcano_island',
 'xr591',
 'zengarden']
for i,venv in enumerate(vec_env.envs):
    print(i, tracks[i%len(tracks)])
    venv.env.default_track = tracks[i%len(tracks)]


net_arch=[512,512,512,512]
activation_fn=torch.nn.SiLU
filename = 'policy_512_512_512_512_SiLU_3_statedict'

action_dims = [space.n for space in vec_env.action_space]
unified_policy = UnifiedSACPolicy(
    vec_env.observation_space, 
    action_dims, 
    net_arch=net_arch, 
    activation_fn=activation_fn
)
unified_policy.load_state_dict(torch.load(filename, map_location='cpu'))


..:: Antarctica Rendering Engine 2.0 ::..
0 abyss
1 black_forest
2 candela_city
3 cocoa_temple
4 cornfield_crossing
5 fortmagma
6 gran_paradiso_island
7 hacienda
8 lighthouse
9 mines
10 minigolf
11 olivermath
12 ravenbridge_mansion
13 sandtrack
14 scotland
15 snowmountain
16 snowtuxpeak
17 stk_enterprise
18 volcano_island
19 xr591
20 zengarden


  unified_policy.load_state_dict(torch.load(filename, map_location='cpu'))


<All keys matched successfully>

In [2]:
steps = [(
    #16384,
    3000,
    3_000_000,
)]
for n_steps, total_timesteps in steps:
    model = PPO(
        "MlpPolicy", 
        vec_env, 
        verbose=1, 
        policy_kwargs = dict(net_arch=net_arch, activation_fn=activation_fn,),
        device='cpu',
        learning_rate=0.0001,
        batch_size=128,
        n_steps=n_steps,
        tensorboard_log="./outputs/",
        ent_coef=0.001,
        clip_range=0.001,
    )
    print('DOING', n_steps, total_timesteps)
    model.policy.load_state_dict(unified_policy.shared.state_dict())
    model.policy.load_state_dict(
        PPO.load(
            "ppti_ppo_1500", 
            custom_objects={'policy_kwargs' : dict(net_arch=net_arch, activation_fn=activation_fn), }
        ).policy.state_dict()
    )
    model.learn(total_timesteps=total_timesteps, progress_bar=True)#callback=SummaryWriterCallback())
    model.save(f'ppti_ppo_{n_steps}')


Using cpu device


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3000 and n_envs=21)


..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
..:: Antarctica Rendering Engine 2.0 ::..
DOING 3000 3000000
Logging to ./outputs/PPO_5


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 855      |
|    ep_rew_mean     | 657      |
| time/              |          |
|    fps             | 89       |
|    iterations      | 1        |
|    time_elapsed    | 705      |
|    total_timesteps | 63000    |
---------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 843       |
|    ep_rew_mean          | 619       |
| time/                   |           |
|    fps                  | 87        |
|    iterations           | 2         |
|    time_elapsed         | 1441      |
|    total_timesteps      | 126000    |
| train/                  |           |
|    approx_kl            | 1.9916539 |
|    clip_fraction        | 0.548     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.134    |
|    explained_variance   | 0.891     |
|    learning_rate        | 0.0001    |
|    loss                 | 44.8      |
|    n_updates            | 10        |
|    policy_gradient_loss | 0.106     |
|    value_loss           | 141       |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 845        |
|    ep_rew_mean          | 578        |
| time/                   |            |
|    fps                  | 86         |
|    iterations           | 3          |
|    time_elapsed         | 2177       |
|    total_timesteps      | 189000     |
| train/                  |            |
|    approx_kl            | 0.35606694 |
|    clip_fraction        | 0.403      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.105     |
|    explained_variance   | 0.879      |
|    learning_rate        | 0.0001     |
|    loss                 | 47.5       |
|    n_updates            | 20         |
|    policy_gradient_loss | 0.0422     |
|    value_loss           | 130        |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.1e+03   |
|    ep_rew_mean          | 458       |
| time/                   |           |
|    fps                  | 86        |
|    iterations           | 4         |
|    time_elapsed         | 2915      |
|    total_timesteps      | 252000    |
| train/                  |           |
|    approx_kl            | 4.7284894 |
|    clip_fraction        | 0.435     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.108    |
|    explained_variance   | 0.911     |
|    learning_rate        | 0.0001    |
|    loss                 | 22.8      |
|    n_updates            | 30        |
|    policy_gradient_loss | 0.0669    |
|    value_loss           | 128       |
---------------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 1.27e+03 |
|    ep_rew_mean          | 336      |
| time/                   |          |
|    fps                  | 86       |
|    iterations           | 5        |
|    time_elapsed         | 3653     |
|    total_timesteps      | 315000   |
| train/                  |          |
|    approx_kl            | 8.185188 |
|    clip_fraction        | 0.711    |
|    clip_range           | 0.001    |
|    entropy_loss         | -0.0951  |
|    explained_variance   | 0.906    |
|    learning_rate        | 0.0001   |
|    loss                 | 15.6     |
|    n_updates            | 40       |
|    policy_gradient_loss | 0.142    |
|    value_loss           | 49.7     |
--------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.06e+03   |
|    ep_rew_mean          | 413        |
| time/                   |            |
|    fps                  | 86         |
|    iterations           | 6          |
|    time_elapsed         | 4394       |
|    total_timesteps      | 378000     |
| train/                  |            |
|    approx_kl            | 0.06829537 |
|    clip_fraction        | 0.436      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.105     |
|    explained_variance   | 0.928      |
|    learning_rate        | 0.0001     |
|    loss                 | 14.7       |
|    n_updates            | 50         |
|    policy_gradient_loss | 0.0695     |
|    value_loss           | 62.5       |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 886       |
|    ep_rew_mean          | 527       |
| time/                   |           |
|    fps                  | 85        |
|    iterations           | 7         |
|    time_elapsed         | 5129      |
|    total_timesteps      | 441000    |
| train/                  |           |
|    approx_kl            | 1.1489141 |
|    clip_fraction        | 0.47      |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.104    |
|    explained_variance   | 0.925     |
|    learning_rate        | 0.0001    |
|    loss                 | 11.1      |
|    n_updates            | 60        |
|    policy_gradient_loss | 0.0491    |
|    value_loss           | 71.6      |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 858        |
|    ep_rew_mean          | 553        |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 8          |
|    time_elapsed         | 5866       |
|    total_timesteps      | 504000     |
| train/                  |            |
|    approx_kl            | 0.24628992 |
|    clip_fraction        | 0.454      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.121     |
|    explained_variance   | 0.896      |
|    learning_rate        | 0.0001     |
|    loss                 | 29.9       |
|    n_updates            | 70         |
|    policy_gradient_loss | 0.0382     |
|    value_loss           | 108        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 920        |
|    ep_rew_mean          | 456        |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 9          |
|    time_elapsed         | 6615       |
|    total_timesteps      | 567000     |
| train/                  |            |
|    approx_kl            | 0.24628979 |
|    clip_fraction        | 0.443      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.105     |
|    explained_variance   | 0.92       |
|    learning_rate        | 0.0001     |
|    loss                 | 27.3       |
|    n_updates            | 80         |
|    policy_gradient_loss | 0.0343     |
|    value_loss           | 84.8       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 905        |
|    ep_rew_mean          | 468        |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 10         |
|    time_elapsed         | 7367       |
|    total_timesteps      | 630000     |
| train/                  |            |
|    approx_kl            | 0.20149677 |
|    clip_fraction        | 0.45       |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.105     |
|    explained_variance   | 0.92       |
|    learning_rate        | 0.0001     |
|    loss                 | 28.3       |
|    n_updates            | 90         |
|    policy_gradient_loss | 0.0304     |
|    value_loss           | 74.6       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 953        |
|    ep_rew_mean          | 423        |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 11         |
|    time_elapsed         | 8121       |
|    total_timesteps      | 693000     |
| train/                  |            |
|    approx_kl            | 0.48035473 |
|    clip_fraction        | 0.416      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0977    |
|    explained_variance   | 0.92       |
|    learning_rate        | 0.0001     |
|    loss                 | 35.1       |
|    n_updates            | 100        |
|    policy_gradient_loss | 0.0324     |
|    value_loss           | 85.4       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 945        |
|    ep_rew_mean          | 498        |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 12         |
|    time_elapsed         | 8873       |
|    total_timesteps      | 756000     |
| train/                  |            |
|    approx_kl            | 0.27986655 |
|    clip_fraction        | 0.332      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0666    |
|    explained_variance   | 0.927      |
|    learning_rate        | 0.0001     |
|    loss                 | 50.6       |
|    n_updates            | 110        |
|    policy_gradient_loss | 0.038      |
|    value_loss           | 57         |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 850       |
|    ep_rew_mean          | 522       |
| time/                   |           |
|    fps                  | 85        |
|    iterations           | 13        |
|    time_elapsed         | 9625      |
|    total_timesteps      | 819000    |
| train/                  |           |
|    approx_kl            | 1.5812303 |
|    clip_fraction        | 0.398     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0803   |
|    explained_variance   | 0.911     |
|    learning_rate        | 0.0001    |
|    loss                 | 36.1      |
|    n_updates            | 120       |
|    policy_gradient_loss | 0.0606    |
|    value_loss           | 93.7      |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 846        |
|    ep_rew_mean          | 542        |
| time/                   |            |
|    fps                  | 84         |
|    iterations           | 14         |
|    time_elapsed         | 10376      |
|    total_timesteps      | 882000     |
| train/                  |            |
|    approx_kl            | 0.39989844 |
|    clip_fraction        | 0.344      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0807    |
|    explained_variance   | 0.921      |
|    learning_rate        | 0.0001     |
|    loss                 | 16.6       |
|    n_updates            | 130        |
|    policy_gradient_loss | 0.0281     |
|    value_loss           | 80.6       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 835        |
|    ep_rew_mean          | 563        |
| time/                   |            |
|    fps                  | 84         |
|    iterations           | 15         |
|    time_elapsed         | 11128      |
|    total_timesteps      | 945000     |
| train/                  |            |
|    approx_kl            | 0.14013131 |
|    clip_fraction        | 0.315      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0873    |
|    explained_variance   | 0.913      |
|    learning_rate        | 0.0001     |
|    loss                 | 16.3       |
|    n_updates            | 140        |
|    policy_gradient_loss | 0.0266     |
|    value_loss           | 91.4       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 859        |
|    ep_rew_mean          | 534        |
| time/                   |            |
|    fps                  | 84         |
|    iterations           | 16         |
|    time_elapsed         | 11881      |
|    total_timesteps      | 1008000    |
| train/                  |            |
|    approx_kl            | 0.50188035 |
|    clip_fraction        | 0.338      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0775    |
|    explained_variance   | 0.915      |
|    learning_rate        | 0.0001     |
|    loss                 | 20.6       |
|    n_updates            | 150        |
|    policy_gradient_loss | 0.0465     |
|    value_loss           | 98.2       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 873        |
|    ep_rew_mean          | 514        |
| time/                   |            |
|    fps                  | 84         |
|    iterations           | 17         |
|    time_elapsed         | 12633      |
|    total_timesteps      | 1071000    |
| train/                  |            |
|    approx_kl            | 0.27697086 |
|    clip_fraction        | 0.3        |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0946    |
|    explained_variance   | 0.92       |
|    learning_rate        | 0.0001     |
|    loss                 | 10.1       |
|    n_updates            | 160        |
|    policy_gradient_loss | 0.0202     |
|    value_loss           | 89.5       |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.12e+03  |
|    ep_rew_mean          | 402       |
| time/                   |           |
|    fps                  | 84        |
|    iterations           | 18        |
|    time_elapsed         | 13391     |
|    total_timesteps      | 1134000   |
| train/                  |           |
|    approx_kl            | 3.5516844 |
|    clip_fraction        | 0.347     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0651   |
|    explained_variance   | 0.931     |
|    learning_rate        | 0.0001    |
|    loss                 | 32.7      |
|    n_updates            | 170       |
|    policy_gradient_loss | 0.0526    |
|    value_loss           | 89.9      |
---------------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 1.38e+03 |
|    ep_rew_mean          | 224      |
| time/                   |          |
|    fps                  | 84       |
|    iterations           | 19       |
|    time_elapsed         | 14150    |
|    total_timesteps      | 1197000  |
| train/                  |          |
|    approx_kl            | 0.754445 |
|    clip_fraction        | 0.301    |
|    clip_range           | 0.001    |
|    entropy_loss         | -0.0576  |
|    explained_variance   | 0.918    |
|    learning_rate        | 0.0001   |
|    loss                 | 10.7     |
|    n_updates            | 180      |
|    policy_gradient_loss | 0.0525   |
|    value_loss           | 29.6     |
--------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 146        |
| time/                   |            |
|    fps                  | 84         |
|    iterations           | 20         |
|    time_elapsed         | 14909      |
|    total_timesteps      | 1260000    |
| train/                  |            |
|    approx_kl            | 0.25081944 |
|    clip_fraction        | 0.282      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0364    |
|    explained_variance   | 0.954      |
|    learning_rate        | 0.0001     |
|    loss                 | 12.4       |
|    n_updates            | 190        |
|    policy_gradient_loss | 0.0165     |
|    value_loss           | 25.3       |
----------------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 1.49e+03 |
|    ep_rew_mean          | 140      |
| time/                   |          |
|    fps                  | 84       |
|    iterations           | 21       |
|    time_elapsed         | 15669    |
|    total_timesteps      | 1323000  |
| train/                  |          |
|    approx_kl            | 1.02942  |
|    clip_fraction        | 0.265    |
|    clip_range           | 0.001    |
|    entropy_loss         | -0.0534  |
|    explained_variance   | 0.95     |
|    learning_rate        | 0.0001   |
|    loss                 | 10.3     |
|    n_updates            | 200      |
|    policy_gradient_loss | 0.0304   |
|    value_loss           | 21.5     |
--------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.48e+03  |
|    ep_rew_mean          | 118       |
| time/                   |           |
|    fps                  | 84        |
|    iterations           | 22        |
|    time_elapsed         | 16435     |
|    total_timesteps      | 1386000   |
| train/                  |           |
|    approx_kl            | 0.4558342 |
|    clip_fraction        | 0.228     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0428   |
|    explained_variance   | 0.937     |
|    learning_rate        | 0.0001    |
|    loss                 | 8.75      |
|    n_updates            | 210       |
|    policy_gradient_loss | 0.0298    |
|    value_loss           | 21.3      |
---------------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 1.49e+03 |
|    ep_rew_mean          | 116      |
| time/                   |          |
|    fps                  | 84       |
|    iterations           | 23       |
|    time_elapsed         | 17200    |
|    total_timesteps      | 1449000  |
| train/                  |          |
|    approx_kl            | 0.832254 |
|    clip_fraction        | 0.285    |
|    clip_range           | 0.001    |
|    entropy_loss         | -0.0617  |
|    explained_variance   | 0.947    |
|    learning_rate        | 0.0001   |
|    loss                 | 1.33     |
|    n_updates            | 220      |
|    policy_gradient_loss | 0.0385   |
|    value_loss           | 19.1     |
--------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 117        |
| time/                   |            |
|    fps                  | 84         |
|    iterations           | 24         |
|    time_elapsed         | 17967      |
|    total_timesteps      | 1512000    |
| train/                  |            |
|    approx_kl            | 0.24242583 |
|    clip_fraction        | 0.147      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0282    |
|    explained_variance   | 0.962      |
|    learning_rate        | 0.0001     |
|    loss                 | 1.29       |
|    n_updates            | 230        |
|    policy_gradient_loss | 0.0227     |
|    value_loss           | 15.6       |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.5e+03   |
|    ep_rew_mean          | 106       |
| time/                   |           |
|    fps                  | 84        |
|    iterations           | 25        |
|    time_elapsed         | 18732     |
|    total_timesteps      | 1575000   |
| train/                  |           |
|    approx_kl            | 0.8086234 |
|    clip_fraction        | 0.211     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0429   |
|    explained_variance   | 0.964     |
|    learning_rate        | 0.0001    |
|    loss                 | 2.25      |
|    n_updates            | 240       |
|    policy_gradient_loss | 0.0196    |
|    value_loss           | 13.4      |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.5e+03   |
|    ep_rew_mean          | 68.2      |
| time/                   |           |
|    fps                  | 83        |
|    iterations           | 26        |
|    time_elapsed         | 19502     |
|    total_timesteps      | 1638000   |
| train/                  |           |
|    approx_kl            | 1.0310566 |
|    clip_fraction        | 0.255     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0392   |
|    explained_variance   | 0.965     |
|    learning_rate        | 0.0001    |
|    loss                 | 1.11      |
|    n_updates            | 250       |
|    policy_gradient_loss | 0.0225    |
|    value_loss           | 12        |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 70.6       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 27         |
|    time_elapsed         | 20267      |
|    total_timesteps      | 1701000    |
| train/                  |            |
|    approx_kl            | 0.22980233 |
|    clip_fraction        | 0.181      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0336    |
|    explained_variance   | 0.952      |
|    learning_rate        | 0.0001     |
|    loss                 | 6.03       |
|    n_updates            | 260        |
|    policy_gradient_loss | 0.0169     |
|    value_loss           | 13.3       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.49e+03   |
|    ep_rew_mean          | 80.9       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 28         |
|    time_elapsed         | 21032      |
|    total_timesteps      | 1764000    |
| train/                  |            |
|    approx_kl            | 0.18089044 |
|    clip_fraction        | 0.0953     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0199    |
|    explained_variance   | 0.963      |
|    learning_rate        | 0.0001     |
|    loss                 | 0.745      |
|    n_updates            | 270        |
|    policy_gradient_loss | 0.0114     |
|    value_loss           | 13.1       |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.49e+03  |
|    ep_rew_mean          | 92.5      |
| time/                   |           |
|    fps                  | 83        |
|    iterations           | 29        |
|    time_elapsed         | 21800     |
|    total_timesteps      | 1827000   |
| train/                  |           |
|    approx_kl            | 2.9805934 |
|    clip_fraction        | 0.0859    |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0172   |
|    explained_variance   | 0.957     |
|    learning_rate        | 0.0001    |
|    loss                 | 7.77      |
|    n_updates            | 280       |
|    policy_gradient_loss | 0.0153    |
|    value_loss           | 13.2      |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 82.1       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 30         |
|    time_elapsed         | 22568      |
|    total_timesteps      | 1890000    |
| train/                  |            |
|    approx_kl            | 0.28053057 |
|    clip_fraction        | 0.0973     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0168    |
|    explained_variance   | 0.96       |
|    learning_rate        | 0.0001     |
|    loss                 | 27.1       |
|    n_updates            | 290        |
|    policy_gradient_loss | 0.0124     |
|    value_loss           | 14.8       |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.5e+03   |
|    ep_rew_mean          | 88.5      |
| time/                   |           |
|    fps                  | 83        |
|    iterations           | 31        |
|    time_elapsed         | 23339     |
|    total_timesteps      | 1953000   |
| train/                  |           |
|    approx_kl            | 2.2089455 |
|    clip_fraction        | 0.138     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0239   |
|    explained_variance   | 0.955     |
|    learning_rate        | 0.0001    |
|    loss                 | 2.3       |
|    n_updates            | 300       |
|    policy_gradient_loss | 0.0219    |
|    value_loss           | 13.2      |
---------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.5e+03   |
|    ep_rew_mean          | 96.4      |
| time/                   |           |
|    fps                  | 83        |
|    iterations           | 32        |
|    time_elapsed         | 24106     |
|    total_timesteps      | 2016000   |
| train/                  |           |
|    approx_kl            | 0.3350855 |
|    clip_fraction        | 0.137     |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0227   |
|    explained_variance   | 0.967     |
|    learning_rate        | 0.0001    |
|    loss                 | 3.31      |
|    n_updates            | 310       |
|    policy_gradient_loss | 0.0207    |
|    value_loss           | 10.9      |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 104        |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 33         |
|    time_elapsed         | 24872      |
|    total_timesteps      | 2079000    |
| train/                  |            |
|    approx_kl            | 0.88348216 |
|    clip_fraction        | 0.125      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0379    |
|    explained_variance   | 0.969      |
|    learning_rate        | 0.0001     |
|    loss                 | 1.72       |
|    n_updates            | 320        |
|    policy_gradient_loss | 0.0178     |
|    value_loss           | 11.1       |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.49e+03  |
|    ep_rew_mean          | 95        |
| time/                   |           |
|    fps                  | 83        |
|    iterations           | 34        |
|    time_elapsed         | 25638     |
|    total_timesteps      | 2142000   |
| train/                  |           |
|    approx_kl            | 0.3305661 |
|    clip_fraction        | 0.0899    |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0167   |
|    explained_variance   | 0.946     |
|    learning_rate        | 0.0001    |
|    loss                 | 0.593     |
|    n_updates            | 330       |
|    policy_gradient_loss | 0.0127    |
|    value_loss           | 15.3      |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.49e+03   |
|    ep_rew_mean          | 80.1       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 35         |
|    time_elapsed         | 26401      |
|    total_timesteps      | 2205000    |
| train/                  |            |
|    approx_kl            | 0.14985324 |
|    clip_fraction        | 0.097      |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0181    |
|    explained_variance   | 0.954      |
|    learning_rate        | 0.0001     |
|    loss                 | 3.52       |
|    n_updates            | 340        |
|    policy_gradient_loss | 0.012      |
|    value_loss           | 12.5       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.49e+03   |
|    ep_rew_mean          | 78.8       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 36         |
|    time_elapsed         | 27165      |
|    total_timesteps      | 2268000    |
| train/                  |            |
|    approx_kl            | 0.40532517 |
|    clip_fraction        | 0.0899     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0152    |
|    explained_variance   | 0.962      |
|    learning_rate        | 0.0001     |
|    loss                 | 1.17       |
|    n_updates            | 350        |
|    policy_gradient_loss | 0.0209     |
|    value_loss           | 12.6       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 69.4       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 37         |
|    time_elapsed         | 27929      |
|    total_timesteps      | 2331000    |
| train/                  |            |
|    approx_kl            | 0.44565266 |
|    clip_fraction        | 0.0827     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0147    |
|    explained_variance   | 0.959      |
|    learning_rate        | 0.0001     |
|    loss                 | 9.03       |
|    n_updates            | 360        |
|    policy_gradient_loss | 0.0124     |
|    value_loss           | 13.6       |
----------------------------------------


--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 1.49e+03 |
|    ep_rew_mean          | 76.1     |
| time/                   |          |
|    fps                  | 83       |
|    iterations           | 38       |
|    time_elapsed         | 28697    |
|    total_timesteps      | 2394000  |
| train/                  |          |
|    approx_kl            | 0.39989  |
|    clip_fraction        | 0.0803   |
|    clip_range           | 0.001    |
|    entropy_loss         | -0.0189  |
|    explained_variance   | 0.964    |
|    learning_rate        | 0.0001   |
|    loss                 | 6.54     |
|    n_updates            | 370      |
|    policy_gradient_loss | 0.0099   |
|    value_loss           | 10.8     |
--------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.49e+03   |
|    ep_rew_mean          | 85.8       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 39         |
|    time_elapsed         | 29457      |
|    total_timesteps      | 2457000    |
| train/                  |            |
|    approx_kl            | 0.21995556 |
|    clip_fraction        | 0.0759     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.016     |
|    explained_variance   | 0.967      |
|    learning_rate        | 0.0001     |
|    loss                 | 17.7       |
|    n_updates            | 380        |
|    policy_gradient_loss | 0.0125     |
|    value_loss           | 9.73       |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.49e+03  |
|    ep_rew_mean          | 95.8      |
| time/                   |           |
|    fps                  | 83        |
|    iterations           | 40        |
|    time_elapsed         | 30206     |
|    total_timesteps      | 2520000   |
| train/                  |           |
|    approx_kl            | 0.9548488 |
|    clip_fraction        | 0.0774    |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.0222   |
|    explained_variance   | 0.968     |
|    learning_rate        | 0.0001    |
|    loss                 | 0.479     |
|    n_updates            | 390       |
|    policy_gradient_loss | 0.00963   |
|    value_loss           | 10.3      |
---------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 86         |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 41         |
|    time_elapsed         | 30952      |
|    total_timesteps      | 2583000    |
| train/                  |            |
|    approx_kl            | 0.70789516 |
|    clip_fraction        | 0.0685     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0123    |
|    explained_variance   | 0.976      |
|    learning_rate        | 0.0001     |
|    loss                 | 1.07       |
|    n_updates            | 400        |
|    policy_gradient_loss | 0.0121     |
|    value_loss           | 8.41       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 67.6       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 42         |
|    time_elapsed         | 31702      |
|    total_timesteps      | 2646000    |
| train/                  |            |
|    approx_kl            | 0.71174777 |
|    clip_fraction        | 0.0715     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.012     |
|    explained_variance   | 0.961      |
|    learning_rate        | 0.0001     |
|    loss                 | 2.41       |
|    n_updates            | 410        |
|    policy_gradient_loss | 0.013      |
|    value_loss           | 11.7       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 62.3       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 43         |
|    time_elapsed         | 32456      |
|    total_timesteps      | 2709000    |
| train/                  |            |
|    approx_kl            | 0.27907774 |
|    clip_fraction        | 0.0433     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0103    |
|    explained_variance   | 0.963      |
|    learning_rate        | 0.0001     |
|    loss                 | 0.724      |
|    n_updates            | 420        |
|    policy_gradient_loss | 0.00796    |
|    value_loss           | 8.36       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 65.4       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 44         |
|    time_elapsed         | 33204      |
|    total_timesteps      | 2772000    |
| train/                  |            |
|    approx_kl            | 0.41051355 |
|    clip_fraction        | 0.0534     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0116    |
|    explained_variance   | 0.972      |
|    learning_rate        | 0.0001     |
|    loss                 | 0.794      |
|    n_updates            | 430        |
|    policy_gradient_loss | 0.013      |
|    value_loss           | 8.25       |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 78.9       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 45         |
|    time_elapsed         | 33951      |
|    total_timesteps      | 2835000    |
| train/                  |            |
|    approx_kl            | 0.23730262 |
|    clip_fraction        | 0.0638     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0132    |
|    explained_variance   | 0.963      |
|    learning_rate        | 0.0001     |
|    loss                 | 1.74       |
|    n_updates            | 440        |
|    policy_gradient_loss | 0.0115     |
|    value_loss           | 10         |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 87.7       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 46         |
|    time_elapsed         | 34700      |
|    total_timesteps      | 2898000    |
| train/                  |            |
|    approx_kl            | 0.51432586 |
|    clip_fraction        | 0.0752     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0127    |
|    explained_variance   | 0.969      |
|    learning_rate        | 0.0001     |
|    loss                 | 5.98       |
|    n_updates            | 450        |
|    policy_gradient_loss | 0.0113     |
|    value_loss           | 9.7        |
----------------------------------------


----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1.5e+03    |
|    ep_rew_mean          | 113        |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 47         |
|    time_elapsed         | 35448      |
|    total_timesteps      | 2961000    |
| train/                  |            |
|    approx_kl            | 0.58305323 |
|    clip_fraction        | 0.0587     |
|    clip_range           | 0.001      |
|    entropy_loss         | -0.0107    |
|    explained_variance   | 0.971      |
|    learning_rate        | 0.0001     |
|    loss                 | 5.41       |
|    n_updates            | 460        |
|    policy_gradient_loss | 0.0102     |
|    value_loss           | 10.4       |
----------------------------------------


---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 1.5e+03   |
|    ep_rew_mean          | 113       |
| time/                   |           |
|    fps                  | 83        |
|    iterations           | 48        |
|    time_elapsed         | 36195     |
|    total_timesteps      | 3024000   |
| train/                  |           |
|    approx_kl            | 1.0745189 |
|    clip_fraction        | 0.0591    |
|    clip_range           | 0.001     |
|    entropy_loss         | -0.012    |
|    explained_variance   | 0.968     |
|    learning_rate        | 0.0001    |
|    loss                 | 0.799     |
|    n_updates            | 470       |
|    policy_gradient_loss | 0.016     |
|    value_loss           | 11.9      |
---------------------------------------


In [3]:
model.save(f'ppti_ppo_{n_steps}')