In [4]:
import torch
from torch import nn
from deeplotx import MultiHeadFeedForward
from stable_baselines3.common.policies import ActorCriticPolicy


class MyActorCritic(nn.Module):
    def __init__(self, feature_dim: int, policy_output_dim: int, value_output_dim: int, device: str = 'cpu', dtype: torch.dtype = torch.float32):
        super().__init__()  
        self.latent_dim_pi = policy_output_dim  
        self.latent_dim_vf = value_output_dim  
        self.policy_net = nn.Sequential(  
            MultiHeadFeedForward(feature_dim=feature_dim, num_heads=50, device=device, dtype=dtype), nn.Linear(in_features=feature_dim, out_features=policy_output_dim, device=torch.device(device), dtype=dtype)
        )  
        self.value_net = nn.Sequential(  
            MultiHeadFeedForward(feature_dim=feature_dim, num_heads=50, device=device, dtype=dtype), nn.Linear(in_features=feature_dim, out_features=value_output_dim, device=torch.device(device), dtype=dtype)
        )
    
    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
        return self.policy_net.forward(x), self.value_net.forward(x)
    
    def forward_actor(self, x: torch.Tensor):  
        return self.policy_net.forward(x)  
  
    def forward_critic(self, x: torch.Tensor):  
        return self.value_net.forward(x)


class MyPolicy(ActorCriticPolicy):
    def _build_mlp_extractor(self) -> None:  
        self.mlp_extractor = MyActorCritic(self.features_dim, 64, 64)

In [5]:
import numpy as np
from gymnasium import spaces, Env


class VectorClassificationEnv(Env):  
    def __init__(self, features: np.ndarray, labels: np.ndarray):  
        super().__init__()  
        self.observation_space = spaces.Box(  
            low=-np.inf, high=np.inf,   
            shape=(features.shape[1],),   
            dtype=np.float32  
        )  
        self.action_space = spaces.Discrete(2)  
        self.features = features
        self.labels = labels  
        self.ptr = 0  
      
    def step(self, action: int):
        true_label = self.labels[self.ptr]  
        reward = 1.0 if action == true_label else -1.0  
        self.ptr += 1  
        terminated = self.ptr >= len(self.features)  
        if not terminated:  
            observation = self.features[self.ptr]  
        else:  
            observation = np.zeros(self.observation_space.shape)  
        return observation, reward, terminated, False, {}  
    
    def reset(self, seed=None, options=None):  
        super().reset(seed=seed)  
        self.ptr = 0  
        observation = self.features[self.ptr]
        return observation, {}

In [6]:
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback  

# 创建 RL 环境
env = VectorClassificationEnv(np.random.randn(1000, 128), np.random.randint(0, 2, 1000))

# 定义 PPO 算法实现
ppo = PPO(  
    policy=MyPolicy,                            # 策略网络类型
    env=env,                                    # 环境实例
    learning_rate=2e-6,                         # 学习率  
    n_steps=2048,                               # 单个 rollout 的采样时间步
    batch_size=64,                              # 批次大小  
    n_epochs=10,                                # 在单个 rollout buffer 上的训练轮数
    gamma=0.99,                                 # 折扣因子
    gae_lambda=0.95,                            # GAE lambda 参数  
    clip_range=0.2,                             # PPO 裁剪范围  
    clip_range_vf=None,                         # 价值函数裁剪范围  
    normalize_advantage=True,                   # 是否标准化优势  
    ent_coef=0.0,                               # 熵系数
    vf_coef=0.5,                                # 价值函数系数
    max_grad_norm=0.5,                          # 梯度裁剪最大范数
    use_sde=False,                              # 是否使用状态依赖探索 (SDE)
    sde_sample_freq=-1,                         # SDE采样频率
    rollout_buffer_class=None,                  # rollout 缓冲区类
    rollout_buffer_kwargs=None,                 # rollout 缓冲区参数  
    target_kl=None,                             # 目标 KL 散度  
    stats_window_size=100,                      # 统计窗口大小  
    tensorboard_log=None,                       # TensorBoard 日志路径, None 表示不记录日志  
    policy_kwargs=None,                         # 策略额外参数  
    verbose=2,                                  # 日志详细程度  
    seed=None,                                  # 随机种子  
    device="auto",                              # 计算设备  
    _init_setup_model=True                      # 是否初始化模型  
)

# 创建训练过程回调函数
eval_callback = EvalCallback(env, best_model_save_path='./logs/', log_path='./logs/', eval_freq=500)  
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/')

# 开始训练
ppo.learn(  
    total_timesteps=50000,  
    callback=[eval_callback, checkpoint_callback],  
    log_interval=10,  
    tb_log_name="ppo_run",  
    progress_bar=True  
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Output()

---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 34       |
| time/              |          |
|    total_timesteps | 500      |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 34       |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 34       |
| time/              |          |
|    total_timesteps | 1500     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 34       |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1e+03        |
|    mean_reward          | 32           |
| time/                   |              |
|    total_timesteps      | 2500         |
| train/                  |              |
|    approx_kl            | 0.0001294725 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.693       |
|    explained_variance   | -0.377       |
|    learning_rate        | 2e-06        |
|    loss                 | 3.95         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.000682    |
|    value_loss           | 8.01         |
------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 32       |
| time/              |          |
|    total_timesteps | 3000     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 32       |
| time/              |          |
|    total_timesteps | 3500     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 32       |
| time/              |          |
|    total_timesteps | 4000     |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 32            |
| time/                   |               |
|    total_timesteps      | 4500          |
| train/                  |               |
|    approx_kl            | 2.5820074e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.693        |
|    explained_variance   | -0.101        |
|    learning_rate        | 2e-06         |
|    loss                 | 4.69          |
|    n_updates            | 20            |
|    policy_gradient_loss | -0.000371     |
|    value_loss           | 8.63          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 32       |
| time/              |          |
|    total_timesteps | 5000     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 32       |
| time/              |          |
|    total_timesteps | 5500     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 32       |
| time/              |          |
|    total_timesteps | 6000     |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 60            |
| time/                   |               |
|    total_timesteps      | 6500          |
| train/                  |               |
|    approx_kl            | 3.0226918e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.693        |
|    explained_variance   | -0.037        |
|    learning_rate        | 2e-06         |
|    loss                 | 3.68          |
|    n_updates            | 30            |
|    policy_gradient_loss | -0.000372     |
|    value_loss           | 6.75          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 60       |
| time/              |          |
|    total_timesteps | 7000     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 60       |
| time/              |          |
|    total_timesteps | 7500     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 60       |
| time/              |          |
|    total_timesteps | 8000     |
---------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1e+03        |
|    mean_reward          | 62           |
| time/                   |              |
|    total_timesteps      | 8500         |
| train/                  |              |
|    approx_kl            | 8.176168e-06 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.693       |
|    explained_variance   | -0.0638      |
|    learning_rate        | 2e-06        |
|    loss                 | 3.92         |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.00031     |
|    value_loss           | 9.31         |
------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 62       |
| time/              |          |
|    total_timesteps | 9000     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 62       |
| time/              |          |
|    total_timesteps | 9500     |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 62       |
| time/              |          |
|    total_timesteps | 10000    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 96            |
| time/                   |               |
|    total_timesteps      | 10500         |
| train/                  |               |
|    approx_kl            | 1.6331876e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.693        |
|    explained_variance   | -0.0487       |
|    learning_rate        | 2e-06         |
|    loss                 | 3.83          |
|    n_updates            | 50            |
|    policy_gradient_loss | -0.000366     |
|    value_loss           | 8.37          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 96       |
| time/              |          |
|    total_timesteps | 11000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 96       |
| time/              |          |
|    total_timesteps | 11500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 96       |
| time/              |          |
|    total_timesteps | 12000    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 86            |
| time/                   |               |
|    total_timesteps      | 12500         |
| train/                  |               |
|    approx_kl            | 1.0283897e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.693        |
|    explained_variance   | 0.0199        |
|    learning_rate        | 2e-06         |
|    loss                 | 3.9           |
|    n_updates            | 60            |
|    policy_gradient_loss | -0.000346     |
|    value_loss           | 10            |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 86       |
| time/              |          |
|    total_timesteps | 13000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 86       |
| time/              |          |
|    total_timesteps | 13500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 86       |
| time/              |          |
|    total_timesteps | 14000    |
---------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1e+03        |
|    mean_reward          | 70           |
| time/                   |              |
|    total_timesteps      | 14500        |
| train/                  |              |
|    approx_kl            | 8.176436e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.693       |
|    explained_variance   | -0.0233      |
|    learning_rate        | 2e-06        |
|    loss                 | 4.35         |
|    n_updates            | 70           |
|    policy_gradient_loss | -0.000602    |
|    value_loss           | 8.04         |
------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 70       |
| time/              |          |
|    total_timesteps | 15000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 70       |
| time/              |          |
|    total_timesteps | 15500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 70       |
| time/              |          |
|    total_timesteps | 16000    |
---------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1e+03        |
|    mean_reward          | 68           |
| time/                   |              |
|    total_timesteps      | 16500        |
| train/                  |              |
|    approx_kl            | 3.702563e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.693       |
|    explained_variance   | -0.104       |
|    learning_rate        | 2e-06        |
|    loss                 | 3.82         |
|    n_updates            | 80           |
|    policy_gradient_loss | -0.000509    |
|    value_loss           | 8.64         |
------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 68       |
| time/              |          |
|    total_timesteps | 17000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 68       |
| time/              |          |
|    total_timesteps | 17500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 68       |
| time/              |          |
|    total_timesteps | 18000    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 106           |
| time/                   |               |
|    total_timesteps      | 18500         |
| train/                  |               |
|    approx_kl            | 2.5251153e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.692        |
|    explained_variance   | -0.0742       |
|    learning_rate        | 2e-06         |
|    loss                 | 4.15          |
|    n_updates            | 90            |
|    policy_gradient_loss | -0.000475     |
|    value_loss           | 9.16          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 106      |
| time/              |          |
|    total_timesteps | 19000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 106      |
| time/              |          |
|    total_timesteps | 19500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 106      |
| time/              |          |
|    total_timesteps | 20000    |
---------------------------------
------------------------------
| time/              |       |
|    fps             | 7     |
|    iterations      | 10    |
|    time_elapsed    | 2697  |
|    total_timesteps | 20480 |
------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 162           |
| time/                   |               |
|    total_timesteps      | 20500         |
| train/                  |               |
|    approx_kl            | 0.00031076826 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.693        |
|    explained_variance   | -0.0159       |
|    learning_rate        | 2e-06         |
|    loss                 | 3.54          |
|    n_updates            | 100           |
|    policy_gradient_loss | -0.00131      |
|    value_loss           | 7.89          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 162      |
| time/              |          |
|    total_timesteps | 21000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 162      |
| time/              |          |
|    total_timesteps | 21500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 162      |
| time/              |          |
|    total_timesteps | 22000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 162      |
| time/              |          |
|    total_timesteps | 22500    |
---------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1e+03        |
|    mean_reward          | 170          |
| time/                   |              |
|    total_timesteps      | 23000        |
| train/                  |              |
|    approx_kl            | 3.597475e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.693       |
|    explained_variance   | -0.0691      |
|    learning_rate        | 2e-06        |
|    loss                 | 5.64         |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.000688    |
|    value_loss           | 8.76         |
------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 170      |
| time/              |          |
|    total_timesteps | 23500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 170      |
| time/              |          |
|    total_timesteps | 24000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 170      |
| time/              |          |
|    total_timesteps | 24500    |
---------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1e+03        |
|    mean_reward          | 120          |
| time/                   |              |
|    total_timesteps      | 25000        |
| train/                  |              |
|    approx_kl            | 8.371123e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.693       |
|    explained_variance   | -0.0551      |
|    learning_rate        | 2e-06        |
|    loss                 | 4.91         |
|    n_updates            | 120          |
|    policy_gradient_loss | -0.000798    |
|    value_loss           | 9.22         |
------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 120      |
| time/              |          |
|    total_timesteps | 25500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 120      |
| time/              |          |
|    total_timesteps | 26000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 120      |
| time/              |          |
|    total_timesteps | 26500    |
---------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1e+03        |
|    mean_reward          | 118          |
| time/                   |              |
|    total_timesteps      | 27000        |
| train/                  |              |
|    approx_kl            | 5.245171e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.692       |
|    explained_variance   | 0.0168       |
|    learning_rate        | 2e-06        |
|    loss                 | 4.57         |
|    n_updates            | 130          |
|    policy_gradient_loss | -0.000945    |
|    value_loss           | 8.37         |
------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 118      |
| time/              |          |
|    total_timesteps | 27500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 118      |
| time/              |          |
|    total_timesteps | 28000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 118      |
| time/              |          |
|    total_timesteps | 28500    |
---------------------------------


------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 1e+03        |
|    mean_reward          | 66           |
| time/                   |              |
|    total_timesteps      | 29000        |
| train/                  |              |
|    approx_kl            | 0.0001706291 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.692       |
|    explained_variance   | -0.0379      |
|    learning_rate        | 2e-06        |
|    loss                 | 5.03         |
|    n_updates            | 140          |
|    policy_gradient_loss | -0.00117     |
|    value_loss           | 9.43         |
------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 66       |
| time/              |          |
|    total_timesteps | 29500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 66       |
| time/              |          |
|    total_timesteps | 30000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 66       |
| time/              |          |
|    total_timesteps | 30500    |
---------------------------------


-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 1e+03       |
|    mean_reward          | 110         |
| time/                   |             |
|    total_timesteps      | 31000       |
| train/                  |             |
|    approx_kl            | 6.58276e-05 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.692      |
|    explained_variance   | -0.0472     |
|    learning_rate        | 2e-06       |
|    loss                 | 5.24        |
|    n_updates            | 150         |
|    policy_gradient_loss | -0.000895   |
|    value_loss           | 8.86        |
-----------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 110      |
| time/              |          |
|    total_timesteps | 31500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 110      |
| time/              |          |
|    total_timesteps | 32000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 110      |
| time/              |          |
|    total_timesteps | 32500    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 170           |
| time/                   |               |
|    total_timesteps      | 33000         |
| train/                  |               |
|    approx_kl            | 0.00010648908 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.692        |
|    explained_variance   | -0.0721       |
|    learning_rate        | 2e-06         |
|    loss                 | 4.29          |
|    n_updates            | 160           |
|    policy_gradient_loss | -0.00134      |
|    value_loss           | 6.85          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 170      |
| time/              |          |
|    total_timesteps | 33500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 170      |
| time/              |          |
|    total_timesteps | 34000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 170      |
| time/              |          |
|    total_timesteps | 34500    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 218           |
| time/                   |               |
|    total_timesteps      | 35000         |
| train/                  |               |
|    approx_kl            | 0.00076225173 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.692        |
|    explained_variance   | 0.00855       |
|    learning_rate        | 2e-06         |
|    loss                 | 4.7           |
|    n_updates            | 170           |
|    policy_gradient_loss | -0.00254      |
|    value_loss           | 8.2           |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 218      |
| time/              |          |
|    total_timesteps | 35500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 218      |
| time/              |          |
|    total_timesteps | 36000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 218      |
| time/              |          |
|    total_timesteps | 36500    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 254           |
| time/                   |               |
|    total_timesteps      | 37000         |
| train/                  |               |
|    approx_kl            | 0.00016690194 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.691        |
|    explained_variance   | -0.0492       |
|    learning_rate        | 2e-06         |
|    loss                 | 3.84          |
|    n_updates            | 180           |
|    policy_gradient_loss | -0.00165      |
|    value_loss           | 7.6           |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 254      |
| time/              |          |
|    total_timesteps | 37500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 254      |
| time/              |          |
|    total_timesteps | 38000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 254      |
| time/              |          |
|    total_timesteps | 38500    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 198           |
| time/                   |               |
|    total_timesteps      | 39000         |
| train/                  |               |
|    approx_kl            | 0.00054252055 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.689        |
|    explained_variance   | -0.0152       |
|    learning_rate        | 2e-06         |
|    loss                 | 3.83          |
|    n_updates            | 190           |
|    policy_gradient_loss | -0.00236      |
|    value_loss           | 7.33          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 198      |
| time/              |          |
|    total_timesteps | 39500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 198      |
| time/              |          |
|    total_timesteps | 40000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 198      |
| time/              |          |
|    total_timesteps | 40500    |
---------------------------------
------------------------------
| time/              |       |
|    fps             | 7     |
|    iterations      | 20    |
|    time_elapsed    | 5433  |
|    total_timesteps | 40960 |
------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 226           |
| time/                   |               |
|    total_timesteps      | 41000         |
| train/                  |               |
|    approx_kl            | 0.00011797494 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.688        |
|    explained_variance   | -0.0045       |
|    learning_rate        | 2e-06         |
|    loss                 | 3.66          |
|    n_updates            | 200           |
|    policy_gradient_loss | -0.00154      |
|    value_loss           | 9.9           |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 226      |
| time/              |          |
|    total_timesteps | 41500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 226      |
| time/              |          |
|    total_timesteps | 42000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 226      |
| time/              |          |
|    total_timesteps | 42500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 226      |
| time/              |          |
|    total_timesteps | 43000    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 248           |
| time/                   |               |
|    total_timesteps      | 43500         |
| train/                  |               |
|    approx_kl            | 0.00019773385 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.687        |
|    explained_variance   | 0.0615        |
|    learning_rate        | 2e-06         |
|    loss                 | 3.24          |
|    n_updates            | 210           |
|    policy_gradient_loss | -0.00253      |
|    value_loss           | 7.05          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 248      |
| time/              |          |
|    total_timesteps | 44000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 248      |
| time/              |          |
|    total_timesteps | 44500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 248      |
| time/              |          |
|    total_timesteps | 45000    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 250           |
| time/                   |               |
|    total_timesteps      | 45500         |
| train/                  |               |
|    approx_kl            | 0.00021582525 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.686        |
|    explained_variance   | 0.00632       |
|    learning_rate        | 2e-06         |
|    loss                 | 4.52          |
|    n_updates            | 220           |
|    policy_gradient_loss | -0.00238      |
|    value_loss           | 8.38          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 250      |
| time/              |          |
|    total_timesteps | 46000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 250      |
| time/              |          |
|    total_timesteps | 46500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 250      |
| time/              |          |
|    total_timesteps | 47000    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 260           |
| time/                   |               |
|    total_timesteps      | 47500         |
| train/                  |               |
|    approx_kl            | 0.00024886185 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.684        |
|    explained_variance   | 0.0898        |
|    learning_rate        | 2e-06         |
|    loss                 | 5.52          |
|    n_updates            | 230           |
|    policy_gradient_loss | -0.00252      |
|    value_loss           | 9.49          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 260      |
| time/              |          |
|    total_timesteps | 48000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 260      |
| time/              |          |
|    total_timesteps | 48500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 260      |
| time/              |          |
|    total_timesteps | 49000    |
---------------------------------


-------------------------------------------
| eval/                   |               |
|    mean_ep_length       | 1e+03         |
|    mean_reward          | 270           |
| time/                   |               |
|    total_timesteps      | 49500         |
| train/                  |               |
|    approx_kl            | 0.00055756443 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.683        |
|    explained_variance   | -0.0185       |
|    learning_rate        | 2e-06         |
|    loss                 | 3.22          |
|    n_updates            | 240           |
|    policy_gradient_loss | -0.00343      |
|    value_loss           | 7.33          |
-------------------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 270      |
| time/              |          |
|    total_timesteps | 50000    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 270      |
| time/              |          |
|    total_timesteps | 50500    |
---------------------------------


---------------------------------
| eval/              |          |
|    mean_ep_length  | 1e+03    |
|    mean_reward     | 270      |
| time/              |          |
|    total_timesteps | 51000    |
---------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x24385095ee0>

In [1]:
vec_env = ppo.get_env()
obs = vec_env.reset()
for i in range(1000):
    action, _states = ppo.predict(obs, deterministic=True)
    obs, reward, done, info = vec_env.step(action)
    print('Action:', action)
    print('State:', _states)
    print('Observation:', obs)
    print('Reward:', reward)
    if done:
      obs = vec_env.reset() # env resets automatically

NameError: name 'ppo' is not defined