- 定义超参数
- 创建环境或模拟器
- 设计策略网络和价值模型
- 创建回放缓冲区和数据加载器
- 运行训练循环并分析结果




In [None]:
from collections import defaultdict  # 返回缺失键的默认值
import matplotlib.pyplot as plt  # 提供类似 MATLAB 的绘图框架
import torch  # PyTorch 的顶级包，一个深度学习框架
# PyTorch 模块的基类，使用 tensordict.TensorDict 作为输入和输出
from tensordict.nn import TensorDictModule
from tensordict.nn.distributions import NormalParamExtractor  # 从输入张量中提取正态分布的参数
from torch import nn  # PyTorch 的子包，提供用于构建神经网络的模块和类
from torchrl.collectors import SyncDataCollector  # 使用 actor 模块从多个环境中同步收集数据的类
from torchrl.data.replay_buffers import ReplayBuffer  # 存储从环境中收集的数据并允许进行训练采样的类
# 从重放缓冲存储器中无放回地采样数据的类
from torchrl.data.replay_buffers.samplers import SamplerWithoutReplacement
# 将数据存储为内存中的张量并允许延迟访问它们的类
from torchrl.data.replay_buffers.storages import LazyTensorStorage
from torchrl.envs import (
    Compose,  # 将多个环境变换组合成一个变换的类
    DoubleToFloat,  # 将环境观察或动作空间中的双精度张量转换为单精度张量的类
    ObservationNorm,  # 使用平均值和标准差的运行统计信息对观察进行归一化的类
    StepCounter,  # 计算在环境中执行了多少步并将其添加到观察空间作为额外维度的类
    TransformedEnv,  # 对观察、动作、奖励或完成标志应用变换的环境抽象基类
)
from torchrl.envs.libs.gym import GymEnv  # 实现了 torchrl.envs.Env 接口的 gym 环境包装器类
# 检查环境是否符合预期规范并设置支持探索模式（如随机 vs 确定性动作）的函数
from torchrl.envs.utils import check_env_specs, set_exploration_mode
# 根据观察输出动作概率分布的 actor（策略网络）类
from torchrl.modules import ProbabilisticActor, TanhNormal, ValueOperator
from torchrl.objectives import ClipPPOLoss  # 实现策略梯度方法中近端策略优化（PPO）损失函数剪裁版本的类
from torchrl.objectives.value import GAE  # 计算策略梯度方法中广义优势估计（GAE）信号的类
from tqdm import tqdm  # 提供循环和迭代器进度条


In [None]:
device = "cpu" if not torch.has_cuda else "cuda:0"
num_cells = 256  # number of cells in each layer
lr = 3e-4
max_grad_norm = 1.0


In [None]:
frame_skip = 1
frames_per_batch = 1000 // frame_skip
# For a complete training, bring the number of frames up to 1M
total_frames = 50_000 // frame_skip


In [None]:
# cardinality of the sub-samples gathered from the current data in the inner loop
sub_batch_size = 64
num_epochs = 10  # optimisation steps per batch of data collected
clip_epsilon = (
    # clip value for PPO loss: see the equation in the intro for more context.
    0.2
)
gamma = 0.99
lmbda = 0.95
entropy_eps = 1e-4


In [None]:
base_env = GymEnv("InvertedDoublePendulum-v4",
                  device=device, frame_skip=frame_skip)


In [None]:
env = TransformedEnv(
    base_env,
    Compose(
        # normalize observations
        ObservationNorm(in_keys=["observation"]),
        DoubleToFloat(in_keys=["observation"]),
        StepCounter(),
    ),
)


In [None]:
env.transform[0].init_stats(num_iter=1000, reduce_dim=0, cat_dim=0)


In [None]:
print("normalization constant shape:", env.transform[0].loc.shape)


In [None]:
print("observation_spec:", env.observation_spec)
print("reward_spec:", env.reward_spec)
print("input_spec:", env.input_spec)
print("action_spec (as defined by input_spec):", env.action_spec)


In [None]:
check_env_specs(env)


In [None]:
rollout = env.rollout(6)
print("rollout of three steps:", rollout)
print("Shape of the rollout TensorDict:", rollout.batch_size)


In [None]:
actor_net = nn.Sequential(
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(2 * env.action_spec.shape[-1], device=device),
    NormalParamExtractor(),
)


In [None]:
policy_module = TensorDictModule(
    actor_net, in_keys=["observation"], out_keys=["loc", "scale"]
)


In [None]:
policy_module = ProbabilisticActor(
    module=policy_module,
    spec=env.action_spec,
    in_keys=["loc", "scale"],
    distribution_class=TanhNormal,
    distribution_kwargs={
        "min": env.action_spec.space.minimum,
        "max": env.action_spec.space.maximum,
    },
    return_log_prob=True,
    # we'll need the log-prob for the numerator of the importance weights
)


In [None]:
value_net = nn.Sequential(
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(num_cells, device=device),
    nn.Tanh(),
    nn.LazyLinear(1, device=device),
)

value_module = ValueOperator(
    module=value_net,
    in_keys=["observation"],
)


In [None]:
print("Running policy:", policy_module(env.reset()))
print("Running value:", value_module(env.reset()))


In [None]:
collector = SyncDataCollector(
    env,
    policy_module,
    frames_per_batch=frames_per_batch,
    total_frames=total_frames,
    split_trajs=False,
    device=device,
)


In [None]:
replay_buffer = ReplayBuffer(
    storage=LazyTensorStorage(frames_per_batch),
    sampler=SamplerWithoutReplacement(),
)


In [None]:
advantage_module = GAE(
    gamma=gamma, lmbda=lmbda, value_network=value_module, average_gae=True
)

loss_module = ClipPPOLoss(
    actor=policy_module,
    critic=value_module,
    advantage_key="advantage",
    clip_epsilon=clip_epsilon,
    entropy_bonus=bool(entropy_eps),
    entropy_coef=entropy_eps,
    # these keys match by default but we set this for completeness
    value_target_key=advantage_module.value_target_key,
    critic_coef=1.0,
    gamma=0.99,
    loss_critic_type="smooth_l1",
)

optim = torch.optim.Adam(loss_module.parameters(), lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optim, total_frames // frames_per_batch, 0.0
)


In [None]:
logs = defaultdict(list)
pbar = tqdm(total=total_frames * frame_skip)
eval_str = ""

# We iterate over the collector until it reaches the total number of frames it was
# designed to collect:
for i, tensordict_data in enumerate(collector):
    # we now have a batch of data to work with. Let's learn something from it.
    for _ in range(num_epochs):
        # We'll need an "advantage" signal to make PPO work.
        # We re-compute it at each epoch as its value depends on the value
        # network which is updated in the inner loop.
        advantage_module(tensordict_data)
        data_view = tensordict_data.reshape(-1)
        replay_buffer.extend(data_view.cpu())
        for _ in range(frames_per_batch // sub_batch_size):
            subdata, *_ = replay_buffer.sample(sub_batch_size)
            loss_vals = loss_module(subdata.to(device))
            loss_value = (
                loss_vals["loss_objective"]
                + loss_vals["loss_critic"]
                + loss_vals["loss_entropy"]
            )

            # Optimization: backward, grad clipping and optim step
            loss_value.backward()
            # this is not strictly mandatory but it's good practice to keep
            # your gradient norm bounded
            torch.nn.utils.clip_grad_norm_(
                loss_module.parameters(), max_grad_norm)
            optim.step()
            optim.zero_grad()

    logs["reward"].append(tensordict_data["next", "reward"].mean().item())
    pbar.update(tensordict_data.numel() * frame_skip)
    cum_reward_str = (
        f"average reward={logs['reward'][-1]: 4.4f} (init={logs['reward'][0]: 4.4f})"
    )
    logs["step_count"].append(tensordict_data["step_count"].max().item())
    stepcount_str = f"step count (max): {logs['step_count'][-1]}"
    logs["lr"].append(optim.param_groups[0]["lr"])
    lr_str = f"lr policy: {logs['lr'][-1]: 4.4f}"
    if i % 10 == 0:
        # We evaluate the policy once every 10 batches of data.
        # Evaluation is rather simple: execute the policy without exploration
        # (take the expected value of the action distribution) for a given
        # number of steps (1000, which is our env horizon).
        # The ``rollout`` method of the env can take a policy as argument:
        # it will then execute this policy at each step.
        with set_exploration_mode("mean"), torch.no_grad():
            # execute a rollout with the trained policy
            eval_rollout = env.rollout(1000, policy_module)
            logs["eval reward"].append(
                eval_rollout["next", "reward"].mean().item())
            logs["eval reward (sum)"].append(
                eval_rollout["next", "reward"].sum().item()
            )
            logs["eval step_count"].append(
                eval_rollout["step_count"].max().item())
            eval_str = (
                f"eval cumulative reward: {logs['eval reward (sum)'][-1]: 4.4f} "
                f"(init: {logs['eval reward (sum)'][0]: 4.4f}), "
                f"eval step-count: {logs['eval step_count'][-1]}"
            )
            del eval_rollout
    pbar.set_description(
        ", ".join([eval_str, cum_reward_str, stepcount_str, lr_str]))

    # We're also using a learning rate scheduler. Like the gradient clipping,
    # this is a nice-to-have but nothing necessary for PPO to work.
    scheduler.step()


In [None]:
plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
plt.plot(logs["reward"])
plt.title("training rewards (average)")
plt.subplot(2, 2, 2)
plt.plot(logs["step_count"])
plt.title("Max step count (training)")
plt.subplot(2, 2, 3)
plt.plot(logs["eval reward (sum)"])
plt.title("Return (test)")
plt.subplot(2, 2, 4)
plt.plot(logs["eval step_count"])
plt.title("Max step count (test)")
plt.show()


## stable baseline
1. import dependency
2. load environment
3. train an RL
4. save and reload model

## import dependency

In [None]:
%pip install stable-baselines3[extra]
%pip install pyglet


In [None]:
# Path
import os
# Environment
import gym
# PPO
from stable_baselines3 import PPO
# Vectorize Environment
from stable_baselines3.common.vec_env import DummyVecEnv
# test the performance
from stable_baselines3.common.evaluation import evaluate_policy




## load environment

### OpenAI Gym Spaces
- Box
- Discrete
- Tuple
- Dict
- MultiBinary
- MultiDiscrete

In [None]:
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [None]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        # env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} ,Score:{}'.format(episode,score))
env.close()
        

In [None]:
env.action_space

In [None]:
env.observation_space

## train an RL model

In [None]:
log_path = os.path.join('train','logs')

env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy',env=env, verbose=1,tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=20000)

## Save and Reload Model

In [None]:
PPO_Path = os.path.join('train','model','PPO_Model_CartPole')
model.save(PPO_Path)

In [None]:
del model

In [None]:
model = PPO.load(PPO_Path,env=env)

In [None]:
model.learn(total_timesteps=10000)

## testing and evaluating

In [None]:
evaluate_policy(model=model,env=env,n_eval_episodes=10,render=False)

In [None]:
env.close()

## Test model

In [None]:
episodes = 5
for episode in range(episodes):
   obs = env.reset()
   done = False
   score = 0
   
   while not done:
    # env.render()
    action,state = model.predict(obs)
    obs, reward, done, info = env.step(action)
    score += reward
    
   print('Episode:{} ,Score:{}'.format(episode, score))
env.close()


In [None]:
env.close()

## View tensorboard

In [None]:
training_log_path = os.path.join('train','logs','PPO_1')

!tensorboard --logdir={training_log_path}

## add callback, alt algorithms and Architectures

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold


In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env,callback_on_new_best=stop_callback,eval_freq=10000,best_model_save_path= os.path.join('train','model'),verbose=1)

In [None]:
log_path = os.path.join('train', 'logs')
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=10000, callback=eval_callback)

## change polices

In [None]:
new_arch = dict(pi=[128,128,128,128],vf=[128,128,128,128])
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path,policy_kwargs={'net_arch':new_arch})

In [None]:
model.learn(total_timesteps=10000, callback=eval_callback)


In [None]:
## use alternate algorithm

In [None]:
from stable_baselines3 import DQN
model = DQN('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=10000,callback=eval_callback)

# import dependencies

In [1]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_atari_env
import os

In [None]:
# test enviorment

In [None]:
! python -m atari_py.import_roms .\ROMS\ROMS

In [2]:
environment_name = 'Breakout-v0'
env = gym.make(environment_name)


In [None]:
env.reset()

In [None]:
env.action_space

In [None]:
env.observation_space

In [3]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    done = False
    score = 0

    while not done:
        # env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} ,Score:{}'.format(episode, score))
env.close()


Episode:0 ,Score:2.0
Episode:1 ,Score:2.0
Episode:2 ,Score:2.0
Episode:3 ,Score:1.0
Episode:4 ,Score:0.0


## Vectorize Environment and train model

In [4]:
env = make_atari_env('Breakout-v0',n_envs=4,seed=0)
env = VecFrameStack(env,n_stack=4)
env.reset()

array([[[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        ...,

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]],

        [[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         ...,
         [0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]],


       [[[0, 0, 0, 0],
         [0, 0, 0, 0],
         [0, 0

In [5]:
env.close()

In [6]:
log_path = os.path.join('train','logs')
model = A2C('CnnPolicy',env,verbose=1,tensorboard_log=log_path)


Using cpu device
Wrapping the env in a VecTransposeImage.


In [7]:
model.learn(total_timesteps=100000)

Logging to train\logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 281      |
|    ep_rew_mean        | 1.51     |
| time/                 |          |
|    fps                | 215      |
|    iterations         | 100      |
|    time_elapsed       | 9        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -1.36    |
|    explained_variance | -0.0607  |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0.815   |
|    value_loss         | 0.952    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 291      |
|    ep_rew_mean        | 1.69     |
| time/                 |          |
|    fps                | 214      |
|    iterations         | 200      |
|    time_elapsed       | 18       |
|    total_timesteps    | 4000     |
| train/  

<stable_baselines3.a2c.a2c.A2C at 0x20512590520>

In [None]:
a2c_path = os.path.join('train','model','A2C')
model.save(a2c_path)
model.load(a2c_path,env)

## evaluate test

In [9]:
env = make_atari_env('Breakout-v0',n_envs=1,seed=1)
env  = VecFrameStack(env,n_stack=4)
evaluate_policy(model,env,n_eval_episodes=40,render=False)

(6.175, 2.8362607425975486)