# import pakages

In [45]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Load environments

In [6]:
environment_name = 'CartPole-v0'
env = gym.make(environment_name)

In [7]:
episodes = 5
for episode in range(1,episodes+1):
    state = env.reset()
    done = False
    score = 0
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state,reward,done,info = env.step(action)
        score += reward
    print('episode:{} score:{}'.format(episode,score))
env.close()

episode:1 score:19.0
episode:2 score:28.0
episode:3 score:23.0
episode:4 score:36.0
episode:5 score:26.0


env.reset()

# train an agent

In [8]:
# make your directories first
log_path = os.path.join('Training','Logs')
log_path

'Training\\Logs'

In [9]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda:env])
model = PPO('MlpPolicy',env,verbose = 1,tensorboard_log = log_path)

Using cpu device


In [10]:
model.learn(total_timesteps = 20000)

Logging to Training\Logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 923  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 952         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008253496 |
|    clip_fraction        | 0.094       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00826     |
|    learning_rate        | 0.0003      |
|    loss                 | 5.01        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0152     |
|    value_loss           | 51.7        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x2d49081cfc8>

# Save and reload model

In [11]:
PPO_Path = os.path.join('Taining','Save Models','PPO_Model_CartPole')

In [12]:
model.save(PPO_Path)

In [13]:
del model

In [14]:
model = PPO.load(PPO_Path,env = env)
model.learn(total_timesteps = 10000)

Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 1823 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1237         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0072634863 |
|    clip_fraction        | 0.0615       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.553       |
|    explained_variance   | 0.354        |
|    learning_rate        | 0.0003       |
|    loss                 | 36.4         |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00478     |
|    value_loss           | 112          |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x2d4fb413e48>

# Evaluating

In [15]:
evaluate_policy(model,env,n_eval_episodes=10,render=True)



(200.0, 0.0)

In [16]:
env.close()

# Test model

In [17]:
episodes = 5
for episode in range(1,episodes+1):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        env.render()
        action,_ = model.predict(obs) # predict
        obs,reward,done,info = env.step(action)
        score += reward
    print('episode:{} score:{}'.format(episode,score))
# env.close()

episode:1 score:[200.]
episode:2 score:[200.]
episode:3 score:[200.]
episode:4 score:[200.]
episode:5 score:[200.]


In [18]:
env.close()

In [19]:
obs = env.reset()

In [20]:
model.predict(obs) #返回动作和下一个状态

(array([0], dtype=int64), None)

In [21]:
env.action_space.sample()

1

# viewing logs in tensorboard

In [22]:
training_log_path = os.path.join(log_path,'PPO_1') #序号代表按照顺序训练的几次模型

In [23]:
!tensorboard --logdir={training_log_path} # localhost:6006

2021-12-07 15:28:43.632786: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'cudart64_100.dll'; dlerror: cudart64_100.dll not found
2021-12-07 15:28:43.633094: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--host ADDR]
                   [--port PORT] [--purge_orphaned_data BOOL] [--db URI]
                   [--db_import] [--inspect] [--version_tb] [--tag TAG]
                   [--event_file PATH] [--path_prefix PATH]
                   [--window_title TEXT] [--max_reload_threads COUNT]
                   [--reload_interval SECONDS] [--reload_task TYPE]
                   [--reload_multifile BOOL]
                   [--reload_multifile_inactive_secs SECONDS]
                   [--generic_data TYPE]
                   [--samples_per_plugin SAMPLES_PER_PLUGIN]
                   [--debugger_data_server_g

# add a callback to the training stage

回调是在训练过程的特定阶段调用的一组函数，可以使用回调来获取训练期间内部状态和模型统计信息的视图。

**EarlyStopping**

从字面上理解， EarlyStopping 就是提前终止训练，主要是为了防止过拟合。过拟合是机器学习从业者的噩梦，简单说，就是在训练数据集上精度很高，但在测试数据集上精度很低。解决过拟合有多种手段，有时还需要多种手段并用，其中一种方法是尽早终止训练过程。 EarlyStopping 函数有好几种度量参数，通过修改这些参数，可以控制合适的时机停止训练过程。下面是一些相关度量参数：  

`monitor`：  

监控的度量指标，比如:acc, val_acc, loss和val_loss等  

`min_delta`：  

监控值的最小变化。  

例如，min_delta = 1表示如果监视值的绝对值变化小于1，则将停止训练过程  

`patience`：  

没有改善的epoch数，如果过了数个epoch之后结果没有改善，训练将停止  

`restore_best_weights`：  

如果要在停止后保存最佳权重，请将此参数设置为True  


**ModelCheckpoint**
 

此回调用于在训练周期中保存模型检查点。保存检查点的作用在于保存训练中间的模型，下次在训练时，可以加载模型，而无需重新训练，减少训练时间。它有以一些相关参数：

 

`filepath`：
要保存模型的文件路径

 

`monitor`：
监控的度量指标，比如：

acc, val_acc, loss和val_loss等

 

`save_best_only`：
如果您不想最新的最佳模型被覆盖，请将此值设置为True

 

`save_weights_only`: 如果设为True，将只保存模型权重

 

`mode`：
auto，min或max。

例如，如果监控的度量指标是val_loss，并且想要最小化它，则设置mode =’min’。

 

`period`：
检查点之间的间隔（epoch数）。

**LearningRateScheduler**
 

在深度学习中，学习率的选择也是一件让人头疼的事情，值选择小了，可能会收敛缓慢，值选大了，可能会导致震荡，无法到达局部最优点。后来专家们设计出一种自适应的学习率，比如在训练开始阶段，选择比较大的学习率值，加速收敛，训练一段时间之后，选择小的学习率值，防止震荡。 LearningRateScheduler 用于定义学习率的变化策略，参数如下：

 

`schedule`：
一个函数，以epoch数（整数，从0开始计数）和当前学习速率，作为输入，返回一个新的学习速率作为输出（浮点数）。

 

`verbose`：


0：

静默模式，

1：

详细输出信息。


In [24]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [25]:
save_path = os.path.join('Training','Saved Models')

In [28]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1) # 超过某个阈值的时候停止训练


In [30]:
eval_callback = EvalCallback(env,                                            #  定期评估模型并保存最好的
                            callback_on_new_best=stop_callback,
                            best_model_save_path=save_path,
                            verbose=1)

In [31]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [32]:
model.learn(total_timesteps=20000,callback = eval_callback)

Logging to Training\Logs\PPO_6
-----------------------------
| time/              |      |
|    fps             | 1845 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1271        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008285078 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00127    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.67        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0168     |
|    value_loss           | 51.4        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 200         |
|    mean_reward          | 200         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.008597443 |
|    clip_fraction        | 0.0816      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.614      |
|    explained_variance   | 0.339       |
|    learning_rate        | 0.0003      |
|    loss                 | 26.3        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0168     |
|    value_loss           | 61.1        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 200.00  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x2d4afb274c8>

# Changing Policies

In [36]:
net_arch = [dict(pi=[128,128,128,128],vf=[128,128,128,128])]

In [37]:
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log = log_path,policy_kwargs={'net_arch':net_arch})

Using cpu device


In [38]:
model.learn(total_timesteps=20000,callback = eval_callback)

Logging to Training\Logs\PPO_7
-----------------------------
| time/              |      |
|    fps             | 1329 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 834         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013661863 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.0138     |
|    learning_rate        | 0.0003      |
|    loss                 | 2.67        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0229     |
|    value_loss           | 17.3        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x2d4afb80dc8>

# Using an Alternate Algorithm

In [39]:
from stable_baselines3 import DQN

In [41]:
model = DQN('MlpPolicy',env,verbose=1,tensorboard_log = log_path)

Using cpu device


In [42]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.954    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2338     |
|    time_elapsed     | 0        |
|    total_timesteps  | 97       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.926    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2747     |
|    time_elapsed     | 0        |
|    total_timesteps  | 155      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.896    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 3068     |
|    time_elapsed     | 0        |
|    total_timesteps  | 219      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 5108     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2204     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 5133     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2266     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 5128     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2325     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 5445     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4511     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 5426     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4587     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 5430     |
|    time_elapsed     | 0        |
|    total_timesteps  | 4666     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 5608     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7012     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 5617     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7107     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 5650     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7245     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 5680     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9288     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 5695     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9410     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 5707     |
|    time_elapsed     | 1        |
|    total_timesteps  | 9526     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 5742     |
|    time_elapsed     | 2        |
|    total_timesteps  | 11789    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 5745     |
|    time_elapsed     | 2        |
|    total_timesteps  | 11899    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 5770     |
|    time_elapsed     | 2        |
|    total_timesteps  | 12060    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 5814     |
|    time_elapsed     | 2        |
|    total_timesteps  | 14368    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 5819     |
|    time_elapsed     | 2        |
|    total_timesteps  | 14449    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 5828     |
|    time_elapsed     | 2        |
|    total_timesteps  | 14554    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 5799     |
|    time_elapsed     | 2        |
|    total_timesteps  | 16465    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 5802     |
|    time_elapsed     | 2        |
|    total_timesteps  | 16559    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 5802     |
|    time_elapsed     | 2        |
|    total_timesteps  | 16659    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 5859     |
|    time_elapsed     | 3        |
|    total_timesteps  | 18886    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 5857     |
|    time_elapsed     | 3        |
|    total_timesteps  | 18954    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 5858     |
|    time_elapsed     | 3        |
|    total_timesteps  | 19044    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x2d4afb84b08>