# 大作业：使用DDPG解决四轴飞行器悬浮任务

# Step1 安装依赖

In [None]:
!pip uninstall -y parl  # 说明：AIStudio预装的parl版本太老，容易跟其他库产生兼容性冲突，建议先卸载
!pip uninstall -y pandas scikit-learn # 提示：在AIStudio中卸载这两个库再import parl可避免warning提示，不卸载也不影响parl的使用
!pip uninstall -y paddlepaddle-gpu

!pip install paddlepaddle==1.6.3 
!pip install parl==1.3.1
!pip install rlschool==0.3.1

# 说明：安装日志中出现两条红色的关于 paddlehub 和 visualdl 的 ERROR 与parl无关，可以忽略，不影响使用

Uninstalling parl-1.3.1:
  Successfully uninstalled parl-1.3.1
Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Collecting parl==1.3.1
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/62/79/590af38a920792c71afb73fad7583967928b4d0ba9fca76250d935c7fda8/parl-1.3.1-py2.py3-none-any.whl (521kB)
[K     |████████████████████████████████| 522kB 2.7MB/s eta 0:00:01
Installing collected packages: parl
Successfully installed parl-1.3.1
Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/


In [None]:
# 检查依赖包版本是否正确
!pip list | grep paddlepaddle
!pip list | grep parl
!pip list | grep rlschool

paddlepaddle         1.6.3          
parl                 1.3.1          
rlschool             0.3.1          


# Step2 导入依赖

In [None]:
import os
import numpy as np

import parl
from parl import layers
from paddle import fluid
from parl.utils import logger
from parl.utils import action_mapping # 将神经网络输出映射到对应的 实际动作取值范围 内
from parl.utils import ReplayMemory # 经验回放

from rlschool import make_env  # 使用 RLSchool 创建飞行器环境

os.environ['CUDA_VISIBLE_DEVICES'] = ''

# Step3 设置超参数

In [None]:
######################################################################
######################################################################
#
# 1. 请设定 learning rate，尝试增减查看效果
#
######################################################################
######################################################################
ACTOR_LR = 0.0002   # Actor网络更新的 learning rate
CRITIC_LR = 0.005   # Critic网络更新的 learning rate
DEBUG = False

GAMMA = 0.99        # reward 的衰减因子，一般取 0.9 到 0.999 不等
TAU = 0.001         # target_model 跟 model 同步参数 的 软更新参数
MEMORY_SIZE = 1e6   # replay memory的大小，越大越占用内存
MEMORY_WARMUP_SIZE = 1e4      # replay_memory 里需要预存一些经验数据，再从里面sample一个batch的经验让agent去learn
REWARD_SCALE = 0.01       # reward 的缩放因子
BATCH_SIZE = 256          # 每次给agent learn的数据数量，从replay memory随机里sample一批数据出来
TRAIN_TOTAL_STEPS = 1e6   # 总训练步数
TEST_EVERY_STEPS = 1e4    # 每个N步评估一下算法效果，每次评估5个episode求平均reward

# Step4 搭建Model、Algorithm、Agent架构
* `Agent`把产生的数据传给`algorithm`，`algorithm`根据`model`的模型结构计算出`Loss`，使用`SGD`或者其他优化器不断的优化，`PARL`这种架构可以很方便的应用在各类深度强化学习问题中。

## （1）Model
* 分别搭建`Actor`、`Critic`的`Model`结构，构建`QuadrotorModel`。

In [None]:
class ActorModel(parl.Model):
    def __init__(self, act_dim, model_tag):
        ######################################################################
        ######################################################################
        #
        # 2. 请配置model结构
        #
        ######################################################################
        ######################################################################
        self.model_tag = model_tag
        if self.model_tag == 1:
            # simple model
            hid_size = 100

            self.fc1 = layers.fc(size=hid_size, act='relu', param_attr=fluid.initializer.Normal(loc=0.0, scale=0.1))
            self.fc2 = layers.fc(size=act_dim, act='tanh', param_attr=fluid.initializer.Normal(loc=0.0, scale=0.1))
        else:
            hid1_size = 100
            hid2_size = 100

            self.fc1 = layers.fc(size=hid1_size, act='relu', param_attr=fluid.initializer.Normal(loc=0.0, scale=0.1))
            self.fc2 = layers.fc(size=hid2_size, act='relu', param_attr=fluid.initializer.Normal(loc=0.0, scale=0.1))
            self.fc3 = layers.fc(size=act_dim, act='tanh', param_attr=fluid.initializer.Normal(loc=0.0, scale=0.1))

    def policy(self, obs):
        ######################################################################
        ######################################################################
        #
        # 3. 请组装policy网络
        #
        ######################################################################
        ######################################################################
        if self.model_tag == 1:
            hid = self.fc1(obs)
            logits = self.fc2(hid)
        else:                
            hid1 = self.fc1(obs)
            hid2 = self.fc2(hid1)
            logits = self.fc3(hid2) 
        return logits

In [None]:
class CriticModel(parl.Model):
    def __init__(self, model_tag):
        ######################################################################
        ######################################################################
        #
        # 4. 请配置model结构
        #
        ######################################################################
        ######################################################################
        self.model_tag = model_tag
        if self.model_tag == 1:
            hid_size = 100

            self.fc1 = layers.fc(size=hid_size, act='relu', param_attr=fluid.initializer.Normal(loc=0.0, scale=0.1))
            self.fc2 = layers.fc(size=1, act=None)
        else:
            hid1_size = 100
            hid2_size = 100

            self.fc1 = layers.fc(size=hid1_size, act='relu', param_attr=fluid.initializer.Normal(loc=0.0, scale=0.1))
            self.fc2 = layers.fc(size=hid2_size, act='relu', param_attr=fluid.initializer.Normal(loc=0.0, scale=0.1))
            self.fc3 = layers.fc(size=1, act=None)

    def value(self, obs, act):
        # 输入 state, action, 输出对应的Q(s,a)

        ######################################################################
        ######################################################################
        #
        # 5. 请组装Q网络
        #
        ######################################################################
        ######################################################################
        if self.model_tag == 1:
            concat = layers.concat([obs, act], axis=1)
            hid = self.fc1(concat)
            Q = self.fc2(hid)
            Q = layers.squeeze(Q, axes=[1])
        else:
            hid1 = self.fc1(obs)
            concat = layers.concat([hid1, act], axis=1)
            hid2 = self.fc2(concat)
            Q = self.fc3(hid2)
            Q = layers.squeeze(Q, axes=[1])
        return Q

In [None]:
class QuadrotorModel(parl.Model):
    def __init__(self, act_dim, model_tag):
        self.model_tag = model_tag
        self.actor_model = ActorModel(act_dim, self.model_tag)
        self.critic_model = CriticModel(self.model_tag)

    def policy(self, obs):
        return self.actor_model.policy(obs)

    def value(self, obs, act):
        return self.critic_model.value(obs, act)

    def get_actor_params(self):
        return self.actor_model.parameters()

## （2）Algorithm
* 可以采用下面的方式从`parl`库中快速引入`DDPG`算法，无需自己重新写算法

In [None]:
# from parl.algorithms import DDPG

import parl
from parl import layers
from copy import deepcopy
from paddle import fluid


class DDPG(parl.Algorithm):
    def __init__(self,
                 model,
                 gamma=None,
                 tau=None,
                 actor_lr=None,
                 critic_lr=None):
        """  DDPG algorithm
        
        Args:
            model (parl.Model): actor and critic 的前向网络.
                                model 必须实现 get_actor_params() 方法.
            gamma (float): reward的衰减因子.
            tau (float): self.target_model 跟 self.model 同步参数 的 软更新参数
            actor_lr (float): actor 的学习率
            critic_lr (float): critic 的学习率
        """
        assert isinstance(gamma, float)
        assert isinstance(tau, float)
        assert isinstance(actor_lr, float)
        assert isinstance(critic_lr, float)
        self.gamma = gamma
        self.tau = tau
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr

        self.model = model
        self.target_model = deepcopy(model)

    def predict(self, obs):
        """ 使用 self.model 的 actor model 来预测动作
        """
        return self.model.policy(obs)

    def learn(self, obs, action, reward, next_obs, terminal):
        """ 用DDPG算法更新 actor 和 critic
        """
        actor_cost = self._actor_learn(obs)
        critic_cost = self._critic_learn(obs, action, reward, next_obs,
                                         terminal)
        return actor_cost, critic_cost

    def _actor_learn(self, obs):
        action = self.model.policy(obs)
        Q = self.model.value(obs, action)
        cost = layers.reduce_mean(-1.0 * Q)
        optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
        optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
        if DEBUG:
            print(self.actor_lr)
        return cost

    def _critic_learn(self, obs, action, reward, next_obs, terminal):
        next_action = self.target_model.policy(next_obs)
        next_Q = self.target_model.value(next_obs, next_action)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        Q = self.model.value(obs, action)
        cost = layers.square_error_cost(Q, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        if DEBUG:
            print(self.critic_lr)
        optimizer.minimize(cost)
        return cost

    def sync_target(self, decay=None, share_vars_parallel_executor=None):
        """ self.target_model从self.model复制参数过来，若decay不为None,则是软更新
        """
        if decay is None:
            decay = 1.0 - self.tau
        self.model.sync_weights_to(
            self.target_model,
            decay=decay,
            share_vars_parallel_executor=share_vars_parallel_executor)


## （3）Agent

In [None]:

class QuadrotorAgent(parl.Agent):
    def __init__(self, algorithm, obs_dim, act_dim=4):
        assert isinstance(obs_dim, int)
        assert isinstance(act_dim, int)
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        super(QuadrotorAgent, self).__init__(algorithm)

        # 注意，在最开始的时候，先完全同步target_model和model的参数
        self.alg.sync_target(decay=0)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.pred_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            self.pred_act = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            act = layers.data(
                name='act', shape=[self.act_dim], dtype='float32')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            next_obs = layers.data(
                name='next_obs', shape=[self.obs_dim], dtype='float32')
            terminal = layers.data(name='terminal', shape=[], dtype='bool')
            _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
                                                 terminal)

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        act = self.fluid_executor.run(
            self.pred_program, feed={'obs': obs},
            fetch_list=[self.pred_act])[0]
        # print(act)
        # 调整输出到均值附近
        act_mean = act.mean(axis=1)
        act = act_mean + (act - act_mean) * 0.1
        return act

    def learn(self, obs, act, reward, next_obs, terminal):
        feed = {
            'obs': obs,
            'act': act,
            'reward': reward,
            'next_obs': next_obs,
            'terminal': terminal
        }
        critic_cost = self.fluid_executor.run(
            self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
        self.alg.sync_target()

        return critic_cost


# Step4 Training && Test（训练&&测试）

In [None]:
def run_episode(env, agent, rpm):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)

        # 给输出动作增加探索扰动，输出限制在 [-1.0, 1.0] 范围内
        action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0)
        # 动作映射到对应的 实际动作取值范围 内, action_mapping是从parl.utils那里import进来的函数
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)

        if rpm.size() > MEMORY_WARMUP_SIZE:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                    batch_terminal = rpm.sample_batch(BATCH_SIZE)
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps

# 评估 agent, 跑 5 个episode，总reward求平均
def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        total_reward, steps = 0, 0
        while True:
            batch_obs = np.expand_dims(obs, axis=0)
            action = agent.predict(batch_obs.astype('float32'))
            action = np.squeeze(action)
            action = np.clip(action, -1.0, 1.0)  # the action should be in range [-1.0, 1.0]
            action = action_mapping(action, env.action_space.low[0], 
                                    env.action_space.high[0])

            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward
            steps += 1
            
            if render:
                env.render()
            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)

# Step 5 创建环境和Agent，创建经验池，启动训练，定期保存模型

In [None]:
from parl.utils.scheduler import LinearDecayScheduler
def main(ACTOR_LR=0.0002, CRITIC_LR=0.001, model_tag=1, load_model=False, go_steps=1, f_best=''):
    # 创建飞行器环境
    env = make_env("Quadrotor", task="hovering_control")
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]


    # 根据parl框架构建agent
    ######################################################################
    ######################################################################
    #
    # 6. 请构建agent:  QuadrotorModel, DDPG, QuadrotorAgent三者嵌套
    #
    ######################################################################
    ######################################################################
    model = QuadrotorModel(act_dim, model_tag=model_tag)
    algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim)

    # parl库也为DDPG算法内置了ReplayMemory，可直接从 parl.utils 引入使用
    rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)

    # 启动训练
    logger.info(f'Params: ACTOR_LR={ACTOR_LR}, CRITIC_LR={CRITIC_LR}, model_tag={model_tag}, Pid {os.getpid()}')
    test_flag = 0
    total_steps = 0
    early_stop = 0
    last_reward = -1e9
    actor_lr_scheduler = LinearDecayScheduler(ACTOR_LR, int(1e7))
    critic_lr_scheduler = LinearDecayScheduler(CRITIC_LR, int(1e7))

    # load best model
    # f1_best = './model_dir/steps_220218_evaluate_reward_3050_ACTOR_LR_0.0005_CRITIC_LR_0.01_model_tag_1.ckpt'
    # f2_best = './model_dir/steps_390209_evaluate_reward_8391_ACTOR_LR_0.0002_CRITIC_LR_0.005_model_tag_2.ckpt'
    # if model_tag == 1 and load_model == True and os.path.exists(f1_best): 
    #     agent.restore(f1_best)
    #     actor_lr_scheduler.step(step_num=220218)
    #     critic_lr_scheduler.step(step_num=220218)
    #     logger.info(f'load model success. Pid {os.getpid()}')
    # if model_tag == 2 and load_model == True and os.path.exists(f2_best): 
    #     agent.restore(f2_best)
    #     actor_lr_scheduler.step(step_num=390209)
    #     critic_lr_scheduler.step(step_num=390209)
    #     logger.info(f'load model success. Pid {os.getpid()}')
    if load_model == True and os.path.exists(f_best): 
        agent.restore(f_best)
        actor_lr_scheduler.step(step_num=go_steps)
        critic_lr_scheduler.step(step_num=go_steps)
        logger.info(f'load model success. Pid {os.getpid()}')


    while total_steps < TRAIN_TOTAL_STEPS:
        train_reward, steps = run_episode(env, agent, rpm)
        total_steps += steps
        #logger.info('Steps: {} Reward: {} Pid: {}'.format(total_steps, train_reward, os.getpid())) # 打印训练reward

        # 可以在这里修改学习率, 可以用 parl.utils.scheduler 中的 LinearDecayScheduler 进行修改，也可以自行修改
        agent.alg.actor_lr = max(actor_lr_scheduler.step(step_num=steps), ACTOR_LR/100)
        agent.alg.critic_lr = max(critic_lr_scheduler.step(step_num=steps), CRITIC_LR/100)

        if total_steps // TEST_EVERY_STEPS >= test_flag: # 每隔一定step数，评估一次模型
            while total_steps // TEST_EVERY_STEPS >= test_flag:
                test_flag += 1
    
            evaluate_reward = evaluate(env, agent)
            logger.info('Steps {}, Test reward: {}, Pid {}'.format(total_steps, evaluate_reward, os.getpid())) # 打印评估的reward

            # 每评估一次，就保存一次模型，以训练的step数命名
            if evaluate_reward > 0:
                ckpt = f'model_dir/steps_{total_steps}_evaluate_reward_{int(evaluate_reward)}_ACTOR_LR_{ACTOR_LR}_CRITIC_LR_{CRITIC_LR}_model_tag_{model_tag}.ckpt'
                agent.save(ckpt)
                logger.info(f'Current actor_lr: {agent.alg.actor_lr}  critic_lr: {agent.alg.critic_lr}  Pid {os.getpid()} ckpt {ckpt}')

            # early_stop, 超过5%训练进度且连续3次测评reward下降则提前终止
            if evaluate_reward > last_reward:
                early_stop = 0
            else:
                early_stop += 1
            last_reward = evaluate_reward
            # if total_steps > TRAIN_TOTAL_STEPS / 20 and early_stop >= 3:
            # if total_steps > 200000 and ((early_stop >= 3 and evaluate_reward < 5000) or evaluate_reward < 0):
            if evaluate_reward < 9000 or early_stop >= 2:
                logger.info(f'No good results, stop training. Pid {os.getpid()}')
                break


In [74]:
def parallel():
    from multiprocessing import Pool

    print('Parent process %s.' % os.getpid()) 

    p = Pool(4)
    for ACTOR_LR in [0.0001, 0.0002, 0.0005, 0.001, 0.002]:  # 0.0002
        for CRITIC_LR in [0.001, 0.005, 0.01]:  # 0.001
            for model_tag in [1, 2]:
                p.apply_async(main, args=(ACTOR_LR, CRITIC_LR, model_tag))

    print('Waiting for all subprocesses done...')
    p.close()
    p.join()
    print('All subprocesses done.')


def one(ACTOR_LR=0.0002, CRITIC_LR=0.001, model_tag=1, load_model=False, go_steps=1, f_best=''):
    main(ACTOR_LR, CRITIC_LR, model_tag, load_model, go_steps, f_best)


def find_best():
    best = -1000000
    best_f = ''
    for _, _, files in os.walk('./model_dir'):
        for f in files:
            reward = int(f[f.find('reward')+7:f.find('ACTOR')-1])
            if reward >= best:
                best = reward
                best_f = f
    return 'model_dir/' + best_f


def fine_tune(ACTOR_LR=0.0002, CRITIC_LR=0.005, episodes=10):
    for i in range(episodes):
        f_best = find_best()
        logger.info(f'Current best: {f_best}, finetune it...') 
        one(ACTOR_LR=ACTOR_LR, CRITIC_LR=CRITIC_LR, model_tag=2, load_model=True, go_steps=1, f_best=f_best)

In [76]:
from multiprocessing import Pool

print('Parent process %s.' % os.getpid()) 

p = Pool(2) 
p.apply_async(fine_tune, args=(0.0002, 0.005, 30))
p.apply_async(fine_tune, args=(0.00002, 0.0005, 30))

print('Waiting for all subprocesses done...')
p.close()
p.join()
print('All subprocesses done.')

Parent process 61.
[32m[06-21 15:41:52 MainThread @<ipython-input-74-b64c3768500b>:37][0m Current best: model_dir/steps_348_evaluate_reward_9451_ACTOR_LR_0.0002_CRITIC_LR_0.005_model_tag_2.ckpt, finetune it...
[32m[06-21 15:41:52 MainThread @<ipython-input-74-b64c3768500b>:37][0m Current best: model_dir/steps_348_evaluate_reward_9451_ACTOR_LR_0.0002_CRITIC_LR_0.005_model_tag_2.ckpt, finetune it...
Waiting for all subprocesses done...
[32m[06-21 15:41:53 MainThread @<ipython-input-69-6450d574507d>:26][0m Params: ACTOR_LR=2e-05, CRITIC_LR=0.0005, model_tag=2, Pid 1186
[32m[06-21 15:41:53 MainThread @<ipython-input-69-6450d574507d>:51][0m load model success. Pid 1186
[32m[06-21 15:41:53 MainThread @<ipython-input-69-6450d574507d>:26][0m Params: ACTOR_LR=0.0002, CRITIC_LR=0.005, model_tag=2, Pid 1185
[32m[06-21 15:41:53 MainThread @<ipython-input-69-6450d574507d>:51][0m load model success. Pid 1185
[32m[06-21 15:42:52 MainThread @<ipython-input-69-6450d574507d>:68][0m Steps 4

# 验收测评

In [77]:
######################################################################
######################################################################
#
# 7. 请选择你训练的最好的一次模型文件做评估
#
######################################################################
######################################################################

def test(total_steps, evaluate_reward, ACTOR_LR=0.0002, CRITIC_LR=0.001, model_tag=1, render=False):
    # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称
    ckpt = f'model_dir/steps_{total_steps}_evaluate_reward_{int(evaluate_reward)}_ACTOR_LR_{ACTOR_LR}_CRITIC_LR_{CRITIC_LR}_model_tag_{model_tag}.ckpt'
    env = make_env("Quadrotor", task="hovering_control")
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    model = QuadrotorModel(act_dim, model_tag=model_tag)
    algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim)
    # 加载模型
    if os.path.exists(ckpt):
        agent.restore(ckpt)
        logger.info(f'Test Model file {ckpt}')
    else:
        logger.info(f'No Model file {ckpt}')
        return -1
    evaluate_reward = evaluate(env, agent, render=render)
    logger.info('Evaluate reward: {}'.format(evaluate_reward)) # 打印评估的reward


In [81]:
def test_best():
    best = -1000000
    best_f = ''
    for _, _, files in os.walk('./model_dir'):
        for f in files:
            reward = int(f[f.find('reward')+7:f.find('ACTOR')-1])
            if reward >= best:
                best = reward
                best_f = f 
    res = best_f.split('_') 
    test(res[1], res[4], float(res[7]), float(res[10]), int(res[-1].split('.')[0]), render=False)

test_best()

[32m[06-21 16:48:32 MainThread @<ipython-input-77-466d854624d9>:22][0m Test Model file model_dir/steps_409_evaluate_reward_9487_ACTOR_LR_2e-05_CRITIC_LR_0.0005_model_tag_2.ckpt
[32m[06-21 16:49:23 MainThread @<ipython-input-77-466d854624d9>:27][0m Evaluate reward: 9348.914547714336


### 小结
#### 1. 对训练代码进行了一定优化
- 通过设置early_stop判断条件，对大量的参数组合进行快速验证
- 基于较优组合进行后续微调
- 修改predict函数，使得输出值接近
```
    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        act = self.fluid_executor.run(
            self.pred_program, feed={'obs': obs},
            fetch_list=[self.pred_act])[0]
        # print(act)
        # 调整输出到均值附近
        act_mean = act.mean(axis=1)
        act = act_mean + (act - act_mean) * 0.1
        return act
```      
- 增加自适应学习率
```
        # 可以用 parl.utils.scheduler 中的 LinearDecayScheduler 修改学习率
        agent.alg.actor_lr = max(actor_lr_scheduler.step(step_num=steps), ACTOR_LR/100)
        agent.alg.critic_lr = max(critic_lr_scheduler.step(step_num=steps), CRITIC_LR/100)
```

#### 2. 对大量参数组合进行了实验
- 模型网络结构方面，实验了隐层为1层和2层fc两种，1层效果较差，最终选择了2层结构
- 按正态分布进行权重初始化，不过很难看出是否有明显效果
- 对初始学习率，最开始尝试时按以下组合进行了实验，最优组合为ACTOR_LR=0.0002，CRITIC_LR=0.005
```
    for ACTOR_LR in [0.0001, 0.0002, 0.0005, 0.001, 0.002]:  # 0.0002
        for CRITIC_LR in [0.001, 0.005, 0.01]:  # 0.001
```
之后微调，按(0.0002, 0.005)和(0.00002, 0.0005)进行的尝试，感觉效果差不多，目前最优的是用(0.00002, 0.0005)微调的。

#### 3. 训练过程
- Baidu AIStudio
- Google Colab
- MacBook Pro
> 用各种机器进行了训练，对于不好的参数组合，全部100万步训练完成甚至都见不到reward为正的情况，对于目前定下的组合，大概训练了20万步后reward达到5000。
> 在39万步时，下载了中间结果，后来Colab超时断开连接了，所以后续是基于这个39万步的结果继续微调。微调波动也很大，前几万步结果还好，后续会突然变负，变负就终止。

#### 4. 最终结果
- 最优模型为 model_dir/steps_409_evaluate_reward_9487_ACTOR_LR_2e-05_CRITIC_LR_0.0005_model_tag_2.ckpt
- **Evaluate reward: 9348.914547714336**