# Step1 安装依赖

In [2]:
!pip uninstall -y parl  # 说明：AIStudio预装的parl版本太老，容易跟其他库产生兼容性冲突，建议先卸载
!pip uninstall -y pandas scikit-learn # 提示：在AIStudio中卸载这两个库再import parl可避免warning提示，不卸载也不影响parl的使用

!pip install gym
!pip install atari-py # 玩Gym的Atari游戏必装依赖，本次作业使用了Atari的Pong(乒乓球)环境
!pip install paddlepaddle==1.6.3
!pip install parl==1.3.1

# 说明：安装日志中出现两条红色的关于 paddlehub 和 visualdl 的 ERROR 与parl无关，可以忽略，不影响使用

Uninstalling parl-1.1.2:
  Successfully uninstalled parl-1.1.2
Uninstalling pandas-0.23.4:
  Successfully uninstalled pandas-0.23.4
Uninstalling scikit-learn-0.20.0:
  Successfully uninstalled scikit-learn-0.20.0
Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Collecting atari-py
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/8f/ba/1d22e9d2f332f07aaa57041f5dd569c2cb40a92bd6374a0b743ec3dfae97/atari_py-0.2.6-cp37-cp37m-manylinux1_x86_64.whl (2.8MB)
[K     |████████████████████████████████| 2.8MB 458kB/s eta 0:00:01
Installing collected packages: atari-py
Successfully installed atari-py-0.2.6
Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Collecting paddlepaddle==1.6.3
[?25l  Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/96/28/e72bebb3c9b3d98eb9b15d9f6d85150f3cbd63e695e59882ff9f04846686/paddlepaddle-1.6.3-cp37-cp37m-manylinux1_x86_64.whl (90.9MB)
[K  

In [3]:
# 检查依赖包版本是否正确
!pip list | grep paddlepaddle
!pip list | grep parl

paddlepaddle         1.6.3          
parl                 1.3.1          


# Step2 导入依赖

In [4]:
import os
import gym
import numpy as np

import paddle.fluid as fluid
import parl
from parl import layers
from parl.utils import logger

# Step3 设置超参数

In [5]:
######################################################################
######################################################################
#
# 1. 请设定 learning rate，尝试增减查看效果
#
######################################################################
######################################################################

LEARNING_RATE = 0.004

# Step4 搭建Model、Algorithm、Agent架构
* `Agent`把产生的数据传给`algorithm`，`algorithm`根据`model`的模型结构计算出`Loss`，使用`SGD`或者其他优化器不断的优化，`PARL`这种架构可以很方便的应用在各类深度强化学习问题中。

#### （1）Model
`Model`用来定义前向(`Forward`)网络，用户可以自由的定制自己的网络结构。

In [6]:
class Model(parl.Model):
    def __init__(self, act_dim, hid_size, activation='tanh'):
        ######################################################################
        ######################################################################
        #
        # 2. 请参考课程Demo，配置model结构
        #
        ######################################################################
        ######################################################################
        act_dim = act_dim
        hid1_size = act_dim * hid_size

        self.fc1 = layers.fc(size=hid1_size, act=activation)
        self.fc2 = layers.fc(size=act_dim, act='softmax')

    def forward(self, obs):  # 可直接用 model = Model(5); model(obs)调用
        ######################################################################
        ######################################################################
        #
        # 3. 请参考课程Demo，组装policy网络
        #
        ######################################################################
        ######################################################################
        out = self.fc1(obs)
        out = self.fc2(out)
        return out


#### （2）Algorithm
* `Algorithm` 定义了具体的算法来更新前向网络(`Model`)，也就是通过定义损失函数来更新`Model`，和算法相关的计算都放在`algorithm`中。

In [7]:
from parl.algorithms import PolicyGradient # 直接从parl库中导入PolicyGradient算法，无需重复写算法

#### （3）Agent
* `Agent`负责算法与环境的交互，在交互过程中把生成的数据提供给`Algorithm`来更新模型(`Model`)，数据的预处理流程也一般定义在这里。

In [8]:
class Agent(parl.Agent):
    def __init__(self, algorithm, obs_dim, act_dim):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        super(Agent, self).__init__(algorithm)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.pred_program):  # 搭建计算图用于 预测动作，定义输入输出变量
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            self.act_prob = self.alg.predict(obs)

        with fluid.program_guard(
                self.learn_program):  # 搭建计算图用于 更新policy网络，定义输入输出变量
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            act = layers.data(name='act', shape=[1], dtype='int64')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            self.cost = self.alg.learn(obs, act, reward)

    def sample(self, obs):
        obs = np.expand_dims(obs, axis=0)  # 增加一维维度
        act_prob = self.fluid_executor.run(
            self.pred_program,
            feed={'obs': obs.astype('float32')},
            fetch_list=[self.act_prob])[0]
        act_prob = np.squeeze(act_prob, axis=0)  # 减少一维维度
        act = np.random.choice(range(self.act_dim), p=act_prob)  # 根据动作概率选取动作
        return act

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        act_prob = self.fluid_executor.run(
            self.pred_program,
            feed={'obs': obs.astype('float32')},
            fetch_list=[self.act_prob])[0]
        act_prob = np.squeeze(act_prob, axis=0)
        act = np.argmax(act_prob)  # 根据动作概率选择概率最高的动作
        return act

    def learn(self, obs, act, reward):
        act = np.expand_dims(act, axis=-1)
        feed = {
            'obs': obs.astype('float32'),
            'act': act.astype('int64'),
            'reward': reward.astype('float32')
        }
        cost = self.fluid_executor.run(
            self.learn_program, feed=feed, fetch_list=[self.cost])[0]
        return cost


### Step 5 Training && Test（训练&&测试）

In [9]:
def run_episode(env, agent):
    obs_list, action_list, reward_list = [], [], []
    obs = env.reset()
    while True:
        obs = preprocess(obs) # from shape (210, 160, 3) to (100800,)
        obs_list.append(obs)
        action = agent.sample(obs) # 采样动作
        action_list.append(action)

        obs, reward, done, info = env.step(action)
        reward_list.append(reward)

        if done:
            break
    return obs_list, action_list, reward_list


# 评估 agent, 跑 5 个episode，求平均
def evaluate(env, agent, render=False):
    eval_reward = []
    for i in range(5):
        obs = env.reset()
        episode_reward = 0
        while True:
            obs = preprocess(obs) # from shape (210, 160, 3) to (100800,)
            action = agent.predict(obs) # 选取最优动作
            obs, reward, isOver, _ = env.step(action)
            episode_reward += reward
            if render:
                env.render()
            if isOver:
                break
        eval_reward.append(episode_reward)
    return np.mean(eval_reward)

### Step6 创建环境和Agent，启动训练，保存模型

In [10]:

# Pong 图片预处理
def preprocess(image):
    """ 预处理 210x160x3 uint8 frame into 6400 (80x80) 1维 float vector """
    image = image[35:195]  # 裁剪
    image = image[::2, ::2, 0]  # 下采样，缩放2倍
    image[image == 144] = 0  # 擦除背景 (background type 1)
    image[image == 109] = 0  # 擦除背景 (background type 2)
    image[image != 0] = 1  # 转为灰度图，除了黑色外其他都是白色
    return image.astype(np.float).ravel()


# 根据一个episode的每个step的reward列表，计算每一个Step的Gt
def calc_reward_to_go(reward_list, gamma=0.99):
    """calculate discounted reward"""
    reward_arr = np.array(reward_list)
    for i in range(len(reward_arr) - 2, -1, -1):
        # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
        reward_arr[i] += gamma * reward_arr[i + 1]
    # normalize episode rewards
    reward_arr -= np.mean(reward_arr)
    reward_arr /= np.std(reward_arr)
    return reward_arr


def main(LEARNING_RATE, hid_size, activation, render=False):
    # 创建环境
    env = gym.make('Pong-v0')
    obs_dim = 80 * 80
    act_dim = env.action_space.n
    logger.info('obs_dim {}, act_dim {}, LEARNING_RATE {}, hid_size {}, activation {}, Pid {}'.format(
        obs_dim, act_dim, LEARNING_RATE, hid_size, activation, os.getpid()))

    # 根据parl框架构建agent
    ######################################################################
    ######################################################################
    #
    # 4. 请参考课堂Demo构建 agent，嵌套Model, PolicyGradient, Agent
    #
    ######################################################################
    ######################################################################
    model = Model(act_dim=act_dim, hid_size=hid_size, activation=activation)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    if os.path.exists(f'./model_best_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt'):
        agent.restore(f'./model_best_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt')
        logger.info(f'Successfully loaded model ./model_best_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt')

    best_reward = -100
    for i in range(3000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if 100 < i < 2900:
            if i % 100 == 0:
                logger.info("Train Episode {}, Reward Sum {}. Pid {}".format(i, sum(reward_list), os.getpid()))
        elif i % 10 == 0:
            logger.info("Train Episode {}, Reward Sum {}. Pid {}".format(i, sum(reward_list), os.getpid()))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=render)
            logger.info('Episode {}, Test reward: {}, Pid {}'.format(i + 1, total_reward, os.getpid()))
            if total_reward > best_reward:
                best_reward = total_reward
                agent.save(f'./model_best_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt')

    # save the parameters to ./model.ckpt
    agent.save(f'./model_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt')


In [11]:
def parallel():
    # only work in CPU mode
    from multiprocessing import Pool
    print('Parent process %s.' % os.getpid())
    p = Pool(4)
    for lr in [0.0015, 0.0025]:
        for hs in [10]:
            for act in ['tanh', 'relu']:
                p.apply_async(main, args=(lr, hs, act))
    print('Waiting for all subprocesses done...')
    p.close()
    p.join()
    print('All subprocesses done.')


def one(lr, hs, act):
    # lr = 0.002
    # hs = 20
    # act = 'relu'
    main(lr, hs, act, render=False)

In [12]:
# parallel()
one(0.004, 20, 'relu')

[32m[06-20 16:15:51 MainThread @<ipython-input-10-486e4fc4cd66>:31][0m obs_dim 6400, act_dim 6, LEARNING_RATE 0.004, hid_size 20, activation relu, Pid 60
[32m[06-20 16:15:51 MainThread @machine_info.py:88][0m Cannot find available GPU devices, using CPU now.
[32m[06-20 16:15:51 MainThread @machine_info.py:88][0m Cannot find available GPU devices, using CPU now.
[32m[06-20 16:15:51 MainThread @<ipython-input-10-486e4fc4cd66>:48][0m Successfully loaded model ./model_best_lr_0.004_hids_20_relu.ckpt
[32m[06-20 16:16:00 MainThread @<ipython-input-10-486e4fc4cd66>:57][0m Train Episode 0, Reward Sum 6.0. Pid 60
[32m[06-20 16:17:31 MainThread @<ipython-input-10-486e4fc4cd66>:57][0m Train Episode 10, Reward Sum 15.0. Pid 60
[32m[06-20 16:19:01 MainThread @<ipython-input-10-486e4fc4cd66>:57][0m Train Episode 20, Reward Sum 9.0. Pid 60
[32m[06-20 16:20:33 MainThread @<ipython-input-10-486e4fc4cd66>:57][0m Train Episode 30, Reward Sum -4.0. Pid 60
[32m[06-20 16:22:02 MainThread @<

In [20]:
def test(LEARNING_RATE, hid_size, activation, render=False, gamma=0.99):
    # LEARNING_RATE = 0.002
    # hid_size = 30
    # activation = 'sigmoid'
    # gamma = 0.9
    # render = False

    # 创建环境
    env = gym.make('Pong-v0')
    obs_dim = 80 * 80
    act_dim = env.action_space.n
    model = Model(act_dim=act_dim, hid_size=hid_size, activation=activation)
    alg = PolicyGradient(model, lr=LEARNING_RATE)
    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)

    # 加载模型
    # if os.path.exists(f'./model_best_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt'):
    #     agent.restore(f'./model_best_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt')
    
    if os.path.exists(f'./model_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt'):
        agent.restore(f'./model_lr_{LEARNING_RATE}_hids_{hid_size}_{activation}.ckpt')
        total_reward = evaluate(env, agent, render=render)
        logger.info(
            'obs_dim {}, act_dim {}, LEARNING_RATE {}, hid_size {}, activation {}, gamma {}, Test reward: {}'.format(
                obs_dim, act_dim, LEARNING_RATE, hid_size, activation, gamma, total_reward))
    else:
        logger.info(
            'No model for obs_dim {}, act_dim {}, LEARNING_RATE {}, hid_size {}, activation {}, gamma {}'.format(
                obs_dim, act_dim, LEARNING_RATE, hid_size, activation, gamma))
        return -21

test(0.004, 20, 'relu')

[32m[06-21 00:06:45 MainThread @machine_info.py:88][0m Cannot find available GPU devices, using CPU now.
[32m[06-21 00:06:45 MainThread @machine_info.py:88][0m Cannot find available GPU devices, using CPU now.
[32m[06-21 00:07:21 MainThread @<ipython-input-20-401b1b14b375>:25][0m obs_dim 6400, act_dim 6, LEARNING_RATE 0.004, hid_size 20, activation relu, gamma 0.99, Test reward: 12.2


### 小结
#### 1. 对训练脚本进行了一定优化
- 增加多进程并行训练函数，充分利用机器多核，提高训练速度；不过只适用于CPU版，GPU版会出错退出
- 增加最优模型保存策略

#### 2. 对大量参数组合进行了实验
- 模型网络结构方面，实验了隐层为1层和2层fc两种，2层训练太慢，最终选择了1层结构
- 对于隐层节点数，实验了10倍act_dim和20倍两种，20倍的效果更好
- 对于激活函数，实验了tanh, sigmoid和relu, 发现tanh和relu较好，但relu对学习率的敏感度更低
- 对应Gamma，短暂实验了0.9，0.99，0.995，由于一开始的实验结果都不好，后来固定为0.99
- 对学习率，实验了[0.001, 0.002, 0.0025, 0.003, 0.004, 0.005, 0.01], 结果 relu + 0.004效果最好

#### 3. 训练机器
- Baidu AIStudio
- Google Colab
- MacBook Pro 

一开始没加入最优模型保存策略，也没提前终止不可行方案，各处攒资源大概用了100 CPU核时才发现本笔记版本中的方案，
大概训练了1000Episodes，然后将中间的最优模型上传到这里，继续进行训练，所以现在的模型实际训练了大概4000个Episodes。

#### 4. 最终结果
- 训练时最后一个Episode的 Test Reward = 15.2
- 单独加载最后保存的模型进行多次实验，基本能够稳定使得 Test Reward > 10.0
