# DDPG解决四轴飞行器悬浮任务

In [18]:
import os
import pickle
import numpy as np

import parl
from paddle import fluid
from parl import layers
from parl.core.fluid.plutils import fetch_value, set_value  # PARL的helper functions，帮助加载或保存weights
from parl.utils import logger, action_mapping, ReplayMemory  # TODO: check whether tensorboard works in notebook
from rlschool import make_env  # RLSchool 创建飞行器环境

## 创建飞行器环境

In [3]:
env = make_env("Quadrotor", task="hovering_control")
env.reset()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]

## Model

In [4]:
class ActorModel(parl.Model):
    def __init__(self, act_dim):
        hidden_dim_1, hidden_dim_2 = 64, 64
        self.fc1 = layers.fc(size=hidden_dim_1, act='tanh')
        self.fc2 = layers.fc(size=hidden_dim_2, act='tanh')
        self.fc3 = layers.fc(size=act_dim, act='tanh')

    def policy(self, obs):
        x = self.fc1(obs)
        x = self.fc2(x)
        return self.fc3(x)

In [5]:
class CriticModel(parl.Model):
    def __init__(self):
        hidden_dim_1, hidden_dim_2 = 64, 64
        self.fc1 = layers.fc(size=hidden_dim_1, act='tanh')
        self.fc2 = layers.fc(size=hidden_dim_2, act='tanh')
        self.fc3 = layers.fc(size=1, act=None)

    def value(self, obs, act):
        x = self.fc1(obs)
        concat = layers.concat([x, act], axis=1)
        x = self.fc2(concat)
        Q = self.fc3(x)
        Q = layers.squeeze(Q, axes=[1])
        return Q

In [6]:
class QuadrotorModel(parl.Model):
    def __init__(self, act_dim):
        self.actor_model = ActorModel(act_dim)
        self.critic_model = CriticModel()

    def policy(self, obs):
        return self.actor_model.policy(obs)

    def value(self, obs, act):
        return self.critic_model.value(obs, act)

    def get_actor_params(self):
        return self.actor_model.parameters()

    def load(self, ckpt, use_gpu=False, actor_only=False):
        with open(ckpt, 'rb') as f:
            weights_dict = pickle.load(f)

        if actor_only:
            parameters = self.get_actor_params()
        else:
            parameters = self.parameters()

        for var in parameters:
            set_value(var, weights_dict[var], use_gpu)

    def save(self, ckpt):
        weights_dict = dict()
        for var in self.parameters():
            weights_dict[var] = fetch_value(var)

        with open(ckpt, 'wb') as f:
            pickle.dump(weights_dict, f)

In [7]:
model = QuadrotorModel(act_dim)

## Algorithm

设置DDPG超参以及训练设置

In [8]:
hparams = {
    'gamma': 0.99,
    'tau': 0.001,
    'actor_lr': 0.0002,
    'critic_lr': 0.001,
    'memory_size': 1e6,
    'reward_scale': 0.01,
    'rpm_init_size': 1e4,
    'batch_size': 256,
    'train_total_steps': 1e6,
    'test_every_steps': 1e4
}

In [9]:
algorithm = parl.algorithms.DDPG(
    model,
    gamma=hparams['gamma'],
    tau=hparams['tau'],
    actor_lr=hparams['actor_lr'],
    critic_lr=hparams['critic_lr'])

## Agent

In [10]:
class QuadrotorAgent(parl.Agent):
    def __init__(self, algorithm, obs_dim, act_dim=4):
        assert isinstance(obs_dim, int)
        assert isinstance(act_dim, int)
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        super(QuadrotorAgent, self).__init__(algorithm)

        # Attention: In the beginning, sync target model totally.
        self.alg.sync_target(decay=0)

    def build_program(self):
        self.pred_program = fluid.Program()
        self.learn_program = fluid.Program()

        with fluid.program_guard(self.pred_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            self.pred_act = self.alg.predict(obs)

        with fluid.program_guard(self.learn_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            act = layers.data(
                name='act', shape=[self.act_dim], dtype='float32')
            reward = layers.data(name='reward', shape=[], dtype='float32')
            next_obs = layers.data(
                name='next_obs', shape=[self.obs_dim], dtype='float32')
            terminal = layers.data(name='terminal', shape=[], dtype='bool')
            _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
                                                 terminal)

    def predict(self, obs):
        obs = np.expand_dims(obs, axis=0)
        act = self.fluid_executor.run(
            self.pred_program, feed={'obs': obs},
            fetch_list=[self.pred_act])[0]
        return act

    def learn(self, obs, act, reward, next_obs, terminal):
        feed = {
            'obs': obs,
            'act': act,
            'reward': reward,
            'next_obs': next_obs,
            'terminal': terminal
        }
        critic_cost = self.fluid_executor.run(
            self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
        self.alg.sync_target()
        return critic_cost

In [11]:
agent = QuadrotorAgent(algorithm, obs_dim, act_dim)

[32m[06-05 14:26:44 MainThread @machine_info.py:82][0m nvidia-smi -L found gpu count: 8
[32m[06-05 14:26:44 MainThread @machine_info.py:82][0m nvidia-smi -L found gpu count: 8
[32m[06-05 14:26:52 MainThread @machine_info.py:82][0m nvidia-smi -L found gpu count: 8


## Replay Memory

In [13]:
rpm = ReplayMemory(int(hparams['memory_size']), obs_dim, act_dim)

## Evaluation

In [14]:
def run_evaluate_episode(env, agent):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)
        action = action_mapping(action, env.action_space.low[0], 
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)

        obs = next_obs
        total_reward += reward
        steps += 1

        if done:
            break
    return total_reward, steps

## Training

In [15]:
def run_train_episode(env, agent, rpm):
    obs = env.reset()
    total_reward, steps = 0, 0
    while True:
        steps += 1
        batch_obs = np.expand_dims(obs, axis=0)
        action = agent.predict(batch_obs.astype('float32'))
        action = np.squeeze(action)

        # Add exploration noise, and clip to [-1.0, 1.0]
        action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0)
        action = action_mapping(action, env.action_space.low[0],
                                env.action_space.high[0])

        next_obs, reward, done, info = env.step(action)
        rpm.append(obs, action, hparams['reward_scale'] * reward, next_obs, done)

        if rpm.size() > hparams['rpm_init_size']:
            batch_obs, batch_action, batch_reward, batch_next_obs, \
                    batch_terminal = rpm.sample_batch(hparams['batch_size'])
            critic_cost = agent.learn(batch_obs, batch_action, batch_reward,
                                      batch_next_obs, batch_terminal)

        obs = next_obs
        total_reward += reward

        if done:
            break
    return total_reward, steps

In [16]:
model_dir = 'model_dir'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

In [None]:
test_flag = 0
total_steps = 0
while total_steps < hparams['train_total_steps']:
    train_reward, steps = run_train_episode(env, agent, rpm)
    total_steps += steps
    logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))

    if total_steps // hparams['test_every_steps'] >= test_flag:
        while total_steps // hparams['test_every_steps'] >= test_flag:
            test_flag += 1
 
        evaluate_reward, steps = run_evaluate_episode(env, agent)
        logger.info('Steps {}, Eval reward: {}, Episode length: {}'.format(
            total_steps, evaluate_reward, steps))

        ckpt = 'steps_{}.ckpt'.format(total_steps)
        with open(os.path.join(model_dir, ckpt), 'wb') as f:
            pickle.dump(model.get_weights(), f)

[32m[06-05 14:30:56 MainThread @<ipython-input-19-fb61c282226d>:6][0m Steps: 465 Reward: -5593.8415187394885
[32m[06-05 14:30:59 MainThread @<ipython-input-19-fb61c282226d>:14][0m Steps 465, Eval reward: -2667.921910168716, Episode length: 210
[32m[06-05 14:31:01 MainThread @<ipython-input-19-fb61c282226d>:6][0m Steps: 674 Reward: -2541.9234849159525
[32m[06-05 14:31:03 MainThread @<ipython-input-19-fb61c282226d>:6][0m Steps: 877 Reward: -2576.3027437993796
[32m[06-05 14:31:07 MainThread @<ipython-input-19-fb61c282226d>:6][0m Steps: 1229 Reward: -4461.984281137321
[32m[06-05 14:31:09 MainThread @<ipython-input-19-fb61c282226d>:6][0m Steps: 1444 Reward: -2560.112552749142
[32m[06-05 14:31:12 MainThread @<ipython-input-19-fb61c282226d>:6][0m Steps: 1747 Reward: -3784.297229037673
