<h1> Proximal Policy Optimization Algorithm</h1>
<h2><a href="https://arxiv.org/abs/1707.06347">Arxiv</a></h2>

In [1]:
import ray
from ray.rllib.evaluation import PolicyEvaluator, TorchPolicyGraph
from ray.rllib.utils.annotations import override
from flow.utils.registry import make_create_env

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import numpy as np

In [20]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
from threading import Lock

try:
    import torch
    import torch.nn.functional as F
except ImportError:
    pass  # soft dep

from ray.rllib.evaluation import TorchPolicyGraph
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph, PPOLoss
from ray.rllib.utils.annotations import override

In [4]:
ppo_policy_graph

<module 'ray.rllib.agents.ppo.ppo_policy_graph' from '/opt/conda/envs/flow/lib/python3.5/site-packages/ray/rllib/agents/ppo/ppo_policy_graph.py'>

In [6]:
ray.init(num_cpus=3, include_webui=False, ignore_reinit_error=True)

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-04_07-04-58_1260/logs.
Waiting for redis server at 127.0.0.1:29382 to respond...
Waiting for redis server at 127.0.0.1:51379 to respond...
Starting the Plasma object store with 13.355121049 GB memory using /dev/shm.


{'node_ip_address': '169.237.32.118',
 'object_store_addresses': ['/tmp/ray/session_2019-04-04_07-04-58_1260/sockets/plasma_store'],
 'raylet_socket_names': ['/tmp/ray/session_2019-04-04_07-04-58_1260/sockets/raylet'],
 'redis_address': '169.237.32.118:29382',
 'webui_url': ''}

In [11]:
benchmark_name = 'figureeight0'
benchmark = __import__(
    "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.flow_params
HORIZON = flow_params['env'].horizon

In [12]:
# Create Environment
num_envs = 3
create_env, env_name = make_create_env(params=flow_params, version=0)

In [13]:
env = create_env()
observation_space = env.observation_space
action_space = env.action_space

 Starting SUMO on port 54405


In [14]:
class ActorCritic(nn.Module):
    def __init__(self, obs_space, action_space, fcnet_hiddens, std=0.0):
        super(ActorCritic, self).__init__()
        num_inputs = obs_space.shape[0]
        num_outputs = action_space.shape[0]
        last_layer_size = num_inputs
        layers = []
        
        for size in fcnet_hiddens:
            layers.append(nn.Linear(last_layer_size, size))
            layers.append(nn.ReLU())
            last_layer_size = size
            
        layers.append(nn.Linear(fcnet_hiddens[-1], num_outputs))
        
        self.critic = nn.Sequential(*layers)
        self.actor = nn.Sequential(*layers)
        self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std)
        self.apply(self.init_weights)
        
    def init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0., std=0.1)
            nn.init.constant_(m.bias, 0.1)
       
    def forward(self, x):
        value = self.critic(x)
        mu    = self.actor(x)
        std   = self.log_std.exp().view(mu.shape)
        dist  = Normal(mu, std)
        return dist, value

    
class PPOLoss(nn.Module):
    def forward(self, model, state, action, log_prob, return_, advantage):
        dist, value = model(state)
        entropy = dist.entropy().mean()
        new_log_prob = dist.log_prob(action)

        ratio = (new_log_prob - log_prob).exp()
        surr1 = ratio * advantage
        surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage

        actor_loss  = - torch.min(surr1, surr2).mean()
        critic_loss = (return_ - value).pow(2).mean()

        loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy
    

class PPOTorchPolicyGraph(TorchPolicyGraph):
    def __init__(self, observation_space, action_space, config):
        self.observation_space = observation_space
        self.action_space = action_space
        self.loss = PPOLoss()

class TorchPolicyGraph(PolicyGraph):
    """Template for a PyTorch policy and loss to use with RLlib.
    This is similar to TFPolicyGraph, but for PyTorch.
    Attributes:
        observation_space (gym.Space): observation space of the policy.
        action_space (gym.Space): action space of the policy.
        lock (Lock): Lock that must be held around PyTorch ops on this graph.
            This is necessary when using the async sampler.
    """

    def __init__(self, observation_space, action_space, model, loss,
                 loss_inputs):
        """Build a policy graph from policy and loss torch modules.
        Note that module inputs will be CPU tensors. The model and loss modules
        are responsible for moving inputs to the right device.
        Arguments:
            observation_space (gym.Space): observation space of the policy.
            action_space (gym.Space): action space of the policy.
            model (nn.Module): PyTorch policy module. Given observations as
                input, this module must return a list of outputs where the
                first item is action logits, and the rest can be any value.
            loss (nn.Module): Loss defined as a PyTorch module. The inputs for
                this module are defined by the `loss_inputs` param. This module
                returns a single scalar loss. Note that this module should
                internally be using the model module.
            loss_inputs (list): List of SampleBatch columns that will be
                passed to the loss module's forward() function when computing
                the loss. For example, ["obs", "action", "advantages"].
        """
        self.observation_space = observation_space
        self.action_space = action_space
        self.lock = Lock()
        self._model = model
        self._loss = loss
        self._loss_inputs = loss_inputs
        self._optimizer = self.optimizer()

    @override(PolicyGraph)
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        with self.lock:
            with torch.no_grad():
                ob = torch.from_numpy(obs_batch).float()
                model_out = self._model(ob, state_batches)
                dist, value = model_out
                actions = dist.sample()
                return (actions.cpu().numpy(), dist, value)           
            
    @override(PolicyGraph)
    def compute_gradients(self, postprocessed_batch):
        with self.lock:
            loss_in = []
            for key in self._loss_inputs:
                loss_in.append(torch.from_numpy(postprocessed_batch[key]))
            loss_out = self._loss(*loss_in)
            self._optimizer.zero_grad()
            loss_out.backward()
            # Note that return values are just references;
            # calling zero_grad will modify the values
            grads = []
            for p in self._model.parameters():
                if p.grad is not None:
                    grads.append(p.grad.data.numpy())
                else:
                    grads.append(None)
            return grads, {}

    @override(PolicyGraph)
    def apply_gradients(self, gradients):
        with self.lock:
            for g, p in zip(gradients, self._model.parameters()):
                if g is not None:
                    p.grad = torch.from_numpy(g)
            self._optimizer.step()
            return {}

    @override(PolicyGraph)
    def get_weights(self):
        with self.lock:
            return self._model.state_dict()

    @override(PolicyGraph)
    def set_weights(self, weights):
        with self.lock:
            self._model.load_state_dict(weights)

    @override(PolicyGraph)
    def get_initial_state(self):
        return [s.numpy() for s in self._model.state_init()]

    def extra_action_out(self, model_out):
        """Returns dict of extra info to include in experience batch.
        Arguments:
            model_out (list): Outputs of the policy model module."""
        return {}

    def optimizer(self):
        """Custom PyTorch optimizer to use."""
        return torch.optim.Adam(self._model.parameters())


In [90]:
fcnet_hiddens = [100, 50, 25]
model = ActorCritic(observation_space, action_space, fcnet_hiddens, std=0.05).cpu()
ppoloss = PPOLoss()
loss_inputs = ["model", "state", "action", "log_prob", "return_", "advantage"]

TypeError: issubclass() arg 2 must be a class or tuple of classes

In [91]:
ppo_graph = TorchPolicyGraph(observation_space, action_space, model, ppoloss, loss_inputs)

In [21]:
evaluator = PolicyEvaluator(env_creator=create_env, policy_graph=TorchPolicyGraph)

 Starting SUMO on port 49568


TypeError: __init__() missing 2 required positional arguments: 'loss' and 'loss_inputs'

In [17]:
result = evaluator.sample()

In [19]:
result["obs"].shape

(100, 28)

# Train

In [None]:
num_inputs  = envs.observation_space.shape[0]
num_outputs = envs.action_space.shape[0]
device_id = ray.put(device)

#Hyper params:
lr = 5e-4
training_iter = 500
num_rollouts = 1
num_steps = HORIZON * num_rollouts
mini_batch_size = 128
num_sgd_iter = 10
fcnet_hiddens = [100, 50, 25]
gae_lambda = 0.97

model = ActorCritic(num_inputs, num_outputs, fcnet_hiddens).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
now = datetime.datetime.now()
now = str(now).replace(':', '-').replace(' ', '-')
now = now[:now.find('.')]
result_path = './result/ppo/' + now
os.makedirs(result_path)
image_path = result_path + '/reward_history.png'

test_rewards, num_iters = [], []

state = envs.reset()

for num_iter in trange(training_iter):
    state = envs.reset()
    trajectory = {'log_probs':[], 'values':[], 'states':[], 
                  'actions':[], 'rewards':[], 'masks':[]}

    for _ in range(num_steps):
        state = torch.FloatTensor(state).to(device)
        dist, value = model(state)
        action = dist.sample()
        next_state, reward, done, _ = envs.step(action.cpu().numpy())
        log_prob = dist.log_prob(action)
        append_trajectory(trajectory, log_prob, value, state, action, reward, done, device)
        state = next_state

    if num_iter % 25 == 0:
        model_id = ray.put(model)
        results_ids = [ev.test_env.remote(device_id, model_id) for ev in evs]
        test_reward = np.mean(ray.get(results_ids))
        test_rewards.append(test_reward)
        num_iters.append(num_iter)
        plot_and_save(num_iters, test_rewards, image_path)
        model_path = result_path + '/checkpoint' + str(num_iter) + '.pt'
        torch.save(model.state_dict(), model_path)
        send_line(url, headers, 'epoch: {}'.format(num_iter), image_path)
       
    next_state = torch.FloatTensor(next_state).to(device)
    _, next_value = model(next_state)
    returns = compute_gae(next_value, trajectory, tau=gae_lambda)
    cat_trajectory(trajectory, returns)
    
    ppo_update(model, optimizer, num_sgd_iter, mini_batch_size, trajectory)

# Test

In [None]:
model = ActorCritic(num_inputs, num_outputs, fcnet_hiddens).to(device)

model_path = './result/ppo/2019-04-02-02-06-26/checkpoint475.pt'
model.load_state_dict(torch.load(model_path))
model.eval()
model_id = ray.put(model)

In [None]:
env = make_vis_env(benchmark_name)

test_env(env, device, model)