In [41]:
import gym
import ray

import os, sys, pickle, time, math
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
# ----------
# Pytorch
import torch
dtype = torch.float64
torch.set_default_dtype(dtype)
device = torch.device('cuda', index=args.gpu_index) \
    if torch.cuda.is_available() else torch.device('cpu')
# ----------
# flow
from flow.utils.registry import make_create_env
benchmark_name = 'multi_merge'
benchmark = __import__(
    "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.buffered_obs_flow_params

# ----------
# PyTorch-RL
from models.mlp_policy import MultiAgentPolicy
from models.mlp_critic import Value
from core.a2c import a2c_step
from core.common import estimate_advantages
from core.multi_agent import MultiAgent
from utils.remote_vector_env import MultiAgentVecEnv, dict_to_array
from utils.replay_memory import MultiAgentMemory

In [4]:
ray.init()

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-03_21-32-12_30/logs.
Waiting for redis server at 127.0.0.1:35324 to respond...
Waiting for redis server at 127.0.0.1:20943 to respond...
Starting the Plasma object store with 3.2789331959999997 GB memory using /dev/shm.

View the web UI at http://localhost:8888/notebooks/ray_ui.ipynb?token=b1cbcb449c52e1eb09a8c097007e9bff2522911e4e72fd0c



{'node_ip_address': '168.150.112.131',
 'object_store_addresses': ['/tmp/ray/session_2019-05-03_21-32-12_30/sockets/plasma_store'],
 'raylet_socket_names': ['/tmp/ray/session_2019-05-03_21-32-12_30/sockets/raylet'],
 'redis_address': '168.150.112.131:35324',
 'webui_url': 'http://localhost:8888/notebooks/ray_ui.ipynb?token=b1cbcb449c52e1eb09a8c097007e9bff2522911e4e72fd0c'}

# Set env

In [5]:
create_env, env_name = make_create_env(params=flow_params, version=0)
sample_env = create_env()
state_dim = sample_env.observation_space.shape[0]
action_dim = sample_env.action_space.shape[0]
env = MultiAgentVecEnv(create_env, num_envs=2, remote_env_batch_wait_ms=0)

 Starting SUMO on port 48028


# Set Policy

In [6]:
# define actor and critic
policy_net = MultiAgentPolicy(state_dim, action_dim, activation='relu')
value_net = Value(state_dim)
optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=3e-4)
optimizer_value = torch.optim.Adam(value_net.parameters(), lr=3e-4)

# Sampling

In [7]:
state = env.reset()

Launching env 0 in remote actor
Launching env 1 in remote actor


In [8]:
agent = MultiAgent(env, policy_net)

In [9]:
agent.collect_samples(300)

# Training

In [None]:
for batch in agent.batch_generator(600):
    states = torch.tensor(batch.state).float().to(device)
    actions = torch.tensor(batch.action).float().to(device)
    rewards = torch.tensor(batch.reward).float().to(device)
    masks = torch.tensor(batch.mask).float().to(device)
    with torch.no_grad():
        values = value_net(states)
        fixed_log_probs = policy_net.get_log_prob(states, actions)
    advantages, returns = estimate_advantages(rewards, masks, values, 0.99, 0.95)
    
    

In [None]:

"""perform mini-batch PPO update"""
optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size))
for _ in range(optim_epochs):
    perm = np.arange(states.shape[0])
    np.random.shuffle(perm)
    perm = LongTensor(perm).to(device)

    states, actions, returns, advantages, fixed_log_probs = \
        states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone()

    for i in range(optim_iter_num):
        ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0]))
        states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \
            states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind]

        ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b,
                 advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg)

# Debug

In [9]:
# <Seeding should be here>

In [55]:
agent.memory = MultiAgentMemory()
state = agent.env.reset()
for i in range(1000):
    with torch.no_grad():
        action = agent.policy_net.select_action(state)
    next_state, rew, done, info = agent.env.step(action)
    # if __all__ True, reset
    need_reset = list(map(lambda value:1 if value["__all__"] else 0,\
                                  list(done.values())))
    action, state, listed_next_state, rew, done, info, id_list = \
            dict_to_array(action, state, next_state, rew, done, info)
    mask = list(map(lambda d: 0 if d else 1, done))    
    agent.memory.push(state, action, mask, listed_next_state, rew)
    state = next_state
    # reset if need_reset is 1
    reseted_state = env.reset(need_reset)
    for key, value in reseted_state.items():
        state[key] = value

env: 0 is reseted at 699
env: 1 is reseted at 699


In [37]:
def estimate_advantages(rewards, masks, values, gamma, tau):
    deltas = torch.FloatTensor(rewards.size(0), 1)
    advantages = torch.FloatTensor(rewards.size(0), 1)

    prev_value = 0
    prev_advantage = 0

    for i in reversed(range(rewards.size(0))):
        deltas[i] = rewards[i] + gamma * prev_value * masks[i] - values[i]
        advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i]

        prev_value = values[i, 0]
        prev_advantage = advantages[i, 0]

    returns = values + advantages
    advantages = (advantages - advantages.mean()) / advantages.std()
    
    return advantages, returns