In [1]:
from ray.rllib.agents.a3c.a2c import A2CTrainer, A2C_DEFAULT_CONFIG
from ray.rllib.agents.a3c.a3c_torch_policy_graph import A3CTorchPolicyGraph
from ray.tune.registry import register_env
from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
from ray import tune
import ray, json
from copy import deepcopy

from flow.multiagent_envs import MultiWaveAttenuationPOEnv
from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder, get_flow_params

In [2]:
num_cpus = 3
num_rollouts = 3
horizon = 750
gae_lambda = 0.97
step_size = 5e-4
num_iter = 10
benchmark_name = "multi_merge"
exp_name = "test_ir"

In [3]:
ray.init(num_cpus=num_cpus, logging_level=40, ignore_reinit_error=True)

{'node_ip_address': '169.237.32.118',
 'object_store_address': '/tmp/ray/session_2019-05-25_19-52-15_9233/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-05-25_19-52-15_9233/sockets/raylet',
 'redis_address': '169.237.32.118:61149',
 'webui_url': None}

In [4]:
config = deepcopy(A2C_DEFAULT_CONFIG)
config["use_pytorch"] = True

benchmark = __import__(
            "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.buffered_obs_flow_params

# save the flow params for replay
flow_json = json.dumps(
    flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
config['env_config']['flow_params'] = flow_json
create_env, env_name = make_create_env(params=flow_params, version=0)
register_env(env_name, create_env)
env = create_env()
default_policy = (A3CTorchPolicyGraph, env.observation_space, env.action_space, {})
policy_graph = {DEFAULT_POLICY_ID: default_policy}
config["multiagent"] = {
        'policy_graphs': policy_graph,
        'policy_mapping_fn': tune.function(lambda agent_id: DEFAULT_POLICY_ID)
    }

In [5]:
agent = A2CTrainer(config, env_name)

2019-05-25 19:50:11,005	INFO policy_evaluator.py:728 -- Built policy map: {'default_policy': <ray.rllib.agents.a3c.a3c_torch_policy_graph.A3CTorchPolicyGraph object at 0x7fdbb84c4710>}
2019-05-25 19:50:11,010	INFO policy_evaluator.py:729 -- Built preprocessor map: {'default_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7fdbb8f8ff28>}
2019-05-25 19:50:11,013	INFO policy_evaluator.py:343 -- Built filter map: {'default_policy': <ray.rllib.utils.filter.NoFilter object at 0x7fdbb8f9a358>}


In [6]:
agent.train()

[2m[36m(pid=9134)[0m Loading configuration... done.
[2m[36m(pid=9134)[0m Success.
[2m[36m(pid=9134)[0m Loading configuration... done.
[2m[36m(pid=9136)[0m Loading configuration... done.
[2m[36m(pid=9136)[0m Success.
[2m[36m(pid=9136)[0m Loading configuration... done.
[2m[36m(pid=9134)[0m Loading configuration... done.
[2m[36m(pid=9134)[0m Success.
[2m[36m(pid=9134)[0m Loading configuration... done.
[2m[36m(pid=9136)[0m 2019-05-25 19:50:22,443	INFO policy_evaluator.py:437 -- Generating sample batch of size 20
[2m[36m(pid=9136)[0m Loading configuration... done.
[2m[36m(pid=9136)[0m Success.
[2m[36m(pid=9136)[0m Loading configuration... done.
[2m[36m(pid=9136)[0m 2019-05-25 19:50:24,003	INFO sampler.py:308 -- Raw obs from env: { 0: { 'flow_1.0': np.ndarray((12,), dtype=float32, min=0.0, max=1.0, mean=0.508),
[2m[36m(pid=9136)[0m        'flow_1.1': np.ndarray((12,), dtype=float32, min=0.016, max=0.839, mean=0.193)}}
[2m[36m(pid=9136)[0m 2019-

2019-05-25 19:50:26,233	INFO policy_evaluator.py:564 -- Training on concatenated sample batches:

{ 'data': { 'actions': np.ndarray((260, 1), dtype=float32, min=-2.662, max=2.625, mean=-0.05),
            'advantages': np.ndarray((260,), dtype=float32, min=-7.812, max=0.081, mean=-3.402),
            'agent_index': np.ndarray((260,), dtype=int64, min=0.0, max=2.0, mean=1.023),
            'dones': np.ndarray((260,), dtype=bool, min=0.0, max=1.0, mean=0.008),
            'eps_id': np.ndarray((260,), dtype=int64, min=120875377.0, max=1053936329.0, mean=583817157.031),
            'infos': np.ndarray((260,), dtype=object, head={'cost2': 0.0, 'cost1': 0.6585099526248288, 'outflow': 445.5445544554455, 'mean_vel': 18.57710320265973}),
            'new_obs': np.ndarray((260, 12), dtype=float32, min=-0.07, max=1.0, mean=0.357),
            'obs': np.ndarray((260, 12), dtype=float32, min=-0.07, max=1.0, mean=0.355),
            'prev_actions': np.ndarray((260, 1), dtype=float32, min=-2.662, max

{'config': {'batch_mode': 'truncate_episodes',
  'callbacks': {'on_episode_end': None,
   'on_episode_start': None,
   'on_episode_step': None,
   'on_postprocess_traj': None,
   'on_sample_end': None,
   'on_train_result': None},
  'clip_actions': True,
  'clip_rewards': None,
  'collect_metrics_timeout': 180,
  'compress_observations': False,
  'custom_resources_per_worker': {},
  'entropy_coeff': 0.01,
  'env': 'MultiWaveAttenuationMergePOEnvBufferedObs-v0',
  'gamma': 0.99,
  'grad_clip': 40.0,
  'horizon': None,
  'ignore_worker_failures': False,
  'input': 'sampler',
  'input_evaluation': ['is', 'wis'],
  'lambda': 1.0,
  'local_evaluator_tf_session_args': {'inter_op_parallelism_threads': 8,
   'intra_op_parallelism_threads': 8},
  'log_level': 'INFO',
  'lr': 0.0001,
  'lr_schedule': None,
  'metrics_smoothing_episodes': 100,
  'min_iter_time_s': 10,
  'model': {'conv_activation': 'relu',
   'conv_filters': None,
   'custom_model': None,
   'custom_options': {},
   'custom_prepr

In [5]:
env.reset()

{'flow_1.0': array([0.87369645, 0.03827296, 0.05081967, 0.05117471, 0.05344194,
        0.12477846, 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.        ], dtype=float32),
 'flow_1.1': array([0.55678636, 0.03979726, 0.02676601, 0.03755222, 0.02405487,
        0.53835654, 0.87369645, 0.03827296, 0.05081967, 0.05117471,
        0.05344194, 0.12477846], dtype=float32)}

In [6]:
env.step({})

({'flow_1.0': array([0.8768942 , 0.03911753, 0.05128663, 0.05162269, 0.05406003,
         0.11426529, 1.        , 1.        , 1.        , 1.        ,
         1.        , 0.        ], dtype=float32),
  'flow_1.1': array([0.56502277, 0.03925874, 0.02723876, 0.0368665 , 0.02449937,
         0.5316009 , 0.8768942 , 0.03911753, 0.05128663, 0.05162269,
         0.05406003, 0.11426529], dtype=float32)},
 {'flow_1.0': -0.326915748365307, 'flow_1.1': -0.326915748365307},
 {'__all__': False, 'flow_1.0': False, 'flow_1.1': False},
 {'flow_1.0': {'cost1': 0.6508070239119207,
   'cost2': 0.0,
   'mean_vel': 18.551605749902766,
   'outflow': 445.5445544554455},
  'flow_1.1': {'cost1': 0.6508070239119207,
   'cost2': 0.0,
   'mean_vel': 18.551605749902766,
   'outflow': 445.5445544554455}})