In [1]:
import gym, pickle, argparse, json, logging
from copy import deepcopy
import ray
from gail.gail import GAILTrainer

from ray import tune
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG
from ray.rllib.agents import Trainer
from ray.rllib.evaluation import PolicyEvaluator, SampleBatch, MultiAgentSampleBatchBuilder
from ray.rllib.offline.json_writer import JsonWriter
from ray.rllib.offline.json_reader import JsonReader
from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
from ray.rllib.evaluation.metrics import collect_metrics
from ray.tune.registry import register_env
from ray.rllib.utils.annotations import override

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder, get_flow_params
logger = logging.getLogger(__name__)
ray.init(ignore_reinit_error=True)

2019-05-28 20:56:42,910	INFO node.py:469 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-28_20-56-42_11088/logs.
2019-05-28 20:56:43,026	INFO services.py:407 -- Waiting for redis server at 127.0.0.1:26818 to respond...
2019-05-28 20:56:43,165	INFO services.py:407 -- Waiting for redis server at 127.0.0.1:51969 to respond...
2019-05-28 20:56:43,169	INFO services.py:804 -- Starting Redis shard with 6.68 GB max memory.
2019-05-28 20:56:43,217	INFO node.py:483 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-28_20-56-42_11088/logs.
2019-05-28 20:56:43,221	INFO services.py:1427 -- Starting the Plasma object store with 10.02 GB memory using /dev/shm.


{'node_ip_address': '169.237.32.118',
 'object_store_address': '/tmp/ray/session_2019-05-28_20-56-42_11088/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-05-28_20-56-42_11088/sockets/raylet',
 'redis_address': '169.237.32.118:26818',
 'webui_url': None}

In [3]:
num_cpus = 3
num_rollouts = 3
horizon = 750
gae_lambda = 0.97
step_size = 5e-4
num_iter = 10
benchmark_name = "multi_merge"
exp_name = "test_ir"

In [4]:
config = deepcopy(DEFAULT_CONFIG)
config["num_workers"] = min(num_cpus, num_rollouts)
config["train_batch_size"] = horizon * num_rollouts
config["sample_batch_size"] = horizon / 2
config["use_gae"] = True
config["horizon"] = horizon
config["lambda"] = gae_lambda
config["lr"] = step_size
config["vf_clip_param"] = 1e6
config["num_sgd_iter"] = 10
config['clip_actions'] = False  # FIXME(ev) temporary ray bug
config["model"]["fcnet_hiddens"] = [128, 64, 32]
config["observation_filter"] = "NoFilter"
config["entropy_coeff"] = 0.0
config["expert_path"] = '/headless/rl_project/flow_codes/ModelBased/expert_sample'

benchmark = __import__(
            "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.flow_params

# save the flow params for replay
flow_json = json.dumps(
    flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
config['env_config']['flow_params'] = flow_json

In [5]:
create_env, env_name = make_create_env(params=flow_params, version=0)
register_env(env_name, create_env)
env = create_env()

# we don't need this config
POLICY_ID = DEFAULT_POLICY_ID 
default_policy = (PPOPolicyGraph, env.observation_space, env.action_space, {})
policy_graph = {POLICY_ID: default_policy}
config["multiagent"] = {
        'policy_graphs': policy_graph,
        'policy_mapping_fn': tune.function(lambda agent_id: POLICY_ID),
        'policies_to_train': [POLICY_ID]
    }

In [6]:
agent = GAILTrainer(config, env_name)

2019-05-28 20:57:00,132	INFO json_reader.py:65 -- Found 2 input files.
2019-05-28 20:57:01,284	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2019-05-28 20:57:02,787	INFO policy_evaluator.py:728 -- Built policy map: {'default_policy': <ray.rllib.agents.ppo.ppo_policy_graph.PPOPolicyGraph object at 0x7ff2ad4f8710>}
2019-05-28 20:57:02,788	INFO policy_evaluator.py:729 -- Built preprocessor map: {'default_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7ff2ad4f8358>}
2019-05-28 20:57:02,790	INFO policy_evaluator.py:343 -- Built filter map: {'default_policy': <ray.rllib.utils.filter.NoFilter object at 0x7ff2ad2aa2b0>}


In [7]:
with open('1buf.wgt', 'rb') as f:
    weights = pickle.load(f)
    weights[POLICY_ID] = weights.pop('default')
agent.set_weights(weights)

In [8]:
writer = JsonWriter("./expert_sample")

In [9]:
for _ in range(10):
    samples = agent.sample(agent.train_batch_size)
    samples.count

    writer.write(sample_batch=samples)

[2m[36m(pid=11124)[0m 2019-05-28 20:58:01,748	INFO policy_evaluator.py:437 -- Generating sample batch of size 375.0
[2m[36m(pid=11122)[0m Loading configuration... done.
[2m[36m(pid=11120)[0m Loading configuration... done.
[2m[36m(pid=11122)[0m Success.
[2m[36m(pid=11122)[0m Loading configuration... done.
[2m[36m(pid=11120)[0m Success.
[2m[36m(pid=11120)[0m Loading configuration... done.
[2m[36m(pid=11124)[0m Loading configuration... done.
[2m[36m(pid=11124)[0m Success.
[2m[36m(pid=11124)[0m Loading configuration... done.
[2m[36m(pid=11124)[0m 2019-05-28 20:58:03,241	INFO sampler.py:308 -- Raw obs from env: { 0: { 'flow_1.0': np.ndarray((12,), dtype=float32, min=0.0, max=1.0, mean=0.504),
[2m[36m(pid=11124)[0m        'flow_1.1': np.ndarray((12,), dtype=float32, min=0.009, max=0.804, mean=0.19)}}
[2m[36m(pid=11124)[0m 2019-05-28 20:58:03,241	INFO sampler.py:309 -- Info return from env: {0: {'flow_1.0': {}, 'flow_1.1': {}}}
[2m[36m(pid=11124)[0m 2

[2m[36m(pid=11122)[0m Loading configuration... done.
[2m[36m(pid=11122)[0m Success.
[2m[36m(pid=11122)[0m Loading configuration... done.


2019-05-28 20:58:08,222	INFO json_writer.py:97 -- Writing to new output file <_io.TextIOWrapper name='/headless/rl_project/flow_codes/ModelBased/expert_sample/output-2019-05-28_20-58-08_worker-0_0.json' mode='w' encoding='UTF-8'>


[2m[36m(pid=11124)[0m 2019-05-28 20:58:08,053	INFO policy_evaluator.py:474 -- Completed sample batch:
[2m[36m(pid=11124)[0m 
[2m[36m(pid=11124)[0m { 'data': { 'action_prob': np.ndarray((774,), dtype=float32, min=0.008, max=15.001, mean=2.86),
[2m[36m(pid=11124)[0m             'actions': np.ndarray((774, 1), dtype=float32, min=-1.552, max=3.258, mean=0.658),
[2m[36m(pid=11124)[0m             'advantages': np.ndarray((774,), dtype=float32, min=-10.719, max=27.468, mean=1.353),
[2m[36m(pid=11124)[0m             'agent_index': np.ndarray((774,), dtype=int64, min=0.0, max=6.0, mean=3.021),
[2m[36m(pid=11124)[0m             'behaviour_logits': np.ndarray((774, 2), dtype=float32, min=-3.662, max=1.646, mean=-0.72),
[2m[36m(pid=11124)[0m             'dones': np.ndarray((774,), dtype=bool, min=0.0, max=1.0, mean=0.01),
[2m[36m(pid=11124)[0m             'eps_id': np.ndarray((774,), dtype=int64, min=697840902.0, max=1883949945.0, mean=1718446357.605),
[2m[36m(pid=1112

In [10]:
reader = JsonReader("./expert_sample")
sample = reader.next()
sample.count

2019-05-28 20:59:09,314	INFO json_reader.py:65 -- Found 1 input files.


2364

In [39]:
reader.next().count

4461

In [40]:
agent.train()

[2m[36m(pid=11124)[0m Loading configuration... done.
[2m[36m(pid=11124)[0m Success.
[2m[36m(pid=11120)[0m Loading configuration... done.
[2m[36m(pid=11120)[0m Success.
[2m[36m(pid=11124)[0m Loading configuration... done.
[2m[36m(pid=11120)[0m Loading configuration... done.
[2m[36m(pid=11122)[0m Loading configuration... done.
[2m[36m(pid=11122)[0m Success.
[2m[36m(pid=11122)[0m Loading configuration... done.
[2m[36m(pid=11120)[0m Loading configuration... done.
[2m[36m(pid=11120)[0m Success.
[2m[36m(pid=11120)[0m Loading configuration... done.
[2m[36m(pid=11124)[0m Loading configuration... done.
[2m[36m(pid=11124)[0m Success.
[2m[36m(pid=11124)[0m Loading configuration... done.


2019-05-28 21:00:29,951	INFO policy_evaluator.py:564 -- Training on concatenated sample batches:

{ 'data': { 'action_prob': np.ndarray((2379,), dtype=float32, min=0.001, max=15.399, mean=2.565),
            'actions': np.ndarray((2379, 1), dtype=float32, min=-5.937, max=5.656, mean=0.704),
            'advantages': np.ndarray((2379,), dtype=float32, min=-13.093, max=24.699, mean=2.184),
            'agent_index': np.ndarray((2379,), dtype=int64, min=0.0, max=6.0, mean=3.275),
            'behaviour_logits': np.ndarray((2379, 2), dtype=float32, min=-3.678, max=2.013, mean=-0.566),
            'dones': np.ndarray((2379,), dtype=bool, min=0.0, max=1.0, mean=0.012),
            'eps_id': np.ndarray((2379,), dtype=int64, min=165897188.0, max=1160255321.0, mean=672477232.312),
            'infos': np.ndarray((2379,), dtype=object, head={'mean_vel': 12.803015796527372, 'cost1': 0.49515484300469237, 'cost2': 0.0, 'outflow': 1512.0}),
            'new_obs': np.ndarray((2379, 12), dtype=float32

{'config': {'batch_mode': 'truncate_episodes',
  'callbacks': {'on_episode_end': None,
   'on_episode_start': None,
   'on_episode_step': None,
   'on_postprocess_traj': None,
   'on_sample_end': None,
   'on_train_result': None},
  'clip_actions': False,
  'clip_param': 0.3,
  'clip_rewards': None,
  'collect_metrics_timeout': 180,
  'compress_observations': False,
  'custom_resources_per_worker': {},
  'entropy_coeff': 0.0,
  'env': 'MultiWaveAttenuationMergePOEnv-v0',
  'expert_path': '/headless/rl_project/flow_codes/ModelBased/expert_sample',
  'gamma': 0.99,
  'grad_clip': None,
  'horizon': 750,
  'ignore_worker_failures': False,
  'input': 'sampler',
  'input_evaluation': ['is', 'wis'],
  'kl_coeff': 0.2,
  'kl_target': 0.01,
  'lambda': 0.97,
  'local_evaluator_tf_session_args': {'inter_op_parallelism_threads': 8,
   'intra_op_parallelism_threads': 8},
  'log_level': 'INFO',
  'lr': 0.0005,
  'lr_schedule': None,
  'metrics_smoothing_episodes': 100,
  'model': {'conv_activation