In [2]:
import json
import argparse
from itertools import product
import numpy as np
from logging import getLogger

import ray
try:
    from ray.rllib.agents.agent import get_agent_class
except ImportError:
    from ray.rllib.agents.registry import get_agent_class
from ray.tune import Experiment, run_experiments
from ray.tune.registry import register_env

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder
benchmark_name = 'multi_merge'
# number of rollouts per training iteration
num_rollouts = 1
# number of parallel workers
num_cpus = 2

In [3]:
# Import the benchmark and fetch its flow_params
benchmark = __import__(
    "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.flow_params

# get the env name and a creator for the environment
create_env, env_name = make_create_env(params=flow_params, version=0)

register_env(env_name, create_env)

# initialize a ray instance
ray.init()

alg_run = "PPO"

horizon = flow_params["env"].horizon
agent_cls = get_agent_class(alg_run)
config = agent_cls._default_config.copy()
config["num_workers"] = min(num_cpus, num_rollouts)
config["train_batch_size"] = horizon * num_rollouts
config["use_gae"] = True
config["horizon"] = horizon
gae_lambda = 0.97
step_size = 5e-4
config["lambda"] = gae_lambda
config["lr"] = step_size
config["vf_clip_param"] = 1e6
config["num_sgd_iter"] = 10
config['clip_actions'] = False  # FIXME(ev) temporary ray bug
config["model"]["fcnet_hiddens"] = [100, 50, 25]
config["observation_filter"] = "NoFilter"
config["entropy_coeff"] = 0.0

# save the flow params for replay
flow_json = json.dumps(
    flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
config['env_config']['flow_params'] = flow_json
config['env_config']['run'] = alg_run

Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-04-25_22-58-02_1112/logs.
Waiting for redis server at 127.0.0.1:29882 to respond...
Waiting for redis server at 127.0.0.1:42786 to respond...
Starting the Plasma object store with 13.355121049 GB memory using /dev/shm.

View the web UI at http://localhost:8888/notebooks/ray_ui.ipynb?token=a8bc63089d4e8b15c268160cd9169712900c9505fae01a8b



In [4]:
def on_episode_start(info):
    episode = info["episode"]
    episode.user_data["cost1"] = []
    episode.user_data["cost2"] = []


def on_episode_step(info):
    env = info["env"]
    _, _, _, info, _ = env.poll()
    # episode.user_data["pole_angles"].append(pole_angle)


def on_episode_end(info):
    pass
    # episode = info["episode"]
    # pole_angle = np.mean(episode.user_data["pole_angles"])
    # print("episode {} ended with length {} and pole angles {}".format(
    #     episode.episode_id, episode.length, pole_angle))
    # episode.custom_metrics["pole_angle"] = pole_angle

In [5]:
config['callbacks']['on_episode_start'] = ray.tune.function(on_episode_start)
config['callbacks']['on_episode_step'] = ray.tune.function(on_episode_step)
config['callbacks']['on_episode_end'] = ray.tune.function(on_episode_end)

In [6]:
# tunning parameters
eta = [[1.0, 0.3]]
reward_scale = [1.0]#, 0.5]
t_min = [3.0]# , 5.0, 10.0]

exp_tag = {
    "run": alg_run,
    "env": env_name,
    "config": {
        **config
    },
    "checkpoint_freq": 25,
    "max_failures": 999,
    "stop": {
        "training_iteration": 10
    },
    "num_samples": 1,
}

trials = run_experiments(
    {'test':exp_tag}
)

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/4 CPUs, 0/1 GPUs
Memory usage on this node: 5.1/33.4 GB

Created LogSyncer for /headless/ray_results/test/PPO_MultiWaveAttenuationMergePOEnv-v0_0_2019-04-25_22-58-466sm12mxd -> 
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 2/4 CPUs, 0/1 GPUs
Memory usage on this node: 5.1/33.4 GB
Result logdir: /headless/ray_results/test
RUNNING trials:
 - PPO_MultiWaveAttenuationMergePOEnv-v0_0:	RUNNING

Result for PPO_MultiWaveAttenuationMergePOEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-25_22-59-22
  done: false
  episode_len_mean: 350.0
  episode_reward_max: 290.93073690926167
  episode_reward_mean: 290.93073690926167
  episode_reward_min: 290.93073690926167
  episodes_this_iter: 1
  episodes_total: 1
  experiment_id: d54538960e9648069d9a1dea8bd3d931
  hostname: kronos
  info:
    default:
      cur_kl_coeff: 0.20000000298023224
      cur_lr: 0.0005000000237487257
      entropy: 1.4203226566314697
      kl

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 2/4 CPUs, 0/1 GPUs
Memory usage on this node: 5.9/33.4 GB
Result logdir: /headless/ray_results/test
RUNNING trials:
 - PPO_MultiWaveAttenuationMergePOEnv-v0_0:	RUNNING [pid=1154], 54 s, 8 iter, 9236 ts, 313 rew

Result for PPO_MultiWaveAttenuationMergePOEnv-v0_0:
  custom_metrics: {}
  date: 2019-04-25_23-00-06
  done: false
  episode_len_mean: 350.0
  episode_reward_max: 489.31080349554145
  episode_reward_mean: 323.89000861402775
  episode_reward_min: 133.69298764399164
  episodes_this_iter: 1
  episodes_total: 9
  experiment_id: d54538960e9648069d9a1dea8bd3d931
  hostname: kronos
  info:
    default:
      cur_kl_coeff: 0.11249998956918716
      cur_lr: 0.0005000000237487257
      entropy: 1.3064906597137451
      kl: 0.0029544616118073463
      policy_loss: 7.330104563152418e-05
      total_loss: 34.25816345214844
      vf_explained_var: 0.3161628246307373
      vf_loss: 34.25775909423828
    grad_time_ms: 260.98
  