# TODO

* find the varid horizon

In [1]:
import gym, pickle, argparse, json, logging
from gym import ObservationWrapper
from copy import deepcopy
import ray

from ray import tune
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG
from ray.rllib.agents import Trainer
from ray.rllib.evaluation import PolicyEvaluator, SampleBatch
from ray.rllib.evaluation.sample_batch import DEFAULT_POLICY_ID
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.offline.json_reader import JsonReader
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print
from ray.rllib.utils.annotations import override

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder, get_flow_params
logger = logging.getLogger(__name__)

In [2]:
num_cpus = 3
num_rollouts = 3
horizon = 750
gae_lambda = 0.97
step_size = 5e-4
num_iter = 10
benchmark_name = "multi_merge"
exp_name = "test_ir"

In [3]:
ray.init(num_cpus=num_cpus, logging_level=40, ignore_reinit_error=True)

{'node_ip_address': '169.237.32.118',
 'object_store_address': '/tmp/ray/session_2019-05-27_06-26-35_28706/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-05-27_06-26-35_28706/sockets/raylet',
 'redis_address': '169.237.32.118:25493',
 'webui_url': None}

In [4]:
config = deepcopy(DEFAULT_CONFIG)
config["num_workers"] = min(num_cpus, num_rollouts)
config["train_batch_size"] = horizon * num_rollouts
config["sample_batch_size"] = horizon / 2
config["use_gae"] = True
config["horizon"] = horizon
config["lambda"] = gae_lambda
config["lr"] = step_size
config["vf_clip_param"] = 1e6
config["num_sgd_iter"] = 10
config['clip_actions'] = False  # FIXME(ev) temporary ray bug
config["model"]["fcnet_hiddens"] = [128, 64, 32]
config["observation_filter"] = "NoFilter"
config["entropy_coeff"] = 0.0

benchmark = __import__(
            "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.buffered_obs_flow_params

# save the flow params for replay
flow_json = json.dumps(
    flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
config['env_config']['flow_params'] = flow_json

In [5]:
create_env, env_name = make_create_env(params=flow_params, version=0)
register_env(env_name, create_env)
env = create_env()

default_policy = (PPOPolicyGraph, env.observation_space, env.action_space, {})
policy_graph = {DEFAULT_POLICY_ID: default_policy}
config["multiagent"] = {
        'policy_graphs': policy_graph,
        'policy_mapping_fn': tune.function(lambda agent_id: DEFAULT_POLICY_ID)
    }

In [6]:
class MEIRTrainer(Trainer):
    _name = "MEIR"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PPOPolicyGraph
    
    @override(Trainer)
    def _init(self, config, env_creator):
        self.local_evaluator = self.make_local_evaluator(
             env_creator, self._policy_graph)        
        self.remote_evaluators = self.make_remote_evaluators(
            env_creator, self._policy_graph, config["num_workers"])
        
        self.train_batch_size = config["train_batch_size"]
        self.num_sgd_iter = config["num_sgd_iter"]
        self.num_train = 1
        
        self.expert_path = './expert_sample'
        expert_reader = JsonReader(self.expert_path)
        self.expert_samples = expert_reader.next()
        self.expert_features = self.calculate_expected_feature(self.expert_samples)
        self.theta = np.random.uniform(size=self.expert_features.shape)
        
    def sample(self, sample_size):
        samples = []
        while sum(s.count for s in samples) < sample_size:
            samples.extend(
                ray.get([
                    e.sample.remote() for e in self.remote_evaluators
                ]))
        samples = SampleBatch.concat_samples(samples)       
        return samples
    
    def calculate_expected_feature(self, samples):
        features = np.mean(samples["obs"], axis=0)
        return features
    
    def train_policy_by_samples(self, samples):
        # train policy by given samples
        for i in range(self.num_sgd_iter):
            fetches = self.local_evaluator.learn_on_batch(samples)
            
        def update(pi, pi_id):
            if pi_id in fetches:
                pi.update_kl(fetches[pi_id]["kl"])
            else:
                logger.debug(
                    "No data for {}, not updating kl".format(pi_id))
        self.local_evaluator.foreach_trainable_policy(update)       
        
    def set_new_rewards(self, samples):
        samples["rewards"] = samples["obs"].dot(agent.theta.T)
        policy = self.get_policy()
        return policy.postprocess_trajectory(samples)
    
    def update_theta(self, samples, learning_rate=0.01):
        features = self.calculate_expected_feature(samples)
        update = self.expert_features - features
        self.theta += learning_rate * update
    
    @override(Trainer)    
    def _train(self):
        
        # optimize policy under estimated reward
        for train_iter in range(self.num_train):
            # set local weights to remote
            weights = ray.put(self.local_evaluator.get_weights())
            for e in self.remote_evaluators:
                e.set_weights.remote(weights)       

            # collect samples with new reward fnc
            samples = self.set_new_rewards(self.sample(self.train_batch_size))
            samples.shuffle()

            # train local based on samples
            self.train_policy_by_samples(samples)
            res = collect_metrics(self.local_evaluator, self.remote_evaluators)
            pretty_print(res)
        
        samples = self.sample(self.train_batch_size)
        self.update_theta(samples)

        return res

    @override(Trainer)
    def __getstate__(self):
        state = super().__getstate__()
        state["theta"] = self.theta
        return state
    
    @override(Trainer)
    def __setstate__(self):
        super().__setstate__()
        if "theta" in state:
            self.theta = state["theta"]

In [7]:
agent = MEIRTrainer(config, env_name)

2019-05-27 06:26:41,191	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2019-05-27 06:26:42,651	INFO policy_evaluator.py:728 -- Built policy map: {'default_policy': <ray.rllib.agents.ppo.ppo_policy_graph.PPOPolicyGraph object at 0x7f13cc059588>}
2019-05-27 06:26:42,652	INFO policy_evaluator.py:729 -- Built preprocessor map: {'default_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7f13cc0591d0>}
2019-05-27 06:26:42,653	INFO policy_evaluator.py:343 -- Built filter map: {'default_policy': <ray.rllib.utils.filter.NoFilter object at 0x7f13cc2f5860>}
2019-05-27 06:26:42,734	INFO json_reader.py:65 -- Found 1 input files.


# Debug

In [32]:
np.mean(expert_samples["obs"], axis=0)

array([0.53669256, 0.10492238, 0.18206705, 0.04042373, 0.09086839,
       0.39702156, 0.88553596, 0.504822  , 0.64543855, 0.5248907 ,
       0.5279983 , 0.09511073], dtype=float32)

In [8]:
samples = agent.sample(agent.train_batch_size)

[2m[36m(pid=28496)[0m Loading configuration... done.
[2m[36m(pid=28496)[0m Success.
[2m[36m(pid=28494)[0m Loading configuration... done.
[2m[36m(pid=28497)[0m Loading configuration... done.
[2m[36m(pid=28497)[0m Success.
[2m[36m(pid=28496)[0m Loading configuration... done.
[2m[36m(pid=28494)[0m Success.
[2m[36m(pid=28494)[0m Loading configuration... done.
[2m[36m(pid=28497)[0m Loading configuration... done.
[2m[36m(pid=28496)[0m 2019-05-27 06:21:52,947	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 2 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=28496)[0m 2019-05-27 06:21:52.950040: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
[2m[36m(pid=28494)[0m 2019-05-27 06:21:52,964	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 3 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=28494)[0m 2019-05-

[2m[36m(pid=28496)[0m Loading configuration... done.
[2m[36m(pid=28496)[0m Success.
[2m[36m(pid=28496)[0m Loading configuration... done.
[2m[36m(pid=28497)[0m 2019-05-27 06:22:01,168	INFO policy_evaluator.py:474 -- Completed sample batch:
[2m[36m(pid=28497)[0m 
[2m[36m(pid=28497)[0m { 'data': { 'action_prob': np.ndarray((996,), dtype=float32, min=0.001, max=0.4, mean=0.285),
[2m[36m(pid=28497)[0m             'actions': np.ndarray((996, 1), dtype=float32, min=-3.35, max=2.893, mean=0.037),
[2m[36m(pid=28497)[0m             'advantages': np.ndarray((996,), dtype=float32, min=-21.575, max=-0.285, mean=-14.963),
[2m[36m(pid=28497)[0m             'agent_index': np.ndarray((996,), dtype=int64, min=0.0, max=4.0, mean=2.399),
[2m[36m(pid=28497)[0m             'behaviour_logits': np.ndarray((996, 2), dtype=float32, min=-0.002, max=0.005, mean=0.001),
[2m[36m(pid=28497)[0m             'dones': np.ndarray((996,), dtype=bool, min=0.0, max=1.0, mean=0.006),
[2m[36

# Train

In [8]:
agent.train()

[2m[36m(pid=28747)[0m Loading configuration... done.
[2m[36m(pid=28747)[0m Success.
[2m[36m(pid=28746)[0m Loading configuration... done.
[2m[36m(pid=28746)[0m Success.
[2m[36m(pid=28747)[0m Loading configuration... done.
[2m[36m(pid=28746)[0m Loading configuration... done.
[2m[36m(pid=28744)[0m Loading configuration... done.
[2m[36m(pid=28744)[0m Success.
[2m[36m(pid=28744)[0m Loading configuration... done.
[2m[36m(pid=28747)[0m 2019-05-27 06:26:53,865	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 2 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=28747)[0m 2019-05-27 06:26:53.867931: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
[2m[36m(pid=28746)[0m 2019-05-27 06:26:53,909	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 3 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=28746)[0m 2019-05-

2019-05-27 06:27:02,726	INFO policy_evaluator.py:564 -- Training on concatenated sample batches:

{ 'data': { 'action_prob': np.ndarray((2995,), dtype=float32, min=0.001, max=0.4, mean=0.282),
            'actions': np.ndarray((2995, 1), dtype=float32, min=-3.057, max=3.39, mean=0.005),
            'advantages': np.ndarray((2995,), dtype=float32, min=0.773, max=98.911, mean=46.491),
            'agent_index': np.ndarray((2995,), dtype=int64, min=0.0, max=5.0, mean=2.307),
            'behaviour_logits': np.ndarray((2995, 2), dtype=float32, min=-0.005, max=0.007, mean=-0.0),
            'dones': np.ndarray((2995,), dtype=bool, min=0.0, max=1.0, mean=0.006),
            'eps_id': np.ndarray((2995,), dtype=int64, min=442523900.0, max=1814317348.0, mean=1613769252.759),
            'infos': np.ndarray((2995,), dtype=object, head={'mean_vel': 2.6695644103721983, 'cost2': 0.0, 'outflow': 900.0, 'cost1': 0.10428317597198752}),
            'new_obs': np.ndarray((2995, 12), dtype=float32, min=-

[2m[36m(pid=28744)[0m 2019-05-27 06:27:02,572	INFO policy_evaluator.py:474 -- Completed sample batch:
[2m[36m(pid=28744)[0m 
[2m[36m(pid=28744)[0m { 'data': { 'action_prob': np.ndarray((1106,), dtype=float32, min=0.004, max=0.399, mean=0.282),
[2m[36m(pid=28744)[0m             'actions': np.ndarray((1106, 1), dtype=float32, min=-3.057, max=2.751, mean=0.022),
[2m[36m(pid=28744)[0m             'advantages': np.ndarray((1106,), dtype=float32, min=-21.252, max=-0.298, mean=-14.419),
[2m[36m(pid=28744)[0m             'agent_index': np.ndarray((1106,), dtype=int64, min=0.0, max=5.0, mean=2.618),
[2m[36m(pid=28744)[0m             'behaviour_logits': np.ndarray((1106, 2), dtype=float32, min=-0.005, max=0.007, mean=0.0),
[2m[36m(pid=28744)[0m             'dones': np.ndarray((1106,), dtype=bool, min=0.0, max=1.0, mean=0.006),
[2m[36m(pid=28744)[0m             'eps_id': np.ndarray((1106,), dtype=int64, min=1808980993.0, max=1814317348.0, mean=1809559982.693),
[2m[36m

2019-05-27 06:27:03,103	INFO policy_evaluator.py:586 -- Training output:

{ 'learner_stats': { 'cur_kl_coeff': 0.2,
                     'cur_lr': 0.0005000000237487257,
                     'entropy': 1.4193064,
                     'kl': -1.5921107e-10,
                     'model': {},
                     'policy_loss': -46.491283,
                     'total_loss': 3045.5593,
                     'vf_explained_var': 0.0001270771,
                     'vf_loss': 3092.0508}}



[2m[36m(pid=28744)[0m Loading configuration... done.
[2m[36m(pid=28744)[0m Success.
[2m[36m(pid=28744)[0m Loading configuration... done.
[2m[36m(pid=28747)[0m Loading configuration... done.
[2m[36m(pid=28747)[0m Success.
[2m[36m(pid=28747)[0m Loading configuration... done.
[2m[36m(pid=28746)[0m Loading configuration... done.
[2m[36m(pid=28746)[0m Success.
[2m[36m(pid=28746)[0m Loading configuration... done.


{'config': {'batch_mode': 'truncate_episodes',
  'callbacks': {'on_episode_end': None,
   'on_episode_start': None,
   'on_episode_step': None,
   'on_postprocess_traj': None,
   'on_sample_end': None,
   'on_train_result': None},
  'clip_actions': False,
  'clip_param': 0.3,
  'clip_rewards': None,
  'collect_metrics_timeout': 180,
  'compress_observations': False,
  'custom_resources_per_worker': {},
  'entropy_coeff': 0.0,
  'env': 'MultiWaveAttenuationMergePOEnvBufferedObs-v0',
  'gamma': 0.99,
  'grad_clip': None,
  'horizon': 750,
  'ignore_worker_failures': False,
  'input': 'sampler',
  'input_evaluation': ['is', 'wis'],
  'kl_coeff': 0.2,
  'kl_target': 0.01,
  'lambda': 0.97,
  'local_evaluator_tf_session_args': {'inter_op_parallelism_threads': 8,
   'intra_op_parallelism_threads': 8},
  'log_level': 'INFO',
  'lr': 0.0005,
  'lr_schedule': None,
  'metrics_smoothing_episodes': 100,
  'model': {'conv_activation': 'relu',
   'conv_filters': None,
   'custom_model': None,
   'c