# TODO

* find the varid horizon

In [1]:
import gym, pickle, argparse, json, logging
from gym import ObservationWrapper
from copy import deepcopy
import ray

from meir import MEIRTrainer
from ray import tune
from ray.rllib.agents.ppo.ppo_policy_graph import PPOPolicyGraph
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG
from ray.rllib.agents import Trainer
from ray.rllib.evaluation import PolicyEvaluator, MultiAgentBatch
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.offline.json_reader import JsonReader
from ray.tune.registry import register_env
from ray.tune.logger import pretty_print
from ray.rllib.utils import merge_dicts
from ray.rllib.utils.annotations import override
from ray.rllib.evaluation.postprocessing import discount

from flow.utils.registry import make_create_env
from flow.utils.rllib import FlowParamsEncoder, get_flow_params
logger = logging.getLogger(__name__)

In [2]:
num_cpus = 3
num_rollouts = 3
horizon = 750
gae_lambda = 0.97
step_size = 5e-4
num_iter = 10
benchmark_name = "multi_merge"
exp_name = "test_ir"

In [3]:
ray.init(num_cpus=num_cpus, logging_level=40, ignore_reinit_error=True)

{'node_ip_address': '169.237.32.118',
 'object_store_address': '/tmp/ray/session_2019-05-27_21-31-03_8222/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-05-27_21-31-03_8222/sockets/raylet',
 'redis_address': '169.237.32.118:27703',
 'webui_url': None}

In [6]:
config = deepcopy(DEFAULT_CONFIG)
config["num_workers"] = min(num_cpus, num_rollouts)
config["train_batch_size"] = horizon * num_rollouts
config["sample_batch_size"] = horizon / 2
config["use_gae"] = True
config["horizon"] = horizon
config["lambda"] = gae_lambda
config["lr"] = step_size
config["vf_clip_param"] = 1e6
config["num_sgd_iter"] = 10
config['clip_actions'] = False  # FIXME(ev) temporary ray bug
config["model"]["fcnet_hiddens"] = [128, 64, 32]
config["observation_filter"] = "NoFilter"
config["entropy_coeff"] = 0.0
config["num_train"] = 2
config["expert_path"] = './expert_sample'
config["theta_lr"] = 0.1

benchmark = __import__(
            "flow.benchmarks.%s" % benchmark_name, fromlist=["flow_params"])
flow_params = benchmark.custom_rew_flow_params

# save the flow params for replay
flow_json = json.dumps(
    flow_params, cls=FlowParamsEncoder, sort_keys=True, indent=4)
config['env_config']['flow_params'] = flow_json

In [7]:
create_env, env_name = make_create_env(params=flow_params, version=0)
register_env(env_name, create_env)
env = create_env()

POLICY_ID = 'rl'
default_policy = (PPOPolicyGraph, env.observation_space, env.action_space, {})
policy_graph = {POLICY_ID: default_policy}
config["multiagent"] = {
        'policy_graphs': policy_graph,
        'policy_mapping_fn': tune.function(lambda agent_id: POLICY_ID),
        'policies_to_train': [POLICY_ID]
    }

In [6]:
class CustomEnvPolicyEvaluator(PolicyEvaluator):
    def set_theta_to_env(self, theta):
        self.env.set_theta(theta)
        
    def get_theta_from_env(self):
        return self.env.get_theta()

In [7]:
class MEIRTrainer(Trainer):
    _allow_unknown_configs = True
    _name = "MEIR"
    _default_config = DEFAULT_CONFIG
    _policy_graph = PPOPolicyGraph
    
    @override(Trainer)
    def _init(self, config, env_name):
        self._policy_graph = self.config["multiagent"]["policy_graphs"]
        
        self.local_evaluator = self.make_local_evaluator(
             env_name, self._policy_graph, self.config)        
        self.remote_evaluators = self.make_remote_evaluators(
            env_name, self._policy_graph, self.config["num_workers"])
        
        self.train_batch_size = self.config["train_batch_size"]
        self.num_sgd_iter = self.config["num_sgd_iter"]
        self.num_train = self.config["num_train"]
        self.expert_path = self.config["expert_path"]
        self.theta_lr = self.config["theta_lr"]
        
        expert_reader = JsonReader(self.expert_path)
        self.expert_samples = expert_reader.next()
        self.expert_features = self.calculate_expected_feature(self.expert_samples)
        self.theta = np.random.uniform(size=self.expert_features.shape)
        
    def make_local_evaluator(self,
                             env_creator,
                             policy_graph,
                             extra_config=None):
        """Convenience method to return configured local evaluator."""

        return self._make_evaluator(
            CustomEnvPolicyEvaluator,
            env_creator,
            policy_graph,
            0,
            merge_dicts(
                # important: allow local tf to use more CPUs for optimization
                merge_dicts(
                    self.config, {
                        "tf_session_args": self.
                        config["local_evaluator_tf_session_args"]
                    }),
                extra_config or {}))        
    
    def make_remote_evaluators(self, env_creator, policy_graph, count):
        """Convenience method to return a number of remote evaluators."""

        remote_args = {
            "num_cpus": self.config["num_cpus_per_worker"],
            "num_gpus": self.config["num_gpus_per_worker"],
            "resources": self.config["custom_resources_per_worker"],
        }

        cls = CustomEnvPolicyEvaluator.as_remote(**remote_args).remote

        return [
            self._make_evaluator(cls, env_creator, policy_graph, i + 1,
                                 self.config) for i in range(count)
        ]

    def sample(self, sample_size):
        self.set_theta_to_evaluators()
        
        # set local weights to remote
        weights = ray.put(self.local_evaluator.get_weights())
        for e in self.remote_evaluators:
            e.set_weights.remote(weights)
            
        samples = []
        while sum(s.count for s in samples) < sample_size:
            samples.extend(
                ray.get([
                    e.sample.remote() for e in self.remote_evaluators
                ]))
        samples = MultiAgentBatch.concat_samples(samples)
        return samples
    
    def calculate_expected_feature(self, samples):
        features = np.mean(samples["obs"], axis=0)
        return features
    
    def train_policy_by_samples(self, samples):
        # train policy by given samples
        for i in range(self.num_sgd_iter):
            fetches = self.local_evaluator.learn_on_batch(samples)
            
        def update(pi, pi_id):
            if pi_id in fetches:
                pi.update_kl(fetches[pi_id]['learner_stats']["kl"])
            else:
                logger.debug(
                    "No data for {}, not updating kl".format(pi_id))
        self.local_evaluator.foreach_trainable_policy(update)       
        
    def set_theta_to_evaluators(self):
        self.local_evaluator.set_theta_to_env(self.theta)
        for e in self.remote_evaluators:
            e.set_theta_to_env.remote(self.theta)
    
    def update_theta(self, samples, learning_rate=0.01):
        # update and return the difference norm
        features = self.calculate_expected_feature(samples)
        update = self.expert_features - features
        self.theta += learning_rate * update
        return np.linalg.norm(self.expert_features - features)
    
    @override(Trainer)    
    def _train(self):
        self.set_theta_to_evaluators()
        
        # optimize policy under estimated reward
        for train_iter in range(self.num_train):
            # collect samples with new reward fnc
            samples = self.sample(self.train_batch_size)

            # train local based on samples
            self.train_policy_by_samples(samples)
            res = collect_metrics(self.local_evaluator, self.remote_evaluators)
            pretty_print(res)
        
        samples = self.sample(self.train_batch_size) 
        norm = self.update_theta(samples, self.theta_lr)
        
        res["custom_metrics"]["theta_norm"] = norm
        return res

    @override(Trainer)
    def __getstate__(self):
        state = super().__getstate__()
        state["theta"] = self.theta
        return state
    
    @override(Trainer)
    def __setstate__(self, state):
        super().__setstate__(state)
        if "theta" in state:
            self.theta = state["theta"]


In [8]:
agent = MEIRTrainer(config, env_name)

2019-05-27 21:31:27,440	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2019-05-27 21:31:28,875	INFO policy_evaluator.py:728 -- Built policy map: {'rl': <ray.rllib.agents.ppo.ppo_policy_graph.PPOPolicyGraph object at 0x7f4b84609c88>}
2019-05-27 21:31:28,876	INFO policy_evaluator.py:729 -- Built preprocessor map: {'rl': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7f4b84609940>}
2019-05-27 21:31:28,877	INFO policy_evaluator.py:343 -- Built filter map: {'rl': <ray.rllib.utils.filter.NoFilter object at 0x7f4b8460cdd8>}
2019-05-27 21:31:28,962	INFO json_reader.py:65 -- Found 1 input files.


In [8]:
agent.train_policy_by_samples(multi_samples)

NameError: name 'agent' is not defined

In [9]:
agent.local_evaluator.learn_on_batch(multi_samples)

NameError: name 'agent' is not defined

In [7]:
multi_samples = agent.sample(agent.train_batch_size)

[2m[36m(pid=6927)[0m Loading configuration... done.
[2m[36m(pid=6927)[0m Success.
[2m[36m(pid=6927)[0m Loading configuration... done.
[2m[36m(pid=6930)[0m Loading configuration... done.
[2m[36m(pid=6930)[0m Success.
[2m[36m(pid=6930)[0m Loading configuration... done.
[2m[36m(pid=6931)[0m Loading configuration... done.
[2m[36m(pid=6931)[0m Success.
[2m[36m(pid=6931)[0m Loading configuration... done.
[2m[36m(pid=6927)[0m 2019-05-27 21:04:21,855	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 2 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=6927)[0m 2019-05-27 21:04:21.857309: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
[2m[36m(pid=6930)[0m 2019-05-27 21:04:22,035	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 3 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=6930)[0m 2019-05-27 21:04:22.0

[2m[36m(pid=6931)[0m 2019-05-27 21:04:30,348	INFO policy_evaluator.py:474 -- Completed sample batch:
[2m[36m(pid=6931)[0m 
[2m[36m(pid=6931)[0m { 'count': 375,
[2m[36m(pid=6931)[0m   'policy_batches': { 'rl': { 'data': { 'action_prob': np.ndarray((1030,), dtype=float32, min=0.0, max=0.399, mean=0.278),
[2m[36m(pid=6931)[0m                                         'actions': np.ndarray((1030, 1), dtype=float32, min=-2.951, max=4.228, mean=-0.026),
[2m[36m(pid=6931)[0m                                         'advantages': np.ndarray((1030,), dtype=float32, min=0.64, max=118.401, mean=47.033),
[2m[36m(pid=6931)[0m                                         'agent_index': np.ndarray((1030,), dtype=int64, min=0.0, max=4.0, mean=2.329),
[2m[36m(pid=6931)[0m                                         'behaviour_logits': np.ndarray((1030, 2), dtype=float32, min=-0.004, max=0.006, mean=0.002),
[2m[36m(pid=6931)[0m                                         'dones': np.ndarray((

In [37]:
sample = multi_samples.policy_batches["rl"]

In [39]:
sample["obs"]

array([[ 0.36196366,  0.6380364 ,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.3697982 ,  0.6302018 ,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.35862324,  0.64137673,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [ 0.33383006, -0.05302032,  0.0212647 , ...,  0.00737061,
         0.01090145,  0.52218354],
       [ 0.33250976, -0.06333038,  0.02053326, ...,  0.01444553,
         0.01105476,  0.5197702 ],
       [ 0.3248492 , -0.0670458 ,  0.01973958, ..., -0.01763957,
         0.010934  ,  0.5175963 ]], dtype=float32)

In [40]:
sample["rewards"]

array([2.2152696, 2.1731894, 2.1667137, ..., 1.7010003, 1.6543605,
       1.6315465], dtype=float32)

In [41]:
sample["agent_index"]

array([0, 0, 0, ..., 5, 5, 5])

In [39]:
set(sample["eps_id"])

{747833057, 988082960}

In [42]:
for t, r, R, d in zip(sample["t"], sample["rewards"], sample["advantages"], sample["dones"]):
    print(t, r, R, d)

100 -0.6333765 -4.452097 False
101 -0.63606596 -3.976601 False
102 -0.6352114 -3.4785948 False
103 -0.6365751 -2.960939 False
104 -0.63839877 -2.420503 False
105 -0.6410927 -1.8558179 False
106 -0.6455265 -1.2649575 False
107 -0.6455265 -0.6450524 True
100 -0.6333765 -18.280396 False
101 -0.63606596 -18.376608 False
102 -0.6352114 -18.473948 False
103 -0.6365751 -18.576174 False
104 -0.63839877 -18.68123 False
105 -0.6410927 -18.788738 False
106 -0.6455265 -18.897936 False
107 -0.65703934 -19.006937 False
108 -0.658573 -19.107588 False
109 -0.6617328 -19.211756 False
110 -0.664001 -19.316866 False
111 -0.6780474 -19.423962 False
112 -0.68264073 -19.520853 False
113 -0.6847072 -19.61694 False
114 -0.6876112 -19.7149 False
115 -0.6897008 -19.81383 False
116 -0.7259685 -19.914776 False
117 -0.7269529 -19.982077 False
118 -0.7279562 -20.051197 False
119 -0.7274213 -20.122076 False
120 -0.74273753 -20.196487 False
121 -0.7475499 -20.25798 False
122 -0.74694234 -20.316977 False
123 -0.740030

244 -0.85590637 -21.506037 False
245 -0.8580206 -21.503834 False
246 -0.8617463 -21.499231 False
247 -0.8635688 -21.49061 False
248 -0.86485434 -21.479729 False
249 -0.8674602 -21.467033 False
250 -0.8702177 -21.451092 False
251 -0.87391156 -21.431618 False
252 -0.87630224 -21.407522 False
253 -0.86638236 -21.379911 False
254 -0.86834073 -21.362856 False
255 -0.87065876 -21.341686 False
256 -0.87270916 -21.317234 False
257 -0.87508684 -21.289639 False
258 -0.87840265 -21.258392 False
259 -0.88430345 -21.222452 False
260 -0.88931185 -21.178835 False
261 -0.8717333 -21.12824 False
262 -0.8755939 -21.093897 False
263 -0.8790983 -21.0541 False
264 -0.8824948 -21.008865 False
265 -0.88781554 -20.958344 False
266 -0.89115316 -20.900242 False
267 -0.89405036 -20.836185 False
268 -0.89883566 -20.766449 False
269 -0.90125215 -20.688831 False
270 -0.90475315 -20.605595 False
271 -0.9070437 -20.515165 False
272 -0.9078942 -20.418724 False
273 -0.911102 -20.317375 False
274 -0.91309875 -20.208427 

86 -0.48923227 -12.315406 False
87 -0.49029604 -12.315093 False
88 -0.49169558 -12.313591 False
89 -0.4953892 -12.3107195 False
90 -0.49725667 -12.303767 False
91 -0.4969456 -12.294511 False
92 -0.4971596 -12.285252 False
93 -0.49795428 -12.275498 False
94 -0.4991962 -12.264403 False
95 -0.5025249 -12.251591 False
96 -0.50252295 -12.234831 False
97 -0.5020445 -12.217384 False
98 -0.50265676 -12.199721 False
99 -0.5018924 -12.180672 False
100 -0.5051327 -12.161638 False
101 -0.50613135 -12.138301 False
102 -0.50668246 -12.113116 False
103 -0.5079531 -12.0863285 False
104 -0.5099129 -12.056898 False
105 -0.5133087 -12.024413 False
106 -0.5146088 -11.987015 False
107 -0.5155978 -11.946673 False
108 -0.5166486 -11.903727 False
109 -0.5170045 -11.857773 False
110 -0.51818323 -11.809677 False
111 -0.522196 -11.758314 False
112 -0.5235066 -11.700353 False
113 -0.52425724 -11.638831 False
114 -0.5264126 -11.57406 False
115 -0.52922046 -11.504146 False
116 -0.54060996 -11.428601 False
117 -0.54

# Debug

In [7]:
samples = agent.sample(agent.train_batch_size)

[2m[36m(pid=7456)[0m Loading configuration... done.
[2m[36m(pid=7456)[0m Success.
[2m[36m(pid=7456)[0m Loading configuration... done.
[2m[36m(pid=7454)[0m Loading configuration... done.
[2m[36m(pid=7454)[0m Success.
[2m[36m(pid=7454)[0m Loading configuration... done.
[2m[36m(pid=7457)[0m Loading configuration... done.
[2m[36m(pid=7457)[0m Success.
[2m[36m(pid=7457)[0m Loading configuration... done.
[2m[36m(pid=7456)[0m 2019-05-27 21:18:22,497	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=7456)[0m 2019-05-27 21:18:22.499366: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
[2m[36m(pid=7454)[0m 2019-05-27 21:18:22,702	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 3 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=7454)[0m 2019-05-27 21:18:22.7

[2m[36m(pid=7454)[0m Loading configuration... done.
[2m[36m(pid=7454)[0m Success.
[2m[36m(pid=7454)[0m Loading configuration... done.
[2m[36m(pid=7457)[0m Loading configuration... done.
[2m[36m(pid=7457)[0m Success.
[2m[36m(pid=7457)[0m Loading configuration... done.
[2m[36m(pid=7456)[0m 2019-05-27 21:18:31,288	INFO policy_evaluator.py:474 -- Completed sample batch:
[2m[36m(pid=7456)[0m 
[2m[36m(pid=7456)[0m { 'count': 375,
[2m[36m(pid=7456)[0m   'policy_batches': { 'rl': { 'data': { 'action_prob': np.ndarray((1200,), dtype=float32, min=0.006, max=0.401, mean=0.28),
[2m[36m(pid=7456)[0m                                         'actions': np.ndarray((1200, 1), dtype=float32, min=-2.804, max=2.912, mean=-0.02),
[2m[36m(pid=7456)[0m                                         'advantages': np.ndarray((1200,), dtype=float32, min=0.839, max=94.008, mean=39.505),
[2m[36m(pid=7456)[0m                                         'agent_index': np.ndarray((1200,), 

In [12]:
for sample in samples.policy_batches.values():
    sample.shuffle()

# Train

In [9]:
agent.train()

[2m[36m(pid=8260)[0m Loading configuration... done.
[2m[36m(pid=8260)[0m Success.
[2m[36m(pid=8260)[0m Loading configuration... done.
[2m[36m(pid=8261)[0m Loading configuration... done.
[2m[36m(pid=8261)[0m Success.
[2m[36m(pid=8258)[0m Loading configuration... done.
[2m[36m(pid=8258)[0m Success.
[2m[36m(pid=8261)[0m Loading configuration... done.
[2m[36m(pid=8258)[0m Loading configuration... done.
[2m[36m(pid=8260)[0m 2019-05-27 21:31:40,532	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 3 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=8260)[0m 2019-05-27 21:31:40.534995: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
[2m[36m(pid=8258)[0m 2019-05-27 21:31:40,561	INFO policy_evaluator.py:311 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=8258)[0m 2019-05-27 21:31:40.5

[2m[36m(pid=8260)[0m Loading configuration... done.
[2m[36m(pid=8260)[0m Success.
[2m[36m(pid=8260)[0m Loading configuration... done.
[2m[36m(pid=8261)[0m Loading configuration... done.
[2m[36m(pid=8261)[0m Success.
[2m[36m(pid=8261)[0m Loading configuration... done.
[2m[36m(pid=8258)[0m 2019-05-27 21:31:48,686	INFO policy_evaluator.py:474 -- Completed sample batch:
[2m[36m(pid=8258)[0m 
[2m[36m(pid=8258)[0m { 'count': 375,
[2m[36m(pid=8258)[0m   'policy_batches': { 'rl': { 'data': { 'action_prob': np.ndarray((1021,), dtype=float32, min=0.001, max=0.403, mean=0.284),
[2m[36m(pid=8258)[0m                                         'actions': np.ndarray((1021, 1), dtype=float32, min=-3.34, max=2.92, mean=-0.012),
[2m[36m(pid=8258)[0m                                         'advantages': np.ndarray((1021,), dtype=float32, min=0.431, max=77.092, mean=33.514),
[2m[36m(pid=8258)[0m                                         'agent_index': np.ndarray((1021,), 

2019-05-27 21:31:54,339	INFO policy_evaluator.py:564 -- Training on concatenated sample batches:

{ 'count': 2250,
  'policy_batches': { 'rl': { 'data': { 'action_prob': np.ndarray((6003,), dtype=float32, min=0.0, max=0.403, mean=0.282),
                                        'actions': np.ndarray((6003, 1), dtype=float32, min=-3.57, max=3.873, mean=0.001),
                                        'advantages': np.ndarray((6003,), dtype=float32, min=0.408, max=77.767, mean=34.04),
                                        'agent_index': np.ndarray((6003,), dtype=int64, min=0.0, max=5.0, mean=2.382),
                                        'behaviour_logits': np.ndarray((6003, 2), dtype=float32, min=-0.011, max=0.001, mean=-0.003),
                                        'dones': np.ndarray((6003,), dtype=bool, min=0.0, max=1.0, mean=0.006),
                                        'eps_id': np.ndarray((6003,), dtype=int64, min=196699457.0, max=1946196971.0, mean=1063088589.475),
         

[2m[36m(pid=8258)[0m Loading configuration... done.
[2m[36m(pid=8258)[0m Success.
[2m[36m(pid=8258)[0m Loading configuration... done.
[2m[36m(pid=8260)[0m Loading configuration... done.
[2m[36m(pid=8260)[0m Success.
[2m[36m(pid=8260)[0m Loading configuration... done.
[2m[36m(pid=8261)[0m Loading configuration... done.
[2m[36m(pid=8261)[0m Success.
[2m[36m(pid=8261)[0m Loading configuration... done.
[2m[36m(pid=8260)[0m Loading configuration... done.
[2m[36m(pid=8260)[0m Success.
[2m[36m(pid=8260)[0m Loading configuration... done.
[2m[36m(pid=8261)[0m Loading configuration... done.
[2m[36m(pid=8261)[0m Success.
[2m[36m(pid=8261)[0m Loading configuration... done.
[2m[36m(pid=8258)[0m Loading configuration... done.
[2m[36m(pid=8258)[0m Success.
[2m[36m(pid=8258)[0m Loading configuration... done.
[2m[36m(pid=8258)[0m Loading configuration... done.
[2m[36m(pid=8258)[0m Success.
[2m[36m(pid=8258)[0m Loading configuration... done.

{'config': {'batch_mode': 'truncate_episodes',
  'callbacks': {'on_episode_end': None,
   'on_episode_start': None,
   'on_episode_step': None,
   'on_postprocess_traj': None,
   'on_sample_end': None,
   'on_train_result': None},
  'clip_actions': False,
  'clip_param': 0.3,
  'clip_rewards': None,
  'collect_metrics_timeout': 180,
  'compress_observations': False,
  'custom_resources_per_worker': {},
  'entropy_coeff': 0.0,
  'env': 'MultiWaveAttenuationMergePOEnvCustomRew-v0',
  'expert_path': './expert_sample',
  'gamma': 0.99,
  'grad_clip': None,
  'horizon': 750,
  'ignore_worker_failures': False,
  'input': 'sampler',
  'input_evaluation': ['is', 'wis'],
  'kl_coeff': 0.2,
  'kl_target': 0.01,
  'lambda': 0.97,
  'local_evaluator_tf_session_args': {'inter_op_parallelism_threads': 8,
   'intra_op_parallelism_threads': 8},
  'log_level': 'INFO',
  'lr': 0.0005,
  'lr_schedule': None,
  'metrics_smoothing_episodes': 100,
  'model': {'conv_activation': 'relu',
   'conv_filters': No