support for custom summaries, parameter min/max-bounds, agent argumen…

…t batch_size mandatory, removed estimator argument capacity, major internal changes
tensorforce · Mar 8, 2020 · b36f4c3 · b36f4c3
1 parent 1470ea8
commit b36f4c3
Show file tree

Hide file tree

Showing 74 changed files with 1,789 additions and 1,429 deletions.
diff --git a/UPDATE_NOTES.md b/UPDATE_NOTES.md
@@ -6,8 +6,13 @@ This file records all major updates and new features, starting from version 0.5.
 
 ### Latest
 
-- Changed independent mode of `agent.act` to use final values of parameters and avoid TensorFlow conditions
-- Extended `"tensorflow"` format of `agent.save` to include an optimized Protobuf model with an act-only graph as `.pb` file
+- Changed independent mode of `agent.act` to use final values of dynamic hyperparameters and avoid TensorFlow conditions
+- Extended `"tensorflow"` format of `agent.save` to include an optimized Protobuf model with an act-only graph as `.pb` file, and `Agent.load` format `"pb-actonly"` to load act-only agent based on Protobuf model
+- Support for custom summaries via new `summarizer` argument value `custom` to specify summary type, and `Agent.summarize(...)` to record summary values
+- Added min/max-bounds for dynamic hyperparameters min/max-bounds to assert valid range and infer other arguments
+- Argument `batch_size` now mandatory for all agent classes
+- Removed `Estimator` argument `capacity`, now always automatically inferred
+- Internal changes related to agent arguments `memory`, `update` and `reward_estimation`
 
 
 

diff --git a/docs/agents/agent.rst b/docs/agents/agent.rst
@@ -31,6 +31,11 @@ Get and assign variables
 .. automethod:: tensorforce.agents.TensorforceAgent.get_variable
 .. automethod:: tensorforce.agents.TensorforceAgent.assign_variable
 
+Custom summaries
+----------------
+
+.. automethod:: tensorforce.agents.TensorforceAgent.summarize
+
 Advanced functions for specialized use cases
 --------------------------------------------
 

diff --git a/run.py b/run.py
@@ -21,10 +21,7 @@
 import matplotlib
 import numpy as np
 
-from tensorforce.agents import Agent
-from tensorforce.core.utils.json_encoder import NumpyJSONEncoder
-from tensorforce.environments import Environment
-from tensorforce.execution import Runner
+from tensorforce import Agent, Environment, Runner
 
 matplotlib.use('Agg')
 import matplotlib.pyplot as plt
@@ -141,10 +138,10 @@ def main():
         agent_seconds = [list() for _ in range(args.episodes)]
 
         def callback(r, p):
-            rewards[r.episodes - 1].append(r.episode_rewards[-1])
-            timesteps[r.episodes - 1].append(r.episode_timesteps[-1])
-            seconds[r.episodes - 1].append(r.episode_seconds[-1])
-            agent_seconds[r.episodes - 1].append(r.episode_agent_seconds[-1])
+            rewards[r.episodes - 1].append(float(r.episode_rewards[-1]))
+            timesteps[r.episodes - 1].append(int(r.episode_timesteps[-1]))
+            seconds[r.episodes - 1].append(float(r.episode_seconds[-1]))
+            agent_seconds[r.episodes - 1].append(float(r.episode_agent_seconds[-1]))
             return True
 
     if args.environment is None:
@@ -203,7 +200,7 @@ def callback(r, p):
                 json.dumps(dict(
                     rewards=rewards, timesteps=timesteps, seconds=seconds,
                     agent_seconds=agent_seconds
-                ), cls=NumpyJSONEncoder)
+                ))
             )
 
         if args.seaborn:

diff --git a/setup.py b/setup.py
@@ -105,7 +105,7 @@
     extras_require=dict(
         tf=['tensorflow==2.0.1'],
         tf_gpu=['tensorflow-gpu==2.0.1'],
-        tfa=['tensorflow-addons'],
+        tfa=['tensorflow-addons==0.6.0'],
         docs=['m2r', 'recommonmark', 'sphinx', 'sphinx-rtd-theme'],
         tune=['hpbandster'],
         envs=['gym[all]', 'gym-retro', 'mazeexp', 'vizdoom'],

diff --git a/tensorforce/agents/a2c.py b/tensorforce/agents/a2c.py
@@ -57,13 +57,18 @@ class AdvantageActorCritic(TensorforceAgent):
             (<span style="color:#00C000"><b>default</b></span>: not given, better implicitly
             specified via `environment` argument for `Agent.create(...)`).
 
+        batch_size (parameter, long > 0): Number of timesteps per update batch
+            (<span style="color:#C00000"><b>required</b></span>).
+
         network ("auto" | specification): Policy network configuration, see
             [networks](../modules/networks.html)
             (<span style="color:#00C000"><b>default</b></span>: "auto", automatically configured
             network).
 
-        batch_size (parameter, long > 0): Number of episodes per update batch
-            (<span style="color:#00C000"><b>default</b></span>: 10 episodes).
+        memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + maximum
+            network/estimator horizon + 1 timesteps
+            (<span style="color:#00C000"><b>default</b></span>: minimum capacity, usually does not
+            need to be changed).
         update_frequency ("never" | parameter, long > 0): Frequency of updates
             (<span style="color:#00C000"><b>default</b></span>: batch_size).
         learning_rate (parameter, float > 0.0): Optimizer learning rate
@@ -88,9 +93,6 @@ class AdvantageActorCritic(TensorforceAgent):
             the critic loss
             (<span style="color:#00C000"><b>default</b></span>: 1.0).
 
-        memory (int > 0): Memory capacity, has to fit at least around batch_size + one episode
-            (<span style="color:#00C000"><b>default</b></span>: minimum required size).
-
         preprocessing (dict[specification]): Preprocessing as layer or list of layers, see
             [preprocessing](../modules/preprocessing.html), specified per state-type or -name and
             for reward
@@ -155,6 +157,11 @@ class AdvantageActorCritic(TensorforceAgent):
             summary writer (<span style="color:#00C000"><b>default</b></span>: 10).</li>
             <li><b>max-summaries</b> (<i>int > 0</i>) &ndash; maximum number of summaries to keep
             (<span style="color:#00C000"><b>default</b></span>: 5).</li>
+            <li><b>custom</b> (<i>dict[spec]</i>) &ndash; custom summaries which are recorded via
+            `agent.summarize(...)`, specification with either type "scalar", type "histogram" with
+            optional "buckets", type "image" with optional "max_outputs"
+            (<span style="color:#00C000"><b>default</b></span>: 3), or type "audio"
+            (<span style="color:#00C000"><b>default</b></span>: no custom summaries).</li>
             <li><b>labels</b> (<i>"all" | iter[string]</i>) &ndash; all excluding "*-histogram"
             labels, or list of summaries to record, from the following labels
             (<span style="color:#00C000"><b>default</b></span>: only "graph"):</li>
@@ -179,8 +186,9 @@ class AdvantageActorCritic(TensorforceAgent):
             <li>"variables": variable mean and variance scalars</li>
             <li>"variables-histogram": variable histograms</li>
             </ul>
-        recorder (specification): Experience traces recorder configuration with the following
-            attributes (<span style="color:#00C000"><b>default</b></span>: no recorder):
+        recorder (specification): Experience traces recorder configuration, currently not including
+            internal states, with the following attributes
+            (<span style="color:#00C000"><b>default</b></span>: no recorder):
             <ul>
             <li><b>directory</b> (<i>path</i>) &ndash; recorder directory
             (<span style="color:#C00000"><b>required</b></span>).</li>
@@ -193,18 +201,20 @@ class AdvantageActorCritic(TensorforceAgent):
     """
 
     def __init__(
+        # Required
+        self, states, actions, batch_size,
         # Environment
-        self, states, actions, max_episode_timesteps,
+        max_episode_timesteps=None,
         # Network
         network='auto',
+        # Memory
+        memory=None,
         # Optimization
-        batch_size=10, update_frequency=None, learning_rate=3e-4,
+        update_frequency=None, learning_rate=3e-4,
         # Reward estimation
         horizon=0, discount=0.99, state_action_value=False, estimate_terminal=False,
         # Critic
         critic_network='auto', critic_optimizer=1.0,
-        # Memory
-        memory=None,
         # Preprocessing
         preprocessing=None,
         # Exploration
@@ -217,38 +227,38 @@ def __init__(
     ):
         self.spec = OrderedDict(
             agent='a2c',
-            states=states, actions=actions, max_episode_timesteps=max_episode_timesteps,
+            states=states, actions=actions, batch_size=batch_size,
+            max_episode_timesteps=max_episode_timesteps,
             network=network,
-            batch_size=batch_size, update_frequency=update_frequency, learning_rate=learning_rate,
+            memory=memory,
+            update_frequency=update_frequency, learning_rate=learning_rate,
             horizon=horizon, discount=discount, state_action_value=state_action_value,
-            estimate_terminal=estimate_terminal,
+                estimate_terminal=estimate_terminal,
             critic_network=critic_network, critic_optimizer=critic_optimizer,
             preprocessing=preprocessing,
             exploration=exploration, variable_noise=variable_noise,
             l2_regularization=l2_regularization, entropy_regularization=entropy_regularization,
             name=name, device=device, parallel_interactions=parallel_interactions, seed=seed,
-            execution=execution, saver=saver, summarizer=summarizer, recorder=recorder,
-            config=config
+                execution=execution, saver=saver, summarizer=summarizer, recorder=recorder,
+                config=config
         )
 
         policy = dict(network=network, temperature=1.0)
         if memory is None:
-            memory = dict(type='recent', capacity=(batch_size + max_episode_timesteps + horizon))
+            memory = dict(type='recent')
         else:
             memory = dict(type='recent', capacity=memory)
-        if update_frequency is None:
-            update = dict(unit='timesteps', batch_size=batch_size)
-        else:
-            update = dict(unit='timesteps', batch_size=batch_size, frequency=update_frequency)
+        update = dict(unit='timesteps', batch_size=batch_size)
+        if update_frequency is not None:
+            update['frequency'] = update_frequency
         optimizer = dict(type='adam', learning_rate=learning_rate)
         objective = 'policy_gradient'
         reward_estimation = dict(
             horizon=horizon, discount=discount, estimate_horizon='early',
             estimate_actions=state_action_value, estimate_terminal=estimate_terminal,
             estimate_advantage=True
         )
-        # State value doesn't exist for Beta
-        baseline_policy = dict(network=critic_network, distributions=dict(float='gaussian'))
+        baseline_policy = dict(network=critic_network)
         if state_action_value:
             baseline_objective = dict(type='value', value='action')
         else:

diff --git a/tensorforce/agents/ac.py b/tensorforce/agents/ac.py
@@ -57,13 +57,18 @@ class ActorCritic(TensorforceAgent):
             (<span style="color:#00C000"><b>default</b></span>: not given, better implicitly
             specified via `environment` argument for `Agent.create(...)`).
 
+        batch_size (parameter, long > 0): Number of timesteps per update batch
+            (<span style="color:#C00000"><b>required</b></span>).
+
         network ("auto" | specification): Policy network configuration, see
             [networks](../modules/networks.html)
             (<span style="color:#00C000"><b>default</b></span>: "auto", automatically configured
             network).
 
-        batch_size (parameter, long > 0): Number of episodes per update batch
-            (<span style="color:#00C000"><b>default</b></span>: 10 episodes).
+        memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + maximum
+            network/estimator horizon + 1 timesteps
+            (<span style="color:#00C000"><b>default</b></span>: minimum capacity, usually does not
+            need to be changed).
         update_frequency ("never" | parameter, long > 0): Frequency of updates
             (<span style="color:#00C000"><b>default</b></span>: batch_size).
         learning_rate (parameter, float > 0.0): Optimizer learning rate
@@ -88,9 +93,6 @@ class ActorCritic(TensorforceAgent):
             the critic loss
             (<span style="color:#00C000"><b>default</b></span>: 1.0).
 
-        memory (int > 0): Memory capacity, has to fit at least around batch_size + one episode
-            (<span style="color:#00C000"><b>default</b></span>: minimum required size).
-
         preprocessing (dict[specification]): Preprocessing as layer or list of layers, see
             [preprocessing](../modules/preprocessing.html), specified per state-type or -name and
             for reward
@@ -155,6 +157,11 @@ class ActorCritic(TensorforceAgent):
             summary writer (<span style="color:#00C000"><b>default</b></span>: 10).</li>
             <li><b>max-summaries</b> (<i>int > 0</i>) &ndash; maximum number of summaries to keep
             (<span style="color:#00C000"><b>default</b></span>: 5).</li>
+            <li><b>custom</b> (<i>dict[spec]</i>) &ndash; custom summaries which are recorded via
+            `agent.summarize(...)`, specification with either type "scalar", type "histogram" with
+            optional "buckets", type "image" with optional "max_outputs"
+            (<span style="color:#00C000"><b>default</b></span>: 3), or type "audio"
+            (<span style="color:#00C000"><b>default</b></span>: no custom summaries).</li>
             <li><b>labels</b> (<i>"all" | iter[string]</i>) &ndash; all excluding "*-histogram"
             labels, or list of summaries to record, from the following labels
             (<span style="color:#00C000"><b>default</b></span>: only "graph"):</li>
@@ -179,8 +186,9 @@ class ActorCritic(TensorforceAgent):
             <li>"variables": variable mean and variance scalars</li>
             <li>"variables-histogram": variable histograms</li>
             </ul>
-        recorder (specification): Experience traces recorder configuration with the following
-            attributes (<span style="color:#00C000"><b>default</b></span>: no recorder):
+        recorder (specification): Experience traces recorder configuration, currently not including
+            internal states, with the following attributes
+            (<span style="color:#00C000"><b>default</b></span>: no recorder):
             <ul>
             <li><b>directory</b> (<i>path</i>) &ndash; recorder directory
             (<span style="color:#C00000"><b>required</b></span>).</li>
@@ -193,18 +201,20 @@ class ActorCritic(TensorforceAgent):
     """
 
     def __init__(
+        # Required
+        self, states, actions, batch_size,
         # Environment
-        self, states, actions, max_episode_timesteps,
+        max_episode_timesteps=None,
         # Network
         network='auto',
+        # Memory
+        memory=None,
         # Optimization
-        batch_size=10, update_frequency=None, learning_rate=3e-4,
+        update_frequency=None, learning_rate=3e-4,
         # Reward estimation
         horizon=0, discount=0.99, state_action_value=False, estimate_terminal=False,
         # Critic
         critic_network='auto', critic_optimizer=1.0,
-        # Memory
-        memory=None,
         # Preprocessing
         preprocessing=None,
         # Exploration
@@ -217,23 +227,25 @@ def __init__(
     ):
         self.spec = OrderedDict(
             agent='ac',
-            states=states, actions=actions, max_episode_timesteps=max_episode_timesteps,
+            states=states, actions=actions, batch_size=batch_size,
+            max_episode_timesteps=max_episode_timesteps,
             network=network,
-            batch_size=batch_size, update_frequency=update_frequency, learning_rate=learning_rate,
+            memory=memory,
+            update_frequency=update_frequency, learning_rate=learning_rate,
             horizon=horizon, discount=discount, state_action_value=state_action_value,
-            estimate_terminal=estimate_terminal,
+                estimate_terminal=estimate_terminal,
             critic_network=critic_network, critic_optimizer=critic_optimizer,
             preprocessing=preprocessing,
             exploration=exploration, variable_noise=variable_noise,
             l2_regularization=l2_regularization, entropy_regularization=entropy_regularization,
             name=name, device=device, parallel_interactions=parallel_interactions, seed=seed,
-            execution=execution, saver=saver, summarizer=summarizer, recorder=recorder,
-            config=config
+                execution=execution, saver=saver, summarizer=summarizer, recorder=recorder,
+                config=config
         )
 
         policy = dict(network=network, temperature=1.0)
         if memory is None:
-            memory = dict(type='recent', capacity=(batch_size + max_episode_timesteps + horizon))
+            memory = dict(type='recent')
         else:
             memory = dict(type='recent', capacity=memory)
         if update_frequency is None:
@@ -246,8 +258,7 @@ def __init__(
             horizon=horizon, discount=discount, estimate_horizon='early',
             estimate_actions=state_action_value, estimate_terminal=estimate_terminal
         )
-        # State value doesn't exist for Beta
-        baseline_policy = dict(network=critic_network, distributions=dict(float='gaussian'))
+        baseline_policy = dict(network=critic_network)
         if state_action_value:
             baseline_objective = dict(type='value', value='action')
         else: