Skip to content

Commit

Permalink
support for custom summaries, parameter min/max-bounds, agent argumen…
Browse files Browse the repository at this point in the history
…t batch_size mandatory, removed estimator argument capacity, major internal changes
  • Loading branch information
AlexKuhnle committed Mar 8, 2020
1 parent 1470ea8 commit b36f4c3
Show file tree
Hide file tree
Showing 74 changed files with 1,789 additions and 1,429 deletions.
9 changes: 7 additions & 2 deletions UPDATE_NOTES.md
Expand Up @@ -6,8 +6,13 @@ This file records all major updates and new features, starting from version 0.5.

### Latest

- Changed independent mode of `agent.act` to use final values of parameters and avoid TensorFlow conditions
- Extended `"tensorflow"` format of `agent.save` to include an optimized Protobuf model with an act-only graph as `.pb` file
- Changed independent mode of `agent.act` to use final values of dynamic hyperparameters and avoid TensorFlow conditions
- Extended `"tensorflow"` format of `agent.save` to include an optimized Protobuf model with an act-only graph as `.pb` file, and `Agent.load` format `"pb-actonly"` to load act-only agent based on Protobuf model
- Support for custom summaries via new `summarizer` argument value `custom` to specify summary type, and `Agent.summarize(...)` to record summary values
- Added min/max-bounds for dynamic hyperparameters min/max-bounds to assert valid range and infer other arguments
- Argument `batch_size` now mandatory for all agent classes
- Removed `Estimator` argument `capacity`, now always automatically inferred
- Internal changes related to agent arguments `memory`, `update` and `reward_estimation`



Expand Down
5 changes: 5 additions & 0 deletions docs/agents/agent.rst
Expand Up @@ -31,6 +31,11 @@ Get and assign variables
.. automethod:: tensorforce.agents.TensorforceAgent.get_variable
.. automethod:: tensorforce.agents.TensorforceAgent.assign_variable

Custom summaries
----------------

.. automethod:: tensorforce.agents.TensorforceAgent.summarize

Advanced functions for specialized use cases
--------------------------------------------

Expand Down
15 changes: 6 additions & 9 deletions run.py
Expand Up @@ -21,10 +21,7 @@
import matplotlib
import numpy as np

from tensorforce.agents import Agent
from tensorforce.core.utils.json_encoder import NumpyJSONEncoder
from tensorforce.environments import Environment
from tensorforce.execution import Runner
from tensorforce import Agent, Environment, Runner

matplotlib.use('Agg')
import matplotlib.pyplot as plt
Expand Down Expand Up @@ -141,10 +138,10 @@ def main():
agent_seconds = [list() for _ in range(args.episodes)]

def callback(r, p):
rewards[r.episodes - 1].append(r.episode_rewards[-1])
timesteps[r.episodes - 1].append(r.episode_timesteps[-1])
seconds[r.episodes - 1].append(r.episode_seconds[-1])
agent_seconds[r.episodes - 1].append(r.episode_agent_seconds[-1])
rewards[r.episodes - 1].append(float(r.episode_rewards[-1]))
timesteps[r.episodes - 1].append(int(r.episode_timesteps[-1]))
seconds[r.episodes - 1].append(float(r.episode_seconds[-1]))
agent_seconds[r.episodes - 1].append(float(r.episode_agent_seconds[-1]))
return True

if args.environment is None:
Expand Down Expand Up @@ -203,7 +200,7 @@ def callback(r, p):
json.dumps(dict(
rewards=rewards, timesteps=timesteps, seconds=seconds,
agent_seconds=agent_seconds
), cls=NumpyJSONEncoder)
))
)

if args.seaborn:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -105,7 +105,7 @@
extras_require=dict(
tf=['tensorflow==2.0.1'],
tf_gpu=['tensorflow-gpu==2.0.1'],
tfa=['tensorflow-addons'],
tfa=['tensorflow-addons==0.6.0'],
docs=['m2r', 'recommonmark', 'sphinx', 'sphinx-rtd-theme'],
tune=['hpbandster'],
envs=['gym[all]', 'gym-retro', 'mazeexp', 'vizdoom'],
Expand Down
56 changes: 33 additions & 23 deletions tensorforce/agents/a2c.py
Expand Up @@ -57,13 +57,18 @@ class AdvantageActorCritic(TensorforceAgent):
(<span style="color:#00C000"><b>default</b></span>: not given, better implicitly
specified via `environment` argument for `Agent.create(...)`).
batch_size (parameter, long > 0): Number of timesteps per update batch
(<span style="color:#C00000"><b>required</b></span>).
network ("auto" | specification): Policy network configuration, see
[networks](../modules/networks.html)
(<span style="color:#00C000"><b>default</b></span>: "auto", automatically configured
network).
batch_size (parameter, long > 0): Number of episodes per update batch
(<span style="color:#00C000"><b>default</b></span>: 10 episodes).
memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + maximum
network/estimator horizon + 1 timesteps
(<span style="color:#00C000"><b>default</b></span>: minimum capacity, usually does not
need to be changed).
update_frequency ("never" | parameter, long > 0): Frequency of updates
(<span style="color:#00C000"><b>default</b></span>: batch_size).
learning_rate (parameter, float > 0.0): Optimizer learning rate
Expand All @@ -88,9 +93,6 @@ class AdvantageActorCritic(TensorforceAgent):
the critic loss
(<span style="color:#00C000"><b>default</b></span>: 1.0).
memory (int > 0): Memory capacity, has to fit at least around batch_size + one episode
(<span style="color:#00C000"><b>default</b></span>: minimum required size).
preprocessing (dict[specification]): Preprocessing as layer or list of layers, see
[preprocessing](../modules/preprocessing.html), specified per state-type or -name and
for reward
Expand Down Expand Up @@ -155,6 +157,11 @@ class AdvantageActorCritic(TensorforceAgent):
summary writer (<span style="color:#00C000"><b>default</b></span>: 10).</li>
<li><b>max-summaries</b> (<i>int > 0</i>) &ndash; maximum number of summaries to keep
(<span style="color:#00C000"><b>default</b></span>: 5).</li>
<li><b>custom</b> (<i>dict[spec]</i>) &ndash; custom summaries which are recorded via
`agent.summarize(...)`, specification with either type "scalar", type "histogram" with
optional "buckets", type "image" with optional "max_outputs"
(<span style="color:#00C000"><b>default</b></span>: 3), or type "audio"
(<span style="color:#00C000"><b>default</b></span>: no custom summaries).</li>
<li><b>labels</b> (<i>"all" | iter[string]</i>) &ndash; all excluding "*-histogram"
labels, or list of summaries to record, from the following labels
(<span style="color:#00C000"><b>default</b></span>: only "graph"):</li>
Expand All @@ -179,8 +186,9 @@ class AdvantageActorCritic(TensorforceAgent):
<li>"variables": variable mean and variance scalars</li>
<li>"variables-histogram": variable histograms</li>
</ul>
recorder (specification): Experience traces recorder configuration with the following
attributes (<span style="color:#00C000"><b>default</b></span>: no recorder):
recorder (specification): Experience traces recorder configuration, currently not including
internal states, with the following attributes
(<span style="color:#00C000"><b>default</b></span>: no recorder):
<ul>
<li><b>directory</b> (<i>path</i>) &ndash; recorder directory
(<span style="color:#C00000"><b>required</b></span>).</li>
Expand All @@ -193,18 +201,20 @@ class AdvantageActorCritic(TensorforceAgent):
"""

def __init__(
# Required
self, states, actions, batch_size,
# Environment
self, states, actions, max_episode_timesteps,
max_episode_timesteps=None,
# Network
network='auto',
# Memory
memory=None,
# Optimization
batch_size=10, update_frequency=None, learning_rate=3e-4,
update_frequency=None, learning_rate=3e-4,
# Reward estimation
horizon=0, discount=0.99, state_action_value=False, estimate_terminal=False,
# Critic
critic_network='auto', critic_optimizer=1.0,
# Memory
memory=None,
# Preprocessing
preprocessing=None,
# Exploration
Expand All @@ -217,38 +227,38 @@ def __init__(
):
self.spec = OrderedDict(
agent='a2c',
states=states, actions=actions, max_episode_timesteps=max_episode_timesteps,
states=states, actions=actions, batch_size=batch_size,
max_episode_timesteps=max_episode_timesteps,
network=network,
batch_size=batch_size, update_frequency=update_frequency, learning_rate=learning_rate,
memory=memory,
update_frequency=update_frequency, learning_rate=learning_rate,
horizon=horizon, discount=discount, state_action_value=state_action_value,
estimate_terminal=estimate_terminal,
estimate_terminal=estimate_terminal,
critic_network=critic_network, critic_optimizer=critic_optimizer,
preprocessing=preprocessing,
exploration=exploration, variable_noise=variable_noise,
l2_regularization=l2_regularization, entropy_regularization=entropy_regularization,
name=name, device=device, parallel_interactions=parallel_interactions, seed=seed,
execution=execution, saver=saver, summarizer=summarizer, recorder=recorder,
config=config
execution=execution, saver=saver, summarizer=summarizer, recorder=recorder,
config=config
)

policy = dict(network=network, temperature=1.0)
if memory is None:
memory = dict(type='recent', capacity=(batch_size + max_episode_timesteps + horizon))
memory = dict(type='recent')
else:
memory = dict(type='recent', capacity=memory)
if update_frequency is None:
update = dict(unit='timesteps', batch_size=batch_size)
else:
update = dict(unit='timesteps', batch_size=batch_size, frequency=update_frequency)
update = dict(unit='timesteps', batch_size=batch_size)
if update_frequency is not None:
update['frequency'] = update_frequency
optimizer = dict(type='adam', learning_rate=learning_rate)
objective = 'policy_gradient'
reward_estimation = dict(
horizon=horizon, discount=discount, estimate_horizon='early',
estimate_actions=state_action_value, estimate_terminal=estimate_terminal,
estimate_advantage=True
)
# State value doesn't exist for Beta
baseline_policy = dict(network=critic_network, distributions=dict(float='gaussian'))
baseline_policy = dict(network=critic_network)
if state_action_value:
baseline_objective = dict(type='value', value='action')
else:
Expand Down
49 changes: 30 additions & 19 deletions tensorforce/agents/ac.py
Expand Up @@ -57,13 +57,18 @@ class ActorCritic(TensorforceAgent):
(<span style="color:#00C000"><b>default</b></span>: not given, better implicitly
specified via `environment` argument for `Agent.create(...)`).
batch_size (parameter, long > 0): Number of timesteps per update batch
(<span style="color:#C00000"><b>required</b></span>).
network ("auto" | specification): Policy network configuration, see
[networks](../modules/networks.html)
(<span style="color:#00C000"><b>default</b></span>: "auto", automatically configured
network).
batch_size (parameter, long > 0): Number of episodes per update batch
(<span style="color:#00C000"><b>default</b></span>: 10 episodes).
memory (int > 0): Batch memory capacity, has to fit at least maximum batch_size + maximum
network/estimator horizon + 1 timesteps
(<span style="color:#00C000"><b>default</b></span>: minimum capacity, usually does not
need to be changed).
update_frequency ("never" | parameter, long > 0): Frequency of updates
(<span style="color:#00C000"><b>default</b></span>: batch_size).
learning_rate (parameter, float > 0.0): Optimizer learning rate
Expand All @@ -88,9 +93,6 @@ class ActorCritic(TensorforceAgent):
the critic loss
(<span style="color:#00C000"><b>default</b></span>: 1.0).
memory (int > 0): Memory capacity, has to fit at least around batch_size + one episode
(<span style="color:#00C000"><b>default</b></span>: minimum required size).
preprocessing (dict[specification]): Preprocessing as layer or list of layers, see
[preprocessing](../modules/preprocessing.html), specified per state-type or -name and
for reward
Expand Down Expand Up @@ -155,6 +157,11 @@ class ActorCritic(TensorforceAgent):
summary writer (<span style="color:#00C000"><b>default</b></span>: 10).</li>
<li><b>max-summaries</b> (<i>int > 0</i>) &ndash; maximum number of summaries to keep
(<span style="color:#00C000"><b>default</b></span>: 5).</li>
<li><b>custom</b> (<i>dict[spec]</i>) &ndash; custom summaries which are recorded via
`agent.summarize(...)`, specification with either type "scalar", type "histogram" with
optional "buckets", type "image" with optional "max_outputs"
(<span style="color:#00C000"><b>default</b></span>: 3), or type "audio"
(<span style="color:#00C000"><b>default</b></span>: no custom summaries).</li>
<li><b>labels</b> (<i>"all" | iter[string]</i>) &ndash; all excluding "*-histogram"
labels, or list of summaries to record, from the following labels
(<span style="color:#00C000"><b>default</b></span>: only "graph"):</li>
Expand All @@ -179,8 +186,9 @@ class ActorCritic(TensorforceAgent):
<li>"variables": variable mean and variance scalars</li>
<li>"variables-histogram": variable histograms</li>
</ul>
recorder (specification): Experience traces recorder configuration with the following
attributes (<span style="color:#00C000"><b>default</b></span>: no recorder):
recorder (specification): Experience traces recorder configuration, currently not including
internal states, with the following attributes
(<span style="color:#00C000"><b>default</b></span>: no recorder):
<ul>
<li><b>directory</b> (<i>path</i>) &ndash; recorder directory
(<span style="color:#C00000"><b>required</b></span>).</li>
Expand All @@ -193,18 +201,20 @@ class ActorCritic(TensorforceAgent):
"""

def __init__(
# Required
self, states, actions, batch_size,
# Environment
self, states, actions, max_episode_timesteps,
max_episode_timesteps=None,
# Network
network='auto',
# Memory
memory=None,
# Optimization
batch_size=10, update_frequency=None, learning_rate=3e-4,
update_frequency=None, learning_rate=3e-4,
# Reward estimation
horizon=0, discount=0.99, state_action_value=False, estimate_terminal=False,
# Critic
critic_network='auto', critic_optimizer=1.0,
# Memory
memory=None,
# Preprocessing
preprocessing=None,
# Exploration
Expand All @@ -217,23 +227,25 @@ def __init__(
):
self.spec = OrderedDict(
agent='ac',
states=states, actions=actions, max_episode_timesteps=max_episode_timesteps,
states=states, actions=actions, batch_size=batch_size,
max_episode_timesteps=max_episode_timesteps,
network=network,
batch_size=batch_size, update_frequency=update_frequency, learning_rate=learning_rate,
memory=memory,
update_frequency=update_frequency, learning_rate=learning_rate,
horizon=horizon, discount=discount, state_action_value=state_action_value,
estimate_terminal=estimate_terminal,
estimate_terminal=estimate_terminal,
critic_network=critic_network, critic_optimizer=critic_optimizer,
preprocessing=preprocessing,
exploration=exploration, variable_noise=variable_noise,
l2_regularization=l2_regularization, entropy_regularization=entropy_regularization,
name=name, device=device, parallel_interactions=parallel_interactions, seed=seed,
execution=execution, saver=saver, summarizer=summarizer, recorder=recorder,
config=config
execution=execution, saver=saver, summarizer=summarizer, recorder=recorder,
config=config
)

policy = dict(network=network, temperature=1.0)
if memory is None:
memory = dict(type='recent', capacity=(batch_size + max_episode_timesteps + horizon))
memory = dict(type='recent')
else:
memory = dict(type='recent', capacity=memory)
if update_frequency is None:
Expand All @@ -246,8 +258,7 @@ def __init__(
horizon=horizon, discount=discount, estimate_horizon='early',
estimate_actions=state_action_value, estimate_terminal=estimate_terminal
)
# State value doesn't exist for Beta
baseline_policy = dict(network=critic_network, distributions=dict(float='gaussian'))
baseline_policy = dict(network=critic_network)
if state_action_value:
baseline_objective = dict(type='value', value='action')
else:
Expand Down

0 comments on commit b36f4c3

Please sign in to comment.