In [47]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
1
0
GeForce RTX 3080 Ti


In [55]:
import numpy as np

from tensorforce import Agent, Environment, Runner


class MultiactorEnvironment(Environment):
    """
    Example multi-actor environment, illustrating best-practice implementation pattern.
    State space: position in [0, 10].
    Action space: movement in {-1, 0, 1}.
    Random start in [3, 7].
    Actor 1 perspective as is, actor 2 perspective mirrored.
    Positive reward for being closer to 10.
    """

    def __init__(self):
        super().__init__()

    def states(self):
        return dict(type='int', num_values=11)

    def actions(self):
        return dict(type='int', num_values=3)

    def num_actors(self):
        return 2  # Indicates that environment has multiple actors

    def reset(self):
        # Always for multi-actor environments: initialize parallel indices
        self._parallel_indices = np.arange(self.num_actors())

        # Single shared environment logic, plus per-actor perspective
        self._states = 3 + np.random.randint(5)
        self.second_actor = True
        states = np.stack([self._states, 10 - self._states], axis=0)

        # Always for multi-actor environments: return per-actor values
        return self._parallel_indices.copy(), states

    def execute(self, actions):
        # Single shared environment logic, plus per-actor perspective
        if self.second_actor:
            self.second_actor = self.second_actor and not (np.random.random_sample() < 0.1)
            terminal = np.stack([False, not self.second_actor], axis=0)
            delta = (actions[0] - 1) - (actions[1] - 1)
            self._states = np.clip(self._states + delta, a_min=0, a_max=10)
            states = np.stack([self._states, 10 - self._states], axis=0)
        else:
            terminal = np.stack([False], axis=0)
            delta = (actions[0] - 1)
            self._states = np.clip(self._states + delta, a_min=0, a_max=10)
            states = np.stack([self._states], axis=0)
        reward = (states - 5.0) / 5.0

        # Always for multi-actor environments: update parallel indices, and return per-actor values
        self._parallel_indices = self._parallel_indices[~terminal]
        return self._parallel_indices.copy(), states, terminal, reward


environment = Environment.create(
    environment=MultiactorEnvironment, max_episode_timesteps=10000
)


agent = Agent.create(
    agent='ppo',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    network={"type": "auto", "rnn": False},
    use_beta_distribution=False,
    memory="minimum",
    batch_size=12,
    update_frequency=1,
    learning_rate=0.001813150053725916,
    multi_step=5,
    subsampling_fraction=0.9131375430837279,
    likelihood_ratio_clipping=0.09955676846552193,
    discount=0.9985351346308641,
    return_processing=None,
    advantage_processing=None,
    predict_terminal_values=False,
    reward_processing=None,
    baseline={"type": "auto", "rnn":False},
    baseline_optimizer={"optimizer": "adam", "learning_rate": 0.003670157218888348, "multi_step":10},
    l2_regularization=0.0,
    entropy_regularization=0.0011393096635237982,
    state_preprocessing="linear_normalization",
    exploration=0.0,
    variable_noise=0.0
    
)


In [53]:
runner = Runner(
    # agent='json/ppo.json',
    agent=agent,
    environment=MultiactorEnvironment,
    max_episode_timesteps=10
)

AssertionError: 

In [8]:
runner.run(num_episodes=100)

Episodes:   0%|          | 0/100 [00:00, return=0.00, ts/ep=0, sec/ep=0.00, ms/ts=0.0, agent=0.0%]

In [15]:
runner.states

[None, None]

In [58]:
dir(runner)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'agent',
 'close',
 'environments',
 'evaluation',
 'handle_act',
 'handle_act_evaluation',
 'handle_act_joint',
 'handle_observe',
 'handle_observe_evaluation',
 'handle_observe_joint',
 'handle_terminal',
 'handle_terminal_evaluation',
 'is_agent_external',
 'is_environment_external',
 'is_environment_remote',
 'num_vectorized',
 'run']

In [60]:
dir(runner.agent)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_is_agent',
 '_process_states_input',
 'act',
 'actions_spec',
 'auxiliaries_spec',
 'close',
 'config',
 'create',
 'deterministic_spec',
 'episodes',
 'experience',
 'fn_act',
 'get_architecture',
 'get_specification',
 'initial_internals',
 'initialize',
 'internals_spec',
 'is_initialized',
 'load',
 'max_episode_timesteps',
 'model',
 'observe',
 'parallel_interactions',
 'parallel_spec',
 'pretrain',
 'recorder',
 'reset',
 'restore',
 'reward_buffer',
 'reward_spec',
 'save',
 'spec',
 'states_spec',
 'terminal_buffer',
 'terminal_spec',
 'timestep_completed',
 'timestep_counter',
 'timesteps',
 'tracked_tensors',

In [76]:
runner.agent.updates

0

In [86]:
runner.environments[0].states()

{'type': 'int', 'num_values': 11}