In [107]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
1
0
GeForce RTX 3080 Ti


In [108]:
!pwd

/workspace/liul-storage/gpu-multiagent


In [109]:
!pip install ipywidgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [116]:
import numpy as np

from tensorforce import Agent, Environment, Runner


class MultiactorEnvironment(Environment):
    """
    Example multi-actor environment, illustrating best-practice implementation pattern.
    State space: position in [0, 10].
    Action space: movement in {-1, 0, 1}.
    Random start in [3, 7].
    Actor 1 perspective as is, actor 2 perspective mirrored.
    Positive reward for being closer to 10.
    """

    def __init__(self):
        super().__init__()

    def states(self):
        return dict(type='int', num_values=11)

    def actions(self):
        return dict(type='int', num_values=3)

    def num_actors(self):
        return 2  # Indicates that environment has multiple actors

    def reset(self):
        # Always for multi-actor environments: initialize parallel indices
        self._parallel_indices = np.arange(self.num_actors())

        # Single shared environment logic, plus per-actor perspective
        self._states = 3 + np.random.randint(5)
        self.second_actor = True
        states = np.stack([self._states, 10 - self._states], axis=0)

        # Always for multi-actor environments: return per-actor values
        return self._parallel_indices.copy(), states

    def execute(self, actions):
        # Single shared environment logic, plus per-actor perspective
        if self.second_actor:
            self.second_actor = self.second_actor and not (np.random.random_sample() < 0.1)
            terminal = np.stack([False, not self.second_actor], axis=0)
            delta = (actions[0] - 1) - (actions[1] - 1)
            self._states = np.clip(self._states + delta, a_min=0, a_max=10)
            states = np.stack([self._states, 10 - self._states], axis=0)
        else:
            terminal = np.stack([False], axis=0)
            delta = (actions[0] - 1)
            self._states = np.clip(self._states + delta, a_min=0, a_max=10)
            states = np.stack([self._states], axis=0)
        reward = (states - 5.0) / 5.0

        # Always for multi-actor environments: update parallel indices, and return per-actor values
        self._parallel_indices = self._parallel_indices[~terminal]
        return self._parallel_indices.copy(), states, terminal, reward


In [122]:
# def main():
# Multi-actor runner, automatically if environment.num_actors() > 1

environment = Environment.create(
    environment=MultiactorEnvironment, max_episode_timesteps=10000
)

'''
agent = Agent.create(
    agent='tensorforce',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    memory=10000,
    update=dict(unit='timesteps', batch_size=64),
    optimizer=dict(type='adam', learning_rate=3e-4),
    policy=dict(network='auto'),
    objective='policy_gradient',
    reward_estimation=dict(horizon=20)
)
'''
agent = Agent.create(
    agent='ppo',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    # memory=10000,
    # update=dict(unit='timesteps', batch_size=64),
    # optimizer=dict(type='adam', learning_rate=3e-4),
    # policy=dict(network='auto'),
    # objective='policy_gradient',
    # reward_estimation=dict(horizon=20),
    network={"type": "auto", "rnn": False},
    use_beta_distribution=False,
    memory="minimum",
    batch_size=12,
    update_frequency=1,
    learning_rate=0.001813150053725916,
    multi_step=5,
    subsampling_fraction=0.9131375430837279,
    likelihood_ratio_clipping=0.09955676846552193,
    discount=0.9985351346308641,
    #return_processing=null,
    #advantage_processing=null,
    predict_terminal_values=False,
    #reward_processing=null,
    baseline={"type": "auto", "rnn":False},
    baseline_optimizer={"optimizer": "adam", "learning_rate": 0.003670157218888348, "multi_step":10},
    l2_regularization=0.0,
    entropy_regularization=0.0011393096635237982,
    state_preprocessing="linear_normalization",
    exploration=0.0,
    variable_noise=0.0
)

'''
runner = Runner(
    # agent='json/ppo.json',
    agent=agent,
    environment=MultiactorEnvironment,
    # max_episode_timesteps=10
)
'''

"\nrunner = Runner(\n    # agent='json/ppo.json',\n    agent=agent,\n    environment=MultiactorEnvironment,\n    # max_episode_timesteps=10\n)\n"

In [123]:
agent.parallel_spec

TensorSpec(type=int, shape=(), num_values=1)

In [133]:
states = environment.reset()
states[1][1:]

array([4])

In [135]:
states = environment.reset()
actions = agent.act(states=states[1], parallel=0)

IndexError: tuple index out of range

In [None]:
for _ in range(2):
    # Initialize episode
    states = environment.reset()
    terminal = False
    while not terminal:
        # Episode timestep
        actions = agent.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)
        agent.observe(terminal=terminal, reward=reward)
        print(states)
        print(actions)

AssertionError: 

IndexError: tuple index out of range

In [None]:
runner.run(num_episodes=1000)

Episodes:   0%|          | 0/1000 [00:00, return=0.00, ts/ep=0, sec/ep=0.00, ms/ts=0.0, agent=0.0%]



In [None]:
dir(runner)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'actions',
 'agent',
 'batch_agent_calls',
 'callback',
 'callback_episode_frequency',
 'callback_timestep_frequency',
 'close',
 'environments',
 'episode_agent_second',
 'episode_agent_seconds',
 'episode_return',
 'episode_returns',
 'episode_seconds',
 'episode_start',
 'episode_timestep',
 'episode_timesteps',
 'episodes',
 'evaluation',
 'evaluation_agent_second',
 'evaluation_agent_seconds',
 'evaluation_callback',
 'evaluation_returns',
 'evaluation_run',
 'evaluation_seconds',
 'evaluation_start',
 'evaluation_timesteps',
 'handle_act',
 'handle_act_evaluation',
 'handle_act_joint',
 'handle_observe',
 'handle_ob

In [None]:
runner.evaluation_agent_seconds

[2.001223921775818,
 2.0195528268814087,
 0.008112788200378418,
 0.01914513111114502,
 0.013635396957397461,
 0.013635396957397461,
 0.008235573768615723,
 0.01928102970123291,
 0.010624885559082031,
 0.01901841163635254,
 0.00402224063873291,
 0.023464560508728027,
 0.0068738460540771484,
 0.020610332489013672,
 0.013749241828918457,
 0.013749241828918457,
 0.013769984245300293,
 0.013769984245300293,
 0.013679742813110352,
 0.013679742813110352,
 0.005470156669616699,
 0.02170121669769287,
 0.006799459457397461,
 0.020642757415771484,
 0.013522624969482422,
 0.013522624969482422,
 0.007580280303955078,
 0.019850730895996094,
 0.004013538360595703,
 0.022961854934692383,
 0.0040149688720703125,
 0.023369312286376953,
 0.013428568840026855,
 0.013428568840026855,
 0.013542532920837402,
 0.013542532920837402,
 0.013486146926879883,
 0.013486146926879883,
 0.014803886413574219,
 0.014803886413574219,
 0.013579368591308594,
 0.013579368591308594,
 0.013577461242675781,
 0.0135774612426757

In [None]:
runner.states

[None, None]

In [None]:
np.stack([2, 10 - 2], axis=0)

array([2, 8])

In [None]:
env = MultiactorEnvironment()

In [None]:
env.states()

{'type': 'int', 'num_values': 11}

In [None]:
env.reset()

(array([0, 1]), array([5, 5]))