In [3]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
1
0
GeForce RTX 3080 Ti


In [4]:
from tensorforce import Agent, Environment
import numpy as np

class MDPEnv(Environment):

    def __init__(self):
        super().__init__()
    
    # State1 active , State2 inactive
    def states(self):
        return dict(type='float', shape=(1,))
    
    # Actions 0 noting,1 pp,2 major pp
    def actions(self):
        return dict(type='int', num_values=3)

    # Optional: should only be defined if environment has a natural fixed
    # maximum episode length; otherwise specify maximum number of training
    # timesteps via Environment.create(..., max_episode_timesteps=???)
    def max_episode_timesteps(self):
        return super().max_episode_timesteps()

    # Optional additional steps to close environment
    def close(self):
        super().close()
    
    # Assume initial state inavtive
    def reset(self):
        self._parallel_indices = np.arange(2)
        state = np.array([1])
        return state

    def execute(self, actions):
        if actions == 0:
            if np.random.uniform(0, 1, 1)[0] < 0.9:
                next_state = np.array([0])
            else:
                next_state = np.array([0])
        else:
            if np.random.uniform(0, 1, 1)[0] < 0.1:
                next_state = np.array([1])
            else:
                next_state = np.array([1])
        
        # next_state = np.random.random(size=(8,))
        terminal = False  # Always False if no "natural" terminal state
        reward = np.random.random()
        return next_state, terminal, reward

In [5]:
environment = Environment.create(
    environment=MDPEnv, max_episode_timesteps=10000
)


In [6]:
import numpy as np

# Instantiate a Tensorforce agent
agent = Agent.create(
    agent='tensorforce',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    memory=10000,
    update=dict(unit='timesteps', batch_size=64),
    optimizer=dict(type='adam', learning_rate=3e-4),
    policy=dict(network='auto'),
    objective='policy_gradient',
    reward_estimation=dict(horizon=20)
)

agent2 = Agent.create(
    agent='tensorforce',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    memory=10000,
    update=dict(unit='timesteps', batch_size=64),
    optimizer=dict(type='adam', learning_rate=3e-4),
    policy=dict(network='auto'),
    objective='policy_gradient',
    reward_estimation=dict(horizon=20)
)

# Train for 300 episodes
for _ in range(2):
    # Initialize episode
    states = environment.reset()
    terminal = False
    while not terminal:
        # Episode timestep
        actions = agent.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)
        agent.observe(terminal=terminal, reward=reward)
        print(states)
        print(actions)
        
        '''
        actions = agent2.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)
        agent2.observe(terminal=terminal, reward=reward)
        print(states)
        print(actions)
        '''
        
agent.close()
environment.close()



[1]
2
[1]
1
[0]
0
[0]
0
[1]
2
[1]
2
[1]
2
[0]
0
[1]
1
[1]
1
[0]
0
[0]
0
[1]
2
[1]
2
[1]
1
[1]
1
[1]
1
[1]
1
[0]
0
[1]
2
[0]
0
[0]
0
[1]
2
[1]
1
[1]
1
[1]
1
[1]
1
[0]
0
[1]
1
[0]
0
[1]
1
[1]
1
[1]
1
[1]
2
[1]
2
[0]
0
[1]
2
[0]
0
[1]
2
[0]
0
[1]
1
[0]
0
[1]
1
[1]
2
[1]
2
[1]
1
[1]
1
[1]
1
[0]
0
[1]
2
[1]
2
[1]
1
[1]
2
[1]
1
[1]
2
[1]
2
[1]
1
[0]
0
[0]
0
[0]
0
[1]
2
[1]
2
[1]
2
[1]
2
[1]
1
[0]
0
[1]
2
[0]
0
[1]
2
[1]
1
[0]
0
[1]
2
[1]
2
[0]
0
[1]
2
[1]
1
[0]
0
[1]
2
[1]
1
[1]
2
[0]
0
[1]
2
[1]
1
[1]
2
[0]
0
[1]
2
[1]
2
[1]
1
[0]
0
[1]
1
[0]
0
[0]
0
[0]
0
[0]
0
[1]
1
[1]
1
[1]
2
[1]
1
[1]
2
[0]
0
[1]
1
[0]
0
[1]
1
[1]
1
[1]
2
[1]
2
[1]
1
[1]
2
[1]
1
[1]
1
[1]
1
[0]
0
[1]
1
[0]
0
[1]
2
[0]
0
[0]
0
[0]
0
[0]
0
[1]
2
[1]
1
[0]
0
[0]
0
[1]
1
[0]
0
[1]
2
[0]
0
[1]
1
[0]
0
[1]
2
[1]
1
[1]
1
[1]
1
[1]
2
[1]
2
[1]
1
[1]
1
[1]
2
[1]
2
[0]
0
[0]
0
[1]
2
[0]
0
[0]
0
[1]
2
[1]
1
[1]
2
[0]
0
[1]
2
[1]
2
[1]
1
[0]
0
[1]
2
[1]
1
[1]
1
[0]
0
[0]
0
[0]
0
[1]
1
[1]
1
[1]
1
[1]
1
[1]
1
[0]
0
[1]
2
[0]
0
[0]


In [7]:
import numpy as np

from tensorforce import Environment, Runner


class MultiactorEnvironment(Environment):
    """
    Example multi-actor environment, illustrating best-practice implementation pattern.
    State space: position in [0, 10].
    Action space: movement in {-1, 0, 1}.
    Random start in [3, 7].
    Actor 1 perspective as is, actor 2 perspective mirrored.
    Positive reward for being closer to 10.
    """

    def __init__(self):
        super().__init__()

    def states(self):
        return dict(type='int', num_values=11)

    def actions(self):
        return dict(type='int', num_values=3)

    def num_actors(self):
        return 2  # Indicates that environment has multiple actors

    def reset(self):
        # Always for multi-actor environments: initialize parallel indices
        self._parallel_indices = np.arange(self.num_actors())

        # Single shared environment logic, plus per-actor perspective
        self._states = 3 + np.random.randint(5)
        self.second_actor = True
        states = np.stack([self._states, 10 - self._states], axis=0)

        # Always for multi-actor environments: return per-actor values
        return self._parallel_indices.copy(), states

    def execute(self, actions):
        # Single shared environment logic, plus per-actor perspective
        if self.second_actor:
            self.second_actor = self.second_actor and not (np.random.random_sample() < 0.1)
            terminal = np.stack([False, not self.second_actor], axis=0)
            delta = (actions[0] - 1) - (actions[1] - 1)
            self._states = np.clip(self._states + delta, a_min=0, a_max=10)
            states = np.stack([self._states, 10 - self._states], axis=0)
        else:
            terminal = np.stack([False], axis=0)
            delta = (actions[0] - 1)
            self._states = np.clip(self._states + delta, a_min=0, a_max=10)
            states = np.stack([self._states], axis=0)
        reward = (states - 5.0) / 5.0

        # Always for multi-actor environments: update parallel indices, and return per-actor values
        self._parallel_indices = self._parallel_indices[~terminal]
        return self._parallel_indices.copy(), states, terminal, reward


In [13]:
def main():
    # Multi-actor runner, automatically if environment.num_actors() > 1
    runner = Runner(
        agent='/json/ppo.json',
        environment=MultiactorEnvironment,
        max_episode_timesteps=10
    )
    runner.run(num_episodes=1000)

In [14]:
main()

TensorforceError: Invalid value for Agent.create argument agent: /liul-storage/ppo.json.

In [12]:
!pwd

/workspace
