In [2]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

True
1
0
GeForce RTX 3080 Ti


In [3]:
from tensorforce import Agent, Environment
import numpy as np

class MDPEnv(Environment):

    def __init__(self):
        super().__init__()
    
    # State1 active , State2 inactive
    def states(self):
        return dict(type='float', shape=(1,))
    
    # Actions 0 noting,1 pp,2 major pp
    def actions(self):
        return dict(type='int', num_values=3)

    # Optional: should only be defined if environment has a natural fixed
    # maximum episode length; otherwise specify maximum number of training
    # timesteps via Environment.create(..., max_episode_timesteps=???)
    def max_episode_timesteps(self):
        return super().max_episode_timesteps()

    # Optional additional steps to close environment
    def close(self):
        super().close()
    
    # Assume initial state inavtive
    def reset(self):
        state = np.array([1])
        return state

    def execute(self, actions):
        if actions == 0:
            if np.random.uniform(0, 1, 1)[0] < 0.9:
                next_state = np.array([0])
            else:
                next_state = np.array([0])
        else:
            if np.random.uniform(0, 1, 1)[0] < 0.1:
                next_state = np.array([1])
            else:
                next_state = np.array([1])
        
        # next_state = np.random.random(size=(8,))
        terminal = False  # Always False if no "natural" terminal state
        reward = np.random.random()
        return next_state, terminal, reward

In [4]:
environment = Environment.create(
    environment=MDPEnv, max_episode_timesteps=10000
)


In [5]:
import numpy as np

# Instantiate a Tensorforce agent
agent = Agent.create(
    agent='tensorforce',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    memory=10000,
    update=dict(unit='timesteps', batch_size=64),
    optimizer=dict(type='adam', learning_rate=3e-4),
    policy=dict(network='auto'),
    objective='policy_gradient',
    reward_estimation=dict(horizon=20)
)

agent2 = Agent.create(
    agent='tensorforce',
    environment=environment,  # alternatively: states, actions, (max_episode_timesteps)
    memory=10000,
    update=dict(unit='timesteps', batch_size=64),
    optimizer=dict(type='adam', learning_rate=3e-4),
    policy=dict(network='auto'),
    objective='policy_gradient',
    reward_estimation=dict(horizon=20)
)

# Train for 300 episodes
for _ in range(2):
    # Initialize episode
    states = environment.reset()
    terminal = False
    while not terminal:
        # Episode timestep
        actions = agent.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)
        agent.observe(terminal=terminal, reward=reward)
        print(states)
        print(actions)
        
        '''
        actions = agent2.act(states=states)
        states, terminal, reward = environment.execute(actions=actions)
        agent2.observe(terminal=terminal, reward=reward)
        print(states)
        print(actions)
        '''
        
agent.close()
environment.close()



[1]
1
[0]
0
[1]
1
[1]
2
[1]
2
[1]
1
[0]
0
[0]
0
[0]
0
[1]
1
[1]
2
[1]
1
[1]
2
[1]
1
[1]
1
[1]
1
[0]
0
[1]
1
[1]
1
[1]
1
[1]
2
[1]
2
[1]
2
[0]
0
[1]
2
[0]
0
[1]
2
[1]
2
[0]
0
[0]
0
[0]
0
[1]
2
[0]
0
[1]
1
[1]
2
[1]
2
[1]
2
[1]
2
[0]
0
[0]
0
[1]
2
[1]
2
[1]
2
[0]
0
[1]
2
[1]
2
[0]
0
[0]
0
[1]
1
[1]
1
[1]
2
[1]
2
[0]
0
[0]
0
[1]
1
[1]
1
[1]
1
[0]
0
[1]
1
[1]
2
[0]
0
[1]
1
[1]
1
[0]
0
[1]
1
[1]
1
[0]
0
[0]
0
[1]
1
[0]
0
[0]
0
[1]
2
[0]
0
[1]
1
[1]
1
[1]
2
[1]
1
[0]
0
[0]
0
[1]
1
[1]
1
[1]
1
[1]
1
[0]
0
[1]
1
[1]
2
[1]
1
[0]
0
[0]
0
[1]
1
[1]
1
[0]
0
[1]
2
[1]
2
[1]
1
[1]
1
[1]
2
[1]
2
[1]
1
[0]
0
[1]
1
[1]
2
[1]
2
[1]
2
[0]
0
[0]
0
[0]
0
[1]
2
[1]
1
[1]
2
[1]
2
[0]
0
[1]
2
[1]
2
[1]
2
[0]
0
[1]
1
[1]
2
[1]
2
[0]
0
[1]
1
[1]
2
[1]
2
[1]
1
[0]
0
[0]
0
[1]
1
[1]
1
[1]
2
[1]
2
[1]
2
[1]
1
[0]
0
[1]
2
[1]
1
[0]
0
[1]
1
[1]
1
[1]
1
[0]
0
[1]
2
[1]
1
[1]
1
[0]
0
[1]
1
[1]
2
[1]
1
[1]
2
[1]
1
[1]
2
[1]
2
[0]
0
[1]
2
[1]
1
[1]
1
[1]
2
[0]
0
[0]
0
[1]
2
[0]
0
[0]
0
[1]
2
[1]
1
[0]
0
[1]
1
[0]
0
[1]
