In [1]:
# import OpenAI gym
import gym

# create the environment
env = gym.make('FetchReach-v1')

# initialize the env
env.reset()

{'observation': array([ 1.34183265e+00,  7.49100387e-01,  5.34722720e-01,  1.97805133e-04,
         7.15193042e-05,  7.73933014e-06,  5.51992816e-08, -2.42927453e-06,
         4.73325650e-06, -2.28455228e-06]),
 'achieved_goal': array([1.34183265, 0.74910039, 0.53472272]),
 'desired_goal': array([1.3752391 , 0.6333266 , 0.53754559])}

### Display Fetch

In [2]:
# show a few frames of CartPole
for i in range(100):
    # display the env (optional)
#     env.render()
    # randomly chose an action from all available actions
    action = env.action_space.sample()
    # agent takes an action and interacts with the env, receiving state, reward, done and info
    state, reward, done, info = env.step(action)
    # if episode is over reset the env
    if done:
        env.reset()

### Fetch Environment Space

In [3]:
print("state space is {}\n".format(env.observation_space))
print("action space is {}\n".format(env.action_space))
print("example state is {}\n".format(env.reset()))
print("example action is {}\n".format(env.action_space.sample()))

state space is Dict(achieved_goal:Box(-inf, inf, (3,), float32), desired_goal:Box(-inf, inf, (3,), float32), observation:Box(-inf, inf, (10,), float32))

action space is Box(-1.0, 1.0, (4,), float32)

example state is {'observation': array([ 1.34183265e+00,  7.49100387e-01,  5.34722720e-01,  1.97805133e-04,
        7.15193042e-05,  7.73933014e-06,  5.51992816e-08, -2.42927453e-06,
        4.73325650e-06, -2.28455228e-06]), 'achieved_goal': array([1.34183265, 0.74910039, 0.53472272]), 'desired_goal': array([1.21307256, 0.79961713, 0.65835359])}

example action is [-0.9026402  -0.26175523  0.3022741  -0.46299675]



### The Reinforcement Learning Loop

In [4]:
episodes = 10

# run environment for 10 episodes
for ep in range(episodes):
    episode_reward = 0
    count_while = 0
    while True:
        # randomly chose an action from all available actions
        action = env.action_space.sample()
        # agent takes an action and interacts with the env, receiving state, reward, done and info
        state, reward, done, info = env.step(action)
        episode_reward += 1
        count_while += 1
        # if episode is over reset the env
        if done:
            print("Episode {} done with reward: {} in {} iterations".format(ep, episode_reward,count_while))
            env.reset()
            break

Episode 0 done with reward: 50 in 50 iterations
Episode 1 done with reward: 50 in 50 iterations
Episode 2 done with reward: 50 in 50 iterations
Episode 3 done with reward: 50 in 50 iterations
Episode 4 done with reward: 50 in 50 iterations
Episode 5 done with reward: 50 in 50 iterations
Episode 6 done with reward: 50 in 50 iterations
Episode 7 done with reward: 50 in 50 iterations
Episode 8 done with reward: 50 in 50 iterations
Episode 9 done with reward: 50 in 50 iterations


### Episodes and Timesteps

In [5]:
episodes = 10
max_timesteps = 200
# run environment for 10 episodes
for ep in range(episodes):
    timestep = 0
    while timestep < max_timesteps:
        # randomly chose an action from all available actions
        action = env.action_space.sample()
        # agent takes an action and interacts with the env, receiving state, reward, done and info
        state, reward, done, info = env.step(action)
#         print(reward)
        timestep += 1
        # if episode is over reset the env
        if done:
            print("Episode {} done after {} timesteps".format(ep, timestep))
            env.reset()
            break

Episode 0 done after 50 timesteps
Episode 1 done after 50 timesteps
Episode 2 done after 50 timesteps
Episode 3 done after 50 timesteps
Episode 4 done after 50 timesteps
Episode 5 done after 50 timesteps
Episode 6 done after 50 timesteps
Episode 7 done after 50 timesteps
Episode 8 done after 50 timesteps
Episode 9 done after 50 timesteps


### Interacting with the Environment: actions, done and env.step()

In [6]:
episodes = 1

max_timesteps = 10

for ep in range(episodes):
    timestep = 0
    while timestep < max_timesteps:
        # randomly chose an action from all available actions
        action = env.action_space.sample()
        # agent takes an actiona nd interacts with the env, receiving state, reward, done and info
        state, reward, done, info = env.step(action)
        timestep += 1
        print("Timestep {}: agent took action {}\n".format(timestep, action))
        print("Timestep {}: state {}, reward {}, done {}, info {}\n\n-----".format(timestep, state, reward, done, info))
        # if episode is over reset the env
        if done:
            env.reset()
            break

Timestep 1: agent took action [ 0.7560593  -0.6924245  -0.24547774 -0.39658237]

Timestep 1: state {'observation': array([ 1.36410827e+00,  7.27643151e-01,  5.26939023e-01,  0.00000000e+00,
        0.00000000e+00,  1.95180012e-02, -1.82222498e-02, -6.66460540e-03,
        4.39849041e-04,  3.49064332e-04]), 'achieved_goal': array([1.36410827, 0.72764315, 0.52693902]), 'desired_goal': array([1.19615138, 0.73346856, 0.62849142])}, reward -1.0, done False, info {'is_success': 0.0}

-----
Timestep 2: agent took action [-0.17267902  0.40358633  0.47622138  0.9253453 ]

Timestep 2: state {'observation': array([ 1.36114808e+00,  7.38084262e-01,  5.40679097e-01,  0.00000000e+00,
        0.00000000e+00, -7.01393735e-03,  1.24937954e-02,  1.31277325e-02,
        1.42609132e-04,  1.06358565e-03]), 'achieved_goal': array([1.36114808, 0.73808426, 0.5406791 ]), 'desired_goal': array([1.19615138, 0.73346856, 0.62849142])}, reward -1.0, done False, info {'is_success': 0.0}

-----
Timestep 3: agent took

In [8]:
from gym.wrappers import FilterObservation, FlattenObservation
env = gym.make('FetchReach-v1')
# create single observation space array
env = FlattenObservation(FilterObservation(env, ['desired_goal','observation']))
env.reset()


array([ 1.4501722e+00,  6.2254441e-01,  4.4336995e-01,  1.3418326e+00,
        7.4910039e-01,  5.3472275e-01,  1.9780513e-04,  7.1519302e-05,
        7.7393297e-06,  5.5199283e-08, -2.4292744e-06,  4.7332564e-06,
       -2.2845522e-06], dtype=float32)