In [1]:
import gymnasium as gym

In [2]:
# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="human")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(1000):
    # this is where you would insert your policy
    action = env.action_space.sample()

    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

env.close()

In [3]:
env.action_space

Discrete(4)

In [4]:
env.observation_space

Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)

In [5]:
gym.spaces.Box?

[0;31mInit signature:[0m
[0mgym[0m[0;34m.[0m[0mspaces[0m[0;34m.[0m[0mBox[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mlow[0m[0;34m:[0m [0;34m'SupportsFloat | NDArray[Any]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhigh[0m[0;34m:[0m [0;34m'SupportsFloat | NDArray[Any]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshape[0m[0;34m:[0m [0;34m'Sequence[int] | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdtype[0m[0;34m:[0m [0;34m'type[np.floating[Any]] | type[np.integer[Any]]'[0m [0;34m=[0m [0;34m<[0m[0;32mclass[0m [0;34m'numpy.float32'[0m[0;34m>[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mseed[0m[0;34m:[0m [0;34m'int | np.random.Generator | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
A (possibly unbounded) box in :math:`\mathbb{R}^n`.

Specifically, a Box represents the Cartesian product of n closed intervals.
Each

In [6]:
list(env.observation_space.low)

[np.float32(-2.5),
 np.float32(-2.5),
 np.float32(-10.0),
 np.float32(-10.0),
 np.float32(-6.2831855),
 np.float32(-10.0),
 np.float32(-0.0),
 np.float32(-0.0)]

In [7]:
list(env.observation_space.high)

[np.float32(2.5),
 np.float32(2.5),
 np.float32(10.0),
 np.float32(10.0),
 np.float32(6.2831855),
 np.float32(10.0),
 np.float32(1.0),
 np.float32(1.0)]

In [8]:
env.observation_space.sample()

array([-1.9686588 ,  0.58947736,  6.9609694 ,  9.632183  ,  1.3206633 ,
        0.08854495,  0.689009  ,  0.01023463], dtype=float32)

In [9]:
?env

[0;31mType:[0m           TimeLimit
[0;31mString form:[0m    <TimeLimit<OrderEnforcing<PassiveEnvChecker<LunarLander<LunarLander-v3>>>>>
[0;31mFile:[0m           ~/projects/RL-Introduction/venv/lib/python3.10/site-packages/gymnasium/wrappers/common.py
[0;31mDocstring:[0m     
Limits the number of steps for an environment through truncating the environment if a maximum number of timesteps is exceeded.

If a truncation is not defined inside the environment itself, this is the only place that the truncation signal is issued.
Critically, this is different from the `terminated` signal that originates from the underlying environment as part of the MDP.
No vector wrapper exists.

Example using the TimeLimit wrapper:
    >>> from gymnasium.wrappers import TimeLimit
    >>> from gymnasium.envs.classic_control import CartPoleEnv

    >>> spec = gym.spec("CartPole-v1")
    >>> spec.max_episode_steps
    500
    >>> env = gym.make("CartPole-v1")
    >>> env  # TimeLimit is included within t

In [10]:
env.unwrapped.observation_space

Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)

In [11]:
env.spec.max_episode_steps

1000

In [12]:
orig_env = env.unwrapped

In [13]:
orig_env?

[0;31mType:[0m        LunarLander
[0;31mString form:[0m <LunarLander<LunarLander-v3>>
[0;31mFile:[0m        ~/projects/RL-Introduction/venv/lib/python3.10/site-packages/gymnasium/envs/box2d/lunar_lander.py
[0;31mDocstring:[0m  
## Description
This environment is a classic rocket trajectory optimization problem.
According to Pontryagin's maximum principle, it is optimal to fire the
engine at full throttle or turn it off. This is the reason why this
environment has discrete actions: engine on or off.

There are two environment versions: discrete or continuous.
The landing pad is always at coordinates (0,0). The coordinates are the
first two numbers in the state vector.
Landing outside of the landing pad is possible. Fuel is infinite, so an agent
can learn to fly and then land on its first attempt.

To see a heuristic landing, run:
```shell
python gymnasium/envs/box2d/lunar_lander.py
```

## Action Space
There are four discrete actions available:
- 0: do nothing
- 1: fire left ori

In [14]:
type(gym.make("LunarLander-v3", render_mode="human"))

gymnasium.wrappers.common.TimeLimit

In [15]:
type(gym.make("LunarLander-v3", render_mode="human", max_episode_steps=-1))

gymnasium.wrappers.common.OrderEnforcing

In [16]:
gym.wrappers.common.OrderEnforcing?

[0;31mInit signature:[0m
[0mgym[0m[0;34m.[0m[0mwrappers[0m[0;34m.[0m[0mcommon[0m[0;34m.[0m[0mOrderEnforcing[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0menv[0m[0;34m:[0m [0;34m'gym.Env[ObsType, ActType]'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdisable_render_order_enforcing[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Will produce an error if ``step`` or ``render`` is called before ``reset``.

No vector version of the wrapper exists.

Example:
    >>> import gymnasium as gym
    >>> from gymnasium.wrappers import OrderEnforcing
    >>> env = gym.make("CartPole-v1", render_mode="human")
    >>> env = OrderEnforcing(env)
    >>> env.step(0)
    Traceback (most recent call last):
        ...
    gymnasium.error.ResetNeeded: Cannot call env.step() before calling env.reset()
    >>> env.render()
    Traceback (most recent call last):
        ...
  

In [17]:
gym.make("LunarLander-v3", render_mode="human", max_episode_steps=-1).unwrapped

<gymnasium.envs.box2d.lunar_lander.LunarLander at 0x7f9495cc8a00>

In [18]:
from typing import TypeVar
from enum import Enum

In [19]:
Observation = TypeVar("Observation")
# Action = TypeVar("Action")

In [20]:
# - 0: do nothing
# - 1: fire left orientation engine
# - 2: fire main engine
# - 3: fire right orientation engine

class Action(Enum):
    Nothing = 0
    Left = 1
    Main = 2
    Right = 3

In [21]:
def steer(obs: Observation) -> Action:
    x, y, vel_x, vel_y, angle, angular_velocity, leg1_contact, leg2_contact = obs
    if angle > 0 and angular_velocity < 1:
        return Action.Right.value
    elif angle < 0 and angular_velocity > -1:
        return Action.Left.value
    else:
        return Action.Nothing.value

In [22]:
# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="human")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
for _ in range(1000):
    # this is where you would insert your policy
    # action = env.action_space.sample()

    action = steer(observation)
    
    # step (transition) through the environment with the action
    # receiving the next observation, reward and if the episode has terminated or truncated
    observation, reward, terminated, truncated, info = env.step(action)

    # If the episode has ended then we can reset to start a new episode
    if terminated or truncated:
        observation, info = env.reset()

In [23]:
env.close()