In [1]:
import gymnasium as gym
from ray.rllib.algorithms.ppo import PPOConfig

In [2]:
# Define your problem using python and Farama-Foudation's gymnasium API:

class SimpleCorridor(gym.Env):
    """Corridor in which an agent must learn to move right to reach the exit.
    -----------------------
    | S | 1 | 2 | 3 | G |
    -----------------------
    S = Start, G = goal, corridor_length = 5

    Possible actions to chose from are: 0=left, 1=right
    Observations are floats indicating the current field index, e.g. 0.0 for
    starting position, 1.0 for the field next to the starting position, etc.
    Rewards are -0.1 for all steps, except when reaching the goal (+1.0).
    """

    def __init__(self, config):
        self.end_pos = config["corridor_length"]
        self.cur_pos = 0
        self.action_space = gym.spaces.Discrete(2)
        self.observation_space = gym.spaces.Box(0.0, self.end_pos, shape=(1,))

    def reset(self, *, seed=None, options=None):
        """Resets the episode.
        Returns:
            Initial observation of the new episode and an info dict.
        """

        self.cur_pos = 0
        return [self.cur_pos], {}

    def step(self, action):
        """Takes a single step in the episode given action.

        Returns:
            New observation, reward, terminatd-flag, truncated-flag, info-dict (empty).
        """

        # Walk left
        if action == 0 and self.cur_pos > 0:
            self.cur_pos -= 1
        # Walk right
        elif action == 1:
            self.cur_pos += 1

        # Set terminated flag when endo of corridor (goal) reached.
        terminated = self.cur_pos >= self.end_pos
        truncated = False

        # +1 when goal reached, otherwise -1.
        reward = 1.0 if terminated else -0.1
        return [self.cur_pos], reward, terminated, truncated, {}
        

In [3]:
# Create an RLlib Algorithm instance from a PPOConfig object.

config = (
    PPOConfig().environment(
        # Env class to use (here: our gym.Env sub-class from above).
        env=SimpleCorridor,
        # Config dict to be passed to our custom env's constructor.
        # Use corridor with 20 fields (including S and G).
        env_config = {"corridor_length":28},
    )
    # Parallelize environment rollouts.
    .env_runners(num_env_runners=3)
)

# Construct the actual (PPO) algorithm object from the config.
algo = config.build()

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2024-05-24 04:36:18,884	INFO worker.py:1749 -- Started a local Ray instance.


In [4]:
# Train for n iterations and report results (mean, episode rewards).
# Since we have to move at least 19 times in the env to reach the goal and
# each move gives us -0.1 reward (except the last move at the end: +1.0),
# we can expect to reach an optimal episode reward of -0.1*18 + 1.0 = -0.8

for i in range(10):
    results = algo.train()
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")

# Perform inference (action computations) based on given env observations.
# Note that we are using a slightly different env here (len 10 instead of 20),
# however, this should still work as the agent has (hopefully) learned
# to "just always walk right!"

env = SimpleCorridor({"corridor_length": 10})

# Get the initial observation (should be: [0.0] for the starting position).
obs, info = env.reset()
terminated = truncated = False
total_reward = 0.0

# Play one episode.
while not terminated and not truncated:
    # Compute a single action, given the current observation
    # from the environment.
    action = algo.compute_single_action(obs)
    # Apply the computed action in the environment.
    obs, reward, terminated, truncated, info = env.step(action)
    # Sum up rewards for reporting purposes.
    total_reward += reward

# Report results.
print(f"Played 1 episode; total-reward={total_reward}")




KeyError: 'episode_reward_mean'