In [1]:
!pip install torch
!pip install gymnasium
!pip install minigrid
!pip install rl_zoo3



In [2]:
import gymnasium as gym
import minigrid
import numpy as np

np.set_printoptions(formatter={'float': lambda x: "{0:0.2f}".format(x)})

pygame 2.5.2 (SDL 2.28.3, Python 3.11.4)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
def print_obs(obs):
    mat = obs['image']
    mat = mat[:, :, 0].transpose((1, 0))

    res = ''
    for i in range(7):
        for j in range(7):
            res += str(mat[i, j])
            res += ' '
        res += '\n'
    res += '\n'
    res = res[:-2]
    print(res)

In [4]:
def print_ont_hot(mat):
    mat = mat.transpose((1, 0, 2))
    
    res = ''
    for i in range(7):
        for j in range(7):
            for k in range(9):
                if mat[i, j, k] > 0:
                    res += str(k)
                    res += ' '
        res += '\n'
    res += '\n'
    res = res[:-2]
    print(res)

In [5]:
def compute_score(task, policy):
  num_episodes = 10
  cur_episode  = 0

  seed_by_episode = [42, 34, 50, 1, 9, 7, 43, 56, 90, 11]
  score_by_episode = np.zeros(num_episodes)

  while cur_episode < num_episodes:

    cumulative_reward = 0
    cur_seed = seed_by_episode[cur_episode]

    observation, info = task.reset(seed=cur_seed)
    done = False

    while not done:
      action = policy(observation)
      observation, reward, terminated, truncated, info = task.step(action)
      cumulative_reward += reward

      if terminated or truncated:
        done = True
        score_by_episode[cur_episode] = cumulative_reward
        cur_episode += 1

  score_mean = round(score_by_episode.mean(), 3)
  score_std  = round(score_by_episode.std(), 3)
  score_best = round(score_by_episode.max(), 3)

  print(f"Best score: {score_best}")
  print(f"Average score: {score_mean, score_std}")

  return score_by_episode

## My Customize Code

----

I carefully read through all the wrappers in https://github.com/Farama-Foundation/Minigrid/blob/master/minigrid/wrappers.py and gained a lot of inspiration.

About `DiscreteObsWrapper`:

- The `DictObservationSpaceWrapper` suggested that I could remove the tuples `(object, color, state)` unrelated to these three tasks to reduce the state space, allowing us to obtain some discrete values (approximately 9).

- The `OneHotPartialObsWrapper` suggested that I could one-hot encode the discrete values, resulting in an image of shape `(7, 7, 9)`.

- Then, based on this image, I proceeded with subsequent CNN operations.

---

About `ActionReward`:

- I have looked into **Reward Shaping**, but I'm not sure how to integrate it with this task. 

- As a tentative solution, I have written an `ActionReward` to incentivize successful subtasks, addressing the issue of having to complete the entire task to receive a **non-zero** reward originally.

- The reward brought by sub-tasks is given by $(1 - 0.9 \times \frac{\text{step count}}{\text{max steps}}) \times \left(1 - \frac{2}{3}^{\text{|sub-tasks|}} \right)$. It ensures that the overall reward remains within the range $[0, \text{reward})$.


In [6]:
%%writefile wrappers.py

import numpy as np
from gymnasium.core import ObservationWrapper, Wrapper
from gymnasium import spaces


class DiscreteObsWrapper(ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)

        self.observation_space = spaces.Box(
            low=0,
            high=1,
            shape=(7, 7, 9),
            dtype="uint8",
        )

    @classmethod
    def encode_image(cls, image: np.ndarray):
        OBJECT_TO_IDX = {
            "unseen": 0,
            "empty": 1,
            "wall": 2,
            "floor": 3,
            "door": 4,
            "key": 5,
            "ball": 6,
            "box": 7,
            "goal": 8,
            "lava": 9,
            "agent": 10,
        }

        STATE_TO_IDX = {
            "open": 0,
            "closed": 1,
            "locked": 2,
        }

        # 0: unseen
        # 1: empty
        # 2: wall
        # 3-5: door * state
        # 6: key
        # 7: box
        # 8: ball

        rows = []
        for i in range(7):
            row = []
            for j in range(7):
                (obj, color, state) = image[i][j]
                item = -1
                if obj == OBJECT_TO_IDX['unseen']:
                    assert color == 0
                    assert state == 0
                    item = 0
                elif obj == OBJECT_TO_IDX['empty']:
                    assert color == 0
                    assert state == 0
                    item = 1
                elif obj == OBJECT_TO_IDX['wall']:
                    assert color == 5
                    assert state == 0
                    item = 2
                elif obj == OBJECT_TO_IDX['door']:
                    assert color < 6, f'Unknown Color ({color})'
                    item = 3 + state
                elif obj == OBJECT_TO_IDX['key']:
                    assert color < 6, f'Unknown Color ({color})'
                    assert state == 0
                    item = 6
                elif obj == OBJECT_TO_IDX['box']:
                    assert color < 6, f'Unknown Color ({color})'
                    assert state == 0
                    item = 7
                elif obj == OBJECT_TO_IDX['ball']:
                    assert color < 6, f'Unknown Color ({color})'
                    assert state == 0
                    item = 8
                else:
                    assert False, f'Unknown Object ({obj})'
                row.append(
                    cls.one_hot_item(item, 9)
                )
            rows.append(row)
        image_array = np.array(rows)
        return image_array

    @classmethod
    def one_hot_item(cls, ith: int, n: int):
        one_hot_array = np.zeros(
            shape=(n,), dtype="uint8"
        )
        one_hot_array[ith] = 1
        return one_hot_array

    def observation(self, obs):
        image = obs["image"]
        image_array = self.encode_image(image)
        return image_array


class ActionReward(Wrapper):
    def __init__(self, env):
        """A wrapper that adds an exploration bonus to less visited positions.

        Args:
            env: The environment to apply the wrapper
        """
        super().__init__(env)

        self.met_key = False
        self.met_door = False
        self.met_ball = False
        self.met_box = False

    def step(self, action):
        """Steps through the environment with `action`."""
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        front = obs['image'][3, 5]
        obj, color, state = front

        # Map of object type to integers
        OBJECT_TO_IDX = {
            "unseen": 0,
            "empty": 1,
            "wall": 2,
            "floor": 3,
            "door": 4,
            "key": 5,
            "ball": 6,
            "box": 7,
            "goal": 8,
            "lava": 9,
            "agent": 10,
        }

        # Map of state names to integers
        STATE_TO_IDX = {
            "open": 0,
            "closed": 1,
            "locked": 2,
        }

        if self.met_key and obj == OBJECT_TO_IDX["key"]:
            self.met_key = True
        if self.met_door and obj == OBJECT_TO_IDX["door"]:
            self.met_door = True
        if self.met_ball and obj == OBJECT_TO_IDX["ball"]:
            self.met_ball = True
        if self.met_box and obj == OBJECT_TO_IDX["box"]:
            self.met_box = True

        counter = 0.
        if self.met_key:
            counter += 1
        if self.met_door:
            counter += 1
        if self.met_ball:
            counter += 0.5
        if self.met_box:
            counter += 1.5

        done = terminated or truncated
        if done:
            self.met_key = False
            self.met_door = False
            self.met_ball = False
            self.met_box = False

        if not done or reward == 0.:
            reward = 1 - 0.9 * (self.step_count / self.max_steps)
            reward = reward * (1 - 2/3 ** counter)

        return obs, reward, terminated, truncated, info

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        return obs, info 



__all__  = [
    'DiscreteObsWrapper',
    'ActionReward',
]

Overwriting wrappers.py


I was inspired by the work at https://github.com/lcswillems/rl-starter-files/blob/master/model.py#L27.

In [7]:
%%writefile models.py

from typing import Dict, List, Tuple, Type, Union

import gymnasium as gym
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import torch
from torch import nn

class MinigridFeaturesExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.Space, features_dim: int = 512, normalized_image: bool = False) -> None:
        super().__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 16, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(16, 32, (2, 2)),
            nn.ReLU(),
            nn.Conv2d(32, 64, (2, 2)),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with torch.no_grad():
            n_flatten = self.cnn(torch.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))


Overwriting models.py


Replace the original `features_extractor_class` in `CnnPolicy`.

In [8]:
%%writefile policies.py

from stable_baselines3 import PPO
from stable_baselines3.ppo.policies import CnnPolicy

from models import MinigridFeaturesExtractor


class MinigridCnnPolicy(CnnPolicy):
    def __init__(self, *args, **kwargs):
        super().__init__(
            *args,
            features_extractor_class=MinigridFeaturesExtractor,
            **kwargs,
        )


PPO.policy_aliases["MinigridCnnPolicy"] = MinigridCnnPolicy


Overwriting policies.py


Define all configurations for my hyperparameters.

In [9]:
%%writefile reward.yaml

# Following https://github.com/lcswillems/rl-starter-files
MiniGrid-Empty-Random-5x5-v0: &minigrid-defaults
  env_wrapper: minigrid.wrappers.FlatObsWrapper # See GH/1320#issuecomment-1421108191
  normalize: true
  n_envs: 8  # number of environment copies running in parallel
  n_timesteps: !!float 1e5
  n_steps: 128  # batch size is n_steps * n_env
  batch_size: 64  # Number of training minibatches per update
  gae_lambda: 0.95  #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  n_epochs: 10  # Number of epoch when optimizing the surrogate
  ent_coef: 0.0  # Entropy coefficient for the loss caculation
  learning_rate: 2.5e-4  # The learning rate, it can be a function
  clip_range: 0.2  # Clipping parameter, it can be a function
  policy: 'MlpPolicy'
  policy_kwargs: "dict(
    normalize_images=False,
  )"


MiniGrid-Unlock-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e5

  env_wrapper:
    - wrappers.ActionReward
    - wrappers.DiscreteObsWrapper
  policy: 'policies.MinigridCnnPolicy'
  policy_kwargs: "dict(
    normalize_images=False,
    net_arch=dict(pi=[64], vf=[64]),
  )"


MiniGrid-UnlockPickup-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e5

  env_wrapper:
    - wrappers.ActionReward
    - wrappers.DiscreteObsWrapper
  policy: 'policies.MinigridCnnPolicy'
  policy_kwargs: "dict(
    normalize_images=False,
    net_arch=dict(pi=[64], vf=[64]),
  )"


MiniGrid-BlockedUnlockPickup-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e5

  env_wrapper:
    - wrappers.ActionReward
    - wrappers.DiscreteObsWrapper
  policy: 'policies.MinigridCnnPolicy'
  policy_kwargs: "dict(
    normalize_images=False,
    net_arch=dict(pi=[64], vf=[64]),
  )"


Overwriting reward.yaml


In [10]:
%%writefile without-reward.yaml

# Following https://github.com/lcswillems/rl-starter-files
MiniGrid-Empty-Random-5x5-v0: &minigrid-defaults
  env_wrapper: minigrid.wrappers.FlatObsWrapper # See GH/1320#issuecomment-1421108191
  normalize: true
  n_envs: 8  # number of environment copies running in parallel
  n_timesteps: !!float 1e5
  n_steps: 128  # batch size is n_steps * n_env
  batch_size: 64  # Number of training minibatches per update
  gae_lambda: 0.95  #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  n_epochs: 10  # Number of epoch when optimizing the surrogate
  ent_coef: 0.0  # Entropy coefficient for the loss caculation
  learning_rate: 2.5e-4  # The learning rate, it can be a function
  clip_range: 0.2  # Clipping parameter, it can be a function
  policy: 'MlpPolicy'
  policy_kwargs: "dict(
    normalize_images=False,
  )"


MiniGrid-Unlock-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e5

  env_wrapper:
    - wrappers.DiscreteObsWrapper
  policy: 'policies.MinigridCnnPolicy'
  policy_kwargs: "dict(
    normalize_images=False,
    net_arch=dict(pi=[64], vf=[64]),
  )"


MiniGrid-UnlockPickup-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 3e5

  env_wrapper:
    - wrappers.DiscreteObsWrapper
  policy: 'policies.MinigridCnnPolicy'
  policy_kwargs: "dict(
    normalize_images=False,
    net_arch=dict(pi=[64], vf=[64]),
  )"


MiniGrid-BlockedUnlockPickup-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 3e5

  env_wrapper:
    - wrappers.DiscreteObsWrapper
  policy: 'policies.MinigridCnnPolicy'
  policy_kwargs: "dict(
    normalize_images=False,
    net_arch=dict(pi=[64], vf=[64]),
  )"


Overwriting without-reward.yaml


## Point 1.1
Solve the [Minigrid Unlock](https://minigrid.farama.org/environments/minigrid/UnlockEnv/) task.


![](https://minigrid.farama.org/_images/UnlockEnv.gif)

In [11]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-Unlock-v0 \
    --eval-freq 10000 \
    --conf-file reward.yaml

Seed: 4191266599
Loading hyperparameters from: reward.yaml
Default hyperparameters for environment (ones being tuned will be overridden):
OrderedDict([('batch_size', 64),
             ('clip_range', 0.2),
             ('ent_coef', 0.0),
             ('env_wrapper',
              ['wrappers.ActionReward', 'wrappers.DiscreteObsWrapper']),
             ('gae_lambda', 0.95),
             ('gamma', 0.99),
             ('learning_rate', 0.00025),
             ('n_envs', 8),
             ('n_epochs', 10),
             ('n_steps', 128),
             ('n_timesteps', 100000.0),
             ('normalize', True),
             ('policy', 'policies.MinigridCnnPolicy'),
             ('policy_kwargs',
              'dict( normalize_images=False, net_arch=dict(pi=[64], vf=[64]), '
              ')')])
Using 8 environments
Creating test environment
Normalization activated: {'gamma': 0.99, 'norm_reward': False, 'training': False}
Normalization activated: {'gamma': 0.99}
Using cpu device
Log path: logs/pp

In [12]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-Unlock-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-Unlock-v0_1/MiniGrid-Unlock-v0.zip \
    --conf-file without-reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [13]:
first_task = gym.make("MiniGrid-Unlock-v0")

In [14]:
from wrappers import DiscreteObsWrapper, ActionReward
from stable_baselines3 import PPO

first_env = DiscreteObsWrapper(first_task)
first_model = PPO.load("logs/ppo/MiniGrid-Unlock-v0_2/best_model.zip")

def first_policy(observation):
    observation = first_env.observation(observation)
    action, _ = first_model.predict(observation)
    return action

Exception: Can't get attribute '_function_setstate' on <module 'cloudpickle.cloudpickle' from '/Users/rjy/anaconda3/lib/python3.11/site-packages/cloudpickle/cloudpickle.py'>
Exception: Can't get attribute '_function_setstate' on <module 'cloudpickle.cloudpickle' from '/Users/rjy/anaconda3/lib/python3.11/site-packages/cloudpickle/cloudpickle.py'>
Exception: Can't get attribute '_function_setstate' on <module 'cloudpickle.cloudpickle' from '/Users/rjy/anaconda3/lib/python3.11/site-packages/cloudpickle/cloudpickle.py'>


In [15]:
compute_score(task=first_task, policy=first_policy)

Best score: 0.984
Average score: (0.877, 0.192)


array([0.85, 0.93, 0.88, 0.97, 0.97, 0.98, 0.97, 0.32, 0.96, 0.93])

## Point 1.2
Solve the [Minigrid Unlock and Pickup](https://minigrid.farama.org/environments/minigrid/UnlockEnv/) task.

![](https://minigrid.farama.org/_images/UnlockPickupEnv.gif)

In [16]:
second_task = gym.make("MiniGrid-UnlockPickup-v0")

In [17]:
print_obs(second_task.reset()[0])

0 0 0 0 0 0 0 
0 0 0 0 0 0 0 
2 2 2 2 4 2 0 
2 1 5 1 1 2 0 
2 1 1 1 1 2 0 
2 1 1 1 1 2 0 
2 1 1 1 1 2 0 


In [18]:
print_obs(second_task.step(5)[0])

0 0 0 0 0 0 0 
0 0 0 0 0 0 0 
2 2 2 2 4 2 0 
2 1 5 1 1 2 0 
2 1 1 1 1 2 0 
2 1 1 1 1 2 0 
2 1 1 1 1 2 0 


In [19]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-UnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-Unlock-v0_1/best_model.zip \
    --conf-file reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [20]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-UnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-UnlockPickup-v0_1/best_model.zip \
    --conf-file reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [21]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-UnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-UnlockPickup-v0_2/best_model.zip \
    --conf-file reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [22]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-UnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-UnlockPickup-v0_3/MiniGrid-UnlockPickup-v0.zip \
    --conf-file without-reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [23]:
from wrappers import DiscreteObsWrapper
from stable_baselines3 import PPO

second_env = DiscreteObsWrapper(second_task)
second_model = PPO.load("logs/ppo/MiniGrid-UnlockPickup-v0_4/best_model.zip")

def second_policy(observation):
  # print_obs(observation)
  observation = second_env.observation(observation)
  # print('-----------')
  # print_ont_hot(observation)
  action, _ = second_model.predict(observation)
  # print(f'action: {action}\n')
  return action

Exception: Can't get attribute '_function_setstate' on <module 'cloudpickle.cloudpickle' from '/Users/rjy/anaconda3/lib/python3.11/site-packages/cloudpickle/cloudpickle.py'>
Exception: Can't get attribute '_function_setstate' on <module 'cloudpickle.cloudpickle' from '/Users/rjy/anaconda3/lib/python3.11/site-packages/cloudpickle/cloudpickle.py'>
Exception: Can't get attribute '_function_setstate' on <module 'cloudpickle.cloudpickle' from '/Users/rjy/anaconda3/lib/python3.11/site-packages/cloudpickle/cloudpickle.py'>


In [24]:
compute_score(task=second_task, policy=second_policy)

Best score: 0.819
Average score: (0.316, 0.329)


array([0.49, 0.67, 0.00, 0.00, 0.00, 0.82, 0.47, 0.00, 0.00, 0.70])

## Point 1.3
Solve the [Minigrid Blocked, Unlock and Pickup](https://minigrid.farama.org/environments/minigrid/UnlockEnv/) task.

![](https://minigrid.farama.org/_images/BlockedUnlockPickupEnv.gif)

In [25]:

# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-BlockedUnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-UnlockPickup-v0_4/best_model.zip \
    --conf-file reward.yaml

Seed: 3314051353
Loading hyperparameters from: reward.yaml
Default hyperparameters for environment (ones being tuned will be overridden):
OrderedDict([('batch_size', 64),
             ('clip_range', 0.2),
             ('ent_coef', 0.0),
             ('env_wrapper',
              ['wrappers.ActionReward', 'wrappers.DiscreteObsWrapper']),
             ('gae_lambda', 0.95),
             ('gamma', 0.99),
             ('learning_rate', 0.00025),
             ('n_envs', 8),
             ('n_epochs', 10),
             ('n_steps', 128),
             ('n_timesteps', 100000.0),
             ('normalize', True),
             ('policy', 'policies.MinigridCnnPolicy'),
             ('policy_kwargs',
              'dict( normalize_images=False, net_arch=dict(pi=[64], vf=[64]), '
              ')')])
Using 8 environments
Creating test environment
Normalization activated: {'gamma': 0.99, 'norm_reward': False, 'training': False}
Normalization activated: {'gamma': 0.99}
Loading pretrained agent
Exception

In [26]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-BlockedUnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-BlockedUnlockPickup-v0_1/best_model.zip \
    --conf-file reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [27]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-BlockedUnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-BlockedUnlockPickup-v0_2/best_model.zip \
    --conf-file reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [28]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-BlockedUnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-BlockedUnlockPickup-v0_3/best_model.zip \
    --conf-file reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [29]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-BlockedUnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-BlockedUnlockPickup-v0_4/best_model.zip \
    --conf-file reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [30]:
# Train an agent to solve the task
! python -m rl_zoo3.train --algo ppo --env MiniGrid-BlockedUnlockPickup-v0 \
    --eval-freq 10000 \
    --trained-agent logs/ppo/MiniGrid-BlockedUnlockPickup-v0_5/best_model.zip \
    --conf-file reward.yaml

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 279, in <module>
    train()
  File "/Users/rjy/anaconda3/lib/python3.11/site-packages/rl_zoo3/train.py", line 192, in train
    assert args.trained_agent.endswith(".zip") and os.path.isfile(
AssertionError: The trained_agent must be a valid path to a .zip file


In [31]:
third_task = gym.make("MiniGrid-BlockedUnlockPickup-v0")

In [32]:
from wrappers import DiscreteObsWrapper
from stable_baselines3 import PPO

third_env = DiscreteObsWrapper(third_task)
third_model = PPO.load("logs/ppo/MiniGrid-BlockedUnlockPickup-v0_6/best_model.zip")

def third_policy(observation):
  observation = third_env.observation(observation)
  action, _ = third_model.predict(observation)
    
  return action

In [33]:
compute_score(task=third_task, policy=third_policy)

Best score: 0.758
Average score: (0.223, 0.341)


array([0.00, 0.00, 0.00, 0.76, 0.00, 0.73, 0.00, 0.75, 0.00, 0.00])