<a href="https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/sb3/5_custom_gym_env.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install "stable-baselines3[extra]>=2.0.0a4"
from stable_baselines3 import PPO, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.env_checker import check_env
import numpy as np
import gymnasium as gym
from gymnasium import spaces



In [71]:
# Apple game environment

class AppleGameEnv(gym.Env):
    """
    Custom Environment that follows gym interface.
    This is a simple env where the agent must learn to play this addictive apple game 
    """

    # Because of google colab, we cannot implement the GUI ('human' render mode)
    metadata = {"render_modes": ["console"]}

    def __init__(self, grid_size=8, render_mode="console"):
        super(AppleGameEnv, self).__init__()
        self.render_mode = render_mode
        self.width = 10
        self.length = 10
        self.num_turns = 1000  # maximum number of turns
        self.observation_space = spaces.Box(
            low=0, high=9, shape=(self.width * self.length,), dtype=np.float32 #flattened board instead of nxn
        )
        self.board = np.zeros((grid_size * grid_size,), dtype=np.float32)
        # action space is picking two corners of the grid
        # normalizing the action space. will transform back to the grid size later
        self.action_space = spaces.Box(
            low=-1,
            high=1,
            shape=(4,),
            dtype=np.float32
        )
        # self.action_space = spaces.Box(low=np.array([1,1,1,1]), high=np.array([self.length,self.width,self.length,self.width]), dtype=np.int8)
    
    def reset(self, seed=None, options=None):
        """
        Important: the observation must be a numpy array
        :return: (np.array)
        """
        super().reset(seed=seed, options=options)
        self.num_turns = 1000
        self.board = np.random.randint(1,10,size=(self.width*self.length),dtype = np.float32)
        return np.array(self.board),{}



    def step(self, action):
        self.num_turns -= 1
        # extract the two points and map from [-1,1] to coordinate ranges
        action = np.zeros_like(action)
        action[[0,2]] = ((action[[0,2]] + 1) / 2) * (self.width + 1)  # map to [0,width+1] 
        action[[1,3]] = ((action[[1,3]] + 1) / 2) * (self.length + 1) # map to [0,length+1]
        x1,y1 = action[:2].astype(np.int8)
        x2,y2 = action[2:].astype(np.int8)
        top = min(y1,y2)
        bottom = max(y1,y2)
        left = min(x1,x2)
        right = max(x1,x2)

        # check if the sum of the points in the rectangle formed by the two corners is 10
        # if 10, set all entries in the rectangle to 0
        board_2d = self.board.reshape((self.length,self.width))
        rectangle = board_2d[left:right+1,top:bottom+1]
        sum = np.sum(rectangle)
        reward = 0
        if sum == 10:
            board_2d[left:right,top-1:bottom] = 0
            reward = (right-left)*(bottom-top)
            self.board = board_2d.flatten()

        terminated = bool((self.num_turns <= 0) or np.all(self.board == 0))
        truncated = False
        board_2d = self.board.reshape((self.length,self.width))
        return (
            np.array(self.board).astype(np.float32),
            reward,
            terminated,
            truncated,
            {}
        ) 

    def render(self):
        board2d = self.board.reshape((self.length,self.width))
        for i in range(self.length):
            for j in range(self.width):
              if j == self.grid_size - 1:
                  print(board2d[i,j])
              else:
                  print(board2d[i,j],end="|")             
            if i != self.grid_size - 1:
                print("-"*(2*self.width-1))

    def close(self):
        pass

In [70]:
env = AppleGameEnv(grid_size=8)
check_env(env,warn=True)
vec_env = make_vec_env(AppleGameEnv, n_envs=1, env_kwargs=dict(grid_size=8))
model = PPO("MlpPolicy", env, verbose=1).learn(100000)

TypeError: The reset() method must accept a `seed` parameter

In [72]:
action_space = spaces.Box(low=np.array([0,0,0,0]), high=np.array([9,9,9,9]), dtype=np.int8)
action = action_space.sample()
action

array([6, 9, 7, 8], dtype=int8)

In [73]:
x1,y1 = action[:2]
x2,y2 = action[2:]
print(x1,y1)
print(x2,y2)
top = min(y1,y2)
bottom = max(y1,y2)
left = min(x1,x2)
right = max(x1,x2)
print(top,bottom)
print(left,right)

6 9
7 8
8 9
6 7


In [74]:
board = np.random.randint(1,10,size=(10,10),dtype=np.int8)
board

array([[6, 9, 7, 4, 6, 4, 2, 5, 8, 3],
       [4, 6, 5, 5, 1, 3, 7, 5, 3, 6],
       [3, 2, 2, 5, 6, 2, 4, 2, 3, 7],
       [4, 3, 7, 4, 6, 7, 2, 6, 4, 2],
       [1, 6, 4, 6, 3, 9, 6, 3, 2, 1],
       [7, 2, 1, 1, 5, 4, 1, 4, 8, 5],
       [9, 7, 4, 9, 5, 4, 7, 6, 4, 1],
       [9, 8, 2, 1, 2, 9, 5, 8, 1, 4],
       [8, 4, 6, 9, 2, 9, 7, 9, 7, 7],
       [7, 9, 7, 2, 8, 7, 9, 2, 1, 8]], dtype=int8)

In [43]:
# 5, 7, 5, 8, 4, 4, 4,

In [76]:
rectangle = board[left:right+1,top:bottom+1]
rectangle

array([[4, 1],
       [1, 4]], dtype=int8)

In [47]:
np.sum(rectangle)

75

In [23]:
GRID_SIZE = 3
env = TicTacToeEnv(grid_size=GRID_SIZE)
check_env(env,warn=True)

vec_env = make_vec_env(TicTacToeEnv, n_envs=1, env_kwargs=dict(grid_size=GRID_SIZE))

In [30]:
model = PPO("MlpPolicy", env, verbose=1).learn(100000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 3.49     |
|    ep_rew_mean     | -2.5     |
| time/              |          |
|    fps             | 6547     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 3.5         |
|    ep_rew_mean          | -2.36       |
| time/                   |             |
|    fps                  | 4837        |
|    iterations           | 2           |
|    time_elapsed         | 0           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016748298 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.2         |
|    entropy_loss   