# **More complicated gymnasium environment**




### **1. Preparation**
Import necessary dependencies

In [10]:
from typing import Any, SupportsFloat, Dict

import numpy as np
import gymnasium as gym
from gymnasium.core import ActType, ObsType

In [11]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.env_util import make_vec_env


## **2. Golf2DEnv: A Simple 2D Golf Game Environment**
### Overview
`Golf2DEnv` is a simple 2D reinforcement learning environment implemented using OpenAI Gym. The goal of the game is to move a point (the "ball") to the target position `[5,5]` on a 10x10 grid. The environment provides a discrete action space for movement and assigns rewards based on the Manhattan distance from the target.

### Environment Description
- The environment is represented as a 10x10 grid.
- The starting position is `[0,0]`.
- The target position is `[5,5]`.
- The observation space is a 2D integer coordinate within the range `[0,10]`.
- The environment supports only console-based rendering.

### Game Objective
The agent needs to reach the goal position `[5,5]` from any given starting position. The reward function is based on the negative Manhattan distance from the target, encouraging the agent to minimize its distance to the goal.

### Actions
The environment provides a discrete action space with four possible movements:

| Action | Description |
|--------|-------------|
| 0 | Move left (decrease x-coordinate) |
| 1 | Move up (increase y-coordinate) |
| 2 | Move right (increase x-coordinate) |
| 3 | Move down (decrease y-coordinate) |

### Step Method Breakdown
Each step in the environment follows these operations:
1. The agent takes an action from the discrete action space.
2. The agent's position updates based on the chosen action.
3. The position is clipped within the 10x10 grid.
4. The reward is computed as the negative Manhattan distance from the target `[5,5]`.
5. The environment checks if the agent has reached the goal (`done` state).
6. The method returns the new state, reward, done flag, and additional information.

### Expected Results
- The agent receives negative rewards until it reaches `[5,5]`.
- The optimal strategy minimizes the number of steps to the goal.
- Upon reaching `[5,5]`, the episode terminates.
- The environment provides a simple testing ground for reinforcement learning algorithms with discrete action spaces and reward shaping based on distance minimization.


In [12]:
class Golf2DEnv(gym.Env):
    metadata = {'render.modes': ['console']}

    def __init__(self):
        super().__init__()

        self.pos = np.array([0, 0], dtype=np.int32)
        self.action_space = gym.spaces.Discrete(4)
        # 2D Box
        self.observation_space = gym.spaces.Box(
            low=0,
            high=10,
            shape=(2,),
            dtype=np.int32,
        )

    def reset(
            self,
            *,
            seed: int | None = None,
            options: dict[str, Any] | None = None,
    ) -> tuple[ObsType, dict[str, Any]]:  # type: ignore
        self.pos = np.array([0, 0], dtype=np.int32)
        # self.pos = self.np_random.integers(low=0, high=10, size=(2,))
        return self.pos, {}

    def _get_reward(self) -> int:
        return -abs(self.pos[0] - 5) - abs(self.pos[1] - 5)

    def _is_done(self) -> bool:
        return np.array_equal(self.pos, np.array([5, 5]))

    def step(
            self, action: ActType
    ) -> tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]:

        if action == 0:
            self.pos[0] -= 1
        elif action == 1:
            self.pos[1] += 1
        elif action == 2:
            self.pos[0] += 1
        elif action == 3:
            self.pos[1] -= 1

        self.pos = np.clip(self.pos, 0, 10)

        reward = self._get_reward()

        done = self._is_done()

        return self.pos, reward, done, False, {}

    def render(self, mode='console'):
        if mode == 'console':
            print(f"Current Position: {self.pos}")

    def close(self):
        pass


### Train And Test

- **Algorithm**  
  - The code uses **PPO (Proximal Policy Optimization)**, a reinforcement learning algorithm.  
  - PPO optimizes the policy by limiting the step size of updates, ensuring stable learning.  
  - It is widely used for continuous control tasks due to its balance between sample efficiency and stability.  

- **Process**  
  - **Training:**  
    - A vectorized environment (`make_vec_env`) is created with 10 parallel instances.  
    - A PPO model is initialized with the `MlpPolicy` (Multi-Layer Perceptron policy).  
    - The model is trained for `100000` timesteps.  

  - **Testing:**  
    - The trained model is tested in a single instance of the environment.  
    - The environment is reset before testing.  
    - The model predicts actions step by step for up to 100 iterations or until `done`.  
    - Observations, actions, rewards, and termination signals are printed.  

- **Validation**  
  - The test phase evaluates whether the trained policy performs well in the environment.  
  - Observing the reward values and termination conditions helps determine policy effectiveness.  
  - If performance is suboptimal, hyperparameters (e.g., learning rate, network architecture) can be adjusted to improve learning.  


In [13]:
def run_golf2d_env():
    
    def train(NewEnv):
        train_env = make_vec_env(lambda: NewEnv(), n_envs=10)
        model = PPO('MlpPolicy', env=train_env, verbose=0)
        model.learn(total_timesteps=10_0000)
        return model


    def test(model, env):
        obs, info = env.reset()

        for i in range(100):
            action, _states = model.predict(obs)
            obs, reward, done, _, _ = env.step(action)
            print(f'obs: {obs}, action: {action}, reward: {reward}, done: {done}')
            if done:
                break
            
    model = train(Golf2DEnv)
    test(model, Golf2DEnv())
    
run_golf2d_env()

obs: [0 1], action: 1, reward: -9, done: False
obs: [0 2], action: 1, reward: -8, done: False
obs: [1 2], action: 2, reward: -7, done: False
obs: [1 1], action: 3, reward: -8, done: False
obs: [1 2], action: 1, reward: -7, done: False
obs: [2 2], action: 2, reward: -6, done: False
obs: [3 2], action: 2, reward: -5, done: False
obs: [3 3], action: 1, reward: -4, done: False
obs: [3 4], action: 1, reward: -3, done: False
obs: [4 4], action: 2, reward: -2, done: False
obs: [4 3], action: 3, reward: -3, done: False
obs: [4 4], action: 1, reward: -2, done: False
obs: [4 5], action: 1, reward: -1, done: False
obs: [5 5], action: 2, reward: 0, done: True


## **3. Golf2DBoxEnv: Multi Factor Observation In Dict Type**

### Overview
The `Golf2DBoxEnv` is a custom Gym environment designed for a simple 2D grid-based game. The environment simulates an agent navigating a 10x10 grid to collect chests. The agent's position and the positions of the chests are represented in a 2D space, and the agent can move in four cardinal directions to interact with the environment.

### Environment Description
- The environment consists of a 10x10 grid.
- The agent starts at the position `[0, 0]`.
- The grid contains randomly generated chests, with the number of chests ranging between 2 and 4.
- The observation space is a dictionary containing:
  - `agent_pos`: A 2D vector representing the agent's current position on the grid (values range from 0 to 10).
  - `magic_box`: A 3x2 matrix representing the positions of the chests, padded with `-1` if there are fewer than 3 chests.

### Game Objective
- The goal of the game is for the agent to collect all the chests on the grid.
- The game ends when all chests have been collected.

### Action Behavior
The action space is discrete with 4 possible actions:
- **0**: Move left (decrease the x-coordinate).
- **1**: Move up (increase the y-coordinate).
- **2**: Move right (increase the x-coordinate).
- **3**: Move down (decrease the y-coordinate).
- The agent's position is clipped to ensure it stays within the bounds of the grid (0 to 10).

### Step Method
The `step` method allows the agent to interact with the environment. Here is the breakdown of its steps:
1. **Action Execution**:
   - The agent's position is updated based on the chosen action.
2. **Position Clipping**:
   - The agent's position is clipped to ensure it remains within the grid bounds.
3. **Reward Calculation**:
   - A reward of `10` is given if the agent collects a chest.
   - The collected chest is removed from the environment.
4. **Termination Check**:
   - The episode ends if all chests are collected.
5. **Observation Return**:
   - The updated observation (agent position and chest positions) is returned.

### Expected Results
- The agent will navigate the grid to collect chests.
- The game will terminate once all chests are collected.
- The agent will receive a reward of `10` for each chest collected.
- The observation space will reflect the agent's current position and the remaining chests.
```

In [14]:

class Golf2DBoxEnv(gym.Env):
    metadata = {'render.modes': ['console']}

    def __init__(self):
        super().__init__()

        self.pos = np.array([0, 0], dtype=np.int32)
        self.action_space = gym.spaces.Discrete(4)
        # 2D Box
        self.observation_space = gym.spaces.Box(
            low=0,
            high=10,
            shape=(2,),
            dtype=np.int32,
        )

        self.observation_space = gym.spaces.Dict({
            "agent_pos": gym.spaces.Box(low=0, high=10, shape=(2,), dtype=np.int32),
            "magic_box": gym.spaces.Box(low=0, high=10, shape=(3, 2), dtype=np.int32)
        })

        self.chests = self._generate_chests()

    def _generate_chests(self) -> np.ndarray:
        num_chests = np.random.randint(2, 4)
        chests = np.random.randint(low=0, high=11, size=(num_chests, 2))
        return chests

    def _get_obs(self) -> Dict[str, np.ndarray]:
        padded_chests = np.full((3, 2), -1, dtype=np.int32)
        padded_chests[:len(self.chests)] = self.chests
        return {
            'agent_pos': self.pos,
            'magic_box': padded_chests,
        }

    def _get_reward(self) -> int:
        reward = 0
        for i, chest in enumerate(self.chests):
            if np.array_equal(self.pos, chest):
                reward += 10
                self.chests = np.delete(self.chests, i, axis=0)
                break
        return reward

    def _is_done(self) -> bool:
        return len(self.chests) == 0

    def reset(
            self,
            *,
            seed: int | None = None,
            options: dict[str, Any] | None = None,
    ) -> tuple[ObsType, dict[str, Any]]:  # type: ignore

        self.pos = np.array([0, 0], dtype=np.int32)
        self.chests = self._generate_chests()

        return self._get_obs(), {}

    def step(
            self, action: ActType
    ) -> tuple[ObsType, SupportsFloat, bool, bool, dict[str, Any]]:

        if action == 0:
            self.pos[0] -= 1
        elif action == 1:
            self.pos[1] += 1
        elif action == 2:
            self.pos[0] += 1
        elif action == 3:
            self.pos[1] -= 1

        self.pos = np.clip(self.pos, 0, 10)

        reward = self._get_reward()

        done = self._is_done()

        return self._get_obs(), reward, done, False, {}

    def render(self, mode='console'):
        if mode == 'console':
            print(f"Agent Position: {self.pos}")
            print(f"Remaining Chests: {self.chests}")

    def close(self):
        pass


### Train And Test  

- **Algorithm**  
  - The code uses **PPO (Proximal Policy Optimization)**, a policy gradient reinforcement learning algorithm.  
  - PPO optimizes the policy by **limiting large updates**, ensuring stable and consistent training.  
  - It is effective for environments with **continuous action spaces**, making it suitable for this task.  

- **Process**  
  - **Training:**  
    - A vectorized environment (`make_vec_env`) is created with **10 parallel instances**.  
    - The PPO model is initialized with the **MultiInputPolicy**, which processes multiple observation inputs.  
    - The model is trained for **100,000 timesteps** to learn an optimal movement strategy.  

  - **Testing:**  
    - The trained model is evaluated in a **single instance** of `Golf2DBoxEnv`.  
    - The environment is **reset** before testing.  
    - The model **predicts actions** and interacts with the environment for **up to 100 steps** or until it reaches the goal.  
    - The agent’s **observations, actions, rewards, and termination status** are printed to monitor performance.  

- **Validation**  
  - The test phase **verifies** whether the agent successfully learns to reach the target efficiently.  
  - Observing **reward trends** and **termination conditions** helps assess learning success.  
  - If performance is poor, **hyperparameters** such as **learning rate, training steps, or reward function** may need adjustment.  


In [15]:
def run_golf2d_box_env():
    
    def train(NewEnv):
        train_env = make_vec_env(lambda: NewEnv(), n_envs=10)
        model = PPO('MultiInputPolicy', env=train_env, verbose=0)
        model.learn(total_timesteps=10_0000)
        return model


    def test(model, env):
        obs, info = env.reset()

        for i in range(100):
            action, _states = model.predict(obs)
            obs, reward, done, _, _ = env.step(action)
            print(f'obs: {obs}, action: {action}, reward: {reward}, done: {done}')
            if done:
                break
            
    model = train(Golf2DBoxEnv)
    test(model, Golf2DBoxEnv())
    
run_golf2d_box_env()

obs: {'agent_pos': array([0, 0]), 'magic_box': array([[ 8,  1],
       [10,  8],
       [ 3,  2]])}, action: 0, reward: 0, done: False
obs: {'agent_pos': array([0, 1]), 'magic_box': array([[ 8,  1],
       [10,  8],
       [ 3,  2]])}, action: 1, reward: 0, done: False
obs: {'agent_pos': array([0, 2]), 'magic_box': array([[ 8,  1],
       [10,  8],
       [ 3,  2]])}, action: 1, reward: 0, done: False
obs: {'agent_pos': array([1, 2]), 'magic_box': array([[ 8,  1],
       [10,  8],
       [ 3,  2]])}, action: 2, reward: 0, done: False
obs: {'agent_pos': array([2, 2]), 'magic_box': array([[ 8,  1],
       [10,  8],
       [ 3,  2]])}, action: 2, reward: 0, done: False
obs: {'agent_pos': array([2, 1]), 'magic_box': array([[ 8,  1],
       [10,  8],
       [ 3,  2]])}, action: 3, reward: 0, done: False
obs: {'agent_pos': array([2, 0]), 'magic_box': array([[ 8,  1],
       [10,  8],
       [ 3,  2]])}, action: 3, reward: 0, done: False
obs: {'agent_pos': array([2, 0]), 'magic_box': array([[

## **4. Golf2DMultiDiscreteBoxEnv: Multi-Discrete Action Sample**

### Overview  
`Golf2DMultiDiscreteBoxEnv` is a **custom reinforcement learning environment** implemented using **Gym**. The environment simulates a simple **2D grid-based movement** where an agent navigates toward a target using discrete directional actions and step sizes.  

### Environment Description  
- The environment represents a **2D grid** with boundaries between `[0, 10]` for both x and y coordinates.  
- The agent starts at position `[0, 0]` and moves toward the target at `[5, 5]`.  
- The action space consists of a **direction and a step size**, influencing the agent's movement.  
- The episode terminates once the agent reaches the target.  

### Game Objective  
- The agent's goal is to **reach the target position `[5, 5]`** as efficiently as possible.  
- The reward function is based on the **negative Manhattan distance**, meaning the closer the agent is to the goal, the higher the reward.  
- The agent should optimize its movement strategy to minimize the number of steps taken.  

### Action Behavior  
- The action space is **MultiDiscrete([4, 5])**, where:  
  - **First dimension (Direction)**:  
    - `0`: Left  
    - `1`: Up  
    - `2`: Right  
    - `3`: Down  
  - **Second dimension (Step Size)**:  
    - Ranges from `1` to `5`, indicating the number of steps moved in the chosen direction.  

### Step Method Process  
1. **Unpack the action** into `direction` and `steps`.  
2. **Convert steps** from `[0-4]` to `[1-5]` to ensure a valid movement range.  
3. **Update the agent’s position** based on the chosen direction and step size.  
4. **Clip the position** within the valid grid boundaries `[0, 10]`.  
5. **Calculate the reward** based on the **negative Manhattan distance** to the goal.  
6. **Check if the episode is done**, i.e., the agent reaches `[5, 5]`.  
7. **Return the new state, reward, done flag, and additional information**.  

### Expected Outcome  
- The agent learns to **navigate efficiently** toward `[5, 5]` using optimal step sizes and directions.  
- **Higher rewards** are achieved when the agent reaches the target faster.  
- The environment provides a simple but effective testbed for **discrete action-space reinforcement learning** algorithms.  

**Key Features of MultiDiscrete:**

1. Vector of Discrete Actions:
- The action is represented as a vector (or array) of integers.
- Each element in the vector corresponds to a separate discrete action with its own range of possible values.

2. Independent Ranges:
- Each dimension of the vector can have a different range of values.
- For example, one dimension might have 3 possible values (0, 1, 2), while another might have 5 possible values (0, 1, 2, 3, 4).

3. Use Cases:
- MultiDiscrete is commonly used in environments where multiple discrete decisions need to be made simultaneously.
- Examples include controlling multiple agents, selecting actions for different components of a system, or making decisions in a multi-dimensional discrete space.

In [22]:
class Golf2DMultiDiscreteBoxEnv(gym.Env):
    metadata = {'render.modes': ['console']}

    def __init__(self):
        super().__init__()

        # Initial position
        self.pos = np.array([0, 0], dtype=np.int32)

        # Define the action space as MultiDiscrete
        # First dimension: direction (0: left, 1: up, 2: right, 3: down)
        # Second dimension: number of steps (1 to 5)
        self.action_space = gym.spaces.MultiDiscrete([4, 5])

        # Define the observation space as a 2D Box
        self.observation_space = gym.spaces.Box(
            low=0,
            high=10,
            shape=(2,),
            dtype=np.int32,
        )

    def reset(
            self,
            *,
            seed: int | None = None,
            options: dict[str, Any] | None = None,
    ) -> tuple[np.ndarray, dict[str, Any]]:
        # Reset the position to [0, 0]
        self.pos = np.array([0, 0], dtype=np.int32)
        return self.pos, {}

    def _get_reward(self) -> int:
        # Reward is the negative Manhattan distance to the target [5, 5]
        return -abs(self.pos[0] - 5) - abs(self.pos[1] - 5)

    def _is_done(self) -> bool:
        # Episode ends when the agent reaches the target [5, 5]
        return np.array_equal(self.pos, np.array([5, 5]))

    def step(
            self, action: np.ndarray
    ) -> tuple[np.ndarray, SupportsFloat, bool, bool, dict[str, Any]]:
        # Unpack the action
        direction, steps = action
        steps += 1  # Convert steps from 0-4 to 1-5

        # Update position based on direction and steps
        if direction == 0:  # Left
            self.pos[0] -= steps
        elif direction == 1:  # Up
            self.pos[1] += steps
        elif direction == 2:  # Right
            self.pos[0] += steps
        elif direction == 3:  # Down
            self.pos[1] -= steps

        # Clip the position to stay within the bounds [0, 10]
        self.pos = np.clip(self.pos, 0, 10)

        # Calculate reward
        reward = self._get_reward()

        # Check if the episode is done
        done = self._is_done()

        return self.pos, reward, done, False, {}

    def render(self, mode='console'):
        if mode == 'console':
            print(f"Current Position: {self.pos}")

    def close(self):
        pass

### Train And Test

- **Algorithm**  
  - The code uses **PPO (Proximal Policy Optimization)**, a reinforcement learning algorithm.  
  - PPO optimizes the policy by limiting the step size of updates, ensuring stable learning.  
  - It is widely used for continuous control tasks due to its balance between sample efficiency and stability.  

- **Process**  
  - **Training:**  
    - A vectorized environment (`make_vec_env`) is created with 10 parallel instances of the `Golf2DMultiDiscreteBoxEnv`.  
    - A PPO model is initialized with the `MlpPolicy` (Multi-Layer Perceptron policy).  
    - The model is trained for `300,000` timesteps using the `learn` method.  

  - **Testing:**  
    - The trained model is tested in a single instance of the `Golf2DMultiDiscreteBoxEnv`.  
    - The environment is reset before testing to initialize the observation and info.  
    - The model predicts actions step by step for up to 100 iterations or until the environment signals `done`.  
    - Observations, actions, rewards, and termination signals are printed during each step.  

- **Validation**  
  - The test phase evaluates whether the trained policy performs well in the environment.  
  - Observing the reward values and termination conditions helps determine the effectiveness of the learned policy.  
  - If performance is suboptimal, hyperparameters (e.g., learning rate, network architecture) or the training duration can be adjusted to improve learning.  
  - The vectorized training environment ensures efficient exploration and faster convergence by leveraging parallel environments.

In [24]:
def run_golf2d_multidiscrete_box_env():
    
    def train(NewEnv):
        train_env = make_vec_env(lambda: NewEnv(), n_envs=10)
        model = PPO('MlpPolicy', env=train_env, verbose=0)
        model.learn(total_timesteps=30_0000)
        return model


    def test(model, env):
        obs, info = env.reset()

        for i in range(100):
            action, _states = model.predict(obs)
            obs, reward, done, _, _ = env.step(action)
            print(f'obs: {obs}, action: {action}, reward: {reward}, done: {done}')
            if done:
                break
            
    model = train(Golf2DMultiDiscreteBoxEnv)
    test(model, Golf2DMultiDiscreteBoxEnv())
    
run_golf2d_multidiscrete_box_env()

obs: [5 0], action: [2 4], reward: -5, done: False
obs: [5 5], action: [1 4], reward: 0, done: True
