### Reinforcement Learning - Resource Manager

Kudos to:

https://www.gymlibrary.dev/content/environment_creation/

https://www.youtube.com/watch?v=bD6V3rcr_54&ab_channel=NicholasRenotte 


Version 0.1:

- Only first learning of GYM.
- There is no penalty for running around, Reward is maxed when target is met - so pretty dumb right now.


### Imports

In [1]:
import gym
from gym import spaces
import numpy as np
import pygame
from gym.envs.registration import register

### Environment

In [2]:
#Create Gym Environment for Resource Manager
#The environment is a 2D grid with 4 possible actions: up, down, left, right
#The agent can move in any direction but cannot move outside the grid

class ResourceManagerEnv(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}


    def __init__(self, grid_size=10, render_mode=None):

        #Define Grid Size
        self.grid_size = grid_size
        self.window_size = 500

        #Action Space:
        #0: Right, 1: up, 2: left, 3: down

        self.action_space = spaces.Discrete(4)

        #Map the action to the corresponding movement
        self.action_to_direction = {
            0: np.array([1, 0]),
            1: np.array([0, 1]),
            2: np.array([-1, 0]),
            3: np.array([0, -1]),
        }

        #Observation Space:
        #The observation space is a 2D grid with the agent's position marked as 1
        #and the rest of the grid marked as 0

        self.observation_space = spaces.Box(low=0, high=1, shape=(grid_size, grid_size), dtype=np.float32)
        self.reset()

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode
        self.window = None
        self.clock = None



    #Needed for Environment Reset
    def reset(self, seed=None):
        super().reset(seed=seed)

        #Choose the agent's initial position at random
        self.agent_position = self.np_random.integers(low=0, high=self.grid_size, size=(2,))

        #Set the target position at random until it is different from the agent's position
        self.target_position = self.agent_position
        while np.all(self.target_position == self.agent_position):
            self.target_position = self.np_random.integers(low=0, high=self.grid_size, size=(2,))

        observation = self.get_obs()
        info = self.get_info()

        if self.render_mode == "human":
            self.render_frame()

        return observation, info
    
    def get_obs(self):
        #Initialize observation
        observation = np.zeros((self.grid_size, self.grid_size), dtype=np.float32)

        #Mark the agent's position
        observation[tuple(self.agent_position)] = 1
        return observation
    
    def get_info(self):
        #Initialize info
        info = {
            'agent_position': self.agent_position,
            'target_position': self.target_position
        }
        return info
    
    def step(self, action):
        #initialize reward
        reward = 0

        #Choose a direction
        direction = self.action_to_direction[action]
        #Move the agent in that direction
        self.agent_position = np.clip(
            self.agent_position + direction,
            0,
            self.grid_size - 1
        )

        #define when done
        done = np.all(self.agent_position == self.target_position)

        #simple reward function
        reward = 1 if done else 0

        observation = self.get_obs()
        info = self.get_info()

        if self.render_mode == "human":
            self.render_frame()

        return observation, reward, done, info
    
    def render(self):
            if self.render_mode == "rgb_array":
                return self.render_frame()
    
    def render_frame(self):
        if self.window is None and self.render_mode == "human":
            pygame.init()
            pygame.display.init()
            self.window = pygame.display.set_mode((self.window_size, self.window_size))
        if self.clock is None and self.render_mode == "human":
            self.clock = pygame.time.Clock()
        canvas = pygame.Surface((self.window_size, self.window_size))
        canvas.fill((255, 255, 255))
        pix_square_size = (
            self.window_size / self.grid_size
        )  # The size of a single grid square in pixels
        # First we draw the target
        pygame.draw.rect(
            canvas,
            (255, 0, 0),
            pygame.Rect(
                pix_square_size * self.target_position,
                (pix_square_size, pix_square_size),
            ),
        )
        # Now we draw the agent
        pygame.draw.circle(
            canvas,
            (0, 0, 255),
            (self.agent_position + 0.5) * pix_square_size,
            pix_square_size / 3,
        )
        # Finally, add some gridlines
        for x in range(self.grid_size + 1):
            pygame.draw.line(
                canvas,
                0,
                (0, pix_square_size * x),
                (self.window_size, pix_square_size * x),
                width=3,
            )
            pygame.draw.line(
                canvas,
                0,
                (pix_square_size * x, 0),
                (pix_square_size * x, self.window_size),
                width=3,
            )
        if self.render_mode == "human":
            # The following line copies our drawings from `canvas` to the visible window
            self.window.blit(canvas, canvas.get_rect())
            pygame.event.pump()
            pygame.display.update()
            # We need to ensure that human-rendering occurs at the predefined framerate.
            # The following line will automatically add a delay to keep the framerate stable.
            self.clock.tick(self.metadata["render_fps"])
        else:  # rgb_array
            return np.transpose(
                np.array(pygame.surfarray.pixels3d(canvas)), axes=(1, 0, 2)
            )
        
    def close(self):
            if self.window is not None:
                pygame.display.quit()
                pygame.quit()

In [3]:
register(
    id='gym_env/GridTesting-v005',
    entry_point='gym_env:GridTesting',
    max_episode_steps=300,
)

In [4]:
env = ResourceManagerEnv(grid_size=10, render_mode="human")


In [5]:
#Run the enfironment for 20 episodes
episodes = 5

for episode in range(episodes):
    state = env.reset()
    done = False
    
    while not done:
        env.render()
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        print(f"Agent position: {info['agent_position']}, Target position: {info['target_position']}, Reward: {reward}, Episode: {episode}")

Agent position: [8 3], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [8 2], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [8 1], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [7 1], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [7 0], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [7 1], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [7 0], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [6 0], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [5 0], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [5 0], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [5 1], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [6 1], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [6 2], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [6 1], Target position: [6 4], Reward: 0, Episode: 0
Agent position: [6 2], Target posi

KeyboardInterrupt: 