In [None]:
## Only run this cell if you are using Google Colab
## Skip with docker container
## Technically you can run these commands on your local (linux distro with apt) machine to have things installed in local environment, preferrably using conda or python venv, but overall not recommended

!apt-get update && apt-get install swig cmake ffmpeg freeglut3-dev xvfb
!git clone https://github.com/DLR-RM/rl-baselines3-zoo
%cd /content/rl-baselines3-zoo/
%pip install -r requirements.txt
%cd /content/
!git clone https://github.com/yusenz/gym-maze.git
%cd /content/gym-maze
%pip install .
%cd /content/
%pip install opencv-python-headless

In [None]:
# sanity testing to see if you have gpu setup correctly
# Should display True, 0, and the name of your GPU or some Tesla device on Colab
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(torch.cuda.current_device()))

## Section 1: Toy environment recap
A simple 2D maze environment where an agent (blue dot) finds its way from the top left corner (blue square) to the goal at the bottom right corner (red square). 
The objective is to find the shortest path from the start to the goal.

<kbd>![Simple 2D maze environment](http://i.giphy.com/Ar3aKxkAAh3y0.gif)</kbd>

#### Action space
The agent may only choose to go up, down, right, or left ("N", "S", "E", "W"; or alternatively: 0, 1, 2, 3). If the way is blocked, it will remain at the same the location. 

#### Observation space
The observation space is the (x, y) coordinate of the agent. The top left cell is `[0. 0.]`, and the bottom right cell is `[size-1, size-1]`.

#### Reward
A reward of 1 is given when the agent reaches the goal. For every step in the maze, the agent recieves a reward of -0.1/(number of cells). This penalty can be adjusted by the `penalty=0.1` parameter, and the normalization is controlled by `penalty_normalize='size'`. I have implemented a few other normalizations: 'none', 'sqrt_size', 'log_size'. You can use `env = gym.make('maze-sample-10x10-v0',penalty=penalty,penalty_normalize=penalty_normalize)` to change the penalty and normalization.

#### End condition
The maze is reset when the agent reaches the goal (terminated), or maximum time step (10,000 for 10x10 maze) is reached (truncated). 

## Section 2: The agent
Again, this block of code defines the base agent. I saved `self.terminated` and `self.truncated` in the agent as well, so that the value can be checked in `update_step()` without changing the calling interface.

In [None]:
import numpy as np
import gym
class BaseAgent:
    def __init__(self, env, verbose=1, ran_seed=42):
        self.env = env
        # random seed is only set once when the agent is initialized
        self.env.seed(ran_seed)
        self.env.action_space.seed(ran_seed+1)  # why isnt this set at env.seed?
        self.env.observation_space.seed(ran_seed+2)
        self.random_state = np.random.RandomState(ran_seed+3)
        self.observation_space = env.observation_space
        space = gym.spaces.utils.flatten_space(self.observation_space)
        # these values reflect the return value of each step, not cumulative
        self.terminated = False
        self.truncated = False
        self.info = None
        # sanitize the observation space
        if isinstance(space, gym.spaces.Box):
            if space.shape is not None and len(space.shape) > 0:
                if space.is_bounded("both"):
                    high = space.high
                    low = space.low
                    self.observation_space_shape = high - low + 1
        self.action_space = env.action_space
        space = gym.spaces.utils.flatten_space(self.action_space)
        # sanitize the action space
        if isinstance(space, gym.spaces.Box):
            if space.shape is not None and len(space.shape) > 0:
                if space.is_bounded("both"):
                    high = space.high
                    low = space.low
                    self.action_space_shape = high - low + 1
        self.verbose = verbose
        self.cumulative_reward = 0
        self.num_steps = 0
    def select_action(self, state):
        raise NotImplementedError
    def update_step(self, reward: float):
        self.cumulative_reward += reward
        self.num_steps += 1
    def update_episode(self):
        self.reset_episode()
    def update_rollout(self):
        if self.verbose > 0:
            print('update_rollout in base class is called, nothing is changed')
    def update_replay(self):
        if self.verbose > 0:
            print('update_replay in base class is called, nothing is changed')
    def reset_episode(self):
        self.cumulative_reward = 0
        self.num_steps = 0
        self.terminated = False
        self.truncated = False
        self.info = None


In [None]:
class RandomAgent(BaseAgent):
    def __init__(self, *args, **kwargs):
        self.cumulative_reward = 0
        super().__init__(*args, **kwargs)
    def select_action(self, state):
        action = self.action_space.sample()
        if self.verbose > 1:
            print('Random agent selected action: ', action)
        return action
    def update_step(self, old_state, action, reward, new_state):
        super().update_step(reward)
    def update_episode(self, terminated, truncated):
        if self.verbose > 0:
            if terminated:
                print('Episode terminated')
            if truncated:
                print('Episode truncated')
        super().update_episode()
    def update_rollout(self):
        pass
    def update_replay(self):
        pass

## Section 3: The training loop


In [None]:
import gym
import gym_maze
import numpy as np
import sys
from matplotlib import pyplot as plt
import IPython.display as display
import cv2


def main_loop(agent, args):
    ## experiment parameters
    # Training terminates on either reaching NUM_EPISODES or MAX_STEPS_TOTAL
    NUM_EPISODES = args['NUM_EPISODES']
    MAX_STEPS_TOTAL = args['MAX_STEPS_TOTAL']
    RENDER_MAZE = args['RENDER_MAZE']
    RENDER_EVERY = args['RENDER_EVERY']  # render every RENDER_EVERY episode because rendering can be slow
    verbose = args['verbose']
    display_handle = args['display_handle']

    ## environment
    env = args['env']
    obv = env.reset()
    directions = ['N', 'E', 'S', 'W']
    # if RENDER_MAZE:
    #     env.render()

    ## additional parameter initialization
    total_steps = 0
    cumulative_reward_array = []
    num_steps_array = []

    ## main loop
    for episode in range(NUM_EPISODES):
        new_obv = env.reset()
        # truncation is handled by the gym environment
        for step in range(MAX_STEPS_TOTAL):
            total_steps += 1
            old_obv = new_obv
            # flatten the states for the agent
            old_obv = gym.spaces.utils.flatten(env.observation_space, old_obv)
            action = agent.select_action(old_obv)
            if np.any(action ==np.array([0,1,2,3])):
                action = int(action)  # needs to be explicitly converted to int

            if verbose > 1:
                print('Selected action: ', action)
            new_obv, reward, terminated, truncated, info = env.step(action)
            agent.terminated = terminated
            agent.truncated = truncated
            agent.info = info
            if RENDER_MAZE and episode % RENDER_EVERY == 0:
                frame = env.render(mode="rgb_array")
                # ipython display DOES NOT support rgb_array which should literally be bitmap
                # turns out they dont support bitmap either
                # bgr_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                bgr_frame = frame[:, :, ::-1]
                bgr_frame = cv2.resize(bgr_frame, (320, 320))
                _, frame_png = cv2.imencode('.png', bgr_frame)
                frame_bytes = frame_png.tobytes()
                display_handle.update(display.Image(data=frame_bytes))
            new_obv = gym.spaces.utils.flatten(env.observation_space, new_obv)
            agent.update_step(old_obv, action, reward, new_obv)
            if terminated or truncated:
                print(f'Episode {episode} finished after {int(agent.num_steps)} steps with total reward {agent.cumulative_reward}')
                cumulative_reward_array.append(agent.cumulative_reward)
                num_steps_array.append(agent.num_steps)
                agent.update_episode(terminated, truncated)
                break
            if total_steps >= MAX_STEPS_TOTAL:
                break
        if total_steps >= MAX_STEPS_TOTAL:
            break
        
    plt.plot(cumulative_reward_array)
    plt.title(f'Cumulative reward per episode for agent {agent.__class__.__name__}')
    plt.xlabel('Episode')
    plt.ylabel('Cumulative reward')
    plt.show()

    # plot against number of steps spent
    num_steps_cumulative = np.cumsum(num_steps_array)
    plt.plot(num_steps_cumulative, cumulative_reward_array)
    plt.title(f'Cumulative reward against number of steps for agent {agent.__class__.__name__}')
    plt.xlabel('Number of steps')
    plt.ylabel('Cumulative reward')
    plt.show()
    return cumulative_reward_array, num_steps_array

