# Gymnasium Exploration

## Initialize Environment

In [7]:
import torch
from torch import nn
from torchvision import transforms as T
from PIL import Image
import numpy as np
from pathlib import Path
from collections import deque
import random, datetime, os, sys

%matplotlib inline



In [8]:
# A tool kit for RL
import gymnasium as gym
import matplotlib.pyplot as plt

from tensordict import TensorDict
from torchrl.data import TensorDictReplayBuffer, LazyMemmapStorage

# NES Emulator
from nes_py.wrappers import JoypadSpace

In [9]:
# Add the src directory to the path
sys.path.append('../src')

In [10]:
# Create the Mario environment
env = gym.make("ALE/MarioBros-v5", render_mode="rgb_array")
state = env.reset()

In [11]:
# Print environment information
print(f"Action space: {env.action_space}")
print(f"Observation space: {env.observation_space}")

Action space: Discrete(18)
Observation space: Box(0, 255, (210, 160, 3), uint8)


In [12]:
# Render the environment
next_state, reward, done, trunc, info = env.step(action=0)
print(f"Next state: {next_state.shape}")
print(f"Reward: {reward}")
print(f"Done?: {done}")
print(f"Info: {info}")

Next state: (210, 160, 3)
Reward: 0.0
Done?: False
Info: {'lives': 5, 'episode_frame_number': 4, 'frame_number': 4}


## Preprocess Data

In the game of Mario, color information is not important. We can convert the image to grayscale to reduce the dimensionality of the observation space.

We use Wrappers to preprocess the data before feeding it to the agent.

In [None]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        """
        Initialize the SkipFrame wrapper.
        
        Parameters:
        env (gym.Env): The environment to wrap.
        skip (int): The number of frames to skip.
        """
        pass


    def step(self, action):
        """
        Repeat the action for the specified number of frames and sum the reward.

        Parameters:
        action: The action to perform on the environment.

        Returns:
        obs: The observation after performing the action.
        total_reward: The sum of rewards obtained from repeated actions.
        done: Whether the episode has ended.
        trunk: Additional information (typically 'truncated' status in Gym).
        info: Additional info from the environment.
        """
        pass


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        """
        Initialize the GrayScaleObservation wrapper.
        
        Parameters:
        env (gym.Env): The environment to wrap.
        """
        pass
    
    def permute_orientation(self, observation):
        """
        Permute the observation array from [H, W, C] to [C, H, W].
        
        Parameters:
        observation (numpy.ndarray): The original observation array.
        
        Returns:
        observation (torch.Tensor): The permuted observation tensor.
        """
        pass

    def observation(self, observation):
        """
        Convert the observation to grayscale.
        
        Parameters:
        observation (numpy.ndarray): The original observation array.
        
        Returns:
        observation (torch.Tensor): The grayscale observation tensor.
        """
        pass


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        """
        Initialize the ResizeObservation wrapper.
        
        Parameters:
        env (gym.Env): The environment to wrap.
        shape (int or tuple): The desired shape for the observation.
        """
        pass


    def observation(self, observation):
        """
        Resize the observation to the specified shape.
        
        Parameters:
        observation (numpy.ndarray): The original observation array.
        
        Returns:
        observation (torch.Tensor): The resized observation tensor.
        """
        pass


After applying the above wrappers to the environment, the final wrapped state consists of 4 gray-scaled consecutive frames stacked together, as shown above in the image on the left. Each time Mario makes an action, the environment responds with a state of this structure. The structure is represented by a 3-D array of size [4, 84, 84].

In [14]:
import gymnasium as gym
from gymnasium.wrappers import FrameStack

# import wrappers from ../src/env/wrappers.py
from env.wrappers import SkipFrame, GrayScaleObservation, ResizeObservation

# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)

env = FrameStack(env, num_stack=4)

# Print the observation space
print(f"Observation space after preprocessing: {env.observation_space}")

Observation space after preprocessing: Box(0, 255, (4, 84, 84), uint8)


## Agent

We create a class Mario to represent our agent in the game. Mario should be able to:

- Act according to the optimal action policy based on the current state (of the environment).

- Remember experiences. Experience = (current state, current action, reward, next state). Mario caches and later recalls his experiences to update his action policy.

- Learn a better action policy over time

In [15]:
class Mario:
    def __init__():
        pass

    def act(self, state):
        """Given a state, choose an epsilon-greedy action"""
        pass

    def cache(self, experience):
        """Add the experience to memory"""
        pass

    def recall(self):
        """Sample experiences from memory"""
        pass

    def learn(self):
        """Update online action value (Q) function with a batch of experiences"""
        pass