# 1. Setup Mario Environment

# 1.1 Install and Import Dependencies

In [4]:
!pip install gym_super_mario_bros==7.3.0 nes_py
# gym-super-mario-bros - super mario environment based on openai gymnasium
# nes-py - emulator that allows python to interact with super-mario through simulated joystick



In [1]:
# Import the Game
import gym_super_mario_bros

# Import Joypad wrapper
from nes_py.wrappers import JoypadSpace

# Import the SIMPLIFIED controls
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

from matplotlib import pyplot as plt

In [3]:
# SIMPLE MOVEMENT - Simplify number of actions that mario is able to do
SIMPLE_MOVEMENT

[['NOOP'],
 ['right'],
 ['right', 'A'],
 ['right', 'B'],
 ['right', 'A', 'B'],
 ['A'],
 ['left']]

## 1.2 Setup Game

In [2]:
# Create gym environment
env = gym_super_mario_bros.make('SuperMarioBros-v0',apply_api_compatibility=True, render_mode='human') # Standard Super Mario Game Visual Environment

# There are 256 discrete actions the AI can make
env.action_space 

  logger.warn(
  logger.warn(


Discrete(256)

In [3]:
# Wrap environment in JoypadSpace 
env = JoypadSpace(env, SIMPLE_MOVEMENT)

# JoypadSpace.reset Configuration
JoypadSpace.reset = lambda self, **kwargs: self.env.reset(**kwargs)

# Reduced the number of distinct actions to 7 (SIMPLE_MOVEMENT), making the model significantly easier to train
env.action_space

Discrete(7)

In [5]:
env.observation_space.shape # size of data frame returned from the environment (x, y, RGB)

(240, 256, 3)

## 1.3 Test Environment with Random Actions

In [7]:
env.close() # Run this cell to shut down python instance

# 2. Preprocess Environment

## 2.1 Data Analysis

In [8]:
state = env.reset()
state

OSError: exception: access violation reading 0x000000000003C200

In [None]:
# Data Frame Immediately after Action
env.step(1)[0]

In [None]:
# Reward 
env.step(1)[1]

In [None]:
# Game ended (terminated) - dead
env.step(1)[2]

In [None]:
# Game ended due to max_steps or timeout (truncated)
env.step(1)[3]

In [None]:
# Dictionary of Game Info
env.step(1)[4]

In [None]:
plt.imshow(state[0])

## 2.2 Imports for Preprocessing

In [1]:
'''Install PyTorch'''
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

'''Install stable-baselines for Reinforcement Learning''' 
!pip install stable-baselines3[extra] 
# Models Included - A2C, DDP, DQN, HER, PPO, SAV, TD3

Looking in indexes: https://download.pytorch.org/whl/cu121


In [4]:
'''Import Frame Stacker Wrapper and GrayScaling Wrapper '''
from gym.wrappers import GrayScaleObservation 
# GrayScaleObservation - reduces amount of data that the model needs to process

''' Import Vectorization Wrappers '''
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv

# VecFrameStack - work with stacked environments. model can see last several frames to identify Mario's movement(velocity, trajectory, etc.), other patterns and environment interactions.
# DummyVecEnv - wraps environment in a vectorization wrapper

## 2.3 Create new enviornment with preprocessing wrappers

In [5]:
''' 1. Create Base Environment '''
env = gym_super_mario_bros.make('SuperMarioBros-v0',apply_api_compatibility=True, render_mode='human')
''' 2. Simplify Controls '''
env = JoypadSpace(env, SIMPLE_MOVEMENT)
''' 3. Gray Scale '''
env = GrayScaleObservation(env, keep_dim=True) #Reduces number of data to process by factor of 3 (61,440 vs 184,320 points per frame)
# gray_env = GrayScaleObservation(env, keep_dim=True)
''' 4. Wrap inside Dummy Environment '''
env = DummyVecEnv([lambda: env]) # passes list of environment
''' 5. Stack Frames '''
env = VecFrameStack(env, 4, channels_order='last') # stack 4 frames stored as the last set of data in the array

  logger.warn(
  logger.warn(


In [10]:
state = env.reset()



In [None]:
state.shape

In [None]:
plt.imshow(state[0])

In [None]:
env.action_space

In [11]:
'''Step Through the Frames Performing an action from SIMPLE_MOVEMENT or sample from action space'''
obs, reward, done, info  = env.step([1]) # env.action_space.sample()

  if not isinstance(terminated, (bool, np.bool8)):


In [None]:
state.shape

In [None]:
'''Plot Progressive Steps'''
plt.figure(figsize=(20,16))
for idx in range(state.shape[3]):
    plt.subplot(1,4,idx+1)
    plt.imshow(state[0][:,:,idx])
plt.show()

# 3. Train Proximal Policy Optimization (PPO) RL Model
Agent - Mario
Reward - 
Environment - NES Emulator
Action - SIMPLE_MOVEMENT

## 3.1 Imports for Training

In [6]:
''' Import OS for file path management '''
import os

''' Import PPO Algorithm '''
from stable_baselines3 import PPO

''' Import Base Callback for Saving Models '''
from stable_baselines3.common.callbacks import BaseCallback

## 3.2 Implement Model Saving

In [7]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1): # check frequency - how many iterations between saves
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

## 3.3 Create Storage Directories

In [8]:
CHECKPOINT_DIR = './train/' # Store all saved models
LOG_DIR = './logs/' # Store tensorflow logs of each time the learning algorithm is run

## 3.4 Create Callback

In [9]:
callback = TrainAndLoggingCallback(check_freq=100000, save_path=CHECKPOINT_DIR) # (frequency of model saves, where to store models)

## 3.5 Create Model

In [17]:
model = PPO('CnnPolicy', 
            env, 
            verbose=1, 
            tensorboard_log=LOG_DIR, 
            learning_rate=0.000001, 
            n_steps=512)
# Policy - Use CNN Policy as it is optimized for images
# verbose = 1 - get more training info
# tensorboard_log - see metrics on training performance
# learning_rate - keep low to ensure good convergence
# n_steps - number of frames to run per game before updating the network

Using cuda device
Wrapping the env in a VecTransposeImage.


## 3.6 Train Model

In [None]:
model.learn(
    total_timesteps=10000000, 
    callback=callback)
# total_timesteps - number of frames to run

In [16]:
env.close()

In [None]:
model.learn(total_timesteps=10000000, callback=callback)

## 3.7 Continue Training

In [10]:
model = PPO.load('./train/best_model_1300000', tensorboard_log=LOG_DIR)
model.set_env(env)

Wrapping the env in a VecTransposeImage.


In [None]:
model.learn(
    total_timesteps=10000000, 
    callback = callback)

Logging to ./logs/PPO_3


  if not isinstance(terminated, (bool, np.bool8)):
  return (self.ram[0x86] - self.ram[0x071c]) % 256


----------------------------
| time/              |     |
|    fps             | 100 |
|    iterations      | 1   |
|    time_elapsed    | 5   |
|    total_timesteps | 512 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 1024        |
| train/                  |             |
|    approx_kl            | 0.009035625 |
|    clip_fraction        | 0.0865      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.349      |
|    explained_variance   | 0.949       |
|    learning_rate        | 1e-06       |
|    loss                 | 60.9        |
|    n_updates            | 35160       |
|    policy_gradient_loss | -0.00465    |
|    value_loss           | 141         |
-----------------------------------------
-----------------------------------------

# 4. Test Model

# 5. Iterate