## Initial Setup for the Mario Game

In [None]:
!pip install gym_super_mario_bros==7.3.0 nes_py

In [None]:
# Import the game from OpenAI Gym
import gym_super_mario_bros

# Import the Joypad Wrapper
from nes_py.wrappers import JoypadSpace

# Import the Simplified controls
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT

In [None]:
# Set up the game
env = gym_super_mario_bros.make('SuperMarioBros-v0') # The original Super Mario Bros
env = JoypadSpace(env, SIMPLE_MOVEMENT) # bind simple movements (7 controls instead of 256) to the environment

# env.observation_space.shape # the game frames
# env.action_space # the possible actions (controls)

In [None]:
done = True # flag for whether or not to restart the game
for frame in range(100000): # loop through each frame in the game
    if done: # start the game by resetting the environment
        env.reset()
    state, reward, done, info = env.step(env.action_space.sample()) # perform a random action
    # state: frame from the game (colored image)
    # reward: point system to train mario to go as far to the right as possible
    # done: whether the game ended (mario is dead)
    # info: coins collected, status, etc.
    env.render() # render the game onto the screen
env.close() # close the game

## Environment Preprocessing

In [None]:
# Install pytorch
!pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio===0.10.1+cu113 -f https://download.pytorch.org
# CUDA 11.3 used due to parallel processing abilities allowing more data processing with less power consumption

In [None]:
# Install stable baselines for Reinforced Learning
!pip install stable-baselines3[extra] # Proximal Policy Optimization used to train the RL Model (made by OpenAI)

In [None]:
# Import Frame Stacker Wrapper and GrayScaling Wrapper
from gym.wrappers import GrayScaleObservation

# Import Vectorization Wrappers
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv # vectorization for frame stack

# Import Matplotlib to show the impact of frame stacking
from matplotlib import pyplot as plt

In [None]:
# 1. Create the base environment
env = gym_super_mario_bros.make('SuperMarioBros-v0')

# 2. Simplify the controls 
env = JoypadSpace(env, SIMPLE_MOVEMENT)

# 3. Grayscale the environment to reduce the data to 33% (61440 pixels)
env = GrayScaleObservation(env, keep_dim=True)

# 4. Wrap inside the Dummy Environment
env = DummyVecEnv([lambda: env])

# 5. Stack the frames
env = VecFrameStack(env, 4, channels_order='last')

In [None]:
state = env.reset() # reset the environment
# plt.imshow(state[0]) # plots the state (a color frame from the game), which is now grayscale

In [None]:
state, reward, done, info = env.step([5])

In [None]:
# Code to visualize the stacked frames
plt.figure(figsize=(20,16))
for idx in range(state.shape[3]):
    plt.subplot(1,4,idx+1)
    plt.imshow(state[0][:,:,idx])
plt.show()

## Training the Reinforced Learning Model

In [None]:
# Import os for file path management
import os

# Import PPO for algos
from stable_baselines3 import PPO

# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback

In [None]:
# Callback to save the model every 10,000 steps (takes a LOT of space!)
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [None]:
# Directory setup
CHECKPOINT_DIR = './train/'
LOG_DIR = './logs/'

In [None]:
# Setup model saving callback
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR) # calls the Callback function to save every 10,000 steps

In [None]:
# This is the AI model started
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.000001, n_steps=512)
# CNN (Conversational Neural Network) Policy: Processes images fast; Alternative MLP Policy
# verbose: to get back a lot of information from the training
# learning_rate: the rate at which the model learns (can take a long time)
# n_steps: the number of frames to wait before updating the model

In [None]:
# Train the AI model, this is where the AI model starts to learn
model.learn(total_timesteps=1000000, callback=callback) # !-- This takes a while --! (GPU preferred)

In [None]:
model.save('testModel') # saving the model 

## Testing the RL Model

In [None]:
# Load model
model = PPO.load('./train/best_model_1000000') # loads the model in

In [None]:
state = env.reset() # resetting the environment

In [None]:
# Start the game 
state = env.reset()
# Loop through the game
while True: 
    action, _ = model.predict(state) # gets the action from the model's prediction for the given state that is passed in
    state, reward, done, info = env.step(action) # getting the state, reward, done and info from the environment (same as above)
    env.render() # rendering the environment onto the screen