In [1]:
import gym
import gym_super_mario_bros
from gym.wrappers import GrayScaleObservation
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

ModuleNotFoundError: No module named 'gym_super_mario_bros'

In [3]:
!pip list | grep gym

gym                       0.26.2
gym-notices               0.0.8


In [None]:
# Create the Super Mario Bros environment
env = gym_super_mario_bros.make('SuperMarioBros-v0')

# Limit actions to simple movements (left, right, jump)
env = gym.wrappers.ActionWrapper(env, SIMPLE_MOVEMENT)

# Convert observations to grayscale to reduce complexity
env = GrayScaleObservation(env, keep_dim=True)

# Stack 4 frames to give the agent a sense of motion
env = VecFrameStack(DummyVecEnv([lambda: env]), n_stack=4)

In [None]:
# Initialize the PPO (Proximal Policy Optimization) agent
model = PPO('CnnPolicy', env, verbose=1)

In [None]:
# Train the agent for 1 million steps
model.learn(total_timesteps=1_000_000)

In [None]:
# Save the model
model.save("mario_ppo")

In [None]:
# Load the saved model
model = PPO.load("mario_ppo")

In [None]:
# Evaluate the trained agent
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
print(f'Mean reward: {mean_reward}')

In [None]:
# Watch the agent play
obs = env.reset()
done = False
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, info = env.step(action)
    env.render()
env.close()