# 2025 DL Lab8: RL Assignment_Super Mario World

**Your Answer:**    
Hi I'm XXX, XXXXXXXXXX.

## Overview
This project implements a **Deep Reinforcement Learning** pipeline to train an autonomous agent for Super Mario World. Leveraging the **Proximal Policy Optimization (PPO)** algorithm, the system interacts with the **stable-retro** environment to master the YoshiIsland1 level. Key components include a custom Vision Backbone for extracting features from raw pixel data and a suite of Environment Wrappers that handle frame preprocessing, action discretization, and reward shaping to facilitate efficient learning.

Reward function implement  
should do something in the beginning (monster attack)  
Custom PPO implement  
pre train weight 差不多，主要是 reward function  
model weight capacity 1GB  
class name 不要動 (可以新增，但是原本有的不要動)

## Imports

In [1]:
import os
import numpy as np
import retro
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv

from eval import evaluate_policy, record_video
from custom_policy import VisionBackbonePolicy, CustomPPO

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [2]:
# Game Settings
GAME = "SuperMarioWorld-Snes"
STATE = "YoshiIsland1"

# Training Settings
TOTAL_STEPS = 8_000_000
TRAIN_CHUNK = 800_000
N_ENVS = 16
LEARNING_RATE = 2.5e-4

# Evaluation & Recording Settings
EVAL_EPISODES = 3
EVAL_MAX_STEPS = 18000
RECORD_STEPS = 18000

# Directories
LOG_DIR = "./runs_smw"
VIDEO_DIR       = os.path.join(LOG_DIR, "videos")
CKPT_DIR        = os.path.join(LOG_DIR, "checkpoints")
TENSORBOARD_LOG = os.path.join(LOG_DIR, "tb")

os.makedirs(LOG_DIR,   exist_ok=True)
os.makedirs(CKPT_DIR,  exist_ok=True)
os.makedirs(VIDEO_DIR, exist_ok=True)

## Environment Functions

In [3]:
from wrappers import make_base_env
def _make_env_thunk(game: str, state: str):
    """Return a function that creates an environment (for multiprocessing)."""
    def _thunk():
        return make_base_env(game, state)
    return _thunk

def make_vec_env(game: str, state: str, n_envs: int, use_subproc: bool = True):
    """Create a vectorized environment (multiple envs running in parallel)."""
    env_fns = [_make_env_thunk(game, state) for _ in range(n_envs)]
    
    if use_subproc and n_envs > 1:
        vec_env = SubprocVecEnv(env_fns)
    else:
        vec_env = DummyVecEnv(env_fns)

    return vec_env


## Initialize Env & Model

In [4]:
# 1. Create Training Environment
train_env = make_vec_env(GAME, STATE, n_envs=N_ENVS)
print(f"Environment created: {GAME} - {STATE} with {N_ENVS} parallel envs.")

checkpoint_path = "None"
checkpoint_path = "runs_smw/checkpoints/p84_step_100000.zip"

# 2. Initialize Model
if os.path.exists(checkpoint_path):
    print(f"Loading model from {checkpoint_path}...")
    # 讀取現有模型
    model = CustomPPO.load(
        checkpoint_path, 
        env=train_env,
        device="cuda:0" # 確保使用 GPU
    )
else:
    model = CustomPPO(
        VisionBackbonePolicy,
        train_env,
        policy_kwargs   = dict(normalize_images=False),
        n_epochs        = 10,
        n_steps         = 512,
        batch_size      = 512,
        learning_rate   = LEARNING_RATE,
        verbose         = 1,
        gamma           = 0.99,
        kl_coef         = 1,
        clip_range      = 0.5,
        tensorboard_log = TENSORBOARD_LOG,
    )

Environment created: SuperMarioWorld-Snes - YoshiIsland1 with 16 parallel envs.
Loading model from runs_smw/checkpoints/p84_step_100000.zip...


## Training Loop

In [5]:
best_mean = -1e18
trained = 100000
round_idx = 0

try:
    while trained < TOTAL_STEPS:
        round_idx += 1
        chunk = min(TRAIN_CHUNK, TOTAL_STEPS - trained)
        # chunk = 2000

        print(f"\n=== Round {round_idx} | Learn {chunk} steps (Total trained: {trained}) ===")
        
        # --- Train ---
        model.learn(total_timesteps=chunk, reset_num_timesteps=False)
        trained += chunk

        # --- Save Checkpoint ---
        ckpt_path = os.path.join(CKPT_DIR, f"p84_step_{trained}.zip")
        model.save(ckpt_path)
        print(f"Saved checkpoint: {ckpt_path}")

        # --- Evaluate ---
        mean_ret, best_ret = evaluate_policy(
            model,
            GAME,
            STATE,
            n_episodes=EVAL_EPISODES,
            max_steps=EVAL_MAX_STEPS,
        )
        print(f"[EVAL] Mean Return: {mean_ret:.3f}, Best Return: {best_ret:.3f}")

        # --- Save Best Model ---
        if mean_ret > best_mean:
            best_mean = mean_ret
            best_path = os.path.join(LOG_DIR, "best_model.zip")
            model.save(best_path)
            print(f"New best record. Saved to {best_path}")

        # --- Record Video ---
        record_video(
            model,
            GAME,
            STATE,
            VIDEO_DIR,
            video_len=RECORD_STEPS,
            prefix=f"step_{trained}_mean_{mean_ret:.2f}",
        )

except KeyboardInterrupt:
    print("\nTraining interrupted manually.")

finally:
    train_env.close()
    print("Training finished. Environment closed.")


=== Round 1 | Learn 800000 steps (Total trained: 100000) ===
Logging to ./runs_smw/tb/MyPPO_0


-------------------------------
| time/              |        |
|    fps             | 1448   |
|    iterations      | 1      |
|    time_elapsed    | 5      |
|    total_timesteps | 114688 |
-------------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 617       |
|    iterations           | 2         |
|    time_elapsed         | 26        |
|    total_timesteps      | 122880    |
| train/                  |           |
|    approx_kl            | 0.0395276 |
|    clip_fraction        | 0.00526   |
|    clip_range           | 0.5       |
|    entropy_loss         | -2.37     |
|    explained_variance   | 0.973     |
|    learning_rate        | 0.00025   |
|    loss                 | 3.93      |
|    n_updates            | 270       |
|    policy_gradient_loss | -0.00955  |
|    value_loss           | 10.3      |
---------------------------------------
----------------------------------------
| time/        

Saved checkpoint: ./runs_smw/checkpoints/p84_step_900000.zip
[EVAL] Mean Return: -79.600, Best Return: -79.600
New best record. Saved to ./runs_smw/best_model.zip
Saved video to ./runs_smw/videos/step_900000_mean_-79.60.mp4

=== Round 2 | Learn 800000 steps (Total trained: 900000) ===
Logging to ./runs_smw/tb/MyPPO_0
-------------------------------
| time/              |        |
|    fps             | 1371   |
|    iterations      | 1      |
|    time_elapsed    | 5      |
|    total_timesteps | 917504 |
-------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 593          |
|    iterations           | 2            |
|    time_elapsed         | 27           |
|    total_timesteps      | 925696       |
| train/                  |              |
|    approx_kl            | 0.0060264776 |
|    clip_fraction        | 6.1e-05      |
|    clip_range           | 0.5          |
|    entropy_loss         |

Saved checkpoint: ./runs_smw/checkpoints/p84_step_1700000.zip
[EVAL] Mean Return: -79.600, Best Return: -79.600
Saved video to ./runs_smw/videos/step_1700000_mean_-79.60.mp4

=== Round 3 | Learn 800000 steps (Total trained: 1700000) ===
Logging to ./runs_smw/tb/MyPPO_0
--------------------------------
| time/              |         |
|    fps             | 1487    |
|    iterations      | 1       |
|    time_elapsed    | 5       |
|    total_timesteps | 1720320 |
--------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 639          |
|    iterations           | 2            |
|    time_elapsed         | 25           |
|    total_timesteps      | 1728512      |
| train/                  |              |
|    approx_kl            | 0.0035677743 |
|    clip_fraction        | 0            |
|    clip_range           | 0.5          |
|    entropy_loss         | -2.37        |
|    explained_variance   

Saved checkpoint: ./runs_smw/checkpoints/p84_step_2500000.zip
[EVAL] Mean Return: 64.300, Best Return: 64.300
New best record. Saved to ./runs_smw/best_model.zip
Saved video to ./runs_smw/videos/step_2500000_mean_64.30.mp4

=== Round 4 | Learn 800000 steps (Total trained: 2500000) ===
Logging to ./runs_smw/tb/MyPPO_0
--------------------------------
| time/              |         |
|    fps             | 1480    |
|    iterations      | 1       |
|    time_elapsed    | 5       |
|    total_timesteps | 2523136 |
--------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 616          |
|    iterations           | 2            |
|    time_elapsed         | 26           |
|    total_timesteps      | 2531328      |
| train/                  |              |
|    approx_kl            | 0.0067733848 |
|    clip_fraction        | 1.22e-05     |
|    clip_range           | 0.5          |
|    entropy_loss   

Saved checkpoint: ./runs_smw/checkpoints/p84_step_3300000.zip
[EVAL] Mean Return: -79.600, Best Return: -79.600
Saved video to ./runs_smw/videos/step_3300000_mean_-79.60.mp4

=== Round 5 | Learn 800000 steps (Total trained: 3300000) ===
Logging to ./runs_smw/tb/MyPPO_0
--------------------------------
| time/              |         |
|    fps             | 1375    |
|    iterations      | 1       |
|    time_elapsed    | 5       |
|    total_timesteps | 3325952 |
--------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 593         |
|    iterations           | 2           |
|    time_elapsed         | 27          |
|    total_timesteps      | 3334144     |
| train/                  |             |
|    approx_kl            | 0.012092808 |
|    clip_fraction        | 0.00118     |
|    clip_range           | 0.5         |
|    entropy_loss         | -2.33       |
|    explained_variance   | 0.997    

Saved checkpoint: ./runs_smw/checkpoints/p84_step_4100000.zip
[EVAL] Mean Return: 207.800, Best Return: 207.800
New best record. Saved to ./runs_smw/best_model.zip
Saved video to ./runs_smw/videos/step_4100000_mean_207.80.mp4

=== Round 6 | Learn 800000 steps (Total trained: 4100000) ===
Logging to ./runs_smw/tb/MyPPO_0
--------------------------------
| time/              |         |
|    fps             | 1399    |
|    iterations      | 1       |
|    time_elapsed    | 5       |
|    total_timesteps | 4128768 |
--------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 609         |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 4136960     |
| train/                  |             |
|    approx_kl            | 0.008921746 |
|    clip_fraction        | 0.00011     |
|    clip_range           | 0.5         |
|    entropy_loss         |

Saved checkpoint: ./runs_smw/checkpoints/p84_step_4900000.zip
[EVAL] Mean Return: -79.600, Best Return: -79.600
Saved video to ./runs_smw/videos/step_4900000_mean_-79.60.mp4

=== Round 7 | Learn 800000 steps (Total trained: 4900000) ===
Logging to ./runs_smw/tb/MyPPO_0
--------------------------------
| time/              |         |
|    fps             | 1436    |
|    iterations      | 1       |
|    time_elapsed    | 5       |
|    total_timesteps | 4931584 |
--------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 594          |
|    iterations           | 2            |
|    time_elapsed         | 27           |
|    total_timesteps      | 4939776      |
| train/                  |              |
|    approx_kl            | 0.0050386842 |
|    clip_fraction        | 4.88e-05     |
|    clip_range           | 0.5          |
|    entropy_loss         | -2.34        |
|    explained_variance   

Saved checkpoint: ./runs_smw/checkpoints/p84_step_5700000.zip
[EVAL] Mean Return: -79.600, Best Return: -79.600
Saved video to ./runs_smw/videos/step_5700000_mean_-79.60.mp4

=== Round 8 | Learn 800000 steps (Total trained: 5700000) ===
Logging to ./runs_smw/tb/MyPPO_0
--------------------------------
| time/              |         |
|    fps             | 1462    |
|    iterations      | 1       |
|    time_elapsed    | 5       |
|    total_timesteps | 5734400 |
--------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 611          |
|    iterations           | 2            |
|    time_elapsed         | 26           |
|    total_timesteps      | 5742592      |
| train/                  |              |
|    approx_kl            | 0.0065685813 |
|    clip_fraction        | 0.000366     |
|    clip_range           | 0.5          |
|    entropy_loss         | -2.38        |
|    explained_variance   

Saved checkpoint: ./runs_smw/checkpoints/p84_step_6500000.zip
[EVAL] Mean Return: -79.600, Best Return: -79.600
Saved video to ./runs_smw/videos/step_6500000_mean_-79.60.mp4

=== Round 9 | Learn 800000 steps (Total trained: 6500000) ===
Logging to ./runs_smw/tb/MyPPO_0
--------------------------------
| time/              |         |
|    fps             | 1432    |
|    iterations      | 1       |
|    time_elapsed    | 5       |
|    total_timesteps | 6537216 |
--------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 560         |
|    iterations           | 2           |
|    time_elapsed         | 29          |
|    total_timesteps      | 6545408     |
| train/                  |             |
|    approx_kl            | 0.011839173 |
|    clip_fraction        | 0.000659    |
|    clip_range           | 0.5         |
|    entropy_loss         | -2.35       |
|    explained_variance   | 0.995    

Saved checkpoint: ./runs_smw/checkpoints/p84_step_7300000.zip
[EVAL] Mean Return: -79.600, Best Return: -79.600
Saved video to ./runs_smw/videos/step_7300000_mean_-79.60.mp4

=== Round 10 | Learn 700000 steps (Total trained: 7300000) ===
Logging to ./runs_smw/tb/MyPPO_0
--------------------------------
| time/              |         |
|    fps             | 1349    |
|    iterations      | 1       |
|    time_elapsed    | 6       |
|    total_timesteps | 7340032 |
--------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 585         |
|    iterations           | 2           |
|    time_elapsed         | 27          |
|    total_timesteps      | 7348224     |
| train/                  |             |
|    approx_kl            | 0.009833697 |
|    clip_fraction        | 0.0049      |
|    clip_range           | 0.5         |
|    entropy_loss         | -2.37       |
|    explained_variance   | 0.99    

Saved checkpoint: ./runs_smw/checkpoints/p84_step_8000000.zip
[EVAL] Mean Return: -79.600, Best Return: -79.600
Saved video to ./runs_smw/videos/step_8000000_mean_-79.60.mp4
Training finished. Environment closed.


## Display Video

In [6]:
from IPython.display import Video
import glob

list_of_files = glob.glob(os.path.join(VIDEO_DIR, '*.mp4')) 
if list_of_files:
    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"Playing: {latest_file}")
    display(Video(latest_file, embed=True, width=600))
else:
    print("No videos found yet.")

Playing: ./runs_smw/videos/step_8000000_mean_-79.60.mp4
