# 2025 DL Lab8: RL Assignment_Super Mario World

**Your Answer:**    
Hi I'm XXX, XXXXXXXXXX.

## Overview
This project implements a **Deep Reinforcement Learning** pipeline to train an autonomous agent for Super Mario World. Leveraging the **Proximal Policy Optimization (PPO)** algorithm, the system interacts with the **stable-retro** environment to master the YoshiIsland1 level. Key components include a custom Vision Backbone for extracting features from raw pixel data and a suite of Environment Wrappers that handle frame preprocessing, action discretization, and reward shaping to facilitate efficient learning.

Reward function implement  
should do something in the beginning (monster attack)  
Custom PPO implement  
pre train weight 差不多，主要是 reward function  
model weight capacity 1GB  
class name 不要動 (可以新增，但是原本有的不要動)

## Imports

In [1]:
import os
import numpy as np
import retro
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.vec_env import VecNormalize

from eval import evaluate_policy, record_video
from custom_policy import VisionBackbonePolicy, CustomPPO

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [2]:
# Game Settings
GAME = "SuperMarioWorld-Snes"
STATE = "YoshiIsland1"

# Training Settings
TOTAL_STEPS = 6_553_600
TRAIN_CHUNK =   327_680
N_ENVS = 16
LEARNING_RATE = 1e-4

# Evaluation & Recording Settings
EVAL_EPISODES = 3
EVAL_MAX_STEPS = 18000
RECORD_STEPS = 1800

# Directories
LOG_DIR = "./runs_smw"
VIDEO_DIR       = os.path.join(LOG_DIR, "videos")
CKPT_DIR        = os.path.join(LOG_DIR, "checkpoints")
TENSORBOARD_LOG = os.path.join(LOG_DIR, "tb")

os.makedirs(LOG_DIR,   exist_ok=True)
os.makedirs(CKPT_DIR,  exist_ok=True)
os.makedirs(VIDEO_DIR, exist_ok=True)

## Environment Functions

In [3]:
from wrappers import make_base_env
def _make_env_thunk(game: str, state: str):
    """Return a function that creates an environment (for multiprocessing)."""
    def _thunk():
        return make_base_env(game, state)
    return _thunk

def make_vec_env(game: str, state: str, n_envs: int, use_subproc: bool = True):
    """Create a vectorized environment (multiple envs running in parallel)."""
    env_fns = [_make_env_thunk(game, state) for _ in range(n_envs)]
    
    if use_subproc and n_envs > 1:
        vec_env = SubprocVecEnv(env_fns)
    else:
        vec_env = DummyVecEnv(env_fns)

    return vec_env

## Initialize Env & Model

In [4]:
# 1. Create Training Environment
train_env = make_vec_env(GAME, STATE, n_envs=N_ENVS)
# train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
print(f"Environment created: {GAME} - {STATE} with {N_ENVS} parallel envs.")

checkpoint_path = "None"
# checkpoint_path = "runs_smw/checkpoints/SF84_step_1600000.zip"
# checkpoint_path = "runs_smw/checkpoints/SF84G_6553600.zip"
checkpoint_path = "runs_smw/checkpoints/SF84G_5.zip"

# 2. Initialize Model
if os.path.exists(checkpoint_path):
    print(f"Loading model from {checkpoint_path}...")
    # 讀取現有模型
    model = CustomPPO.load(
        checkpoint_path, 
        env=train_env,
        device="cuda:0" # 確保使用 GPU
    )
else:
    print(f"Fail to load {checkpoint_path}...")
    model = CustomPPO(
        VisionBackbonePolicy,
        train_env,
        policy_kwargs   = dict(normalize_images=False),
        n_epochs        = 4,
        n_steps         = 512,
        batch_size      = 512,
        learning_rate   = LEARNING_RATE,
        verbose         = 1,
        gamma           = 0.99,
        kl_coef         = 1,
        clip_range      = 0.125,
        tensorboard_log = TENSORBOARD_LOG,
    )

Environment created: SuperMarioWorld-Snes - YoshiIsland1 with 16 parallel envs.
Loading model from runs_smw/checkpoints/SF84G_5.zip...


In [5]:
model.save("policy")

## Training Loop

In [None]:
best_mean = -1e18
trained = 1638400
round_idx = 0

try:
    while trained < TOTAL_STEPS:
        round_idx += 1
        chunk = min(TRAIN_CHUNK, TOTAL_STEPS - trained)
        # chunk = 2000

        print(f"\n=== Round {round_idx} | Learn {chunk} steps (Total trained: {trained}) ===")
        
        # --- Train ---
        model.learn(total_timesteps=chunk, reset_num_timesteps=False, tb_log_name='tunnel')
        trained += chunk
        
        label = "TNL"
        tagged_label = f"{label}_{int(trained/TRAIN_CHUNK)}"

        # --- Save Checkpoint ---
        ckpt_path = os.path.join(CKPT_DIR, f"{tagged_label}.zip")
        model.save(ckpt_path)
        print(f"Saved checkpoint: {ckpt_path}")

        # --- Evaluate ---
        mean_ret, best_ret = evaluate_policy(
            model,
            GAME,
            STATE,
            n_episodes=EVAL_EPISODES,
            max_steps=EVAL_MAX_STEPS,
        )
        print(f"[EVAL] Mean Return: {mean_ret:.3f}, Best Return: {best_ret:.3f}")

        # --- Save Best Model ---
        # if mean_ret > best_mean:
        #     best_mean = mean_ret
        #     best_path = os.path.join(LOG_DIR, "best_model.zip")
        #     model.save(best_path)
        #     print(f"New best record. Saved to {best_path}")

        # --- Record Video ---
        record_video(
            model,
            GAME,
            STATE,
            VIDEO_DIR,
            video_len=RECORD_STEPS,
            prefix=f"{label}/{tagged_label}_{mean_ret:.2f}",
        )

except KeyboardInterrupt:
    print("\nTraining interrupted manually.")

finally:
    train_env.close()
    print("Training finished. Environment closed.")
    
"""
tensorboard --logdir=./runs_smw/tb
"""


=== Round 1 | Learn 327680 steps (Total trained: 1638400) ===
Logging to ./runs_smw/tb/tunnel_0


--------------------------------
| time/              |         |
|    fps             | 1080    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 1646592 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 895         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 1654784     |
| train/                |             |
|    approx_kl          | 0.011073049 |
|    entropy_loss       | -2.01       |
|    explained_variance | 0.836       |
|    learning_rate      | 0.0002      |
|    loss               | 0.213       |
|    mean_step_reward   | 0.085263595 |
|    n_updates          | 804         |
|    policyGradLoss     | -0.00368    |
|    value_loss         | 0.701       |
---------------------------------------
---------------------------------------
| time/                 |             |
|    fps 

Saved checkpoint: ./runs_smw/checkpoints/TNL_6.zip
[EVAL] Mean Return: -1467.902, Best Return: -1467.076
Saved video to ./runs_smw/videos/step_1966080_mean_-1467.90.mp4

=== Round 2 | Learn 327680 steps (Total trained: 1966080) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1131    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 1974272 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 904         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 1982464     |
| train/                |             |
|    approx_kl          | 0.011450709 |
|    entropy_loss       | -2.08       |
|    explained_variance | 0.862       |
|    learning_rate      | 0.0002      |
|    loss               | 0.147       |
|    mean_step_reward  

Saved checkpoint: ./runs_smw/checkpoints/TNL_7.zip
[EVAL] Mean Return: 17.604, Best Return: 17.717
Saved video to ./runs_smw/videos/step_2293760_mean_17.60.mp4

=== Round 3 | Learn 327680 steps (Total trained: 2293760) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1115    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 2301952 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 909         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 2310144     |
| train/                |             |
|    approx_kl          | 0.011228785 |
|    entropy_loss       | -2.07       |
|    explained_variance | 0.924       |
|    learning_rate      | 0.0002      |
|    loss               | 0.0705      |
|    mean_step_reward   | 0.0821

Saved checkpoint: ./runs_smw/checkpoints/TNL_8.zip
[EVAL] Mean Return: -1266.321, Best Return: -1265.443
Saved video to ./runs_smw/videos/step_2621440_mean_-1266.32.mp4

=== Round 4 | Learn 327680 steps (Total trained: 2621440) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1091    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 2629632 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 910         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 2637824     |
| train/                |             |
|    approx_kl          | 0.016054943 |
|    entropy_loss       | -2.08       |
|    explained_variance | 0.914       |
|    learning_rate      | 0.0002      |
|    loss               | -0.00388    |
|    mean_step_reward  

Saved checkpoint: ./runs_smw/checkpoints/TNL_9.zip
[EVAL] Mean Return: -59658.954, Best Return: -59654.252
Saved video to ./runs_smw/videos/step_2949120_mean_-59658.95.mp4

=== Round 5 | Learn 327680 steps (Total trained: 2949120) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1073    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 2957312 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 880         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 2965504     |
| train/                |             |
|    approx_kl          | 0.015108988 |
|    entropy_loss       | -1.88       |
|    explained_variance | 0.945       |
|    learning_rate      | 0.0002      |
|    loss               | 0.0151      |
|    mean_step_rewar

Saved checkpoint: ./runs_smw/checkpoints/TNL_10.zip
[EVAL] Mean Return: -1251.617, Best Return: -1250.742
Saved video to ./runs_smw/videos/step_3276800_mean_-1251.62.mp4

=== Round 6 | Learn 327680 steps (Total trained: 3276800) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1094    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 3284992 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 903         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 3293184     |
| train/                |             |
|    approx_kl          | 0.011255363 |
|    entropy_loss       | -2.07       |
|    explained_variance | 0.905       |
|    learning_rate      | 0.0002      |
|    loss               | 0.103       |
|    mean_step_reward 

Saved checkpoint: ./runs_smw/checkpoints/TNL_11.zip
[EVAL] Mean Return: 93.283, Best Return: 93.446
Saved video to ./runs_smw/videos/step_3604480_mean_93.28.mp4

=== Round 7 | Learn 327680 steps (Total trained: 3604480) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1090    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 3612672 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 895         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 3620864     |
| train/                |             |
|    approx_kl          | 0.016525116 |
|    entropy_loss       | -1.98       |
|    explained_variance | 0.976       |
|    learning_rate      | 0.0002      |
|    loss               | -0.0126     |
|    mean_step_reward   | 0.098

Saved checkpoint: ./runs_smw/checkpoints/TNL_12.zip
[EVAL] Mean Return: -1265.940, Best Return: -1265.113
Saved video to ./runs_smw/videos/step_3932160_mean_-1265.94.mp4

=== Round 8 | Learn 327680 steps (Total trained: 3932160) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1104    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 3940352 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 897         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 3948544     |
| train/                |             |
|    approx_kl          | 0.012774943 |
|    entropy_loss       | -2          |
|    explained_variance | 0.865       |
|    learning_rate      | 0.0002      |
|    loss               | 0.142       |
|    mean_step_reward 

Saved checkpoint: ./runs_smw/checkpoints/TNL_13.zip
[EVAL] Mean Return: 104.800, Best Return: 105.013
Saved video to ./runs_smw/videos/step_4259840_mean_104.80.mp4

=== Round 9 | Learn 327680 steps (Total trained: 4259840) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1088    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 4268032 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 894         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 4276224     |
| train/                |             |
|    approx_kl          | 0.015124194 |
|    entropy_loss       | -2.01       |
|    explained_variance | 0.871       |
|    learning_rate      | 0.0002      |
|    loss               | 0.135       |
|    mean_step_reward   | 0.

Saved checkpoint: ./runs_smw/checkpoints/TNL_14.zip
[EVAL] Mean Return: 50.118, Best Return: 50.231
Saved video to ./runs_smw/videos/step_4587520_mean_50.12.mp4

=== Round 10 | Learn 327680 steps (Total trained: 4587520) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1117    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 4595712 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 893          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 4603904      |
| train/                |              |
|    approx_kl          | 0.0141735375 |
|    entropy_loss       | -1.98        |
|    explained_variance | 0.863        |
|    learning_rate      | 0.0002       |
|    loss               | 0.096        |
|    mean_step_rew

Saved checkpoint: ./runs_smw/checkpoints/TNL_15.zip
[EVAL] Mean Return: 183.317, Best Return: 183.674
Saved video to ./runs_smw/videos/step_4915200_mean_183.32.mp4

=== Round 11 | Learn 327680 steps (Total trained: 4915200) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1103    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 4923392 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 899         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 4931584     |
| train/                |             |
|    approx_kl          | 0.016582938 |
|    entropy_loss       | -1.95       |
|    explained_variance | 0.935       |
|    learning_rate      | 0.0002      |
|    loss               | -0.0172     |
|    mean_step_reward   | 0

Saved checkpoint: ./runs_smw/checkpoints/TNL_16.zip
[EVAL] Mean Return: -1261.999, Best Return: -1261.121
Saved video to ./runs_smw/videos/step_5242880_mean_-1262.00.mp4

=== Round 12 | Learn 327680 steps (Total trained: 5242880) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1133    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 5251072 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 909         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 5259264     |
| train/                |             |
|    approx_kl          | 0.019979257 |
|    entropy_loss       | -1.92       |
|    explained_variance | 0.906       |
|    learning_rate      | 0.0002      |
|    loss               | 0.0425      |
|    mean_step_reward

Saved checkpoint: ./runs_smw/checkpoints/TNL_17.zip
[EVAL] Mean Return: -1265.712, Best Return: -1264.834
Saved video to ./runs_smw/videos/step_5570560_mean_-1265.71.mp4

=== Round 13 | Learn 327680 steps (Total trained: 5570560) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1083    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 5578752 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 881         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 5586944     |
| train/                |             |
|    approx_kl          | 0.019700333 |
|    entropy_loss       | -1.93       |
|    explained_variance | 0.936       |
|    learning_rate      | 0.0002      |
|    loss               | 0.115       |
|    mean_step_reward

Saved checkpoint: ./runs_smw/checkpoints/TNL_18.zip
[EVAL] Mean Return: 160.551, Best Return: 160.858
Saved video to ./runs_smw/videos/step_5898240_mean_160.55.mp4

=== Round 14 | Learn 327680 steps (Total trained: 5898240) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1086    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 5906432 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 895         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 5914624     |
| train/                |             |
|    approx_kl          | 0.012466939 |
|    entropy_loss       | -1.92       |
|    explained_variance | 0.925       |
|    learning_rate      | 0.0002      |
|    loss               | -0.012      |
|    mean_step_reward   | 0

Saved checkpoint: ./runs_smw/checkpoints/TNL_19.zip
[EVAL] Mean Return: 102.878, Best Return: 103.041
Saved video to ./runs_smw/videos/step_6225920_mean_102.88.mp4

=== Round 15 | Learn 327680 steps (Total trained: 6225920) ===
Logging to ./runs_smw/tb/tunnel_0
--------------------------------
| time/              |         |
|    fps             | 1124    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 6234112 |
--------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 898       |
|    iterations         | 2         |
|    time_elapsed       | 18        |
|    total_timesteps    | 6242304   |
| train/                |           |
|    approx_kl          | 0.0169594 |
|    entropy_loss       | -1.93     |
|    explained_variance | 0.937     |
|    learning_rate      | 0.0002    |
|    loss               | 0.134     |
|    mean_step_reward   | 0.1155926 |
|    n_update

Saved checkpoint: ./runs_smw/checkpoints/TNL_20.zip
[EVAL] Mean Return: -0.563, Best Return: -0.500
Saved video to ./runs_smw/videos/step_6553600_mean_-0.56.mp4
Training finished. Environment closed.


'\ntensorboard --logdir=./runs_smw/tb\n'

## Display Video

In [7]:
# from IPython.display import Video
# import glob

# list_of_files = glob.glob(os.path.join(VIDEO_DIR, '*.mp4')) 
# if list_of_files:
#     latest_file = max(list_of_files, key=os.path.getctime)
#     print(f"Playing: {latest_file}")
#     display(Video(latest_file, embed=True, width=600))
# else:
#     print("No videos found yet.")