# 2025 DL Lab8: RL Assignment_Super Mario World

**Your Answer:**    
Hi I'm XXX, XXXXXXXXXX.

## Overview
This project implements a **Deep Reinforcement Learning** pipeline to train an autonomous agent for Super Mario World. Leveraging the **Proximal Policy Optimization (PPO)** algorithm, the system interacts with the **stable-retro** environment to master the YoshiIsland1 level. Key components include a custom Vision Backbone for extracting features from raw pixel data and a suite of Environment Wrappers that handle frame preprocessing, action discretization, and reward shaping to facilitate efficient learning.

Reward function implement  
should do something in the beginning (monster attack)  
Custom PPO implement  
pre train weight 差不多，主要是 reward function  
model weight capacity 1GB  
class name 不要動 (可以新增，但是原本有的不要動)

## Imports

In [1]:
import os
import numpy as np
import retro
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.vec_env import VecNormalize

from eval import evaluate_policy, record_video
from custom_policy import VisionBackbonePolicy, CustomPPO

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [None]:
# Game Settings
GAME = "SuperMarioWorld-Snes"
STATE = "YoshiIsland1"

# Training Settings
TOTAL_STEPS = 6_553_600
TRAIN_CHUNK =   327_680
N_ENVS = 16
LEARNING_RATE = 2e-4

# Evaluation & Recording Settings
EVAL_EPISODES = 3
EVAL_MAX_STEPS = 18000
RECORD_STEPS = 1800

# Directories
LOG_DIR = "./runs_smw"
VIDEO_DIR       = os.path.join(LOG_DIR, "videos")
CKPT_DIR        = os.path.join(LOG_DIR, "checkpoints")
TENSORBOARD_LOG = os.path.join(LOG_DIR, "tb")

os.makedirs(LOG_DIR,   exist_ok=True)
os.makedirs(CKPT_DIR,  exist_ok=True)
os.makedirs(VIDEO_DIR, exist_ok=True)

## Environment Functions

In [3]:
from wrappers import make_base_env
def _make_env_thunk(game: str, state: str):
    """Return a function that creates an environment (for multiprocessing)."""
    def _thunk():
        return make_base_env(game, state)
    return _thunk

def make_vec_env(game: str, state: str, n_envs: int, use_subproc: bool = True):
    """Create a vectorized environment (multiple envs running in parallel)."""
    env_fns = [_make_env_thunk(game, state) for _ in range(n_envs)]
    
    if use_subproc and n_envs > 1:
        vec_env = SubprocVecEnv(env_fns)
    else:
        vec_env = DummyVecEnv(env_fns)

    return vec_env


## Initialize Env & Model

In [None]:
# 1. Create Training Environment
train_env = make_vec_env(GAME, STATE, n_envs=N_ENVS)
# train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
print(f"Environment created: {GAME} - {STATE} with {N_ENVS} parallel envs.")

checkpoint_path = "None"
# checkpoint_path = "runs_smw/checkpoints/SF84_step_1600000.zip"
# checkpoint_path = "runs_smw/checkpoints/SF84G_6553600.zip"

# 2. Initialize Model
if os.path.exists(checkpoint_path):
    print(f"Loading model from {checkpoint_path}...")
    # 讀取現有模型
    model = CustomPPO.load(
        checkpoint_path, 
        env=train_env,
        device="cuda:0" # 確保使用 GPU
    )
else:
    print(f"Fail to load {checkpoint_path}...")
    model = CustomPPO(
        VisionBackbonePolicy,
        train_env,
        policy_kwargs   = dict(normalize_images=False),
        n_epochs        = 4,
        n_steps         = 512,
        batch_size      = 512,
        learning_rate   = LEARNING_RATE,
        verbose         = 1,
        gamma           = 0.99,
        kl_coef         = 1,
        clip_range      = 0.125,
        tensorboard_log = TENSORBOARD_LOG,
    )

Environment created: SuperMarioWorld-Snes - YoshiIsland1 with 16 parallel envs.
Fail to load None...
Using cuda:0 device


## Training Loop

In [None]:
best_mean = -1e18
trained = 0
round_idx = 0

try:
    while trained < TOTAL_STEPS:
        round_idx += 1
        chunk = min(TRAIN_CHUNK, TOTAL_STEPS - trained)
        # chunk = 2000

        print(f"\n=== Round {round_idx} | Learn {chunk} steps (Total trained: {trained}) ===")
        
        # --- Train ---
        model.learn(total_timesteps=chunk, reset_num_timesteps=False, tb_log_name='SF84G_1220')
        trained += chunk

        # --- Save Checkpoint ---
        ckpt_path = os.path.join(CKPT_DIR, f"SF84G_{int(trained/TRAIN_CHUNK)}.zip")
        model.save(ckpt_path)
        print(f"Saved checkpoint: {ckpt_path}")

        # --- Evaluate ---
        mean_ret, best_ret = evaluate_policy(
            model,
            GAME,
            STATE,
            n_episodes=EVAL_EPISODES,
            max_steps=EVAL_MAX_STEPS,
        )
        print(f"[EVAL] Mean Return: {mean_ret:.3f}, Best Return: {best_ret:.3f}")

        # --- Save Best Model ---
        if mean_ret > best_mean:
            best_mean = mean_ret
            best_path = os.path.join(LOG_DIR, "best_model.zip")
            model.save(best_path)
            print(f"New best record. Saved to {best_path}")

        # --- Record Video ---
        record_video(
            model,
            GAME,
            STATE,
            VIDEO_DIR,
            video_len=RECORD_STEPS,
            prefix=f"step_{trained}_mean_{mean_ret:.2f}",
        )

except KeyboardInterrupt:
    print("\nTraining interrupted manually.")

finally:
    train_env.close()
    print("Training finished. Environment closed.")
    
"""
tensorboard --logdir=./runs_smw/tb
"""


=== Round 1 | Learn 327680 steps (Total trained: 0) ===
Logging to ./runs_smw/tb/SF84G_1220_0


-----------------------------
| time/              |      |
|    fps             | 859  |
|    iterations      | 1    |
|    time_elapsed    | 9    |
|    total_timesteps | 8192 |
-----------------------------
----------------------------------------
| time/                 |              |
|    fps                | 687          |
|    iterations         | 2            |
|    time_elapsed       | 23           |
|    total_timesteps    | 16384        |
| train/                |              |
|    approx_kl          | 0.09738616   |
|    entropy_loss       | -2.44        |
|    explained_variance | 0.0146       |
|    learning_rate      | 0.00025      |
|    loss               | -0.0611      |
|    mean_step_reward   | -0.004130859 |
|    n_updates          | 4            |
|    policyGradLoss     | -0.0634      |
|    value_loss         | 0.138        |
----------------------------------------
---------------------------------------
| time/                 |             |
|    fps     

Saved checkpoint: ./runs_smw/checkpoints/SF84G_1.zip
[EVAL] Mean Return: -0.310, Best Return: -0.310
New best record. Saved to ./runs_smw/best_model.zip
Saved video to ./runs_smw/videos/step_327680_mean_-0.31.mp4

=== Round 2 | Learn 327680 steps (Total trained: 327680) ===
Logging to ./runs_smw/tb/SF84G_1220_0
-------------------------------
| time/              |        |
|    fps             | 752    |
|    iterations      | 1      |
|    time_elapsed    | 10     |
|    total_timesteps | 335872 |
-------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 663         |
|    iterations         | 2           |
|    time_elapsed       | 24          |
|    total_timesteps    | 344064      |
| train/                |             |
|    approx_kl          | 0.049360733 |
|    entropy_loss       | -2.21       |
|    explained_variance | 0.856       |
|    learning_rate      | 0.00025     |
|    loss              

Saved checkpoint: ./runs_smw/checkpoints/SF84G_2.zip
[EVAL] Mean Return: 6.860, Best Return: 6.860
New best record. Saved to ./runs_smw/best_model.zip
Saved video to ./runs_smw/videos/step_655360_mean_6.86.mp4

=== Round 3 | Learn 327680 steps (Total trained: 655360) ===
Logging to ./runs_smw/tb/SF84G_1220_0
-------------------------------
| time/              |        |
|    fps             | 725    |
|    iterations      | 1      |
|    time_elapsed    | 11     |
|    total_timesteps | 663552 |
-------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 645         |
|    iterations         | 2           |
|    time_elapsed       | 25          |
|    total_timesteps    | 671744      |
| train/                |             |
|    approx_kl          | 0.09827865  |
|    entropy_loss       | -2.01       |
|    explained_variance | 0.866       |
|    learning_rate      | 0.00025     |
|    loss               | 

## Display Video

In [None]:
# from IPython.display import Video
# import glob

# list_of_files = glob.glob(os.path.join(VIDEO_DIR, '*.mp4')) 
# if list_of_files:
#     latest_file = max(list_of_files, key=os.path.getctime)
#     print(f"Playing: {latest_file}")
#     display(Video(latest_file, embed=True, width=600))
# else:
#     print("No videos found yet.")