# 2025 DL Lab8: RL Assignment_Super Mario World

**Your Answer:**    
Hi I'm XXX, XXXXXXXXXX.

## Overview
This project implements a **Deep Reinforcement Learning** pipeline to train an autonomous agent for Super Mario World. Leveraging the **Proximal Policy Optimization (PPO)** algorithm, the system interacts with the **stable-retro** environment to master the YoshiIsland1 level. Key components include a custom Vision Backbone for extracting features from raw pixel data and a suite of Environment Wrappers that handle frame preprocessing, action discretization, and reward shaping to facilitate efficient learning.

Reward function implement  
should do something in the beginning (monster attack)  
Custom PPO implement  
pre train weight 差不多，主要是 reward function  
model weight capacity 1GB  
class name 不要動 (可以新增，但是原本有的不要動)

## Imports

In [1]:
import os
import numpy as np
import retro
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.vec_env import VecNormalize

from eval import evaluate_policy, record_video
from custom_policy import VisionBackbonePolicy, CustomPPO

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [2]:
# Game Settings
GAME = "SuperMarioWorld-Snes"
STATE = "YoshiIsland1"

# Training Settings
TOTAL_STEPS = 13_107_200
TRAIN_CHUNK =    327_680
N_ENVS = 16
LEARNING_RATE = 1e-4

# Evaluation & Recording Settings
EVAL_EPISODES = 3
EVAL_MAX_STEPS = 18000
RECORD_STEPS = 1200

# Directories
LOG_DIR = "./runs_smw"
VIDEO_DIR       = os.path.join(LOG_DIR, "videos")
CKPT_DIR        = os.path.join(LOG_DIR, "checkpoints")
TENSORBOARD_LOG = os.path.join(LOG_DIR, "tb")

os.makedirs(LOG_DIR,   exist_ok=True)
os.makedirs(CKPT_DIR,  exist_ok=True)
os.makedirs(VIDEO_DIR, exist_ok=True)

## Environment Functions

In [3]:
from wrappers import make_base_env
def _make_env_thunk(game: str, state: str):
    """Return a function that creates an environment (for multiprocessing)."""
    def _thunk():
        return make_base_env(game, state)
    return _thunk

def make_vec_env(game: str, state: str, n_envs: int, use_subproc: bool = True):
    """Create a vectorized environment (multiple envs running in parallel)."""
    env_fns = [_make_env_thunk(game, state) for _ in range(n_envs)]
    
    if use_subproc and n_envs > 1:
        vec_env = SubprocVecEnv(env_fns)
    else:
        vec_env = DummyVecEnv(env_fns)

    return vec_env

## Initialize Env & Model

In [4]:
# 1. Create Training Environment
train_env = make_vec_env(GAME, STATE, n_envs=N_ENVS)
# train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
print(f"Environment created: {GAME} - {STATE} with {N_ENVS} parallel envs.")

checkpoint_path = "None" # 6225920 (19) 有破壞
# checkpoint_path = "runs_smw/checkpoints/P_RWD_18.zip"

best_mean = -1e18
trained = 0
round_idx = 0

# 2. Initialize Model
if os.path.exists(checkpoint_path):
    # 讀取現有模型
    model = CustomPPO.load(
        checkpoint_path, 
        env=train_env,
        device="cuda:0" # 確保使用 GPU
    )
    trained = model.num_timesteps
    round_idx = int(trained / TRAIN_CHUNK)
    print(f"[Sucess] Loaded model from {checkpoint_path}")
    print(f"trained: {trained}, round_index: {round_idx}")
else:
    print(f"[Fail] Can't load {checkpoint_path}. Will use new model")
    model = CustomPPO(
        VisionBackbonePolicy,
        train_env,
        policy_kwargs   = dict(normalize_images=False),
        n_epochs        = 4,
        n_steps         = 512,
        batch_size      = 512,
        learning_rate   = LEARNING_RATE,
        verbose         = 1,
        gamma           = 0.96875,
        kl_coef         = 1,
        clip_range      = 0.125,
        ent_coef        = 0.025,
        tensorboard_log = TENSORBOARD_LOG,
    )

Environment created: SuperMarioWorld-Snes - YoshiIsland1 with 16 parallel envs.
[Fail] Can't load None. Will use new model
Using cuda:0 device


In [5]:
# model.save("policy")

In [6]:
# import os
# import glob
# from custom_policy import CustomPPO
# from eval import record_video  # 確保 eval.py 在同一目錄下

# # ================= 設定區 =================
# # 遊戲設定 (請確保跟訓練時一致)
# # target_numbers = [3932160, 6225920, 12451840] 

# # 方法 B: 自動搜尋資料夾下所有 PIPE_{number}.zip (如果你想全部測的話，把下面解註解)
# files = glob.glob(os.path.join(CKPT_DIR, "SF84G_*.zip"))
# target_numbers = list(range(35, 40))

# # ================= 執行迴圈 =================
# print(f"準備測試以下 Checkpoints: {target_numbers}")

# for num in target_numbers:
#     model_path = os.path.join(CKPT_DIR, f"PIPE_{num}.zip")
    
#     # 檢查檔案是否存在
#     if not os.path.exists(model_path):
#         print(f"⚠️ 找不到檔案: {model_path}，跳過。")
#         continue
    
#     print(f"\n[{num}] 正在載入模型: {model_path} ...")
    
#     try:
#         # 1. 載入模型 (不需要 env 參數也能載入權重)
#         # 如果你有改過 CustomPPO 的參數，load 會自動讀取 zip 裡的設定
#         model = CustomPPO.load(model_path, device="auto") # device="auto" 會自動用 GPU
        
#         # 2. 錄製影片
#         prefix_name = f"test_{num}"
#         print(f"[{num}] 正在錄影 (長度 {RECORD_STEPS} steps)...")
        
#         record_video(
#             model=model,
#             game=GAME,
#             state=STATE,
#             out_dir=VIDEO_DIR,
#             video_len=RECORD_STEPS,
#             prefix=prefix_name
#         )
#         print(f"✅ 完成！影片已儲存為 {prefix_name}.mp4")
        
#     except Exception as e:
#         print(f"❌ 發生錯誤 (Model: {num}): {e}")

# print("\n所有測試結束。")

## Training Loop

In [7]:
try:
    while trained < TOTAL_STEPS:
        round_idx += 1
        chunk = min(TRAIN_CHUNK, TOTAL_STEPS - trained)
        # chunk = 2000
        label = "NoRun"
        tagged_label = f"{label}_{int(trained/TRAIN_CHUNK)}"

        print(f"\n=== Round {round_idx} | Learn {chunk} steps (Total trained: {trained}) ===")
        
        # --- Train ---
        model.learn(total_timesteps=chunk, reset_num_timesteps=False, tb_log_name=label)
        trained += chunk

        # --- Save Checkpoint ---
        ckpt_path = os.path.join(CKPT_DIR, f"{tagged_label}.zip")
        model.save(ckpt_path)
        print(f"Saved checkpoint: {ckpt_path}")

        # --- Evaluate ---
        mean_ret, best_ret = evaluate_policy(
            model,
            GAME,
            STATE,
            n_episodes=EVAL_EPISODES,
            max_steps=EVAL_MAX_STEPS,
        )
        print(f"[EVAL] Mean Return: {mean_ret:.3f}, Best Return: {best_ret:.3f}")

        # --- Save Best Model ---
        # if mean_ret > best_mean:
        #     best_mean = mean_ret
        #     best_path = os.path.join(LOG_DIR, "best_model.zip")
        #     model.save(best_path)
        #     print(f"New best record. Saved to {best_path}")

        # --- Record Video ---
        out_path = os.path.join(VIDEO_DIR, label)
        os.makedirs(out_path,  exist_ok=True)
        record_video(
            model,
            GAME,
            STATE,
            VIDEO_DIR,
            video_len=RECORD_STEPS,
            prefix=f"{label}/{tagged_label}_{mean_ret:.2f}",
        )

except KeyboardInterrupt:
    print("\nTraining interrupted manually.")

finally:
    train_env.close()
    print("Training finished. Environment closed.")
    
"""
tensorboard --logdir=./runs_smw/tb
"""


=== Round 1 | Learn 327680 steps (Total trained: 0) ===
Logging to ./runs_smw/tb/NoRun_0


-----------------------------
| time/              |      |
|    fps             | 1169 |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 8192 |
-----------------------------
------------------------------------------
| time/                 |                |
|    fps                | 893            |
|    iterations         | 2              |
|    time_elapsed       | 18             |
|    total_timesteps    | 16384          |
| train/                |                |
|    approx_kl          | 0.0051269354   |
|    entropy_loss       | -2.3           |
|    explained_variance | -0.00392       |
|    learning_rate      | 0.0001         |
|    loss               | -0.0482        |
|    mean_step_reward   | -0.00036650547 |
|    n_updates          | 4              |
|    policyGradLoss     | -0.00885       |
|    value_loss         | 0.0429         |
------------------------------------------
----------------------------------------
| time/           

Saved checkpoint: ./runs_smw/checkpoints/NoRun_0.zip
[EVAL] Mean Return: -1.154, Best Return: -0.274
Saved video to ./runs_smw/videos/NoRun/NoRun_0_-1.15.mp4

=== Round 2 | Learn 327680 steps (Total trained: 327680) ===
Logging to ./runs_smw/tb/NoRun_0
-------------------------------
| time/              |        |
|    fps             | 1144   |
|    iterations      | 1      |
|    time_elapsed    | 7      |
|    total_timesteps | 335872 |
-------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 878          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 344064       |
| train/                |              |
|    approx_kl          | 0.0023141312 |
|    entropy_loss       | -2.25        |
|    explained_variance | 0.599        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0583      |
|    mean_step_reward   | -0.01

Saved checkpoint: ./runs_smw/checkpoints/NoRun_1.zip
[EVAL] Mean Return: -1.151, Best Return: -0.271
Saved video to ./runs_smw/videos/NoRun/NoRun_1_-1.15.mp4

=== Round 3 | Learn 327680 steps (Total trained: 655360) ===
Logging to ./runs_smw/tb/NoRun_0
-------------------------------
| time/              |        |
|    fps             | 978    |
|    iterations      | 1      |
|    time_elapsed    | 8      |
|    total_timesteps | 663552 |
-------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 833          |
|    iterations         | 2            |
|    time_elapsed       | 19           |
|    total_timesteps    | 671744       |
| train/                |              |
|    approx_kl          | 0.0028044062 |
|    entropy_loss       | -2.19        |
|    explained_variance | 0.382        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0507      |
|    mean_step_reward   | -0.00

Saved checkpoint: ./runs_smw/checkpoints/NoRun_2.zip
[EVAL] Mean Return: -1.154, Best Return: -0.274
Saved video to ./runs_smw/videos/NoRun/NoRun_2_-1.15.mp4

=== Round 4 | Learn 327680 steps (Total trained: 983040) ===
Logging to ./runs_smw/tb/NoRun_0
-------------------------------
| time/              |        |
|    fps             | 993    |
|    iterations      | 1      |
|    time_elapsed    | 8      |
|    total_timesteps | 991232 |
-------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 913          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 999424       |
| train/                |              |
|    approx_kl          | 0.0022459424 |
|    entropy_loss       | -2.22        |
|    explained_variance | 0.754        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0571      |
|    mean_step_reward   | -0.00

Saved checkpoint: ./runs_smw/checkpoints/NoRun_3.zip
[EVAL] Mean Return: -1.174, Best Return: -0.294
Saved video to ./runs_smw/videos/NoRun/NoRun_3_-1.17.mp4

=== Round 5 | Learn 327680 steps (Total trained: 1310720) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1276    |
|    iterations      | 1       |
|    time_elapsed    | 6       |
|    total_timesteps | 1318912 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 915           |
|    iterations         | 2             |
|    time_elapsed       | 17            |
|    total_timesteps    | 1327104       |
| train/                |               |
|    approx_kl          | 0.0038887302  |
|    entropy_loss       | -2.22         |
|    explained_variance | 0.54          |
|    learning_rate      | 0.0001        |
|    loss               | -0.0589       |
|    mean_s

Saved checkpoint: ./runs_smw/checkpoints/NoRun_4.zip
[EVAL] Mean Return: -113.362, Best Return: -109.938
Saved video to ./runs_smw/videos/NoRun/NoRun_4_-113.36.mp4

=== Round 6 | Learn 327680 steps (Total trained: 1638400) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1010    |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 1646592 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 826          |
|    iterations         | 2            |
|    time_elapsed       | 19           |
|    total_timesteps    | 1654784      |
| train/                |              |
|    approx_kl          | 0.0028427192 |
|    entropy_loss       | -2.21        |
|    explained_variance | 0.688        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0552      |
|    mean_step_re

Saved checkpoint: ./runs_smw/checkpoints/NoRun_5.zip
[EVAL] Mean Return: -1.154, Best Return: -0.274
Saved video to ./runs_smw/videos/NoRun/NoRun_5_-1.15.mp4

=== Round 7 | Learn 327680 steps (Total trained: 1966080) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 998     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 1974272 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 823          |
|    iterations         | 2            |
|    time_elapsed       | 19           |
|    total_timesteps    | 1982464      |
| train/                |              |
|    approx_kl          | 0.0021975925 |
|    entropy_loss       | -2.25        |
|    explained_variance | 0.803        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0673      |
|    mean_step_reward  

Saved checkpoint: ./runs_smw/checkpoints/NoRun_6.zip
[EVAL] Mean Return: -1.174, Best Return: -0.294
Saved video to ./runs_smw/videos/NoRun/NoRun_6_-1.17.mp4

=== Round 8 | Learn 327680 steps (Total trained: 2293760) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 977     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 2301952 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 886          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 2310144      |
| train/                |              |
|    approx_kl          | 0.003479539  |
|    entropy_loss       | -2.25        |
|    explained_variance | 0.691        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0635      |
|    mean_step_reward  

Saved checkpoint: ./runs_smw/checkpoints/NoRun_7.zip
[EVAL] Mean Return: -1.151, Best Return: -0.271
Saved video to ./runs_smw/videos/NoRun/NoRun_7_-1.15.mp4

=== Round 9 | Learn 327680 steps (Total trained: 2621440) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1202    |
|    iterations      | 1       |
|    time_elapsed    | 6       |
|    total_timesteps | 2629632 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 889          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 2637824      |
| train/                |              |
|    approx_kl          | 0.003803788  |
|    entropy_loss       | -2.25        |
|    explained_variance | 0.802        |
|    learning_rate      | 0.0001       |
|    loss               | -0.066       |
|    mean_step_reward  

Saved checkpoint: ./runs_smw/checkpoints/NoRun_8.zip
[EVAL] Mean Return: -1.151, Best Return: -0.271
Saved video to ./runs_smw/videos/NoRun/NoRun_8_-1.15.mp4

=== Round 10 | Learn 327680 steps (Total trained: 2949120) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1074    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 2957312 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 849           |
|    iterations         | 2             |
|    time_elapsed       | 19            |
|    total_timesteps    | 2965504       |
| train/                |               |
|    approx_kl          | 0.0034970483  |
|    entropy_loss       | -2.23         |
|    explained_variance | 0.788         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0618       |
|    mean_

Saved checkpoint: ./runs_smw/checkpoints/NoRun_9.zip
[EVAL] Mean Return: -1.151, Best Return: -0.271
Saved video to ./runs_smw/videos/NoRun/NoRun_9_-1.15.mp4

=== Round 11 | Learn 327680 steps (Total trained: 3276800) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 983     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 3284992 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 862           |
|    iterations         | 2             |
|    time_elapsed       | 19            |
|    total_timesteps    | 3293184       |
| train/                |               |
|    approx_kl          | 0.0027364679  |
|    entropy_loss       | -2.21         |
|    explained_variance | 0.772         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0613       |
|    mean_

Saved checkpoint: ./runs_smw/checkpoints/NoRun_10.zip
[EVAL] Mean Return: -1.154, Best Return: -0.274
Saved video to ./runs_smw/videos/NoRun/NoRun_10_-1.15.mp4

=== Round 12 | Learn 327680 steps (Total trained: 3604480) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1375    |
|    iterations      | 1       |
|    time_elapsed    | 5       |
|    total_timesteps | 3612672 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 930           |
|    iterations         | 2             |
|    time_elapsed       | 17            |
|    total_timesteps    | 3620864       |
| train/                |               |
|    approx_kl          | 0.003470254   |
|    entropy_loss       | -2.23         |
|    explained_variance | 0.762         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0616       |
|    mea

Saved checkpoint: ./runs_smw/checkpoints/NoRun_11.zip
[EVAL] Mean Return: -1.158, Best Return: -0.278
Saved video to ./runs_smw/videos/NoRun/NoRun_11_-1.16.mp4

=== Round 13 | Learn 327680 steps (Total trained: 3932160) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1080    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 3940352 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 861           |
|    iterations         | 2             |
|    time_elapsed       | 19            |
|    total_timesteps    | 3948544       |
| train/                |               |
|    approx_kl          | 0.0043416447  |
|    entropy_loss       | -2.24         |
|    explained_variance | 0.866         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0717       |
|    mea

Saved checkpoint: ./runs_smw/checkpoints/NoRun_12.zip
[EVAL] Mean Return: -115.210, Best Return: -111.573
Saved video to ./runs_smw/videos/NoRun/NoRun_12_-115.21.mp4

=== Round 14 | Learn 327680 steps (Total trained: 4259840) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1169    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 4268032 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 910          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 4276224      |
| train/                |              |
|    approx_kl          | 0.0035106766 |
|    entropy_loss       | -2.22        |
|    explained_variance | 0.881        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0623      |
|    mean_step

Saved checkpoint: ./runs_smw/checkpoints/NoRun_13.zip
[EVAL] Mean Return: -1.158, Best Return: -0.278
Saved video to ./runs_smw/videos/NoRun/NoRun_13_-1.16.mp4

=== Round 15 | Learn 327680 steps (Total trained: 4587520) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 977     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 4595712 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 847          |
|    iterations         | 2            |
|    time_elapsed       | 19           |
|    total_timesteps    | 4603904      |
| train/                |              |
|    approx_kl          | 0.0056002457 |
|    entropy_loss       | -2.22        |
|    explained_variance | 0.912        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0668      |
|    mean_step_rewar

Saved checkpoint: ./runs_smw/checkpoints/NoRun_14.zip
[EVAL] Mean Return: -1.158, Best Return: -0.278
Saved video to ./runs_smw/videos/NoRun/NoRun_14_-1.16.mp4

=== Round 16 | Learn 327680 steps (Total trained: 4915200) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1255    |
|    iterations      | 1       |
|    time_elapsed    | 6       |
|    total_timesteps | 4923392 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 902          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 4931584      |
| train/                |              |
|    approx_kl          | 0.0026454246 |
|    entropy_loss       | -2.24        |
|    explained_variance | 0.839        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0623      |
|    mean_step_rewar

Saved checkpoint: ./runs_smw/checkpoints/NoRun_15.zip
[EVAL] Mean Return: -20.290, Best Return: -16.653
Saved video to ./runs_smw/videos/NoRun/NoRun_15_-20.29.mp4

=== Round 17 | Learn 327680 steps (Total trained: 5242880) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1106    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 5251072 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 860           |
|    iterations         | 2             |
|    time_elapsed       | 19            |
|    total_timesteps    | 5259264       |
| train/                |               |
|    approx_kl          | 0.0037950207  |
|    entropy_loss       | -2.2          |
|    explained_variance | 0.942         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0674       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_16.zip
[EVAL] Mean Return: -61.995, Best Return: -58.279
Saved video to ./runs_smw/videos/NoRun/NoRun_16_-62.00.mp4

=== Round 18 | Learn 327680 steps (Total trained: 5570560) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 970     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 5578752 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 902           |
|    iterations         | 2             |
|    time_elapsed       | 18            |
|    total_timesteps    | 5586944       |
| train/                |               |
|    approx_kl          | 0.003356682   |
|    entropy_loss       | -2.21         |
|    explained_variance | 0.872         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0667       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_17.zip
[EVAL] Mean Return: -19.909, Best Return: -16.286
Saved video to ./runs_smw/videos/NoRun/NoRun_17_-19.91.mp4

=== Round 19 | Learn 327680 steps (Total trained: 5898240) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1000    |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 5906432 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 830          |
|    iterations         | 2            |
|    time_elapsed       | 19           |
|    total_timesteps    | 5914624      |
| train/                |              |
|    approx_kl          | 0.005558746  |
|    entropy_loss       | -2.16        |
|    explained_variance | 0.897        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0668      |
|    mean_step_re

Saved checkpoint: ./runs_smw/checkpoints/NoRun_18.zip
[EVAL] Mean Return: -20.436, Best Return: -16.693
Saved video to ./runs_smw/videos/NoRun/NoRun_18_-20.44.mp4

=== Round 20 | Learn 327680 steps (Total trained: 6225920) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 997     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 6234112 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 871           |
|    iterations         | 2             |
|    time_elapsed       | 18            |
|    total_timesteps    | 6242304       |
| train/                |               |
|    approx_kl          | 0.0045519597  |
|    entropy_loss       | -2.17         |
|    explained_variance | 0.859         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0704       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_19.zip
[EVAL] Mean Return: -108.035, Best Return: -104.265
Saved video to ./runs_smw/videos/NoRun/NoRun_19_-108.03.mp4

=== Round 21 | Learn 327680 steps (Total trained: 6553600) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 986     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 6561792 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 914           |
|    iterations         | 2             |
|    time_elapsed       | 17            |
|    total_timesteps    | 6569984       |
| train/                |               |
|    approx_kl          | 0.005004029   |
|    entropy_loss       | -2.17         |
|    explained_variance | 0.905         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0629       |
| 

Saved checkpoint: ./runs_smw/checkpoints/NoRun_20.zip
[EVAL] Mean Return: -64.174, Best Return: -60.471
Saved video to ./runs_smw/videos/NoRun/NoRun_20_-64.17.mp4

=== Round 22 | Learn 327680 steps (Total trained: 6881280) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1118    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 6889472 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 888           |
|    iterations         | 2             |
|    time_elapsed       | 18            |
|    total_timesteps    | 6897664       |
| train/                |               |
|    approx_kl          | 0.004008344   |
|    entropy_loss       | -2.18         |
|    explained_variance | 0.927         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0661       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_21.zip
[EVAL] Mean Return: -20.517, Best Return: -16.827
Saved video to ./runs_smw/videos/NoRun/NoRun_21_-20.52.mp4

=== Round 23 | Learn 327680 steps (Total trained: 7208960) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 987     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 7217152 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 913           |
|    iterations         | 2             |
|    time_elapsed       | 17            |
|    total_timesteps    | 7225344       |
| train/                |               |
|    approx_kl          | 0.004614373   |
|    entropy_loss       | -2.16         |
|    explained_variance | 0.82          |
|    learning_rate      | 0.0001        |
|    loss               | -0.0678       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_22.zip
[EVAL] Mean Return: -19.921, Best Return: -16.271
Saved video to ./runs_smw/videos/NoRun/NoRun_22_-19.92.mp4

=== Round 24 | Learn 327680 steps (Total trained: 7536640) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 980     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 7544832 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 841           |
|    iterations         | 2             |
|    time_elapsed       | 19            |
|    total_timesteps    | 7553024       |
| train/                |               |
|    approx_kl          | 0.004592429   |
|    entropy_loss       | -2.18         |
|    explained_variance | 0.9           |
|    learning_rate      | 0.0001        |
|    loss               | -0.0659       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_23.zip
[EVAL] Mean Return: -21.911, Best Return: -18.275
Saved video to ./runs_smw/videos/NoRun/NoRun_23_-21.91.mp4

=== Round 25 | Learn 327680 steps (Total trained: 7864320) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 987     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 7872512 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 875          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 7880704      |
| train/                |              |
|    approx_kl          | 0.003466377  |
|    entropy_loss       | -2.18        |
|    explained_variance | 0.948        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0689      |
|    mean_step_re

Saved checkpoint: ./runs_smw/checkpoints/NoRun_24.zip
[EVAL] Mean Return: -20.010, Best Return: -16.480
Saved video to ./runs_smw/videos/NoRun/NoRun_24_-20.01.mp4

=== Round 26 | Learn 327680 steps (Total trained: 8192000) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1162    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 8200192 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 883           |
|    iterations         | 2             |
|    time_elapsed       | 18            |
|    total_timesteps    | 8208384       |
| train/                |               |
|    approx_kl          | 0.0039495346  |
|    entropy_loss       | -2.16         |
|    explained_variance | 0.903         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0676       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_25.zip
[EVAL] Mean Return: -107.441, Best Return: -103.871
Saved video to ./runs_smw/videos/NoRun/NoRun_25_-107.44.mp4

=== Round 27 | Learn 327680 steps (Total trained: 8519680) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1075    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 8527872 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 865          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 8536064      |
| train/                |              |
|    approx_kl          | 0.006065361  |
|    entropy_loss       | -2.14        |
|    explained_variance | 0.947        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0663      |
|    mean_step

Saved checkpoint: ./runs_smw/checkpoints/NoRun_26.zip
[EVAL] Mean Return: -20.055, Best Return: -16.472
Saved video to ./runs_smw/videos/NoRun/NoRun_26_-20.06.mp4

=== Round 28 | Learn 327680 steps (Total trained: 8847360) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 982     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 8855552 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 919          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 8863744      |
| train/                |              |
|    approx_kl          | 0.0047173956 |
|    entropy_loss       | -2.15        |
|    explained_variance | 0.898        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0691      |
|    mean_step_re

Saved checkpoint: ./runs_smw/checkpoints/NoRun_27.zip
[EVAL] Mean Return: -20.255, Best Return: -16.872
Saved video to ./runs_smw/videos/NoRun/NoRun_27_-20.26.mp4

=== Round 29 | Learn 327680 steps (Total trained: 9175040) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1022    |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 9183232 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 832          |
|    iterations         | 2            |
|    time_elapsed       | 19           |
|    total_timesteps    | 9191424      |
| train/                |              |
|    approx_kl          | 0.006542147  |
|    entropy_loss       | -2.12        |
|    explained_variance | 0.865        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0692      |
|    mean_step_re

Saved checkpoint: ./runs_smw/checkpoints/NoRun_28.zip
[EVAL] Mean Return: -64.092, Best Return: -60.602
Saved video to ./runs_smw/videos/NoRun/NoRun_28_-64.09.mp4

=== Round 30 | Learn 327680 steps (Total trained: 9502720) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 993     |
|    iterations      | 1       |
|    time_elapsed    | 8       |
|    total_timesteps | 9510912 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 834           |
|    iterations         | 2             |
|    time_elapsed       | 19            |
|    total_timesteps    | 9519104       |
| train/                |               |
|    approx_kl          | 0.0059357784  |
|    entropy_loss       | -2.13         |
|    explained_variance | 0.801         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0643       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_29.zip
[EVAL] Mean Return: -20.239, Best Return: -16.789
Saved video to ./runs_smw/videos/NoRun/NoRun_29_-20.24.mp4

=== Round 31 | Learn 327680 steps (Total trained: 9830400) ===
Logging to ./runs_smw/tb/NoRun_0
--------------------------------
| time/              |         |
|    fps             | 1044    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 9838592 |
--------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 845           |
|    iterations         | 2             |
|    time_elapsed       | 19            |
|    total_timesteps    | 9846784       |
| train/                |               |
|    approx_kl          | 0.0035274522  |
|    entropy_loss       | -2.17         |
|    explained_variance | 0.971         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0566       |
|    

Saved checkpoint: ./runs_smw/checkpoints/NoRun_30.zip
[EVAL] Mean Return: -20.932, Best Return: -17.522
Saved video to ./runs_smw/videos/NoRun/NoRun_30_-20.93.mp4

=== Round 32 | Learn 327680 steps (Total trained: 10158080) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 1252     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 10166272 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 896          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 10174464     |
| train/                |              |
|    approx_kl          | 0.0047856756 |
|    entropy_loss       | -2.1         |
|    explained_variance | 0.993        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0656      |
|    mean

Saved checkpoint: ./runs_smw/checkpoints/NoRun_31.zip
[EVAL] Mean Return: -19.804, Best Return: -16.420
Saved video to ./runs_smw/videos/NoRun/NoRun_31_-19.80.mp4

=== Round 33 | Learn 327680 steps (Total trained: 10485760) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 1012     |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 10493952 |
---------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 834           |
|    iterations         | 2             |
|    time_elapsed       | 19            |
|    total_timesteps    | 10502144      |
| train/                |               |
|    approx_kl          | 0.0038890382  |
|    entropy_loss       | -2.14         |
|    explained_variance | 0.967         |
|    learning_rate      | 0.0001        |
|    loss               | -0.059       

Saved checkpoint: ./runs_smw/checkpoints/NoRun_32.zip
[EVAL] Mean Return: -121.385, Best Return: -119.932
Saved video to ./runs_smw/videos/NoRun/NoRun_32_-121.39.mp4

=== Round 34 | Learn 327680 steps (Total trained: 10813440) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 984      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 10821632 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 881          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 10829824     |
| train/                |              |
|    approx_kl          | 0.0050168196 |
|    entropy_loss       | -2.14        |
|    explained_variance | 0.836        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0652      |
|    m

Saved checkpoint: ./runs_smw/checkpoints/NoRun_33.zip
[EVAL] Mean Return: -121.446, Best Return: -119.926
Saved video to ./runs_smw/videos/NoRun/NoRun_33_-121.45.mp4

=== Round 35 | Learn 327680 steps (Total trained: 11141120) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 1281     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 11149312 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 912          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 11157504     |
| train/                |              |
|    approx_kl          | 0.00747968   |
|    entropy_loss       | -2.1         |
|    explained_variance | 0.928        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0632      |
|    m

Saved checkpoint: ./runs_smw/checkpoints/NoRun_34.zip
[EVAL] Mean Return: -19.928, Best Return: -16.411
Saved video to ./runs_smw/videos/NoRun/NoRun_34_-19.93.mp4

=== Round 36 | Learn 327680 steps (Total trained: 11468800) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 971      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 11476992 |
---------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 884           |
|    iterations         | 2             |
|    time_elapsed       | 18            |
|    total_timesteps    | 11485184      |
| train/                |               |
|    approx_kl          | 0.006480677   |
|    entropy_loss       | -2.14         |
|    explained_variance | 0.917         |
|    learning_rate      | 0.0001        |
|    loss               | -0.0599      

Saved checkpoint: ./runs_smw/checkpoints/NoRun_35.zip
[EVAL] Mean Return: -20.983, Best Return: -17.520
Saved video to ./runs_smw/videos/NoRun/NoRun_35_-20.98.mp4

=== Round 37 | Learn 327680 steps (Total trained: 11796480) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 976      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 11804672 |
---------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 908           |
|    iterations         | 2             |
|    time_elapsed       | 18            |
|    total_timesteps    | 11812864      |
| train/                |               |
|    approx_kl          | 0.0056221383  |
|    entropy_loss       | -2.15         |
|    explained_variance | 0.91          |
|    learning_rate      | 0.0001        |
|    loss               | -0.0538      

Saved checkpoint: ./runs_smw/checkpoints/NoRun_36.zip
[EVAL] Mean Return: -19.612, Best Return: -16.055
Saved video to ./runs_smw/videos/NoRun/NoRun_36_-19.61.mp4

=== Round 38 | Learn 327680 steps (Total trained: 12124160) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 1053     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 12132352 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 844          |
|    iterations         | 2            |
|    time_elapsed       | 19           |
|    total_timesteps    | 12140544     |
| train/                |              |
|    approx_kl          | 0.004830826  |
|    entropy_loss       | -2.14        |
|    explained_variance | 0.735        |
|    learning_rate      | 0.0001       |
|    loss               | -0.066       |
|    mean

Saved checkpoint: ./runs_smw/checkpoints/NoRun_37.zip
[EVAL] Mean Return: -19.606, Best Return: -16.009
Saved video to ./runs_smw/videos/NoRun/NoRun_37_-19.61.mp4

=== Round 39 | Learn 327680 steps (Total trained: 12451840) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 977      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 12460032 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 911          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 12468224     |
| train/                |              |
|    approx_kl          | 0.006671565  |
|    entropy_loss       | -2.07        |
|    explained_variance | 0.861        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0618      |
|    mean

Saved checkpoint: ./runs_smw/checkpoints/NoRun_38.zip
[EVAL] Mean Return: -19.542, Best Return: -16.078
Saved video to ./runs_smw/videos/NoRun/NoRun_38_-19.54.mp4

=== Round 40 | Learn 327680 steps (Total trained: 12779520) ===
Logging to ./runs_smw/tb/NoRun_0
---------------------------------
| time/              |          |
|    fps             | 1216     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 12787712 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 884          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 12795904     |
| train/                |              |
|    approx_kl          | 0.006091073  |
|    entropy_loss       | -2.12        |
|    explained_variance | 0.972        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0847      |
|    mean

Saved checkpoint: ./runs_smw/checkpoints/NoRun_39.zip
[EVAL] Mean Return: -19.669, Best Return: -16.179
Saved video to ./runs_smw/videos/NoRun/NoRun_39_-19.67.mp4
Training finished. Environment closed.


'\ntensorboard --logdir=./runs_smw/tb\n'

## Display Video

In [None]:
from IPython.display import Video
import glob

list_of_files = glob.glob(os.path.join(VIDEO_DIR, '*.mp4')) 
if list_of_files:
    latest_file = max(list_of_files, key=os.path.getctime)
    print(f"Playing: {latest_file}")
    display(Video(latest_file, embed=True, width=600))
else:
    print("No videos found yet.")

In [None]:
import cv2

cap = cv2.VideoCapture("runs_smw/videos/test_16.mp4")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    cv2.imshow("Frame-by-Frame", frame)

    # 關鍵：這裡等待按鍵。按 'n' 鍵跳到下一幀，按 'q' 離開
    key = cv2.waitKey(0) 
    if key == ord('q'):
        break
    elif key == ord('n'):
        continue

cap.release()
cv2.destroyAllWindows()