# 2025 DL Lab8: RL Assignment_Super Mario World

**Your Answer:**    
Hi I'm XXX, XXXXXXXXXX.

## Overview
This project implements a **Deep Reinforcement Learning** pipeline to train an autonomous agent for Super Mario World. Leveraging the **Proximal Policy Optimization (PPO)** algorithm, the system interacts with the **stable-retro** environment to master the YoshiIsland1 level. Key components include a custom Vision Backbone for extracting features from raw pixel data and a suite of Environment Wrappers that handle frame preprocessing, action discretization, and reward shaping to facilitate efficient learning.

Reward function implement  
should do something in the beginning (monster attack)  
Custom PPO implement  
pre train weight 差不多，主要是 reward function  
model weight capacity 1GB  
class name 不要動 (可以新增，但是原本有的不要動)

## Imports

In [1]:
import os
import numpy as np
import retro
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.vec_env import VecNormalize

from eval import evaluate_policy, record_video
from custom_policy import VisionBackbonePolicy, CustomPPO

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [None]:
# Game Settings
GAME = "SuperMarioWorld-Snes"
STATE = "YoshiIsland1"

# Training Settings
BASE_CHUNK  = 8192
TRAIN_CHUNK = BASE_CHUNK * 32
TOTAL_STEPS = TRAIN_CHUNK * 160
N_ENVS = 16

# Evaluation & Recording Settingsc
EVAL_EPISODES = 3
EVAL_MAX_STEPS = 18000
RECORD_STEPS = 1200

# Directories
LOG_DIR = "./runs_smw"
VIDEO_DIR       = os.path.join(LOG_DIR, "videos")
CKPT_DIR        = os.path.join(LOG_DIR, "checkpoints")
TENSORBOARD_LOG = os.path.join(LOG_DIR, "tb")

os.makedirs(LOG_DIR,   exist_ok=True)
os.makedirs(CKPT_DIR,  exist_ok=True)
os.makedirs(VIDEO_DIR, exist_ok=True)

## Environment Functions

In [3]:
from wrappers import make_base_env
def _make_env_thunk(game: str, state: str):
    """Return a function that creates an environment (for multiprocessing)."""
    def _thunk():
        return make_base_env(game, state)
    return _thunk

def make_vec_env(game: str, state: str, n_envs: int, use_subproc: bool = True):
    """Create a vectorized environment (multiple envs running in parallel)."""
    env_fns = [_make_env_thunk(game, state) for _ in range(n_envs)]
    
    if use_subproc and n_envs > 1:
        vec_env = SubprocVecEnv(env_fns)
    else:
        vec_env = DummyVecEnv(env_fns)

    return vec_env

## Initialize Env & Model

In [4]:
# 1. Create Training Environment
train_env = make_vec_env(GAME, STATE, n_envs=N_ENVS)
# train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
print(f"Environment created: {GAME} - {STATE} with {N_ENVS} parallel envs.")

checkpoint_path = "None"
checkpoint_path = "runs_smw/checkpoints/Run_41.zip"

best_mean = -1e18
trained = 0
round_idx = 0

# 2. Initialize Model
if os.path.exists(checkpoint_path):
    # 讀取現有模型
    model = CustomPPO.load(
        checkpoint_path, 
        env=train_env,
        device="cuda:0" # 確保使用 GPU
    )
    trained = model.num_timesteps
    round_idx = int(trained / TRAIN_CHUNK)
    print(f"[Sucess] Loaded model from {checkpoint_path}")
    print(f"trained: {trained}, round_index: {round_idx}")
else:
    print(f"[Fail] Can't load {checkpoint_path}. Will use new model")
    model = CustomPPO(
        VisionBackbonePolicy,
        train_env,
        policy_kwargs   = dict(normalize_images=False),
        n_epochs        = 4,
        n_steps         = 512,
        batch_size      = 512,
        learning_rate   = 1e-4,
        verbose         = 1,
        gamma           = 0.96875,
        kl_coef         = 1,
        clip_range      = 0.125,
        ent_coef        = 0.0375,
        tensorboard_log = TENSORBOARD_LOG,
    )

Environment created: SuperMarioWorld-Snes - YoshiIsland1 with 16 parallel envs.
[Sucess] Loaded model from runs_smw/checkpoints/Run_41.zip
trained: 11010048, round_index: 42


In [None]:
# import os
# from custom_policy import CustomPPO
# from wrappers import make_base_env  # [新增] 必須引入這行來建立環境

# # ================= 設定區 =================
# # 請確保這些變數有被定義 (這裡沿用你原本的變數名稱)
# # GAME = "SuperMarioWorld-Snes"
# # STATE = "Level1" 
# # CKPT_DIR = "./"
# # RECORD_STEPS = 2000
# PSVD_DIR = "./runs_smw/preserved/"

# target_numbers = list(range(70, 128))
# # target_numbers = [124, 137, 147, 151, 179]

# # ================= 執行迴圈 =================
# for num in target_numbers:
#     model_path = os.path.join(CKPT_DIR, f"S2K_{num}.zip")
    
#     if not os.path.exists(model_path):
#         # print(f"⚠️ 找不到檔案: {model_path}，跳過。")
#         continue
    
#     # print(f"\n[{num}] 正在載入模型: {model_path} ...")
    
#     env = None
#     try:
#         model = CustomPPO.load(model_path, device="auto")
#         env = make_base_env(game=GAME, state=STATE)
        
#         obs, info = env.reset()
#         final_score = 0
#         final_coins = 0 # [新增] 初始化金幣紀錄
        
#         for step in range(RECORD_STEPS):
#             action, _ = model.predict(obs, deterministic=True)
#             obs, reward, terminated, truncated, info = env.step(action)
            
#             # 從 info 中讀取當前數值 
#             final_score = info.get("score", final_score)
#             final_coins = info.get("coins", final_coins)
            
#             if terminated or truncated:
#                 break
        
#         # 修改後的印出格式
#         print(f"[{num}] coins: {final_coins} | score: {final_score}")
        
#     except Exception as e:
#         print(f"❌ 發生錯誤 (Model: {num}): {e}")
#     finally:
#         if env is not None:
#             env.close()

# print("\n所有測試結束。")

In [None]:
# import os
# import glob
# from custom_policy import CustomPPO
# from eval import record_video  # 確保 eval.py 在同一目錄下
# PSVD_DIR = "./runs_smw/preserved/"
# CKPT_DIR
# # ================= 設定區 =================
# # target_numbers = list(range(38, 40))
# target_numbers = [126]

# # ================= 執行迴圈 =================
# print(f"準備測試以下 Checkpoints: {target_numbers}")

# for num in target_numbers:
#     model_path = os.path.join(PSVD_DIR, f"S2K_{num}.zip")
    
#     # 檢查檔案是否存在
#     if not os.path.exists(model_path):
#         print(f"⚠️ 找不到檔案: {model_path}，跳過。")
#         continue
    
#     print(f"\n[{num}] 正在載入模型: {model_path} ...")
    
#     try:
#         # 1. 載入模型 (不需要 env 參數也能載入權重)
#         # 如果你有改過 CustomPPO 的參數，load 會自動讀取 zip 裡的設定
#         model = CustomPPO.load(model_path, device="auto") # device="auto" 會自動用 GPU
        
#         # 2. 錄製影片
#         prefix_name = f"test_{num}"
#         print(f"[{num}] 正在錄影 (長度 {RECORD_STEPS} steps)...")
        
#         record_video(
#             model=model,
#             game=GAME,
#             state=STATE,
#             out_dir=VIDEO_DIR,
#             video_len=RECORD_STEPS,
#             prefix=prefix_name
#         )
#         print(f"✅ 完成！影片已儲存為 {prefix_name}.mp4")
        
#     except Exception as e:
#         print(f"❌ 發生錯誤 (Model: {num}): {e}")

# print("\n所有測試結束。")

## Training Loop

In [None]:
try:
    while trained < TOTAL_STEPS:
        round_idx += 1
        chunk = min(TRAIN_CHUNK, TOTAL_STEPS - trained)
        # chunk = 2000
        label = "Run"
        tagged_label = f"{label}_{int(trained/TRAIN_CHUNK)}"

        print(f"\n=== Round {round_idx} | Learn {chunk} steps (Total trained: {trained}) ===")
        
        # --- Train ---
        model.learn(total_timesteps=chunk, reset_num_timesteps=False, tb_log_name=label)
        trained += chunk

        # --- Save Checkpoint ---
        ckpt_path = os.path.join(CKPT_DIR, f"{tagged_label}.zip")
        model.save(ckpt_path)
        print(f"Saved checkpoint: {ckpt_path}")

        # --- Evaluate ---
        mean_ret, best_ret = evaluate_policy(
            model,
            GAME,
            STATE,
            n_episodes=EVAL_EPISODES,
            max_steps=EVAL_MAX_STEPS,
        )
        print(f"[EVAL] Mean Return: {mean_ret:.3f}, Best Return: {best_ret:.3f}")

        # --- Record Video ---
        out_path = os.path.join(VIDEO_DIR, label)
        os.makedirs(out_path,  exist_ok=True)
        record_video(
            model,
            GAME,
            STATE,
            VIDEO_DIR,
            video_len=RECORD_STEPS,
            prefix=f"{label}/{tagged_label}_{mean_ret:.2f}",
        )

except KeyboardInterrupt:
    print("\nTraining interrupted manually.")

finally:
    train_env.close()
    print("Training finished. Environment closed.")
    
"""
tensorboard --logdir=./runs_smw/tb
"""


=== Round 43 | Learn 262144 steps (Total trained: 11010048) ===
Logging to ./runs_smw/tb/Run_0


---------------------------------
| time/              |          |
|    fps             | 1043     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 11018240 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 878         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 11026432    |
| train/                |             |
|    approx_kl          | 0.01558065  |
|    entropy_loss       | -1.87       |
|    explained_variance | 0.894       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0654      |
|    mean_step_reward   | 0.068051636 |
|    n_updates          | 3.12 %      |
|    policyGradLoss     | -0.0105     |
|    value_loss         | 0.497       |
---------------------------------------
---------------------------------------
| time/                 |             |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_42.zip
[EVAL] Mean Return: 18.363, Best Return: 19.696
Saved video to ./runs_smw/videos/Run/Run_42_18.36.mp4

=== Round 44 | Learn 262144 steps (Total trained: 11272192) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1143     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 11280384 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 912         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 11288576    |
| train/                |             |
|    approx_kl          | 0.021025116 |
|    entropy_loss       | -2.01       |
|    explained_variance | 0.901       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0317      |
|    mean_step_reward   | 0.0625

Saved checkpoint: ./runs_smw/checkpoints/Run_43.zip
[EVAL] Mean Return: 93.320, Best Return: 97.320
Saved video to ./runs_smw/videos/Run/Run_43_93.32.mp4

=== Round 45 | Learn 262144 steps (Total trained: 11534336) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1147     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 11542528 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 923         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 11550720    |
| train/                |             |
|    approx_kl          | 0.015474277 |
|    entropy_loss       | -1.95       |
|    explained_variance | 0.904       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0301      |
|    mean_step_reward   | 0.0629

Saved checkpoint: ./runs_smw/checkpoints/Run_44.zip
[EVAL] Mean Return: 125.770, Best Return: 132.270
Saved video to ./runs_smw/videos/Run/Run_44_125.77.mp4

=== Round 46 | Learn 262144 steps (Total trained: 11796480) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1146     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 11804672 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 915         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 11812864    |
| train/                |             |
|    approx_kl          | 0.019280529 |
|    entropy_loss       | -1.94       |
|    explained_variance | 0.945       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0302     |
|    mean_step_reward   | 0.0

Saved checkpoint: ./runs_smw/checkpoints/Run_45.zip
[EVAL] Mean Return: 39.753, Best Return: 41.753
Saved video to ./runs_smw/videos/Run/Run_45_39.75.mp4

=== Round 47 | Learn 262144 steps (Total trained: 12058624) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1118     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 12066816 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 899         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 12075008    |
| train/                |             |
|    approx_kl          | 0.021754365 |
|    entropy_loss       | -1.92       |
|    explained_variance | 0.941       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0139     |
|    mean_step_reward   | 0.0676

Saved checkpoint: ./runs_smw/checkpoints/Run_46.zip
[EVAL] Mean Return: 37.008, Best Return: 39.008
Saved video to ./runs_smw/videos/Run/Run_46_37.01.mp4

=== Round 48 | Learn 262144 steps (Total trained: 12320768) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1151     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 12328960 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 883         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 12337152    |
| train/                |             |
|    approx_kl          | 0.016809124 |
|    entropy_loss       | -1.97       |
|    explained_variance | 0.942       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0243     |
|    mean_step_reward   | 0.0811

Saved checkpoint: ./runs_smw/checkpoints/Run_47.zip
[EVAL] Mean Return: 73.633, Best Return: 80.466
Saved video to ./runs_smw/videos/Run/Run_47_73.63.mp4

=== Round 49 | Learn 262144 steps (Total trained: 12582912) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1187     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 12591104 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 900         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 12599296    |
| train/                |             |
|    approx_kl          | 0.018929526 |
|    entropy_loss       | -1.99       |
|    explained_variance | 0.925       |
|    learning_rate      | 0.0001      |
|    loss               | -0.00349    |
|    mean_step_reward   | 0.0651

Saved checkpoint: ./runs_smw/checkpoints/Run_48.zip
[EVAL] Mean Return: 123.777, Best Return: 129.444
Saved video to ./runs_smw/videos/Run/Run_48_123.78.mp4

=== Round 50 | Learn 262144 steps (Total trained: 12845056) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1132     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 12853248 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 895        |
|    iterations         | 2          |
|    time_elapsed       | 18         |
|    total_timesteps    | 12861440   |
| train/                |            |
|    approx_kl          | 0.01586888 |
|    entropy_loss       | -1.92      |
|    explained_variance | 0.935      |
|    learning_rate      | 0.0001     |
|    loss               | -0.044     |
|    mean_step_reward   | 0.09457708 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_49.zip
[EVAL] Mean Return: 136.725, Best Return: 143.725
Saved video to ./runs_smw/videos/Run/Run_49_136.73.mp4

=== Round 51 | Learn 262144 steps (Total trained: 13107200) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1075     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 13115392 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 854         |
|    iterations         | 2           |
|    time_elapsed       | 19          |
|    total_timesteps    | 13123584    |
| train/                |             |
|    approx_kl          | 0.015643349 |
|    entropy_loss       | -1.99       |
|    explained_variance | 0.947       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0562      |
|    mean_step_reward   | 0.0

Saved checkpoint: ./runs_smw/checkpoints/Run_50.zip
[EVAL] Mean Return: 17.825, Best Return: 19.158
Saved video to ./runs_smw/videos/Run/Run_50_17.82.mp4

=== Round 52 | Learn 262144 steps (Total trained: 13369344) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1177     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 13377536 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 918         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 13385728    |
| train/                |             |
|    approx_kl          | 0.015201175 |
|    entropy_loss       | -1.98       |
|    explained_variance | 0.9         |
|    learning_rate      | 0.0001      |
|    loss               | 0.117       |
|    mean_step_reward   | 0.0717

Saved checkpoint: ./runs_smw/checkpoints/Run_51.zip
[EVAL] Mean Return: 21.089, Best Return: 21.756
Saved video to ./runs_smw/videos/Run/Run_51_21.09.mp4

=== Round 53 | Learn 262144 steps (Total trained: 13631488) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1429     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 13639680 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 1089       |
|    iterations         | 2          |
|    time_elapsed       | 15         |
|    total_timesteps    | 13647872   |
| train/                |            |
|    approx_kl          | 0.01611774 |
|    entropy_loss       | -1.94      |
|    explained_variance | 0.935      |
|    learning_rate      | 0.0001     |
|    loss               | 0.0552     |
|    mean_step_reward   | 0.08250463 |
|    

Saved checkpoint: ./runs_smw/checkpoints/Run_52.zip
[EVAL] Mean Return: 67.517, Best Return: 71.350
Saved video to ./runs_smw/videos/Run/Run_52_67.52.mp4

=== Round 54 | Learn 262144 steps (Total trained: 13893632) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1092     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 13901824 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 857         |
|    iterations         | 2           |
|    time_elapsed       | 19          |
|    total_timesteps    | 13910016    |
| train/                |             |
|    approx_kl          | 0.024414096 |
|    entropy_loss       | -1.92       |
|    explained_variance | 0.957       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0553     |
|    mean_step_reward   | 0.0913

Saved checkpoint: ./runs_smw/checkpoints/Run_53.zip
[EVAL] Mean Return: 68.776, Best Return: 73.276
Saved video to ./runs_smw/videos/Run/Run_53_68.78.mp4

=== Round 55 | Learn 262144 steps (Total trained: 14155776) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1084     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 14163968 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 861         |
|    iterations         | 2           |
|    time_elapsed       | 19          |
|    total_timesteps    | 14172160    |
| train/                |             |
|    approx_kl          | 0.020663043 |
|    entropy_loss       | -1.87       |
|    explained_variance | 0.953       |
|    learning_rate      | 0.0001      |
|    loss               | -0.00526    |
|    mean_step_reward   | 0.1048

Saved checkpoint: ./runs_smw/checkpoints/Run_54.zip
[EVAL] Mean Return: 135.513, Best Return: 143.013
Saved video to ./runs_smw/videos/Run/Run_54_135.51.mp4

=== Round 56 | Learn 262144 steps (Total trained: 14417920) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1179     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 14426112 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 908         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 14434304    |
| train/                |             |
|    approx_kl          | 0.017235681 |
|    entropy_loss       | -1.92       |
|    explained_variance | 0.915       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0118      |
|    mean_step_reward   | 0.0

Saved checkpoint: ./runs_smw/checkpoints/Run_55.zip
[EVAL] Mean Return: 71.794, Best Return: 74.461
Saved video to ./runs_smw/videos/Run/Run_55_71.79.mp4

=== Round 57 | Learn 262144 steps (Total trained: 14680064) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1089     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 14688256 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 864         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 14696448    |
| train/                |             |
|    approx_kl          | 0.020974278 |
|    entropy_loss       | -1.9        |
|    explained_variance | 0.938       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0118      |
|    mean_step_reward   | 0.0925

Saved checkpoint: ./runs_smw/checkpoints/Run_56.zip
[EVAL] Mean Return: 75.021, Best Return: 79.188
Saved video to ./runs_smw/videos/Run/Run_56_75.02.mp4

=== Round 58 | Learn 262144 steps (Total trained: 14942208) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1097     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 14950400 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 876         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 14958592    |
| train/                |             |
|    approx_kl          | 0.020961206 |
|    entropy_loss       | -1.94       |
|    explained_variance | 0.959       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0384     |
|    mean_step_reward   | 0.1060

Saved checkpoint: ./runs_smw/checkpoints/Run_57.zip
[EVAL] Mean Return: 147.342, Best Return: 154.675
Saved video to ./runs_smw/videos/Run/Run_57_147.34.mp4

=== Round 59 | Learn 262144 steps (Total trained: 15204352) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1191     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 15212544 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 906         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 15220736    |
| train/                |             |
|    approx_kl          | 0.022775905 |
|    entropy_loss       | -1.91       |
|    explained_variance | 0.933       |
|    learning_rate      | 0.0001      |
|    loss               | -0.00594    |
|    mean_step_reward   | 0.0

Saved checkpoint: ./runs_smw/checkpoints/Run_58.zip
[EVAL] Mean Return: 136.068, Best Return: 143.401
Saved video to ./runs_smw/videos/Run/Run_58_136.07.mp4

=== Round 60 | Learn 262144 steps (Total trained: 15466496) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1127     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 15474688 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 900         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 15482880    |
| train/                |             |
|    approx_kl          | 0.016051877 |
|    entropy_loss       | -1.93       |
|    explained_variance | 0.936       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0346     |
|    mean_step_reward   | 0.0

Saved checkpoint: ./runs_smw/checkpoints/Run_59.zip
[EVAL] Mean Return: 141.007, Best Return: 147.841
Saved video to ./runs_smw/videos/Run/Run_59_141.01.mp4

=== Round 61 | Learn 262144 steps (Total trained: 15728640) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1156     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 15736832 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 886        |
|    iterations         | 2          |
|    time_elapsed       | 18         |
|    total_timesteps    | 15745024   |
| train/                |            |
|    approx_kl          | 0.02045311 |
|    entropy_loss       | -1.85      |
|    explained_variance | 0.938      |
|    learning_rate      | 0.0001     |
|    loss               | -0.0255    |
|    mean_step_reward   | 0.11964978 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_60.zip
[EVAL] Mean Return: 145.982, Best Return: 153.316
Saved video to ./runs_smw/videos/Run/Run_60_145.98.mp4

=== Round 62 | Learn 262144 steps (Total trained: 15990784) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1167     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 15998976 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 1016       |
|    iterations         | 2          |
|    time_elapsed       | 16         |
|    total_timesteps    | 16007168   |
| train/                |            |
|    approx_kl          | 0.01771575 |
|    entropy_loss       | -1.83      |
|    explained_variance | 0.965      |
|    learning_rate      | 0.0001     |
|    loss               | -0.0198    |
|    mean_step_reward   | 0.14327188 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_61.zip
[EVAL] Mean Return: 153.950, Best Return: 160.783
Saved video to ./runs_smw/videos/Run/Run_61_153.95.mp4

=== Round 63 | Learn 262144 steps (Total trained: 16252928) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1398     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 16261120 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 1081       |
|    iterations         | 2          |
|    time_elapsed       | 15         |
|    total_timesteps    | 16269312   |
| train/                |            |
|    approx_kl          | 0.01940117 |
|    entropy_loss       | -1.87      |
|    explained_variance | 0.903      |
|    learning_rate      | 0.0001     |
|    loss               | 0.0304     |
|    mean_step_reward   | 0.09081156 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_62.zip
[EVAL] Mean Return: 152.207, Best Return: 159.207
Saved video to ./runs_smw/videos/Run/Run_62_152.21.mp4

=== Round 64 | Learn 262144 steps (Total trained: 16515072) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1115     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 16523264 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 862         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 16531456    |
| train/                |             |
|    approx_kl          | 0.022123989 |
|    entropy_loss       | -1.86       |
|    explained_variance | 0.946       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0488      |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_63.zip
[EVAL] Mean Return: 140.905, Best Return: 148.405
Saved video to ./runs_smw/videos/Run/Run_63_140.91.mp4

=== Round 65 | Learn 262144 steps (Total trained: 16777216) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1198     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 16785408 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 919         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 16793600    |
| train/                |             |
|    approx_kl          | 0.018639578 |
|    entropy_loss       | -1.88       |
|    explained_variance | 0.961       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0065      |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_64.zip
[EVAL] Mean Return: 127.230, Best Return: 132.564
Saved video to ./runs_smw/videos/Run/Run_64_127.23.mp4

=== Round 66 | Learn 262144 steps (Total trained: 17039360) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1168     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 17047552 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 896         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 17055744    |
| train/                |             |
|    approx_kl          | 0.020678706 |
|    entropy_loss       | -1.96       |
|    explained_variance | 0.933       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0626     |
|    mean_step_reward   | 0.0

Saved checkpoint: ./runs_smw/checkpoints/Run_65.zip
[EVAL] Mean Return: 153.134, Best Return: 160.134
Saved video to ./runs_smw/videos/Run/Run_65_153.13.mp4

=== Round 67 | Learn 262144 steps (Total trained: 17301504) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1147     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 17309696 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 874         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 17317888    |
| train/                |             |
|    approx_kl          | 0.022051994 |
|    entropy_loss       | -1.86       |
|    explained_variance | 0.944       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0175     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_66.zip
[EVAL] Mean Return: 151.392, Best Return: 158.392
Saved video to ./runs_smw/videos/Run/Run_66_151.39.mp4

=== Round 68 | Learn 262144 steps (Total trained: 17563648) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1073     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 17571840 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 856         |
|    iterations         | 2           |
|    time_elapsed       | 19          |
|    total_timesteps    | 17580032    |
| train/                |             |
|    approx_kl          | 0.014710287 |
|    entropy_loss       | -1.89       |
|    explained_variance | 0.934       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0482      |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_67.zip
[EVAL] Mean Return: 144.909, Best Return: 152.242
Saved video to ./runs_smw/videos/Run/Run_67_144.91.mp4

=== Round 69 | Learn 262144 steps (Total trained: 17825792) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1169     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 17833984 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 914         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 17842176    |
| train/                |             |
|    approx_kl          | 0.020849967 |
|    entropy_loss       | -1.81       |
|    explained_variance | 0.949       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0268     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_68.zip
[EVAL] Mean Return: 140.062, Best Return: 147.562
Saved video to ./runs_smw/videos/Run/Run_68_140.06.mp4

=== Round 70 | Learn 262144 steps (Total trained: 18087936) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1137     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 18096128 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 876         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 18104320    |
| train/                |             |
|    approx_kl          | 0.020916387 |
|    entropy_loss       | -1.91       |
|    explained_variance | 0.943       |
|    learning_rate      | 0.0001      |
|    loss               | 0.106       |
|    mean_step_reward   | 0.0

Saved checkpoint: ./runs_smw/checkpoints/Run_69.zip
[EVAL] Mean Return: 146.236, Best Return: 152.570
Saved video to ./runs_smw/videos/Run/Run_69_146.24.mp4

=== Round 71 | Learn 262144 steps (Total trained: 18350080) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1096     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 18358272 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 873         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 18366464    |
| train/                |             |
|    approx_kl          | 0.020652438 |
|    entropy_loss       | -1.87       |
|    explained_variance | 0.962       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0145     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_70.zip
[EVAL] Mean Return: 151.463, Best Return: 158.463
Saved video to ./runs_smw/videos/Run/Run_70_151.46.mp4

=== Round 72 | Learn 262144 steps (Total trained: 18612224) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1488     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 18620416 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 1112        |
|    iterations         | 2           |
|    time_elapsed       | 14          |
|    total_timesteps    | 18628608    |
| train/                |             |
|    approx_kl          | 0.022071071 |
|    entropy_loss       | -1.89       |
|    explained_variance | 0.949       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0389     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_71.zip
[EVAL] Mean Return: 94.413, Best Return: 99.746
Saved video to ./runs_smw/videos/Run/Run_71_94.41.mp4

=== Round 73 | Learn 262144 steps (Total trained: 18874368) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1102     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 18882560 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 877         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 18890752    |
| train/                |             |
|    approx_kl          | 0.024615325 |
|    entropy_loss       | -1.83       |
|    explained_variance | 0.969       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0482     |
|    mean_step_reward   | 0.1219

Saved checkpoint: ./runs_smw/checkpoints/Run_72.zip
[EVAL] Mean Return: 87.489, Best Return: 92.156
Saved video to ./runs_smw/videos/Run/Run_72_87.49.mp4

=== Round 74 | Learn 262144 steps (Total trained: 19136512) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1115     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 19144704 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 891         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 19152896    |
| train/                |             |
|    approx_kl          | 0.025727585 |
|    entropy_loss       | -1.83       |
|    explained_variance | 0.971       |
|    learning_rate      | 0.0001      |
|    loss               | -0.043      |
|    mean_step_reward   | 0.1230

Saved checkpoint: ./runs_smw/checkpoints/Run_73.zip
[EVAL] Mean Return: 139.329, Best Return: 145.996
Saved video to ./runs_smw/videos/Run/Run_73_139.33.mp4

=== Round 75 | Learn 262144 steps (Total trained: 19398656) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1108     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 19406848 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 861        |
|    iterations         | 2          |
|    time_elapsed       | 19         |
|    total_timesteps    | 19415040   |
| train/                |            |
|    approx_kl          | 0.0186177  |
|    entropy_loss       | -1.86      |
|    explained_variance | 0.912      |
|    learning_rate      | 0.0001     |
|    loss               | -0.00412   |
|    mean_step_reward   | 0.09057943 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_74.zip
[EVAL] Mean Return: 147.675, Best Return: 154.675
Saved video to ./runs_smw/videos/Run/Run_74_147.67.mp4

=== Round 76 | Learn 262144 steps (Total trained: 19660800) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1161     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 19668992 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 916        |
|    iterations         | 2          |
|    time_elapsed       | 17         |
|    total_timesteps    | 19677184   |
| train/                |            |
|    approx_kl          | 0.01890644 |
|    entropy_loss       | -1.85      |
|    explained_variance | 0.946      |
|    learning_rate      | 0.0001     |
|    loss               | 0.0672     |
|    mean_step_reward   | 0.10538682 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_75.zip
[EVAL] Mean Return: 151.602, Best Return: 158.268
Saved video to ./runs_smw/videos/Run/Run_75_151.60.mp4

=== Round 77 | Learn 262144 steps (Total trained: 19922944) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1071     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 19931136 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 854         |
|    iterations         | 2           |
|    time_elapsed       | 19          |
|    total_timesteps    | 19939328    |
| train/                |             |
|    approx_kl          | 0.023568373 |
|    entropy_loss       | -1.83       |
|    explained_variance | 0.961       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0447     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_76.zip
[EVAL] Mean Return: 155.424, Best Return: 162.090
Saved video to ./runs_smw/videos/Run/Run_76_155.42.mp4

=== Round 78 | Learn 262144 steps (Total trained: 20185088) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1185     |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 20193280 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 917         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 20201472    |
| train/                |             |
|    approx_kl          | 0.018473424 |
|    entropy_loss       | -1.86       |
|    explained_variance | 0.92        |
|    learning_rate      | 0.0001      |
|    loss               | 0.000739    |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_77.zip
[EVAL] Mean Return: 82.918, Best Return: 87.251
Saved video to ./runs_smw/videos/Run/Run_77_82.92.mp4

=== Round 79 | Learn 262144 steps (Total trained: 20447232) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1139     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 20455424 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 893         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 20463616    |
| train/                |             |
|    approx_kl          | 0.025453366 |
|    entropy_loss       | -1.83       |
|    explained_variance | 0.966       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0405     |
|    mean_step_reward   | 0.1287

Saved checkpoint: ./runs_smw/checkpoints/Run_78.zip
[EVAL] Mean Return: 153.911, Best Return: 160.911
Saved video to ./runs_smw/videos/Run/Run_78_153.91.mp4

=== Round 80 | Learn 262144 steps (Total trained: 20709376) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1081     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 20717568 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 861        |
|    iterations         | 2          |
|    time_elapsed       | 19         |
|    total_timesteps    | 20725760   |
| train/                |            |
|    approx_kl          | 0.02174374 |
|    entropy_loss       | -1.86      |
|    explained_variance | 0.969      |
|    learning_rate      | 0.0001     |
|    loss               | -0.0596    |
|    mean_step_reward   | 0.1058431  |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_79.zip
[EVAL] Mean Return: 130.623, Best Return: 138.123
Saved video to ./runs_smw/videos/Run/Run_79_130.62.mp4

=== Round 81 | Learn 262144 steps (Total trained: 20971520) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1122     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 20979712 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 900        |
|    iterations         | 2          |
|    time_elapsed       | 18         |
|    total_timesteps    | 20987904   |
| train/                |            |
|    approx_kl          | 0.02540932 |
|    entropy_loss       | -1.81      |
|    explained_variance | 0.96       |
|    learning_rate      | 0.0001     |
|    loss               | -0.059     |
|    mean_step_reward   | 0.11206593 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_80.zip
[EVAL] Mean Return: 153.688, Best Return: 160.688
Saved video to ./runs_smw/videos/Run/Run_80_153.69.mp4

=== Round 82 | Learn 262144 steps (Total trained: 21233664) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1407     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 21241856 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 1051        |
|    iterations         | 2           |
|    time_elapsed       | 15          |
|    total_timesteps    | 21250048    |
| train/                |             |
|    approx_kl          | 0.028966289 |
|    entropy_loss       | -1.82       |
|    explained_variance | 0.966       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0447     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_81.zip
[EVAL] Mean Return: 152.947, Best Return: 159.947
Saved video to ./runs_smw/videos/Run/Run_81_152.95.mp4

=== Round 83 | Learn 262144 steps (Total trained: 21495808) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1097     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 21504000 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 852         |
|    iterations         | 2           |
|    time_elapsed       | 19          |
|    total_timesteps    | 21512192    |
| train/                |             |
|    approx_kl          | 0.025006909 |
|    entropy_loss       | -1.85       |
|    explained_variance | 0.971       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0546     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_82.zip
[EVAL] Mean Return: 151.997, Best Return: 158.664
Saved video to ./runs_smw/videos/Run/Run_82_152.00.mp4

=== Round 84 | Learn 262144 steps (Total trained: 21757952) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1087     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 21766144 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 859         |
|    iterations         | 2           |
|    time_elapsed       | 19          |
|    total_timesteps    | 21774336    |
| train/                |             |
|    approx_kl          | 0.018201653 |
|    entropy_loss       | -1.82       |
|    explained_variance | 0.949       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0106     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_83.zip
[EVAL] Mean Return: 153.640, Best Return: 160.640
Saved video to ./runs_smw/videos/Run/Run_83_153.64.mp4

=== Round 85 | Learn 262144 steps (Total trained: 22020096) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1148     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 22028288 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 884         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 22036480    |
| train/                |             |
|    approx_kl          | 0.025538363 |
|    entropy_loss       | -1.78       |
|    explained_variance | 0.941       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0957      |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_84.zip
[EVAL] Mean Return: 153.382, Best Return: 160.048
Saved video to ./runs_smw/videos/Run/Run_84_153.38.mp4

=== Round 86 | Learn 262144 steps (Total trained: 22282240) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1167     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 22290432 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 911        |
|    iterations         | 2          |
|    time_elapsed       | 17         |
|    total_timesteps    | 22298624   |
| train/                |            |
|    approx_kl          | 0.01948456 |
|    entropy_loss       | -1.89      |
|    explained_variance | 0.944      |
|    learning_rate      | 0.0001     |
|    loss               | 0.00423    |
|    mean_step_reward   | 0.11124082 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_85.zip
[EVAL] Mean Return: 142.183, Best Return: 148.850
Saved video to ./runs_smw/videos/Run/Run_85_142.18.mp4

=== Round 87 | Learn 262144 steps (Total trained: 22544384) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1145     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 22552576 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 892         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 22560768    |
| train/                |             |
|    approx_kl          | 0.027785089 |
|    entropy_loss       | -1.85       |
|    explained_variance | 0.961       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0692      |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_86.zip
[EVAL] Mean Return: 154.329, Best Return: 161.329
Saved video to ./runs_smw/videos/Run/Run_86_154.33.mp4

=== Round 88 | Learn 262144 steps (Total trained: 22806528) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1090     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 22814720 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 874         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 22822912    |
| train/                |             |
|    approx_kl          | 0.020861886 |
|    entropy_loss       | -1.86       |
|    explained_variance | 0.951       |
|    learning_rate      | 0.0001      |
|    loss               | -0.00875    |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_87.zip
[EVAL] Mean Return: 151.580, Best Return: 158.247
Saved video to ./runs_smw/videos/Run/Run_87_151.58.mp4

=== Round 89 | Learn 262144 steps (Total trained: 23068672) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1120     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 23076864 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 866        |
|    iterations         | 2          |
|    time_elapsed       | 18         |
|    total_timesteps    | 23085056   |
| train/                |            |
|    approx_kl          | 0.02536338 |
|    entropy_loss       | -1.84      |
|    explained_variance | 0.948      |
|    learning_rate      | 0.0001     |
|    loss               | -0.0187    |
|    mean_step_reward   | 0.10599108 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_88.zip
[EVAL] Mean Return: 149.858, Best Return: 156.858
Saved video to ./runs_smw/videos/Run/Run_88_149.86.mp4

=== Round 90 | Learn 262144 steps (Total trained: 23330816) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1165     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 23339008 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 906         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 23347200    |
| train/                |             |
|    approx_kl          | 0.029423356 |
|    entropy_loss       | -1.77       |
|    explained_variance | 0.977       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0383     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_89.zip
[EVAL] Mean Return: -10.139, Best Return: -10.139
Saved video to ./runs_smw/videos/Run/Run_89_-10.14.mp4

=== Round 91 | Learn 262144 steps (Total trained: 23592960) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1421     |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 23601152 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 1072        |
|    iterations         | 2           |
|    time_elapsed       | 15          |
|    total_timesteps    | 23609344    |
| train/                |             |
|    approx_kl          | 0.030730005 |
|    entropy_loss       | -1.82       |
|    explained_variance | 0.963       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0559     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_90.zip
[EVAL] Mean Return: 152.850, Best Return: 160.184
Saved video to ./runs_smw/videos/Run/Run_90_152.85.mp4

=== Round 92 | Learn 262144 steps (Total trained: 23855104) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1094     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 23863296 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 868         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 23871488    |
| train/                |             |
|    approx_kl          | 0.027209083 |
|    entropy_loss       | -1.82       |
|    explained_variance | 0.949       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0153     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_91.zip
[EVAL] Mean Return: 153.251, Best Return: 160.417
Saved video to ./runs_smw/videos/Run/Run_91_153.25.mp4

=== Round 93 | Learn 262144 steps (Total trained: 24117248) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1133     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 24125440 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 911         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 24133632    |
| train/                |             |
|    approx_kl          | 0.026826901 |
|    entropy_loss       | -1.8        |
|    explained_variance | 0.963       |
|    learning_rate      | 0.0001      |
|    loss               | -0.045      |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_92.zip
[EVAL] Mean Return: 156.097, Best Return: 163.097
Saved video to ./runs_smw/videos/Run/Run_92_156.10.mp4

=== Round 94 | Learn 262144 steps (Total trained: 24379392) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1086     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 24387584 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 887         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 24395776    |
| train/                |             |
|    approx_kl          | 0.023879819 |
|    entropy_loss       | -1.82       |
|    explained_variance | 0.956       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0244      |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_93.zip
[EVAL] Mean Return: 154.867, Best Return: 161.867
Saved video to ./runs_smw/videos/Run/Run_93_154.87.mp4

=== Round 95 | Learn 262144 steps (Total trained: 24641536) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1105     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 24649728 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 917        |
|    iterations         | 2          |
|    time_elapsed       | 17         |
|    total_timesteps    | 24657920   |
| train/                |            |
|    approx_kl          | 0.02180229 |
|    entropy_loss       | -1.81      |
|    explained_variance | 0.947      |
|    learning_rate      | 0.0001     |
|    loss               | 0.0156     |
|    mean_step_reward   | 0.11843361 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_94.zip
[EVAL] Mean Return: 155.614, Best Return: 162.614
Saved video to ./runs_smw/videos/Run/Run_94_155.61.mp4

=== Round 96 | Learn 262144 steps (Total trained: 24903680) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1122     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 24911872 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 897         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 24920064    |
| train/                |             |
|    approx_kl          | 0.02600263  |
|    entropy_loss       | -1.87       |
|    explained_variance | 0.948       |
|    learning_rate      | 0.0001      |
|    loss               | -0.021      |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_95.zip
[EVAL] Mean Return: 150.701, Best Return: 157.867
Saved video to ./runs_smw/videos/Run/Run_95_150.70.mp4

=== Round 97 | Learn 262144 steps (Total trained: 25165824) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1108     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 25174016 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 903        |
|    iterations         | 2          |
|    time_elapsed       | 18         |
|    total_timesteps    | 25182208   |
| train/                |            |
|    approx_kl          | 0.02213607 |
|    entropy_loss       | -1.81      |
|    explained_variance | 0.963      |
|    learning_rate      | 0.0001     |
|    loss               | 0.0161     |
|    mean_step_reward   | 0.11792127 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_96.zip
[EVAL] Mean Return: 156.155, Best Return: 163.155
Saved video to ./runs_smw/videos/Run/Run_96_156.16.mp4

=== Round 98 | Learn 262144 steps (Total trained: 25427968) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1101     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 25436160 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 908        |
|    iterations         | 2          |
|    time_elapsed       | 18         |
|    total_timesteps    | 25444352   |
| train/                |            |
|    approx_kl          | 0.02808245 |
|    entropy_loss       | -1.83      |
|    explained_variance | 0.979      |
|    learning_rate      | 0.0001     |
|    loss               | 0.0494     |
|    mean_step_reward   | 0.12276604 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_97.zip
[EVAL] Mean Return: 156.083, Best Return: 163.083
Saved video to ./runs_smw/videos/Run/Run_97_156.08.mp4

=== Round 99 | Learn 262144 steps (Total trained: 25690112) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1127     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 25698304 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 916         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 25706496    |
| train/                |             |
|    approx_kl          | 0.032261856 |
|    entropy_loss       | -1.78       |
|    explained_variance | 0.962       |
|    learning_rate      | 0.0001      |
|    loss               | 0.00995     |
|    mean_step_reward   | 0.1

Saved checkpoint: ./runs_smw/checkpoints/Run_98.zip
[EVAL] Mean Return: 156.810, Best Return: 163.810
Saved video to ./runs_smw/videos/Run/Run_98_156.81.mp4

=== Round 100 | Learn 262144 steps (Total trained: 25952256) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1095     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 25960448 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 891        |
|    iterations         | 2          |
|    time_elapsed       | 18         |
|    total_timesteps    | 25968640   |
| train/                |            |
|    approx_kl          | 0.01693855 |
|    entropy_loss       | -1.85      |
|    explained_variance | 0.958      |
|    learning_rate      | 0.0001     |
|    loss               | 0.0392     |
|    mean_step_reward   | 0.1109699  |
|

Saved checkpoint: ./runs_smw/checkpoints/Run_99.zip
[EVAL] Mean Return: 20.751, Best Return: 22.084
Saved video to ./runs_smw/videos/Run/Run_99_20.75.mp4

=== Round 101 | Learn 262144 steps (Total trained: 26214400) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1096     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 26222592 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 905         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 26230784    |
| train/                |             |
|    approx_kl          | 0.015362481 |
|    entropy_loss       | -1.83       |
|    explained_variance | 0.931       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0127      |
|    mean_step_reward   | 0.118

Saved checkpoint: ./runs_smw/checkpoints/Run_100.zip
[EVAL] Mean Return: 155.387, Best Return: 162.387
Saved video to ./runs_smw/videos/Run/Run_100_155.39.mp4

=== Round 102 | Learn 262144 steps (Total trained: 26476544) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1119     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 26484736 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 913         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 26492928    |
| train/                |             |
|    approx_kl          | 0.023912037 |
|    entropy_loss       | -1.75       |
|    explained_variance | 0.974       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0436     |
|    mean_step_reward   | 

Saved checkpoint: ./runs_smw/checkpoints/Run_101.zip
[EVAL] Mean Return: 89.473, Best Return: 94.473
Saved video to ./runs_smw/videos/Run/Run_101_89.47.mp4

=== Round 103 | Learn 262144 steps (Total trained: 26738688) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1098     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 26746880 |
---------------------------------
--------------------------------------
| time/                 |            |
|    fps                | 905        |
|    iterations         | 2          |
|    time_elapsed       | 18         |
|    total_timesteps    | 26755072   |
| train/                |            |
|    approx_kl          | 0.02153026 |
|    entropy_loss       | -1.8       |
|    explained_variance | 0.974      |
|    learning_rate      | 0.0001     |
|    loss               | -0.0175    |
|    mean_step_reward   | 0.11655527 |
| 

Saved checkpoint: ./runs_smw/checkpoints/Run_102.zip
[EVAL] Mean Return: 95.328, Best Return: 100.328
Saved video to ./runs_smw/videos/Run/Run_102_95.33.mp4

=== Round 104 | Learn 262144 steps (Total trained: 27000832) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1092     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 27009024 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 899         |
|    iterations         | 2           |
|    time_elapsed       | 18          |
|    total_timesteps    | 27017216    |
| train/                |             |
|    approx_kl          | 0.021303087 |
|    entropy_loss       | -1.8        |
|    explained_variance | 0.957       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0981      |
|    mean_step_reward   | 0.

Saved checkpoint: ./runs_smw/checkpoints/Run_103.zip
[EVAL] Mean Return: 154.514, Best Return: 161.180
Saved video to ./runs_smw/videos/Run/Run_103_154.51.mp4

=== Round 105 | Learn 262144 steps (Total trained: 27262976) ===
Logging to ./runs_smw/tb/Run_0
---------------------------------
| time/              |          |
|    fps             | 1158     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 27271168 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 919         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 27279360    |
| train/                |             |
|    approx_kl          | 0.021946821 |
|    entropy_loss       | -1.82       |
|    explained_variance | 0.964       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0229     |
|    mean_step_reward   | 

## Display Video

In [None]:
from IPython.display import Video
import glob
# label = "Dec22A"

# list_of_files = glob.glob(os.path.join(VIDEO_DIR, label, '*.mp4')) 
# if list_of_files:
#     latest_file = max(list_of_files, key=os.path.getctime)
#     print(f"Playing: {latest_file}")
#     latest_file = "runs_smw/videos/Dec22A/Dec22A_73_596.54.mp4"
#     print(f"Playing: {latest_file}")
#     display(Video(latest_file, embed=True, width=768))
# else:
#     print("No videos found yet.")
    
video = "./runs_smw/videos/test_126.mp4"
display(Video(video, embed=True, width=768))

In [None]:
import cv2

cap = cv2.VideoCapture("runs_smw/videos/test_16.mp4")

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    cv2.imshow("Frame-by-Frame", frame)

    # 關鍵：這裡等待按鍵。按 'n' 鍵跳到下一幀，按 'q' 離開
    key = cv2.waitKey(0) 
    if key == ord('q'):
        break
    elif key == ord('n'):
        continue

cap.release()
cv2.destroyAllWindows()

In [None]:
"""
[070] coins: 12 | score: 3540
[071] coins: 10 | score: 2260
[072] coins: 11 | score: 2760
[073] coins:  2 | score:  690
[074] coins: 12 | score: 3450
[075] coins: 12 | score: 3515
[076] coins: 12 | score: 3545
[077] coins: 12 | score: 3545
[078] coins: 10 | score: 2460
[079] coins: 12 | score: 3515
[080] coins: 12 | score: 3580
[081] coins: 11 | score: 2750
[082] coins: 12 | score: 3545
[083] coins: 12 | score: 3565
[084] coins: 11 | score: 3475
[085] coins:  0 | score:    0
[086] coins: 12 | score: 3535
[087] coins: 12 | score: 3560
[088] coins:  9 | score: 1420
[089] coins: 11 | score: 3640
[090] coins:  1 | score:  380
[091] coins: 10 | score: 2440
[092] coins: 12 | score: 3570
[093] coins: 12 | score: 3490
[094] coins: 11 | score: 2745
[095] coins: 12 | score: 3565
[096] coins:  0 | score:    0
[097] coins: 12 | score: 3490
[098] coins: 12 | score: 3570
[099] coins:  2 | score:  560
[100] coins:  2 | score:  660
[101] coins: 12 | score: 3580
[102] coins:  9 | score: 1420
[103] coins: 12 | score: 3575
[104] coins: 12 | score: 3585
[105] coins: 12 | score: 3580
[106] coins: 12 | score: 3525
[107] coins:  2 | score:  540
[108] coins:  2 | score:  660
[109] coins: 10 | score: 2420
[110] coins:  1 | score:  140
[111] coins: 11 | score: 2680
[112] coins:  2 | score:  580
[113] coins:  2 | score:  580
[114] coins:  2 | score:  560
[115] coins: 11 | score: 2765
[116] coins:  2 | score:  560
[117] coins:  0 | score:    0
[118] coins: 12 | score: 3570
[119] coins:  1 | score:  340
[120] coins: 11 | score: 2735
[121] coins: 12 | score: 3570
[122] coins: 12 | score: 3515
[123] coins: 12 | score: 3580
[124] coins: 12 | score: 3585
[125] coins: 12 | score: 3560
[126] coins: 12 | score: 3595
[127] coins: 12 | score: 3515

所有測試結束。
在 reward 紀錄上，紀錄前10幀的 action 是甚麼，然後檢查
"""