# 2025 DL Lab8: RL Assignment_Super Mario World

**Your Answer:**    
Hi I'm XXX, XXXXXXXXXX.

## Overview
This project implements a **Deep Reinforcement Learning** pipeline to train an autonomous agent for Super Mario World. Leveraging the **Proximal Policy Optimization (PPO)** algorithm, the system interacts with the **stable-retro** environment to master the YoshiIsland1 level. Key components include a custom Vision Backbone for extracting features from raw pixel data and a suite of Environment Wrappers that handle frame preprocessing, action discretization, and reward shaping to facilitate efficient learning.

Reward function implement  
should do something in the beginning (monster attack)  
Custom PPO implement  
pre train weight 差不多，主要是 reward function  
model weight capacity 1GB  
class name 不要動 (可以新增，但是原本有的不要動)

## Imports

In [1]:
import os
import numpy as np
import retro
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.vec_env import VecNormalize

from eval import evaluate_policy, record_video
from custom_policy import VisionBackbonePolicy, CustomPPO

  from .autonotebook import tqdm as notebook_tqdm


## Configuration

In [2]:
# Game Settings
GAME = "SuperMarioWorld-Snes"
STATE = "YoshiIsland1"

# Training Settings
TOTAL_STEPS = 13_107_200
TRAIN_CHUNK =    327_680
N_ENVS = 16
LEARNING_RATE = 1e-4

# Evaluation & Recording Settings
EVAL_EPISODES = 3
EVAL_MAX_STEPS = 18000
RECORD_STEPS = 1800

# Directories
LOG_DIR = "./runs_smw"
VIDEO_DIR       = os.path.join(LOG_DIR, "videos")
CKPT_DIR        = os.path.join(LOG_DIR, "checkpoints")
TENSORBOARD_LOG = os.path.join(LOG_DIR, "tb")

os.makedirs(LOG_DIR,   exist_ok=True)
os.makedirs(CKPT_DIR,  exist_ok=True)
os.makedirs(VIDEO_DIR, exist_ok=True)

## Environment Functions

In [3]:
from wrappers import make_base_env
def _make_env_thunk(game: str, state: str):
    """Return a function that creates an environment (for multiprocessing)."""
    def _thunk():
        return make_base_env(game, state)
    return _thunk

def make_vec_env(game: str, state: str, n_envs: int, use_subproc: bool = True):
    """Create a vectorized environment (multiple envs running in parallel)."""
    env_fns = [_make_env_thunk(game, state) for _ in range(n_envs)]
    
    if use_subproc and n_envs > 1:
        vec_env = SubprocVecEnv(env_fns)
    else:
        vec_env = DummyVecEnv(env_fns)

    return vec_env

## Initialize Env & Model

In [4]:
# 1. Create Training Environment
train_env = make_vec_env(GAME, STATE, n_envs=N_ENVS)
# train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10.)
print(f"Environment created: {GAME} - {STATE} with {N_ENVS} parallel envs.")

checkpoint_path = "None"
# checkpoint_path = "runs_smw/checkpoints/SF84_step_1600000.zip"
# checkpoint_path = "runs_smw/checkpoints/SF84G_6553600.zip"
# checkpoint_path = "runs_smw/checkpoints/SF84G_5.zip"

# 2. Initialize Model
if os.path.exists(checkpoint_path):
    print(f"Loading model from {checkpoint_path}...")
    # 讀取現有模型
    model = CustomPPO.load(
        checkpoint_path, 
        env=train_env,
        device="cuda:0" # 確保使用 GPU
    )
else:
    print(f"Fail to load {checkpoint_path}...")
    model = CustomPPO(
        VisionBackbonePolicy,
        train_env,
        policy_kwargs   = dict(normalize_images=False),
        n_epochs        = 4,
        n_steps         = 512,
        batch_size      = 512,
        learning_rate   = LEARNING_RATE,
        verbose         = 1,
        gamma           = 0.99,
        kl_coef         = 1,
        clip_range      = 0.125,
        tensorboard_log = TENSORBOARD_LOG,
    )

Environment created: SuperMarioWorld-Snes - YoshiIsland1 with 16 parallel envs.
Fail to load None...
Using cuda:0 device


In [5]:
# model.save("policy")

## Training Loop

In [6]:
best_mean = -1e18
trained = 0
round_idx = 0

try:
    while trained < TOTAL_STEPS:
        round_idx += 1
        chunk = min(TRAIN_CHUNK, TOTAL_STEPS - trained)
        # chunk = 2000
        label = "PIPE"
        tagged_label = f"{label}_{int(trained/TRAIN_CHUNK)}"

        print(f"\n=== Round {round_idx} | Learn {chunk} steps (Total trained: {trained}) ===")
        
        # --- Train ---
        model.learn(total_timesteps=chunk, reset_num_timesteps=False, tb_log_name=label)
        trained += chunk
        

        # --- Save Checkpoint ---
        ckpt_path = os.path.join(CKPT_DIR, f"{tagged_label}.zip")
        model.save(ckpt_path)
        print(f"Saved checkpoint: {ckpt_path}")

        # --- Evaluate ---
        mean_ret, best_ret = evaluate_policy(
            model,
            GAME,
            STATE,
            n_episodes=EVAL_EPISODES,
            max_steps=EVAL_MAX_STEPS,
        )
        print(f"[EVAL] Mean Return: {mean_ret:.3f}, Best Return: {best_ret:.3f}")

        # --- Save Best Model ---
        # if mean_ret > best_mean:
        #     best_mean = mean_ret
        #     best_path = os.path.join(LOG_DIR, "best_model.zip")
        #     model.save(best_path)
        #     print(f"New best record. Saved to {best_path}")

        # --- Record Video ---
        out_path = os.path.join(VIDEO_DIR, label)
        os.makedirs(out_path,  exist_ok=True)
        record_video(
            model,
            GAME,
            STATE,
            VIDEO_DIR,
            video_len=RECORD_STEPS,
            prefix=f"{label}/{tagged_label}_{mean_ret:.2f}",
        )

except KeyboardInterrupt:
    print("\nTraining interrupted manually.")

finally:
    train_env.close()
    print("Training finished. Environment closed.")
    
"""
tensorboard --logdir=./runs_smw/tb
"""


=== Round 1 | Learn 327680 steps (Total trained: 0) ===
Logging to ./runs_smw/tb/PIPE_0


-----------------------------
| time/              |      |
|    fps             | 1118 |
|    iterations      | 1    |
|    time_elapsed    | 7    |
|    total_timesteps | 8192 |
-----------------------------
---------------------------------------
| time/                 |             |
|    fps                | 931         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 16384       |
| train/                |             |
|    approx_kl          | 0.005228821 |
|    entropy_loss       | -2.48       |
|    explained_variance | -0.00291    |
|    learning_rate      | 0.0001      |
|    loss               | -0.0387     |
|    mean_step_reward   | 0.008536389 |
|    n_updates          | 4           |
|    policyGradLoss     | -0.0122     |
|    value_loss         | 0.0917      |
---------------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 87

Saved checkpoint: ./runs_smw/checkpoints/PIPE_0.zip
[EVAL] Mean Return: -0.274, Best Return: -0.274
Saved video to ./runs_smw/videos/PIPE/PIPE_0_-0.27.mp4

=== Round 2 | Learn 327680 steps (Total trained: 327680) ===
Logging to ./runs_smw/tb/PIPE_0
-------------------------------
| time/              |        |
|    fps             | 1094   |
|    iterations      | 1      |
|    time_elapsed    | 7      |
|    total_timesteps | 335872 |
-------------------------------
-----------------------------------------
| time/                 |               |
|    fps                | 920           |
|    iterations         | 2             |
|    time_elapsed       | 17            |
|    total_timesteps    | 344064        |
| train/                |               |
|    approx_kl          | 0.00090827805 |
|    entropy_loss       | -2.25         |
|    explained_variance | 0.871         |
|    learning_rate      | 0.0001        |
|    loss               | 0.105         |
|    mean_step_reward  

Saved checkpoint: ./runs_smw/checkpoints/PIPE_1.zip
[EVAL] Mean Return: -0.274, Best Return: -0.274
Saved video to ./runs_smw/videos/PIPE/PIPE_1_-0.27.mp4

=== Round 3 | Learn 327680 steps (Total trained: 655360) ===
Logging to ./runs_smw/tb/PIPE_0
-------------------------------
| time/              |        |
|    fps             | 1104   |
|    iterations      | 1      |
|    time_elapsed    | 7      |
|    total_timesteps | 663552 |
-------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 927         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 671744      |
| train/                |             |
|    approx_kl          | 0.00288287  |
|    entropy_loss       | -2.22       |
|    explained_variance | 0.927       |
|    learning_rate      | 0.0001      |
|    loss               | 0.045       |
|    mean_step_reward   | 0.048408493 |
|    n_

Saved checkpoint: ./runs_smw/checkpoints/PIPE_2.zip
[EVAL] Mean Return: -0.274, Best Return: -0.274
Saved video to ./runs_smw/videos/PIPE/PIPE_2_-0.27.mp4

=== Round 4 | Learn 327680 steps (Total trained: 983040) ===
Logging to ./runs_smw/tb/PIPE_0
-------------------------------
| time/              |        |
|    fps             | 1117   |
|    iterations      | 1      |
|    time_elapsed    | 7      |
|    total_timesteps | 991232 |
-------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 927          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 999424       |
| train/                |              |
|    approx_kl          | 0.0044983127 |
|    entropy_loss       | -2.19        |
|    explained_variance | 0.916        |
|    learning_rate      | 0.0001       |
|    loss               | 0.137        |
|    mean_step_reward   | 0.0544358

Saved checkpoint: ./runs_smw/checkpoints/PIPE_3.zip
[EVAL] Mean Return: 270.864, Best Return: 270.864
Saved video to ./runs_smw/videos/PIPE/PIPE_3_270.86.mp4

=== Round 5 | Learn 327680 steps (Total trained: 1310720) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1129    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 1318912 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 942          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 1327104      |
| train/                |              |
|    approx_kl          | 0.0041003097 |
|    entropy_loss       | -2.11        |
|    explained_variance | 0.851        |
|    learning_rate      | 0.0001       |
|    loss               | 0.346        |
|    mean_step_reward   

Saved checkpoint: ./runs_smw/checkpoints/PIPE_4.zip
[EVAL] Mean Return: -0.274, Best Return: -0.274
Saved video to ./runs_smw/videos/PIPE/PIPE_4_-0.27.mp4

=== Round 6 | Learn 327680 steps (Total trained: 1638400) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1132    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 1646592 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 948         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 1654784     |
| train/                |             |
|    approx_kl          | 0.008576376 |
|    entropy_loss       | -2          |
|    explained_variance | 0.87        |
|    learning_rate      | 0.0001      |
|    loss               | 0.39        |
|    mean_step_reward   | 0.07470573  |

Saved checkpoint: ./runs_smw/checkpoints/PIPE_5.zip
[EVAL] Mean Return: 5.688, Best Return: 5.688
Saved video to ./runs_smw/videos/PIPE/PIPE_5_5.69.mp4

=== Round 7 | Learn 327680 steps (Total trained: 1966080) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1107    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 1974272 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 920         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 1982464     |
| train/                |             |
|    approx_kl          | 0.010430824 |
|    entropy_loss       | -2.04       |
|    explained_variance | 0.874       |
|    learning_rate      | 0.0001      |
|    loss               | 0.291       |
|    mean_step_reward   | 0.075343445 |
| 

Saved checkpoint: ./runs_smw/checkpoints/PIPE_6.zip
[EVAL] Mean Return: 5.644, Best Return: 5.644
Saved video to ./runs_smw/videos/PIPE/PIPE_6_5.64.mp4

=== Round 8 | Learn 327680 steps (Total trained: 2293760) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1111    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 2301952 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 939         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 2310144     |
| train/                |             |
|    approx_kl          | 0.009665249 |
|    entropy_loss       | -1.94       |
|    explained_variance | 0.903       |
|    learning_rate      | 0.0001      |
|    loss               | 0.145       |
|    mean_step_reward   | 0.08902687  |
| 

Saved checkpoint: ./runs_smw/checkpoints/PIPE_7.zip
[EVAL] Mean Return: 67.332, Best Return: 67.382
Saved video to ./runs_smw/videos/PIPE/PIPE_7_67.33.mp4

=== Round 9 | Learn 327680 steps (Total trained: 2621440) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1111    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 2629632 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 931          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 2637824      |
| train/                |              |
|    approx_kl          | 0.0063825687 |
|    entropy_loss       | -1.95        |
|    explained_variance | 0.832        |
|    learning_rate      | 0.0001       |
|    loss               | 0.623        |
|    mean_step_reward   | 0

Saved checkpoint: ./runs_smw/checkpoints/PIPE_8.zip
[EVAL] Mean Return: 18.170, Best Return: 18.220
Saved video to ./runs_smw/videos/PIPE/PIPE_8_18.17.mp4

=== Round 10 | Learn 327680 steps (Total trained: 2949120) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1077    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 2957312 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 930         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 2965504     |
| train/                |             |
|    approx_kl          | 0.008207271 |
|    entropy_loss       | -2          |
|    explained_variance | 0.862       |
|    learning_rate      | 0.0001      |
|    loss               | 0.387       |
|    mean_step_reward   | 0.086823076 

Saved checkpoint: ./runs_smw/checkpoints/PIPE_9.zip
[EVAL] Mean Return: 57.136, Best Return: 57.236
Saved video to ./runs_smw/videos/PIPE/PIPE_9_57.14.mp4

=== Round 11 | Learn 327680 steps (Total trained: 3276800) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1124    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 3284992 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 918         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 3293184     |
| train/                |             |
|    approx_kl          | 0.009624194 |
|    entropy_loss       | -1.97       |
|    explained_variance | 0.93        |
|    learning_rate      | 0.0001      |
|    loss               | 0.155       |
|    mean_step_reward   | 0.0943924   

Saved checkpoint: ./runs_smw/checkpoints/PIPE_10.zip
[EVAL] Mean Return: 280.729, Best Return: 280.829
Saved video to ./runs_smw/videos/PIPE/PIPE_10_280.73.mp4

=== Round 12 | Learn 327680 steps (Total trained: 3604480) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1117    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 3612672 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 959         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 3620864     |
| train/                |             |
|    approx_kl          | 0.009023877 |
|    entropy_loss       | -1.92       |
|    explained_variance | 0.894       |
|    learning_rate      | 0.0001      |
|    loss               | 0.252       |
|    mean_step_reward   | 0.09732

Saved checkpoint: ./runs_smw/checkpoints/PIPE_11.zip
[EVAL] Mean Return: -114.586, Best Return: -114.536
Saved video to ./runs_smw/videos/PIPE/PIPE_11_-114.59.mp4

=== Round 13 | Learn 327680 steps (Total trained: 3932160) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1111    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 3940352 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 940         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 3948544     |
| train/                |             |
|    approx_kl          | 0.012517181 |
|    entropy_loss       | -1.89       |
|    explained_variance | 0.897       |
|    learning_rate      | 0.0001      |
|    loss               | 0.13        |
|    mean_step_reward   | 0.10

Saved checkpoint: ./runs_smw/checkpoints/PIPE_12.zip
[EVAL] Mean Return: 82.358, Best Return: 82.508
Saved video to ./runs_smw/videos/PIPE/PIPE_12_82.36.mp4

=== Round 14 | Learn 327680 steps (Total trained: 4259840) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1109    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 4268032 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 928          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 4276224      |
| train/                |              |
|    approx_kl          | 0.0085531175 |
|    entropy_loss       | -1.9         |
|    explained_variance | 0.847        |
|    learning_rate      | 0.0001       |
|    loss               | 0.227        |
|    mean_step_reward   

Saved checkpoint: ./runs_smw/checkpoints/PIPE_13.zip
[EVAL] Mean Return: 145.686, Best Return: 145.886
Saved video to ./runs_smw/videos/PIPE/PIPE_13_145.69.mp4

=== Round 15 | Learn 327680 steps (Total trained: 4587520) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1129    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 4595712 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 934          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 4603904      |
| train/                |              |
|    approx_kl          | 0.0123361945 |
|    entropy_loss       | -1.91        |
|    explained_variance | 0.888        |
|    learning_rate      | 0.0001       |
|    loss               | 0.19         |
|    mean_step_reward

Saved checkpoint: ./runs_smw/checkpoints/PIPE_14.zip
[EVAL] Mean Return: 274.507, Best Return: 274.557
Saved video to ./runs_smw/videos/PIPE/PIPE_14_274.51.mp4

=== Round 16 | Learn 327680 steps (Total trained: 4915200) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1115    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 4923392 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 943         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 4931584     |
| train/                |             |
|    approx_kl          | 0.009272688 |
|    entropy_loss       | -1.95       |
|    explained_variance | 0.905       |
|    learning_rate      | 0.0001      |
|    loss               | 0.25        |
|    mean_step_reward   | 0.09772

Saved checkpoint: ./runs_smw/checkpoints/PIPE_15.zip
[EVAL] Mean Return: 23.708, Best Return: 23.758
Saved video to ./runs_smw/videos/PIPE/PIPE_15_23.71.mp4

=== Round 17 | Learn 327680 steps (Total trained: 5242880) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1074    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 5251072 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 917         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 5259264     |
| train/                |             |
|    approx_kl          | 0.008452401 |
|    entropy_loss       | -1.9        |
|    explained_variance | 0.905       |
|    learning_rate      | 0.0001      |
|    loss               | 0.136       |
|    mean_step_reward   | 0.11494108

Saved checkpoint: ./runs_smw/checkpoints/PIPE_16.zip
[EVAL] Mean Return: 186.451, Best Return: 186.751
Saved video to ./runs_smw/videos/PIPE/PIPE_16_186.45.mp4

=== Round 18 | Learn 327680 steps (Total trained: 5570560) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1138    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 5578752 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 939         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 5586944     |
| train/                |             |
|    approx_kl          | 0.009525218 |
|    entropy_loss       | -1.95       |
|    explained_variance | 0.922       |
|    learning_rate      | 0.0001      |
|    loss               | 0.166       |
|    mean_step_reward   | 0.11724

Saved checkpoint: ./runs_smw/checkpoints/PIPE_17.zip
[EVAL] Mean Return: 186.889, Best Return: 187.189
Saved video to ./runs_smw/videos/PIPE/PIPE_17_186.89.mp4

=== Round 19 | Learn 327680 steps (Total trained: 5898240) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1115    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 5906432 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 931         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 5914624     |
| train/                |             |
|    approx_kl          | 0.009112071 |
|    entropy_loss       | -1.95       |
|    explained_variance | 0.933       |
|    learning_rate      | 0.0001      |
|    loss               | 0.174       |
|    mean_step_reward   | 0.12543

Saved checkpoint: ./runs_smw/checkpoints/PIPE_18.zip
[EVAL] Mean Return: -0.278, Best Return: -0.278
Saved video to ./runs_smw/videos/PIPE/PIPE_18_-0.28.mp4

=== Round 20 | Learn 327680 steps (Total trained: 6225920) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1111    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 6234112 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 935         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 6242304     |
| train/                |             |
|    approx_kl          | 0.013216671 |
|    entropy_loss       | -1.9        |
|    explained_variance | 0.953       |
|    learning_rate      | 0.0001      |
|    loss               | 0.1         |
|    mean_step_reward   | 0.12600796

Saved checkpoint: ./runs_smw/checkpoints/PIPE_19.zip
[EVAL] Mean Return: 188.412, Best Return: 188.712
Saved video to ./runs_smw/videos/PIPE/PIPE_19_188.41.mp4

=== Round 21 | Learn 327680 steps (Total trained: 6553600) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1124    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 6561792 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 949         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 6569984     |
| train/                |             |
|    approx_kl          | 0.009955311 |
|    entropy_loss       | -1.92       |
|    explained_variance | 0.948       |
|    learning_rate      | 0.0001      |
|    loss               | 0.315       |
|    mean_step_reward   | 0.13553

Saved checkpoint: ./runs_smw/checkpoints/PIPE_20.zip
[EVAL] Mean Return: 188.004, Best Return: 188.304
Saved video to ./runs_smw/videos/PIPE/PIPE_20_188.00.mp4

=== Round 22 | Learn 327680 steps (Total trained: 6881280) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1082    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 6889472 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 925         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 6897664     |
| train/                |             |
|    approx_kl          | 0.008886844 |
|    entropy_loss       | -1.91       |
|    explained_variance | 0.927       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0368      |
|    mean_step_reward   | 0.11615

Saved checkpoint: ./runs_smw/checkpoints/PIPE_21.zip
[EVAL] Mean Return: 162.465, Best Return: 162.715
Saved video to ./runs_smw/videos/PIPE/PIPE_21_162.47.mp4

=== Round 23 | Learn 327680 steps (Total trained: 7208960) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1091    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 7217152 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 915         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 7225344     |
| train/                |             |
|    approx_kl          | 0.008005666 |
|    entropy_loss       | -1.91       |
|    explained_variance | 0.974       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0915      |
|    mean_step_reward   | 0.13999

Saved checkpoint: ./runs_smw/checkpoints/PIPE_22.zip
[EVAL] Mean Return: 187.482, Best Return: 187.782
Saved video to ./runs_smw/videos/PIPE/PIPE_22_187.48.mp4

=== Round 24 | Learn 327680 steps (Total trained: 7536640) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1110    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 7544832 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 932         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 7553024     |
| train/                |             |
|    approx_kl          | 0.008957449 |
|    entropy_loss       | -1.93       |
|    explained_variance | 0.979       |
|    learning_rate      | 0.0001      |
|    loss               | 0.086       |
|    mean_step_reward   | 0.13369

Saved checkpoint: ./runs_smw/checkpoints/PIPE_23.zip
[EVAL] Mean Return: 188.283, Best Return: 188.583
Saved video to ./runs_smw/videos/PIPE/PIPE_23_188.28.mp4

=== Round 25 | Learn 327680 steps (Total trained: 7864320) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1078    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 7872512 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 919         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 7880704     |
| train/                |             |
|    approx_kl          | 0.008159247 |
|    entropy_loss       | -1.84       |
|    explained_variance | 0.933       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0392      |
|    mean_step_reward   | 0.13215

Saved checkpoint: ./runs_smw/checkpoints/PIPE_24.zip
[EVAL] Mean Return: 189.383, Best Return: 189.683
Saved video to ./runs_smw/videos/PIPE/PIPE_24_189.38.mp4

=== Round 26 | Learn 327680 steps (Total trained: 8192000) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1057    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 8200192 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 926         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 8208384     |
| train/                |             |
|    approx_kl          | 0.019005472 |
|    entropy_loss       | -1.86       |
|    explained_variance | 0.901       |
|    learning_rate      | 0.0001      |
|    loss               | 0.145       |
|    mean_step_reward   | 0.11991

Saved checkpoint: ./runs_smw/checkpoints/PIPE_25.zip
[EVAL] Mean Return: 188.982, Best Return: 189.282
Saved video to ./runs_smw/videos/PIPE/PIPE_25_188.98.mp4

=== Round 27 | Learn 327680 steps (Total trained: 8519680) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1142    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 8527872 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 947         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 8536064     |
| train/                |             |
|    approx_kl          | 0.010166215 |
|    entropy_loss       | -1.82       |
|    explained_variance | 0.964       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0195      |
|    mean_step_reward   | 0.13724

Saved checkpoint: ./runs_smw/checkpoints/PIPE_26.zip
[EVAL] Mean Return: 191.448, Best Return: 191.748
Saved video to ./runs_smw/videos/PIPE/PIPE_26_191.45.mp4

=== Round 28 | Learn 327680 steps (Total trained: 8847360) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1132    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 8855552 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 915         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 8863744     |
| train/                |             |
|    approx_kl          | 0.011459177 |
|    entropy_loss       | -1.9        |
|    explained_variance | 0.978       |
|    learning_rate      | 0.0001      |
|    loss               | 0.126       |
|    mean_step_reward   | 0.13848

Saved checkpoint: ./runs_smw/checkpoints/PIPE_27.zip
[EVAL] Mean Return: 189.635, Best Return: 189.935
Saved video to ./runs_smw/videos/PIPE/PIPE_27_189.63.mp4

=== Round 29 | Learn 327680 steps (Total trained: 9175040) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1114    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 9183232 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 929         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 9191424     |
| train/                |             |
|    approx_kl          | 0.010408166 |
|    entropy_loss       | -1.9        |
|    explained_variance | 0.969       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0655      |
|    mean_step_reward   | 0.13948

Saved checkpoint: ./runs_smw/checkpoints/PIPE_28.zip
[EVAL] Mean Return: 282.261, Best Return: 282.361
Saved video to ./runs_smw/videos/PIPE/PIPE_28_282.26.mp4

=== Round 30 | Learn 327680 steps (Total trained: 9502720) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1131    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 9510912 |
--------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 944          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 9519104      |
| train/                |              |
|    approx_kl          | 0.0128744645 |
|    entropy_loss       | -1.81        |
|    explained_variance | 0.97         |
|    learning_rate      | 0.0001       |
|    loss               | 0.0153       |
|    mean_step_reward

Saved checkpoint: ./runs_smw/checkpoints/PIPE_29.zip
[EVAL] Mean Return: 190.652, Best Return: 190.952
Saved video to ./runs_smw/videos/PIPE/PIPE_29_190.65.mp4

=== Round 31 | Learn 327680 steps (Total trained: 9830400) ===
Logging to ./runs_smw/tb/PIPE_0
--------------------------------
| time/              |         |
|    fps             | 1126    |
|    iterations      | 1       |
|    time_elapsed    | 7       |
|    total_timesteps | 9838592 |
--------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 925         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 9846784     |
| train/                |             |
|    approx_kl          | 0.012118503 |
|    entropy_loss       | -1.87       |
|    explained_variance | 0.984       |
|    learning_rate      | 0.0001      |
|    loss               | 0.00433     |
|    mean_step_reward   | 0.14239

Saved checkpoint: ./runs_smw/checkpoints/PIPE_30.zip
[EVAL] Mean Return: 189.467, Best Return: 189.767
Saved video to ./runs_smw/videos/PIPE/PIPE_30_189.47.mp4

=== Round 32 | Learn 327680 steps (Total trained: 10158080) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1119     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 10166272 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 929         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 10174464    |
| train/                |             |
|    approx_kl          | 0.012197516 |
|    entropy_loss       | -1.78       |
|    explained_variance | 0.991       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0172     |
|    mean_step_reward   |

Saved checkpoint: ./runs_smw/checkpoints/PIPE_31.zip
[EVAL] Mean Return: 190.587, Best Return: 190.887
Saved video to ./runs_smw/videos/PIPE/PIPE_31_190.59.mp4

=== Round 33 | Learn 327680 steps (Total trained: 10485760) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1083     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 10493952 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 935          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 10502144     |
| train/                |              |
|    approx_kl          | 0.0118289385 |
|    entropy_loss       | -1.83        |
|    explained_variance | 0.991        |
|    learning_rate      | 0.0001       |
|    loss               | -0.0093      |
|    mean_ste

Saved checkpoint: ./runs_smw/checkpoints/PIPE_32.zip
[EVAL] Mean Return: 190.989, Best Return: 191.289
Saved video to ./runs_smw/videos/PIPE/PIPE_32_190.99.mp4

=== Round 34 | Learn 327680 steps (Total trained: 10813440) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1084     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 10821632 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 927         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 10829824    |
| train/                |             |
|    approx_kl          | 0.012063393 |
|    entropy_loss       | -1.81       |
|    explained_variance | 0.977       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0266      |
|    mean_step_reward   |

Saved checkpoint: ./runs_smw/checkpoints/PIPE_33.zip
[EVAL] Mean Return: 190.782, Best Return: 191.082
Saved video to ./runs_smw/videos/PIPE/PIPE_33_190.78.mp4

=== Round 35 | Learn 327680 steps (Total trained: 11141120) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1118     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 11149312 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 937          |
|    iterations         | 2            |
|    time_elapsed       | 17           |
|    total_timesteps    | 11157504     |
| train/                |              |
|    approx_kl          | 0.0122812735 |
|    entropy_loss       | -1.84        |
|    explained_variance | 0.982        |
|    learning_rate      | 0.0001       |
|    loss               | 0.0841       |
|    mean_ste

Saved checkpoint: ./runs_smw/checkpoints/PIPE_34.zip
[EVAL] Mean Return: 189.250, Best Return: 189.550
Saved video to ./runs_smw/videos/PIPE/PIPE_34_189.25.mp4

=== Round 36 | Learn 327680 steps (Total trained: 11468800) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1150     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 11476992 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 948         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 11485184    |
| train/                |             |
|    approx_kl          | 0.013888847 |
|    entropy_loss       | -1.83       |
|    explained_variance | 0.993       |
|    learning_rate      | 0.0001      |
|    loss               | -0.031      |
|    mean_step_reward   |

Saved checkpoint: ./runs_smw/checkpoints/PIPE_35.zip
[EVAL] Mean Return: 191.029, Best Return: 191.329
Saved video to ./runs_smw/videos/PIPE/PIPE_35_191.03.mp4

=== Round 37 | Learn 327680 steps (Total trained: 11796480) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1064     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 11804672 |
---------------------------------
----------------------------------------
| time/                 |              |
|    fps                | 902          |
|    iterations         | 2            |
|    time_elapsed       | 18           |
|    total_timesteps    | 11812864     |
| train/                |              |
|    approx_kl          | 0.0140617415 |
|    entropy_loss       | -1.85        |
|    explained_variance | 0.978        |
|    learning_rate      | 0.0001       |
|    loss               | 0.00761      |
|    mean_ste

Saved checkpoint: ./runs_smw/checkpoints/PIPE_36.zip
[EVAL] Mean Return: 191.346, Best Return: 191.646
Saved video to ./runs_smw/videos/PIPE/PIPE_36_191.35.mp4

=== Round 38 | Learn 327680 steps (Total trained: 12124160) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1117     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 12132352 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 944         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 12140544    |
| train/                |             |
|    approx_kl          | 0.010707034 |
|    entropy_loss       | -1.84       |
|    explained_variance | 0.952       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0712      |
|    mean_step_reward   |

Saved checkpoint: ./runs_smw/checkpoints/PIPE_37.zip
[EVAL] Mean Return: 191.734, Best Return: 192.034
Saved video to ./runs_smw/videos/PIPE/PIPE_37_191.73.mp4

=== Round 39 | Learn 327680 steps (Total trained: 12451840) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1110     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 12460032 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 914         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 12468224    |
| train/                |             |
|    approx_kl          | 0.008797487 |
|    entropy_loss       | -1.84       |
|    explained_variance | 0.983       |
|    learning_rate      | 0.0001      |
|    loss               | 0.0475      |
|    mean_step_reward   |

Saved checkpoint: ./runs_smw/checkpoints/PIPE_38.zip
[EVAL] Mean Return: 191.578, Best Return: 191.878
Saved video to ./runs_smw/videos/PIPE/PIPE_38_191.58.mp4

=== Round 40 | Learn 327680 steps (Total trained: 12779520) ===
Logging to ./runs_smw/tb/PIPE_0
---------------------------------
| time/              |          |
|    fps             | 1124     |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 12787712 |
---------------------------------
---------------------------------------
| time/                 |             |
|    fps                | 944         |
|    iterations         | 2           |
|    time_elapsed       | 17          |
|    total_timesteps    | 12795904    |
| train/                |             |
|    approx_kl          | 0.012831092 |
|    entropy_loss       | -1.79       |
|    explained_variance | 0.991       |
|    learning_rate      | 0.0001      |
|    loss               | -0.0336     |
|    mean_step_reward   |

Saved checkpoint: ./runs_smw/checkpoints/PIPE_39.zip
[EVAL] Mean Return: 191.202, Best Return: 191.502
Saved video to ./runs_smw/videos/PIPE/PIPE_39_191.20.mp4
Training finished. Environment closed.


'\ntensorboard --logdir=./runs_smw/tb\n'

## Display Video

In [7]:
# from IPython.display import Video
# import glob

# list_of_files = glob.glob(os.path.join(VIDEO_DIR, '*.mp4')) 
# if list_of_files:
#     latest_file = max(list_of_files, key=os.path.getctime)
#     print(f"Playing: {latest_file}")
#     display(Video(latest_file, embed=True, width=600))
# else:
#     print("No videos found yet.")