In [None]:
import matplotlib.pyplot as plt
import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
from ctf_env import CaptureTheFlagPZ

# Initialize PettingZoo Env
env = CaptureTheFlagPZ(render_mode="rgb_array")


env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.color_reduction_v0(env, mode='full')
env = ss.frame_stack_v1(env, 3)

# Convert to SB3 Vector Env
# This allows PPO to see "Red" and "Blue" as just two independent samples in a batch
vec_env = ss.pettingzoo_env_to_vec_env_v1(env)
# Concatenate them so PPO trains on 2 agents at once
vec_env = ss.concat_vec_envs_v1(vec_env, num_vec_envs=1, num_cpus=0, base_class='stable_baselines3')

print(f"Observation Space: {vec_env.observation_space.shape}")
# Should be (84, 84, 3) -> 84x84 pixels, 3 stacked frames

# Train with PPO
# We use CnnPolicy because we are using images
# Added device="cpu" to bypass the GPU error
model = PPO("CnnPolicy", vec_env, verbose=1, batch_size=256, learning_rate=1e-4, device="cpu")

print("Starting Training...")
model.learn(total_timesteps=200000)
print("Training Finished!")

# 5. Save the Champion
model.save("ctf_champion")

Observation Space: (84, 84, 3)
Using cpu device
Wrapping the env in a VecTransposeImage.




Starting Training...




-----------------------------
| time/              |      |
|    fps             | 406  |
|    iterations      | 1    |
|    time_elapsed    | 10   |
|    total_timesteps | 4096 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 230         |
|    iterations           | 2           |
|    time_elapsed         | 35          |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.011069603 |
|    clip_fraction        | 0.0549      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.029      |
|    learning_rate        | 0.0001      |
|    loss                 | 0.118       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00432    |
|    value_loss           | 0.62        |
-----------------------------------------
----------------------------------

In [None]:
import supersuit as ss
from stable_baselines3 import PPO
from ctf_env import CaptureTheFlagPZ

#  Load the "Old" Champion (Gen 1)
# We load it on CPU to avoid errors
old_champion = PPO.load("ctf_champion", device="cpu")

# 2. Define the Training Environment
env = CaptureTheFlagPZ(render_mode="rgb_array")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.color_reduction_v0(env, mode='full')
env = ss.frame_stack_v1(env, 3)

# 3. The "Gauntlet" Wrapper
# We need a way to tell the environment: "Blue actions come from the Old Model, Red actions come from the New Model"
# PettingZoo doesn't support this out of the box easily, so we usually just run a custom loop or
# use a library like 'shimmy' to convert it to a Single-Agent Gym environment where the Opponent is part of the environment.

print("To train Gen 2 vs Gen 1, we need to convert the environment so 'Blue' is just a part of the game (like a moving wall).")

To train Gen 2 vs Gen 1, we need to convert the environment so 'Blue' is just a part of the game (like a moving wall).
