In [2]:
import torch
import matplotlib.pyplot as plt
import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import VecFrameStack
from ctf_env import CaptureTheFlagPZ

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(torch.cuda.is_available())
print(torch.version.cuda)

# Initialize PettingZoo Env
env = CaptureTheFlagPZ(render_mode="rgb_array")


# env = ss.resize_v1(env, x_size=84, y_size=84)
# env = ss.color_reduction_v0(env, mode='full')
# env = ss.frame_stack_v1(env, 3)

# Convert to SB3 Vector Env
# This allows PPO to see "Red" and "Blue" as just two independent samples in a batch
vec_env = ss.pettingzoo_env_to_vec_env_v1(env)
# Concatenate them so PPO trains on 2 agents at once
# num_vec_envs=4 to better utilize GPU
vec_env = ss.concat_vec_envs_v1(vec_env, num_vec_envs=4, num_cpus=0, base_class='stable_baselines3')


print(f"Observation Space: {vec_env.observation_space.shape}")
# Should be (84, 84, 3) -> 84x84 pixels, 3 stacked frames

# Train with PPO
# We use CnnPolicy because we are using images
# Added device="cpu" to bypass the GPU error
# Increased batch size to fully utilize GPU memory
# model = PPO(
#     "CnnPolicy", 
#     vec_env, 
#     verbose=1, 
#     batch_size=512, 
#     learning_rate=1e-4, 
#     device=device)

model = PPO(
    "MultiInputPolicy", 
    vec_env, 
    verbose=1, 
    batch_size=1024, 
    learning_rate=1e-4, 
    ent_coef=0.01,    # High exploration
    n_steps=2048,     # More data per update
    device=device
)



print("Starting Training...")
model.learn(total_timesteps=3_000_00)
print("Training Finished!")

# 5. Save the Champion
model.save("ctf_champion")
# 3 million timesteps takes 444 min (8h) to train on a CPU  
# On GPU it takes aound 35 min 


Using device: cuda
True
12.8
Observation Space: None
Using cuda device
Starting Training...
------------------------------
| time/              |       |
|    fps             | 1974  |
|    iterations      | 1     |
|    time_elapsed    | 16    |
|    total_timesteps | 32768 |
------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1525        |
|    iterations           | 2           |
|    time_elapsed         | 42          |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.008895783 |
|    clip_fraction        | 0.0149      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.0206     |
|    learning_rate        | 0.0001      |
|    loss                 | 0.42        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00121    |
|    value_loss    

In [9]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU name:", torch.cuda.get_device_name(torch.cuda.current_device()))

PyTorch version: 2.9.1+cu128
CUDA available: True
CUDA version: 12.8
GPU name: NVIDIA GeForce RTX 5070 Ti


In [2]:
import supersuit as ss
from stable_baselines3 import PPO
from ctf_env import CaptureTheFlagPZ

#  Load the "Old" Champion (Gen 1)
# We load it on CPU to avoid errors
old_champion = PPO.load("ctf_champion", device="cpu")

# 2. Define the Training Environment
env = CaptureTheFlagPZ(render_mode="rgb_array")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.color_reduction_v0(env, mode='full')
env = ss.frame_stack_v1(env, 3)

# 3. The "Gauntlet" Wrapper
# We need a way to tell the environment: "Blue actions come from the Old Model, Red actions come from the New Model"
# PettingZoo doesn't support this out of the box easily, so we usually just run a custom loop or
# use a library like 'shimmy' to convert it to a Single-Agent Gym environment where the Opponent is part of the environment.

print("To train Gen 2 vs Gen 1, we need to convert the environment so 'Blue' is just a part of the game (like a moving wall).")

To train Gen 2 vs Gen 1, we need to convert the environment so 'Blue' is just a part of the game (like a moving wall).
