In [None]:
import torch
import matplotlib.pyplot as plt
import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.vec_env import VecFrameStack
from ctf_env import CaptureTheFlagPZ

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(torch.cuda.is_available())
print(torch.version.cuda)

# Initialize PettingZoo Env
env = CaptureTheFlagPZ(render_mode="rgb_array")


# env = ss.resize_v1(env, x_size=84, y_size=84)
# env = ss.color_reduction_v0(env, mode='full')
# env = ss.frame_stack_v1(env, 3)

# Convert to SB3 Vector Env
# This allows PPO to see "Red" and "Blue" as just two independent samples in a batch
vec_env = ss.pettingzoo_env_to_vec_env_v1(env)
# Concatenate them so PPO trains on 2 agents at once
# num_vec_envs=4 to better utilize GPU
vec_env = ss.concat_vec_envs_v1(vec_env, num_vec_envs=4, num_cpus=0, base_class='stable_baselines3')


print(f"Observation Space: {vec_env.observation_space.shape}")
# Should be (84, 84, 3) -> 84x84 pixels, 3 stacked frames

# Train with PPO
# We use CnnPolicy because we are using images
# Added device="cpu" to bypass the GPU error
# Increased batch size to fully utilize GPU memory
# model = PPO(
#     "CnnPolicy", 
#     vec_env, 
#     verbose=1, 
#     batch_size=512, 
#     learning_rate=1e-4, 
#     device=device)

model = PPO(
    "MultiInputPolicy", 
    vec_env, 
    verbose=1, 
    batch_size=4096, 
    learning_rate=1e-4, 
    ent_coef=0.05,    # High exploration
    n_steps=2048,     # More data per update
    device=device
)



print("Starting Training...")
model.learn(total_timesteps=5_000_00)
print("Training Finished!")

# 5. Save the Champion
model.save("ctf_champion")
# 3 million timesteps takes 444 min (8h) to train on a CPU  
# On GPU it takes aound 35 min 


  from pkg_resources import resource_stream, resource_exists


Using device: cuda
True
12.8
Observation Space: None
Using cuda device
Starting Training...
DEBUG: blue_2 PICKED UP FLAG!
DEBUG: red_1 PICKED UP FLAG!
------------------------------
| time/              |       |
|    fps             | 1919  |
|    iterations      | 1     |
|    time_elapsed    | 17    |
|    total_timesteps | 32768 |
------------------------------
DEBUG: blue_1 PICKED UP FLAG!
------------------------------------------
| time/                   |              |
|    fps                  | 1534         |
|    iterations           | 2            |
|    time_elapsed         | 42           |
|    total_timesteps      | 65536        |
| train/                  |              |
|    approx_kl            | 0.0014995602 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.1         |
|    explained_variance   | -0.00469     |
|    learning_rate        | 0.0001       |
|    loss                 | 12.1         |


In [2]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU name:", torch.cuda.get_device_name(torch.cuda.current_device()))

PyTorch version: 2.9.1+cu128
CUDA available: True
CUDA version: 12.8
GPU name: NVIDIA GeForce RTX 5070 Ti


In [2]:
# --- NY CELL: FORTSÄTT TRÄNA (RESUME) ---

# 1. Definiera hur mycket MER vi ska träna
extra_timesteps = 1_500_000 
load_filename = "ctf_champion"      # Filen från förra cellen
save_filename = "ctf_champion_2m"   # Det nya namnet

print(f"Resuming training from {load_filename}...")

# 2. Ladda modellen och koppla den till din existerande vec_env
# Vi behåller samma device och miljöinställningar
try:
    model = PPO.load(load_filename, env=vec_env, device=device)
    
    # Valfritt: Om du vill att de ska utforska lite mindre nu när de kan grunderna,
    # kan du sänka ent_coef här (t.ex. till 0.05), annars behålls 0.1 från originalet.
    # model.ent_coef = 0.05 

    print("Model loaded successfully. Starting training...")
    
    # 3. Träna vidare
    # reset_num_timesteps=False är nyckeln för att fortsätta timeregnskapet
    model.learn(total_timesteps=extra_timesteps, reset_num_timesteps=False)
    
    print("Extra training finished!")
    
    # 4. Spara den uppdaterade versionen
    model.save(save_filename)
    print(f"Model saved as: {save_filename}")

except FileNotFoundError:
    print(f"FEL: Hittade inte filen '{load_filename}.zip'. Kontrollera namnet!")

Resuming training from ctf_champion...
Model loaded successfully. Starting training...
DEBUG: red_2 PICKED UP FLAG!
DEBUG: blue_1 TAGGED red_2!
DEBUG: red_1 PICKED UP FLAG!
DEBUG: blue_2 TAGGED red_1!
DEBUG: red_2 PICKED UP FLAG!
DEBUG: blue_1 PICKED UP FLAG!
-------------------------------
| time/              |        |
|    fps             | 1472   |
|    iterations      | 1      |
|    time_elapsed    | 22     |
|    total_timesteps | 557056 |
-------------------------------
DEBUG: red_1 PICKED UP FLAG!
DEBUG: red_2 PICKED UP FLAG!
------------------------------------------
| time/                   |              |
|    fps                  | 1227         |
|    iterations           | 2            |
|    time_elapsed         | 53           |
|    total_timesteps      | 589824       |
| train/                  |              |
|    approx_kl            | 0.0021548925 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | 

In [None]:
model = PPO.load("ctf_final_boss", env=vec_env, device=device, ent_coef=0.01, learning_rate=1e-4)

# 2. Kör en sista finslipning (t.ex. 1 miljon steg till)
print("Starting Final Polish Training (Low Exploration)...")
model.learn(total_timesteps=1_000_000, reset_num_timesteps=False)

# 3. Spara den slutgiltiga versionen
model.save("ctf_final_boss")
print("Training Finished! Time to watch the pros.")

Starting Final Polish Training (Low Exploration)...
DEBUG: blue_2 PICKED UP FLAG!
DEBUG: blue_2 PICKED UP FLAG!
DEBUG: red_2 PICKED UP FLAG!
--------------------------------
| time/              |         |
|    fps             | 1769    |
|    iterations      | 1       |
|    time_elapsed    | 18      |
|    total_timesteps | 3080192 |
--------------------------------
DEBUG: red_1 PICKED UP FLAG!
DEBUG: red_2 PICKED UP FLAG!
DEBUG: blue_1 PICKED UP FLAG!
------------------------------------------
| time/                   |              |
|    fps                  | 1341         |
|    iterations           | 2            |
|    time_elapsed         | 48           |
|    total_timesteps      | 3112960      |
| train/                  |              |
|    approx_kl            | 0.0008769829 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.08        |
|    explained_variance   | 0.0266       |
|    learning_rate      