In [1]:
import os
import shutil
import json
import ray
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
from ray.rllib.env import PettingZooEnv
from ray.tune.registry import register_env
from spoiled_broth.rl.game_env import GameEnv
from ray.rllib.algorithms.callbacks import DefaultCallbacks
from collections import defaultdict
from pathlib import Path
import supersuit as ss
import numpy as np
import csv
import sys

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
rl_model = 'PPO'
map_nr = 'kitchen'
n_iterations = 20
save_every_n_iterations = 5
w1_1 = float(1.0)
w1_2 = float(0.0)
w2_1 = float(0.7071)
w2_2 = float(0.7071)

# Define reward weights
reward_weights = {
    "ai_rl_1": (w1_1, w1_2),
    "ai_rl_2": (w2_1, w2_2),
}

In [3]:
brigit = "/mnt/lustre/home/samuloza"
base_save_dir = f"{brigit}/data/samuel_lozano/cooked/saved_models/map_{map_nr}/"
Path(base_save_dir).mkdir(parents=True, exist_ok=True)

In [4]:
def env_creator(config):
    # Your existing GameEnv class goes here
    return GameEnv(**config)

#def env_creator_2(config):
#    env = GameEnv(reward_weights=reward_weights, map_nr=map_nr)
#    env = ss.pad_observations_v0(env)
#    env = ss.pad_action_space_v0(env)
#    return PettingZooEnv(env)

In [None]:
# Initialize Ray
#os.environ["RAY_TMPDIR"] = f"/data/samuel_lozano/tmp_ray/"
ray.shutdown()
ray.init(ignore_reinit_error=True, #_temp_dir=os.environ["RAY_TMPDIR"]
         )

# Register the environment with RLLib
register_env("spoiled_broth", lambda config: ParallelPettingZooEnv(env_creator(config)))


In [None]:
# Verify observation space consistency
def verify_observation_space():
    test_env = env_creator({"reward_weights": reward_weights, "map_nr": map_nr})
    obs_space = test_env.observation_space("ai_rl_1")
    sample_obs = test_env.reset()[0]["ai_rl_1"]
    
    print(f"Observation space: {obs_space}")
    print(f"Sample observation shape: {sample_obs.shape}")
    print(f"Sample observation min/max: {np.min(sample_obs)}, {np.max(sample_obs)}")
    
    assert obs_space.contains(sample_obs), "Observation doesn't match observation space!"
    test_env.close()
    return obs_space

# Get verified observation space
obs_space = verify_observation_space()

In [7]:
# Define separate policies for each agent
def policy_mapping_fn(agent_id, episode=None, worker=None, **kwargs):
    return f"policy_{agent_id}"

In [8]:
class PureRewardFromInfoCallback(DefaultCallbacks):
    def on_episode_start(self, *, episode, **kwargs):
        episode.user_data["pure_rewards"] = defaultdict(float)

    def on_postprocess_trajectory(self, *, episode, agent_id, policy_id, postprocessed_batch, original_batches, **kwargs):
        _, infos = original_batches[agent_id]
        for info in infos:
            if "pure_reward" in info:
                episode.user_data["pure_rewards"][agent_id] += info["pure_reward"]

    def on_episode_end(self, *, episode, **kwargs):
        for agent_id, pure_reward in episode.user_data["pure_rewards"].items():
            episode.custom_metrics[f"pure_reward_{agent_id}"] = pure_reward

        episode.custom_metrics["pure_reward_mean"] = sum(episode.user_data["pure_rewards"].values())

In [None]:
# Configuration for multi-agent training
config = (
    PPOConfig()
    .api_stack(
        enable_rl_module_and_learner=True,
        enable_env_runner_and_connector_v2=True
    )
    .environment(
        env="spoiled_broth",
        env_config={
            "reward_weights": reward_weights,
            "map_nr": map_nr
        },
        clip_actions=True,
    )
    .multi_agent(
        policies={
            "policy_ai_rl_1": (
                None,  # Use default PPO policy
                env_creator({"map_nr": map_nr}).observation_space("ai_rl_1"),
                env_creator({"map_nr": map_nr}).action_space("ai_rl_1"),
                {}
            ),
            "policy_ai_rl_2": (
                None,  # Use default PPO policy
                env_creator({"map_nr": map_nr}).observation_space("ai_rl_2"),
                env_creator({"map_nr": map_nr}).action_space("ai_rl_2"),
                {}
            )
        },
        policy_mapping_fn=policy_mapping_fn,
        policies_to_train=["policy_ai_rl_1", "policy_ai_rl_2"]
    )
    #.resources(num_gpus=1)  # Set to 1 if you have a GPU
    .env_runners(num_env_runners=2)
    .training(
        train_batch_size=4000,
        minibatch_size=500,
        num_epochs=10
    )
    .callbacks(PureRewardFromInfoCallback)
)

In [None]:
# Build algorithm with error handling
try:
    algo = config.build_algo()
    full_run_dir = algo.logdir
    NAME_RAY = Path(full_run_dir).name
    print(f"Algorithm built successfully: {NAME_RAY}")
except Exception as e:
    print(f"Error building algorithm: {e}")
    raise

In [11]:
# Save training config
training_dir = Path(f"{base_save_dir}{NAME_RAY}")
training_dir.mkdir(parents=True, exist_ok=True)
shutil.copytree(full_run_dir, training_dir, dirs_exist_ok=True)

def safe_config_dict(d):
    def make_serializable(o):
        try:
            json.dumps(o)
            return o
        except TypeError:
            return str(o)

    return {
        k: make_serializable(v)
        for k, v in d.items()
    }

config_path = f"{training_dir}/config.txt"
with open(config_path, "w") as f:
    f.write("==== Training parameters ====\n")
    f.write(f"RL model: {rl_model}\n")
    f.write(f"Map id: {map_nr}\n")
    f.write(f"Number of iterations: {n_iterations}\n")
    f.write(f"Saved every N iterations: {save_every_n_iterations}\n")
    f.write(f"Reward weights ai_rl_1: {w1_1}, {w1_2}\n")
    f.write(f"Reward weights ai_rl_2: {w2_1}, {w2_2}\n")
    f.write("\n==== Complete configuration ====\n")
    f.write(json.dumps(safe_config_dict(config.to_dict()), indent=4))

csv_file_path = f'{training_dir}/reward_data.csv'
with open(csv_file_path, mode='a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["iteration", "total_reward", "reward_agent_1", "reward_agent_2", "pure_reward_mean", "pure_reward_agent_1", "pure_reward_agent_2"])

In [None]:
for i in range(n_iterations): 
    result = algo.train()
    reward_agent_1 = result['env_runners']['module_episode_returns_mean'].get('policy_ai_rl_1', 0)
    reward_agent_2 = result['env_runners']['module_episode_returns_mean'].get('policy_ai_rl_2', 0)
    total_reward = result['env_runners']['episode_return_mean']
    custom_metrics = result.get("custom_metrics", {})
    pure_reward_agent_1 = custom_metrics.get("pure_reward_agent_1", 0.0)
    pure_reward_agent_2 = custom_metrics.get("pure_reward_agent_2", 0.0)
    pure_reward_mean = custom_metrics.get("pure_reward_mean", 0.0)

    print(f"Iteration {i}:")
    print(f"  Agent 1 reward: {reward_agent_1}")
    print(f"  Agent 2 reward: {reward_agent_2}")
    print(f"  Total reward: {total_reward}")
    print(f"  Pure reward agent 1: {pure_reward_agent_1}")
    print(f"  Pure reward agent 2: {pure_reward_agent_2}")
    print(f"  Total pure reward: {pure_reward_mean}")

    with open(csv_file_path, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([i, total_reward, reward_agent_1, reward_agent_2, pure_reward_mean, pure_reward_agent_1, pure_reward_agent_2])

    # Save checkpoint
    if i % save_every_n_iterations == 0:
        checkpoint_obtained = algo.save()
        checkpoint_path = Path(checkpoint_obtained.checkpoint.path)
        
        custom_checkpoint_dir = Path(f"{training_dir}/Checkpoint_{i}")
        custom_checkpoint_dir.mkdir(parents=True, exist_ok=True)

        for item in checkpoint_path.iterdir():
            dst = custom_checkpoint_dir / item.name
            if item.is_dir():
                shutil.copytree(item, dst, dirs_exist_ok=True)
            else:
                shutil.copy2(item, dst)

        print(f"Checkpoint {i} saved in {custom_checkpoint_dir}")

# Reload params

In [29]:
from ray.rllib.algorithms.algorithm import Algorithm
import torch

In [None]:
reload_map_nr = 1

training_id = "PPO_spoiled_broth_2025-05-18_22-22-47j6l54y_m"

checkpoint_nr = 0

In [None]:
# Restaura el algoritmo desde un checkpoint
reload_checkpoint_dir = f"/data/samuel_lozano/cooked/saved_models/map_{reload_map_nr}/{training_id}/Checkpoint_{checkpoint_nr}/"

reload_algo = Algorithm.from_checkpoint(reload_checkpoint_dir)

policy_module_1 = reload_algo.get_policy("policy_ai_rl_1")
policy_module_2 = reload_algo.get_policy("policy_ai_rl_2")

# Comparar pesos (si estás usando PyTorch)
params1 = policy_module_1.get_parameters()
params2 = policy_module_2.get_parameters()

In [None]:
params1 = dict(policy_module_1.model.named_parameters())
params2 = dict(policy_module_2.model.named_parameters())

for name in params1:
    if name in params2:
        are_equal = torch.allclose(params1[name], params2[name])
        print(f"{name}: {'Equal' if are_equal else 'Different'}")

<bound method Deprecated.<locals>._inner.<locals>._ctor of PPO(env=spoiled_broth; env-runners=2; learners=0; multi-agent=True)>


In [None]:
def params_to_vector(params):
    # Convierte diccionario de parámetros en un vector concatenado (PyTorch tensors)
    vectors = []
    for key, tensor in params.items():
        if isinstance(tensor, torch.Tensor):
            vectors.append(tensor.flatten())
        else:
            # Si no es tensor, intenta convertirlo o ignora
            pass
    return torch.cat(vectors)

v1 = params_to_vector(params1)
v2 = params_to_vector(params2)

diff = torch.norm(v1 - v2).item()
print(f"Diferencia entre parámetros de ambas políticas: {diff}")