# player

> This module training optimizing of RL agents on the electricity market environment.

In [None]:
# | default_exp player

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export
import pickle
from abc import ABC
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import optuna
import torch
import yaml
from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.common.wrappers import ActionMasker
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.monitor import Monitor
from tqdm.notebook import tqdm

from electricity_market.env import ElectricityMarketEnv, EnvConfig
from electricity_market.utils import EvaluationData, TrainingData

In [None]:
# | export
N_EPISODES = 3
N_TRAILS = 10
SEEDS = [
    111111,
    121212,
    123456,
    200000,
    217890,
    222222,
    224775,
    234567,
    253084,
    285234,
    312135,
    314831,
    333333,
    345678,
    406339,
    444444,
    471678,
    555555,
    562845,
    666666,
    701753,
    755460,
    761386,
    777777,
    789391,
    888888,
    993068,
    979797,
    987654,
    999999,
]
ENV_CONFIG = EnvConfig()

QUICK_MODE = True

if QUICK_MODE:
    ENV_CONFIG = EnvConfig(max_timestep=10)
    SEEDS = [10000]

training_data_per_agent = {}
evaluation_data_per_agent = {}

In [None]:
# | export


@dataclass
class TrainingData:
    steps: list[int]
    episodes: list[int]
    rewards: list[float]


@dataclass
class EvaluationData:
    episodes: list[int]
    rewards: list[float]

In [None]:
# | export


class Agent(ABC):
    def __init__(self, name):
        self.name = name

    def evaluate(self):
        """
        Evaluate the model, and return EvaluationData.
        """
        raise NotImplementedError


class ModelAgent(Agent):
    def __init__(self, name, env, model, device):
        super().__init__(name)
        self.device = device
        self.model = model
        self.env = env

    def train(self) -> TrainingData:
        """
        Train the model, and return TrainingData.
        """
        all_rewards = []
        all_steps = []
        total_steps = 0
        checkpoint_callback = CheckpointCallback(save_freq=1000, save_path="./logs/")

        # Training loop
        for seed in tqdm(SEEDS, desc="seeds"):
            for _ in tqdm(range(N_EPISODES), desc="Training episodes"):
                obs, _ = self.env.reset(seed=seed)
                episode_rewards = []
                steps = 0
                done = False

                while not done:
                    obs_tensor = torch.tensor(obs, dtype=torch.float32).to(self.device)
                    if isinstance(self, MaskableAgent):

                        action, _ = self.model.predict(
                            obs_tensor, action_masks=MaskableAgent.mask_fn(self.env)
                        )
                    else:
                        action, _ = self.model.predict(obs_tensor)

                    obs, reward, done, truncated, _ = self.env.step(action)
                    episode_rewards.append(reward)
                    steps += 1

                    if done or truncated:

                        self.model.learn(
                            total_timesteps=steps, callback=checkpoint_callback
                        )

                all_rewards.append(np.sum(episode_rewards))
                total_steps += steps
                all_steps.append(total_steps)

        return TrainingData(
            steps=all_steps,
            episodes=list(range(len(all_rewards))),
            rewards=all_rewards,
        )

    def evaluate(self, render: bool = False) -> EvaluationData:
        """
        Evaluate the model, and return EvaluationData.
        """
        all_rewards = []

        for seed in tqdm(SEEDS, desc="seeds"):
            for _ in tqdm(range(N_EPISODES), desc="Evaluation episodes"):
                obs, _ = self.env.reset(seed=seed)
                episode_rewards = []
                done = False

                while not done:
                    # Convert observation to torch tensor
                    obs_tensor = torch.tensor(obs, dtype=torch.float32).to(self.device)

                    if isinstance(self, MaskableAgent):
                        action, _ = self.model.predict(
                            obs_tensor,
                            action_masks=MaskableAgent.mask_fn(self.env),
                            deterministic=True,
                        )
                    else:
                        action, _ = self.model.predict(obs_tensor, deterministic=True)
                    obs, reward, done, truncated, _ = self.env.step(action)
                    episode_rewards.append(reward)

                    if render:
                        self.env.render()

                    if truncated:
                        break

                all_rewards.append(np.sum(episode_rewards))

        return EvaluationData(
            episodes=list(range(len(all_rewards))),
            rewards=all_rewards,
        )

    def save_model(self, model_path: Path) -> None:
        self.model.save(str(model_path))

    def load_model(self, model_path: Path) -> None:
        self.model.load(str(model_path))


class MaskableAgent(Agent):
    @staticmethod
    def mask_fn(env):
        """
        Placeholder mask function if needed.
        """
        if isinstance(env, Monitor):
            return env.env.action_masks()
        return env.action_masks()

In [None]:
# | export


class MaskableRandomAgent(MaskableAgent):
    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "MaskableRandomAgent",
    ):
        """
        Initialize the agent and create the environment.
        """
        super().__init__(name=name)
        self.env = ActionMasker(
            ElectricityMarketEnv(env_config, render_mode=render_mode), self.mask_fn
        )

    def evaluate(self, render: bool = False) -> EvaluationData:
        """
        Evaluates the random agent by executing episodes in the environment.
        """
        all_rewards = []

        for seed in tqdm(SEEDS, desc="seeds"):
            for _ in tqdm(range(N_EPISODES), desc="episodes", leave=False):
                episode_rewards = []

                obs, _ = self.env.reset(seed=seed)
                done = False
                total_reward = 0.0

                while not done:
                    action_mask = self.env.action_masks()
                    valid_actions = np.where(action_mask)[0]  # Get valid actions
                    action = np.random.choice(
                        valid_actions
                    )  # Select random valid action

                    obs, reward, done, truncated, _ = self.env.step(action)
                    total_reward += reward

                    if render:
                        self.env.render()

                    if truncated:
                        break

                episode_rewards.append(total_reward)

                # Append episode results to the total list
                all_rewards.append(total_reward)

        return EvaluationData(
            episodes=list(range(len(all_rewards))),
            rewards=all_rewards,
        )

In [None]:
# | export


class A2CAgent(ModelAgent):
    """A2C Agent for the Electricity Market Environment."""

    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "A2CAgent",
    ):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        env = Monitor(
            ElectricityMarketEnv(env_config, render_mode=render_mode),
        )
        model = A2C(
            "MlpPolicy",
            env,
            verbose=0,
            tensorboard_log=f"./{name}_tensorboard/",
            device=device,
        )
        super().__init__(name=name, env=env, model=model, device=device)

In [None]:
# | export


class MaskablePPOAgent(ModelAgent, MaskableAgent):
    """Maskable PPO Agent for the Electricity Market Environment."""

    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "MaskablePPOAgent",
    ):
        env = Monitor(
            ActionMasker(
                ElectricityMarketEnv(env_config, render_mode=render_mode),
                self.mask_fn,
            )
        )
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = MaskablePPO(
            MaskableActorCriticPolicy,
            env,
            verbose=0,
            tensorboard_log=f"./{name}_tensorboard/",
            device=device,
        )
        super().__init__(name=name, env=env, model=model, device=device)
        self.optimized_hyperparameters = {}
        self.env_config = env_config or EnvConfig()

    @staticmethod
    def collect_episodes_rewards(model, env) -> list[float]:
        """
        Collect rewards from evaluating the agent for a given number of episodes.
        """
        rewards = []
        for seed in tqdm(SEEDS, desc="seeds"):
            for _ in tqdm(range(N_EPISODES), desc="Collect episodes"):
                obs, _ = env.reset(seed=seed)
                episode_rewards = 0
                done = False

                while not done:
                    action, _ = model.predict(
                        obs, deterministic=True, action_masks=MaskableAgent.mask_fn(env)
                    )
                    obs, reward, done, truncated, _ = env.step(action)
                    episode_rewards += reward
                rewards.append(episode_rewards)

        return rewards

    def optimize(self) -> None:
        """
        Optimize the agent with hyperparameters and return TrainingData.
        """

        def objective(trial):
            # Hyperparameter search space
            learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
            n_steps = trial.suggest_int("n_steps", 32, 1024, log=True)
            batch_size = trial.suggest_int("batch_size", 16, 256, log=True)
            gamma = trial.suggest_float("gamma", 0.9, 0.9999)
            gae_lambda = trial.suggest_float("gae_lambda", 0.8, 1.0)
            ent_coef = trial.suggest_float("ent_coef", 0.0, 0.02)
            vf_coef = trial.suggest_float("vf_coef", 0.1, 1.0)
            clip_range = trial.suggest_float("clip_range", 0.1, 0.3)
            max_grad_norm = trial.suggest_float("max_grad_norm", 0.1, 1.0)

            # Set up environment and model with the sampled hyperparameters
            env = Monitor(
                ActionMasker(
                    ElectricityMarketEnv(self.env_config, render_mode="human"),
                    self.mask_fn,
                )
            )

            model = MaskablePPO(
                MaskableActorCriticPolicy,
                env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                gamma=gamma,
                gae_lambda=gae_lambda,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                clip_range=clip_range,
                max_grad_norm=max_grad_norm,
                verbose=0,
                tensorboard_log=f"./{self.name}_tensorboard/",
                device=self.device,
            )

            model.learn(
                total_timesteps=self.env_config.max_timestep,
                use_masking=True,
                reset_num_timesteps=False,
            )

            # Collect rewards for evaluation
            episode_rewards = self.collect_episodes_rewards(model, env)

            # Return the mean reward as the optimization goal
            return np.mean(episode_rewards)

        # Create a study to optimize the objective function
        study = optuna.create_study(
            study_name=self.name,
            storage="sqlite:///optuna_study.db",
            load_if_exists=True,
            direction="maximize",
            pruner=optuna.pruners.HyperbandPruner(),
            sampler=optuna.samplers.TPESampler(),
        )

        # Optimize using the defined objective function
        study.optimize(objective, n_trials=N_TRAILS, n_jobs=-1, show_progress_bar=True)

        # Save the best learned_hyperparameters
        self.optimized_hyperparameters = study.best_params

        # Re-train the model with the optimized learned_hyperparameters
        self.model = MaskablePPO(
            MaskableActorCriticPolicy,
            self.env,
            **self.optimized_hyperparameters,
            verbose=0,
            tensorboard_log=f"./{self.name}_tensorboard/",
            device=self.device,
        )

    def export_hyperparameters(self, filename: str):
        """
        Export optimized learned_hyperparameters to a YAML file.
        """
        with open(filename, "w") as file:
            yaml.dump(self.optimized_hyperparameters, file)

### Evaluation MaskableRandom on ElectricityMarketEnv


In [None]:
# | hide
maskable_random_agent = MaskableRandomAgent(render_mode="human", env_config=ENV_CONFIG)

evaluation_data_per_agent[maskable_random_agent.name] = maskable_random_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

### Evaluation A2C on ElectricityMarketEnv


In [None]:
# | hide
a2c_agent = A2CAgent(render_mode="human", env_config=ENV_CONFIG)

training_data_per_agent[a2c_agent.name] = a2c_agent.train()

a2c_agent.save_model(f"{a2c_agent.name}.model")

evaluation_data_per_agent[a2c_agent.name] = a2c_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

### Evaluation MaskablePPO with default hyperparameters on ElectricityMarketEnv

In [None]:
# | hide
maskable_ppo_agent = MaskablePPOAgent(render_mode="human", env_config=ENV_CONFIG)

training_data_per_agent[maskable_ppo_agent.name] = maskable_ppo_agent.train()

maskable_ppo_agent.save_model(f"{maskable_ppo_agent.name}.model")

evaluation_data_per_agent[maskable_ppo_agent.name] = maskable_ppo_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

### Evaluation MaskablePPO with optimized hyperparameters on ElectricityMarketEnv

In [None]:
# | hide
optimized_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="OptimizedMaskablePPOAgent"
)
optimized_maskable_ppo_agent.optimize()

optimized_maskable_ppo_agent.export_hyperparameters(
    f"{optimized_maskable_ppo_agent.name}.yaml"
)


training_data_per_agent[optimized_maskable_ppo_agent.name] = (
    optimized_maskable_ppo_agent.train()
)

optimized_maskable_ppo_agent.save_model(f"{optimized_maskable_ppo_agent.name}.model")

evaluation_data_per_agent[optimized_maskable_ppo_agent.name] = (
    optimized_maskable_ppo_agent.evaluate()
)

[I 2025-02-25 12:38:55,670] A new study created in RDB with name: OptimizedMaskablePPOAgent


  0%|          | 0/10 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 12:52:33,302] Trial 6 finished with value: -1111.838161590745 and parameters: {'learning_rate': 4.216434487133055e-05, 'n_steps': 558, 'batch_size': 88, 'gamma': 0.9314515310549413, 'gae_lambda': 0.9722758806015634, 'ent_coef': 0.019206341253203866, 'vf_coef': 0.6328293085991877, 'clip_range': 0.19097023946789943, 'max_grad_norm': 0.7241013016372778}. Best is trial 6 with value: -1111.838161590745.


Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 12:54:05,945] Trial 5 finished with value: -1076.7420001980922 and parameters: {'learning_rate': 2.404115796240019e-05, 'n_steps': 834, 'batch_size': 80, 'gamma': 0.9139661876538311, 'gae_lambda': 0.8776957811475, 'ent_coef': 0.007404749837599179, 'vf_coef': 0.4957318995045149, 'clip_range': 0.25375863285614053, 'max_grad_norm': 0.207700126718677}. Best is trial 5 with value: -1076.7420001980922.


Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 12:58:47,976] Trial 3 finished with value: -896.6917305231227 and parameters: {'learning_rate': 0.00012221158737106634, 'n_steps': 715, 'batch_size': 78, 'gamma': 0.9478286685170438, 'gae_lambda': 0.9617971780239106, 'ent_coef': 0.013724239519701626, 'vf_coef': 0.6994882453071006, 'clip_range': 0.22677020332287795, 'max_grad_norm': 0.9895480134866416}. Best is trial 3 with value: -896.6917305231227.
[I 2025-02-25 12:58:52,028] Trial 2 finished with value: -865.0008481581021 and parameters: {'learning_rate': 5.133676441334798e-05, 'n_steps': 89, 'batch_size': 109, 'gamma': 0.9045368648800091, 'gae_lambda': 0.8650784066479856, 'ent_coef': 0.014126543187027843, 'vf_coef': 0.888006276779061, 'clip_range': 0.1348818286388682, 'max_grad_norm': 0.3983007739518821}. Best is trial 2 with value: -865.0008481581021.


Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 13:04:29,656] Trial 7 finished with value: -545.5181878422944 and parameters: {'learning_rate': 0.00014115388380201517, 'n_steps': 65, 'batch_size': 193, 'gamma': 0.9337988932454525, 'gae_lambda': 0.845236590702226, 'ent_coef': 0.016435940113877386, 'vf_coef': 0.6260372545630986, 'clip_range': 0.23558248052570355, 'max_grad_norm': 0.856221821497002}. Best is trial 7 with value: -545.5181878422944.


Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 13:06:03,485] Trial 8 finished with value: -979.2744558901655 and parameters: {'learning_rate': 9.441093292304602e-05, 'n_steps': 105, 'batch_size': 187, 'gamma': 0.9539448784274425, 'gae_lambda': 0.8462971707969031, 'ent_coef': 0.004339837854222888, 'vf_coef': 0.6208686103702934, 'clip_range': 0.13890905400273074, 'max_grad_norm': 0.4785159951889255}. Best is trial 7 with value: -545.5181878422944.


Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 13:06:10,180] Trial 4 finished with value: -401.07181326555235 and parameters: {'learning_rate': 0.0009734665649384904, 'n_steps': 153, 'batch_size': 191, 'gamma': 0.9387791178876315, 'gae_lambda': 0.939375832380646, 'ent_coef': 0.010904404445921992, 'vf_coef': 0.49016665675959226, 'clip_range': 0.19598911025462018, 'max_grad_norm': 0.8570647419226998}. Best is trial 4 with value: -401.07181326555235.


Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 13:09:14,138] Trial 9 finished with value: -594.4433871183192 and parameters: {'learning_rate': 0.0004371208131380942, 'n_steps': 392, 'batch_size': 131, 'gamma': 0.9466215904470251, 'gae_lambda': 0.9970768249677274, 'ent_coef': 0.004924503205376562, 'vf_coef': 0.7228074805646898, 'clip_range': 0.2194810247375943, 'max_grad_norm': 0.6068036454851875}. Best is trial 4 with value: -401.07181326555235.


Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 13:10:03,195] Trial 1 finished with value: 149.91076317400518 and parameters: {'learning_rate': 0.0007433352616407535, 'n_steps': 942, 'batch_size': 115, 'gamma': 0.9293743812835875, 'gae_lambda': 0.8077916237515688, 'ent_coef': 0.0015431815538238048, 'vf_coef': 0.9169650454157497, 'clip_range': 0.18174976443729085, 'max_grad_norm': 0.503883935779693}. Best is trial 1 with value: 149.91076317400518.


Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 13:11:42,711] Trial 0 finished with value: 1081.5583625907677 and parameters: {'learning_rate': 0.00047500421534876563, 'n_steps': 39, 'batch_size': 243, 'gamma': 0.9045992835885459, 'gae_lambda': 0.8142285666986598, 'ent_coef': 0.010356658628031389, 'vf_coef': 0.1017093795613027, 'clip_range': 0.2017832310530917, 'max_grad_norm': 0.28056068887681723}. Best is trial 0 with value: 1081.5583625907677.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

### Adding expert knowledge to the masking function making learning more efficient

In [None]:
# | export


def is_action_safe(self, action: int) -> bool:
    charge_amount = self._charge_amount(action)
    target_state_of_charge = self._current_state_of_charge + charge_amount
    low, high = self._battery_safe_range
    return high > target_state_of_charge > low


def expert_knowledge_action_masks(self) -> np.ndarray:
    mask = np.array(
        [
            self._is_action_valid(action) and self.is_action_safe(action)
            for action in range(self.action_space.n)
        ],
        dtype=bool,
    )
    if not np.any(mask):  # If all actions are invalid, force one to be valid
        mask[len(mask) // 2] = True
    return mask

In [None]:
# | hide
# Dynamically overriding action_masks to ElectricityMarketEnv
setattr(ElectricityMarketEnv, "action_masks", expert_knowledge_action_masks)
# Dynamically overriding injection is_action_safe to ElectricityMarketEnv
setattr(ElectricityMarketEnv, "is_action_safe", is_action_safe)



### Evaluation MaskableRandomAgent with Expert Knowledge on ElectricityMarketEnv

In [None]:
# | hide
expert_maskable_random_agent = MaskableRandomAgent(
    render_mode="human", env_config=ENV_CONFIG, name="ExpertMaskableRandomAgent"
)

evaluation_data_per_agent[expert_maskable_random_agent.name] = (
    expert_maskable_random_agent.evaluate()
)

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

episodes:   0%|          | 0/3 [00:00<?, ?it/s]

### Evaluation MaskablePPO with default hyperparameters and Expert Knowledge on ElectricityMarketEnv

In [None]:
# | hide
expert_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="ExpertMaskablePPOAgent"
)

training_data_per_agent[expert_maskable_ppo_agent.name] = (
    expert_maskable_ppo_agent.train()
)

expert_maskable_ppo_agent.save_model(f"{expert_maskable_ppo_agent.name}.model")

evaluation_data_per_agent[expert_maskable_ppo_agent.name] = (
    expert_maskable_ppo_agent.evaluate()
)

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

### Evaluation MaskablePPO with optimized hyperparameters and Expert Knowledge on ElectricityMarketEnv


In [None]:
# | hide
optimized_expert_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="OptimizedExpertMaskablePPOAgent"
)
optimized_expert_maskable_ppo_agent.optimize()

optimized_expert_maskable_ppo_agent.export_hyperparameters(
    f"{optimized_expert_maskable_ppo_agent.name}.yaml"
)

training_data_per_agent[optimized_expert_maskable_ppo_agent.name] = (
    optimized_expert_maskable_ppo_agent.train()
)

optimized_expert_maskable_ppo_agent.save_model(
    f"{optimized_expert_maskable_ppo_agent.name}.model"
)

evaluation_data_per_agent[optimized_expert_maskable_ppo_agent.name] = (
    optimized_expert_maskable_ppo_agent.evaluate()
)

[I 2025-02-25 14:46:00,405] A new study created in RDB with name: OptimizedExpertMaskablePPOAgent


  0%|          | 0/10 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 17:37:42,208] Trial 1 finished with value: 2167.4861813861403 and parameters: {'learning_rate': 0.0003013782953792595, 'n_steps': 733, 'batch_size': 247, 'gamma': 0.9561809235961739, 'gae_lambda': 0.8521382431828594, 'ent_coef': 0.016112422385531015, 'vf_coef': 0.8820560002986977, 'clip_range': 0.2605218719180772, 'max_grad_norm': 0.2541676920780071}. Best is trial 1 with value: 2167.4861813861403.
[I 2025-02-25 17:38:05,965] Trial 2 finished with value: 2171.3972779789174 and parameters: {'learning_rate': 0.00010411425232915082, 'n_steps': 282, 'batch_size': 160, 'gamma': 0.9109791740437164, 'gae_lambda': 0.940028753772623, 'ent_coef': 0.010151278083469642, 'vf_coef': 0.7574533418881059, 'clip_range': 0.11068308973902517, 'max_grad_norm': 0.7304943615636335}. Best is trial 2 with value: 2171.3972779789174.
[I 2025-02-25 17:38:26,132] Trial 7 finished with value: 2170.9305270996865 and parameters: {'learning_rate': 3.153798632408535e-05, 'n_steps': 230, 'batch_size': 95, 

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 17:38:38,020] Trial 6 finished with value: 2171.20315513742 and parameters: {'learning_rate': 3.662559264627972e-05, 'n_steps': 74, 'batch_size': 76, 'gamma': 0.9375170539146958, 'gae_lambda': 0.9379372283498484, 'ent_coef': 0.000233058325136164, 'vf_coef': 0.5047900228465663, 'clip_range': 0.10544347135449249, 'max_grad_norm': 0.5988212047146474}. Best is trial 2 with value: 2171.3972779789174.
[I 2025-02-25 17:38:41,838] Trial 5 finished with value: 2169.911584578915 and parameters: {'learning_rate': 0.00019506285158486523, 'n_steps': 54, 'batch_size': 55, 'gamma': 0.9911787769574796, 'gae_lambda': 0.8343857488128679, 'ent_coef': 0.0036270385569936183, 'vf_coef': 0.58679510913267, 'clip_range': 0.28297981567386676, 'max_grad_norm': 0.7909494061523988}. Best is trial 2 with value: 2171.3972779789174.
[I 2025-02-25 17:39:24,382] Trial 0 finished with value: 2175.652307607354 and parameters: {'learning_rate': 0.00042357193050137054, 'n_steps': 747, 'batch_size': 36, 'gamma

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Collect episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-25 17:57:10,029] Trial 8 finished with value: 2170.055078716262 and parameters: {'learning_rate': 0.00013185759763542724, 'n_steps': 921, 'batch_size': 54, 'gamma': 0.9068261347929678, 'gae_lambda': 0.8786786724817562, 'ent_coef': 0.00638246751933137, 'vf_coef': 0.13591984816063743, 'clip_range': 0.28033338524185314, 'max_grad_norm': 0.35656920790810454}. Best is trial 0 with value: 2175.652307607354.
[I 2025-02-25 17:57:15,484] Trial 9 finished with value: 2171.342910783646 and parameters: {'learning_rate': 9.804872997228437e-05, 'n_steps': 100, 'batch_size': 34, 'gamma': 0.9601555906509587, 'gae_lambda': 0.9718339416049988, 'ent_coef': 0.011038474437473498, 'vf_coef': 0.8655720118898798, 'clip_range': 0.2783440900293963, 'max_grad_norm': 0.931289650910097}. Best is trial 0 with value: 2175.652307607354.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluation episodes:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# | hide

with open("training_data_per_agent.pkl", "wb") as f:
    pickle.dump(training_data_per_agent, f)

with open("evaluation_data_per_agent.pkl", "wb") as f:
    pickle.dump(evaluation_data_per_agent, f)

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()