# player

> This module training optimizing of RL agents on the electricity market environment.

In [None]:
# | default_exp player

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export
import pickle
import shutil
from abc import ABC
from pathlib import Path

import numpy as np
import optuna
import torch
import yaml
from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.common.wrappers import ActionMasker
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.monitor import Monitor
from tqdm.notebook import tqdm

from electricity_market.env import ElectricityMarketEnv, EnvConfig
from electricity_market.utils import EvaluationData

In [None]:
# | export
N_TRAIN_EPISODES = 3
N_TRAILS = 10
TRAIN_SEEDS = [
    111111,
    121212,
    123456,
    200000,
    217890,
    222222,
    224775,
    234567,
    253084,
    285234,
    312135,
    314831,
    333333,
    345678,
    406339,
    444444,
    471678,
    555555,
    562845,
    666666,
    701753,
    755460,
    761386,
    777777,
    789391,
    888888,
    993068,
    979797,
    987654,
    999999,
]
EVALUATE_SEEDS = [
    117127,
    136901,
    223246,
    243382,
    245720,
    248832,
    288598,
    374487,
    447331,
    447851,
    490428,
    553737,
    557309,
    571504,
    601426,
    632202,
    653634,
    844596,
    848937,
    849735,
    865470,
    866822,
    876563,
    880689,
    887591,
    911016,
    920528,
    963993,
    967995,
    992634,
]
ENV_CONFIG = EnvConfig()

TENSORBOARD_PATH = Path("../tensorboard")
LOGS_PATH = Path("../logs")

# Set QUICK_MODE = True for CI
QUICK_MODE = True

if QUICK_MODE:
    ENV_CONFIG = EnvConfig(max_timestep=10)
    TRAIN_SEEDS = [10000]
    EVALUATE_SEEDS = [90000]
    TENSORBOARD_PATH = None
    LOGS_PATH = None

evaluation_data_per_agent = {}

In [None]:
# | hide

# cleanup
if not QUICK_MODE:
    shutil.rmtree(TENSORBOARD_PATH, ignore_errors=True)
    shutil.rmtree(LOGS_PATH, ignore_errors=True)

In [None]:
# | export


class Agent(ABC):
    def __init__(self, name, env, device):
        self.name = name
        self.device = device
        self.env = env

    def evaluate(self, render: bool = False) -> EvaluationData:
        """
        Evaluate the model, and return EvaluationData.
        """
        all_rewards = []

        for seed in tqdm(EVALUATE_SEEDS, desc="seeds"):
            obs, _ = self.env.reset(seed=seed)
            episode_rewards = []
            done = False

            while not done:
                obs_tensor = torch.tensor(obs, dtype=torch.float64).to(self.device)

                action = self.choose_action(obs_tensor)
                obs, reward, done, truncated, _ = self.env.step(action)
                episode_rewards.append(reward)

                if render:
                    self.env.render()

                if truncated:
                    break

            all_rewards.append(np.sum(episode_rewards))

        return EvaluationData(
            episodes=list(range(len(all_rewards))),
            rewards=all_rewards,
        )

    def choose_action(self, obs_tensor):
        raise NotImplementedError


class ModelAgent(Agent):
    def __init__(self, name, env, env_config, model, device):
        super().__init__(name, device=device, env=env)
        self.model = model
        self.env_config = env_config

    def train(self) -> None:
        """
        Train the model
        """
        checkpoint_callback = CheckpointCallback(save_freq=1000, save_path="../logs/")

        for seed in tqdm(TRAIN_SEEDS, desc="seeds"):
            for _ in tqdm(range(N_TRAIN_EPISODES), desc="Training episodes"):
                self.env.reset(seed=seed)
                self.model.learn(
                    total_timesteps=self.env_config.max_timestep,
                    callback=checkpoint_callback,
                    reset_num_timesteps=False,
                    tb_log_name=self.name,
                )

    def choose_action(self, obs_tensor):
        action, _ = self.model.predict(obs_tensor, deterministic=True)
        return action

    def save_model(self, model_path: Path) -> None:
        self.model.save(str(model_path))

    def load_model(self, model_path: Path) -> None:
        self.model = self.model.load(str(model_path), env=self.env)


class MaskableAgent(Agent):
    @staticmethod
    def mask_fn(env):
        """
        Placeholder mask function if needed.
        """
        return env.unwrapped.action_masks()


class MaskableModelAgent(MaskableAgent, ModelAgent):
    def choose_action(self, obs_tensor):
        action, _ = self.model.predict(
            obs_tensor,
            action_masks=MaskableAgent.mask_fn(self.env),
            deterministic=True,
        )

In [None]:
# | export


class MaskableRandomAgent(MaskableAgent):
    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "MaskableRandomAgent",
    ):
        """
        Initialize the agent and create the environment.
        """
        device = "cuda" if torch.cuda.is_available() else "cpu"
        env = ActionMasker(
            ElectricityMarketEnv(env_config, render_mode=render_mode), self.mask_fn
        )
        super().__init__(name, device=device, env=env)

    def choose_action(self, obs_tensor):
        action_mask = self.env.action_masks()
        valid_actions = np.where(action_mask)[0]
        action = np.random.choice(valid_actions)

        return action

In [None]:
# | export


class A2CAgent(ModelAgent):
    """A2C Agent for the Electricity Market Environment."""

    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "A2CAgent",
    ):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        env = Monitor(
            ElectricityMarketEnv(env_config, render_mode=render_mode),
        )
        model = A2C(
            "MlpPolicy",
            env,
            verbose=0,
            tensorboard_log=f"./tensorboard/",
            device=device,
        )
        super().__init__(
            name=name, env=env, model=model, device=device, env_config=env_config
        )

In [None]:
# | export


class MaskablePPOAgent(ModelAgent, MaskableAgent):
    """Maskable PPO Agent for the Electricity Market Environment."""

    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "MaskablePPOAgent",
    ):
        env = Monitor(
            ActionMasker(
                ElectricityMarketEnv(env_config, render_mode=render_mode),
                self.mask_fn,
            )
        )
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = MaskablePPO(
            MaskableActorCriticPolicy,
            env,
            verbose=0,
            tensorboard_log=f"./tensorboard/",
            device=device,
        )
        super().__init__(
            name=name, env=env, model=model, device=device, env_config=env_config
        )
        self.optimized_hyperparameters = {}
        self.env_config = env_config or EnvConfig()

    def choose_action(self, obs_tensor):
        action, _ = self.model.predict(
            obs_tensor, deterministic=True, action_masks=MaskableAgent.mask_fn(self.env)
        )
        return action

    def optimize(self) -> None:
        """
        Optimize the agent with hyperparameters.
        """

        def objective(trial):
            learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
            n_steps = trial.suggest_int("n_steps", 32, 1024, log=True)
            batch_size = trial.suggest_int("batch_size", 16, 256, log=True)
            gae_lambda = trial.suggest_float("gae_lambda", 0.8, 1.0)
            ent_coef = trial.suggest_float("ent_coef", 0.0, 0.02)
            vf_coef = trial.suggest_float("vf_coef", 0.1, 1.0)
            clip_range = trial.suggest_float("clip_range", 0.1, 0.3)
            max_grad_norm = trial.suggest_float("max_grad_norm", 0.1, 1.0)

            agent = MaskablePPOAgent(
                self.env_config,
            )

            model = MaskablePPO(
                MaskableActorCriticPolicy,
                agent.env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                gae_lambda=gae_lambda,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                clip_range=clip_range,
                max_grad_norm=max_grad_norm,
                verbose=0,
                device=self.device,
            )

            agent.model = model
            agent.train()

            return np.mean(agent.evaluate().rewards)

        study = optuna.create_study(
            study_name=self.name,
            storage="sqlite:///optuna_study.db",
            load_if_exists=True,
            direction="maximize",
            pruner=optuna.pruners.HyperbandPruner(),
            sampler=optuna.samplers.TPESampler(),
        )

        study.optimize(
            objective,
            n_trials=N_TRAILS,
            n_jobs=-1,
            show_progress_bar=True,
            catch=(ValueError,),
        )

        self.optimized_hyperparameters = study.best_params

        self.model = MaskablePPO(
            MaskableActorCriticPolicy,
            self.env,
            **self.optimized_hyperparameters,
            verbose=0,
            tensorboard_log=f"./tensorboard/",
            device=self.device,
        )

    def export_hyperparameters(self, filename: str):
        """
        Export optimized learned_hyperparameters to a YAML file.
        """
        with open(filename, "w") as file:
            yaml.dump(self.optimized_hyperparameters, file)

### Evaluation MaskableRandom on ElectricityMarketEnv


In [None]:
# | hide
maskable_random_agent = MaskableRandomAgent(render_mode="human", env_config=ENV_CONFIG)

evaluation_data_per_agent[maskable_random_agent.name] = maskable_random_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation A2C on ElectricityMarketEnv


In [None]:
# | hide
a2c_agent = A2CAgent(render_mode="human", env_config=ENV_CONFIG)

a2c_agent.train()

if not QUICK_MODE:
    a2c_agent.save_model(f"{a2c_agent.name}.model")

evaluation_data_per_agent[a2c_agent.name] = a2c_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation MaskablePPO with default hyperparameters on ElectricityMarketEnv

In [None]:
# | hide
maskable_ppo_agent = MaskablePPOAgent(render_mode="human", env_config=ENV_CONFIG)

maskable_ppo_agent.train()

if not QUICK_MODE:
    maskable_ppo_agent.save_model(f"{maskable_ppo_agent.name}.model")

evaluation_data_per_agent[maskable_ppo_agent.name] = maskable_ppo_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation MaskablePPO with optimized hyperparameters on ElectricityMarketEnv

In [None]:
# | hide
optimized_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="OptimizedMaskablePPOAgent"
)
optimized_maskable_ppo_agent.optimize()

if not QUICK_MODE:
    optimized_maskable_ppo_agent.export_hyperparameters(
        f"{optimized_maskable_ppo_agent.name}.yaml"
    )

optimized_maskable_ppo_agent.train()

if not QUICK_MODE:
    optimized_maskable_ppo_agent.save_model(
        f"{optimized_maskable_ppo_agent.name}.model"
    )

evaluation_data_per_agent[optimized_maskable_ppo_agent.name] = (
    optimized_maskable_ppo_agent.evaluate()
)

[I 2025-03-03 22:47:11,957] A new study created in RDB with name: OptimizedMaskablePPOAgent


  0%|          | 0/10 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 03:22:18,121] Trial 6 finished with value: -172.14040192639015 and parameters: {'learning_rate': 2.156591944927807e-05, 'n_steps': 311, 'batch_size': 170, 'gae_lambda': 0.9717942135775777, 'ent_coef': 0.00562915958158533, 'vf_coef': 0.5437199380463767, 'clip_range': 0.22798784087870153, 'max_grad_norm': 0.43910598424665215}. Best is trial 6 with value: -172.14040192639015.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 03:30:54,222] Trial 4 finished with value: 1904.0271516585292 and parameters: {'learning_rate': 0.00020661972370519664, 'n_steps': 575, 'batch_size': 231, 'gae_lambda': 0.9514806516892822, 'ent_coef': 0.005421411454956131, 'vf_coef': 0.2579467377236896, 'clip_range': 0.1495891117730842, 'max_grad_norm': 0.9281860007318072}. Best is trial 4 with value: 1904.0271516585292.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 04:13:58,177] Trial 7 finished with value: 1915.629888452545 and parameters: {'learning_rate': 0.00010214731033085636, 'n_steps': 917, 'batch_size': 118, 'gae_lambda': 0.8854746150447337, 'ent_coef': 0.006804212135342727, 'vf_coef': 0.1684745446454396, 'clip_range': 0.13019053087221258, 'max_grad_norm': 0.5122672798516843}. Best is trial 7 with value: 1915.629888452545.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 04:18:33,287] Trial 2 finished with value: 1066.638346091307 and parameters: {'learning_rate': 2.7050092127571763e-05, 'n_steps': 88, 'batch_size': 123, 'gae_lambda': 0.8903698243918552, 'ent_coef': 0.0056898887869421145, 'vf_coef': 0.18717242610251583, 'clip_range': 0.20481956900927875, 'max_grad_norm': 0.21206356043186225}. Best is trial 7 with value: 1915.629888452545.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 05:44:18,936] Trial 3 finished with value: 1404.3698288166795 and parameters: {'learning_rate': 2.6109196671096892e-05, 'n_steps': 38, 'batch_size': 203, 'gae_lambda': 0.9454715549718409, 'ent_coef': 0.009193884985091225, 'vf_coef': 0.9448209250170455, 'clip_range': 0.22339278345283103, 'max_grad_norm': 0.21800877782307565}. Best is trial 7 with value: 1915.629888452545.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 05:44:38,803] Trial 0 finished with value: 1920.091579628642 and parameters: {'learning_rate': 0.000258508699159723, 'n_steps': 398, 'batch_size': 42, 'gae_lambda': 0.9437295185392839, 'ent_coef': 0.008362300516304235, 'vf_coef': 0.5644764548744111, 'clip_range': 0.12159650395951578, 'max_grad_norm': 0.9902247429571766}. Best is trial 0 with value: 1920.091579628642.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 06:13:36,711] Trial 8 finished with value: 1908.9607000637523 and parameters: {'learning_rate': 8.715112259433786e-05, 'n_steps': 249, 'batch_size': 201, 'gae_lambda': 0.8336930136069418, 'ent_coef': 0.013438159255011982, 'vf_coef': 0.39698203142504873, 'clip_range': 0.23140974787283536, 'max_grad_norm': 0.6520782948294915}. Best is trial 0 with value: 1920.091579628642.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 06:50:24,017] Trial 9 finished with value: 1761.023969445187 and parameters: {'learning_rate': 0.0001321135642783727, 'n_steps': 655, 'batch_size': 60, 'gae_lambda': 0.9912255340848429, 'ent_coef': 0.0077390732685984, 'vf_coef': 0.2807907837545579, 'clip_range': 0.2666148361957634, 'max_grad_norm': 0.6065738210924674}. Best is trial 0 with value: 1920.091579628642.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 07:12:47,702] Trial 1 finished with value: 1841.6095483760816 and parameters: {'learning_rate': 1.6178424048973947e-05, 'n_steps': 35, 'batch_size': 25, 'gae_lambda': 0.8792705645703804, 'ent_coef': 0.01559676066141568, 'vf_coef': 0.5887775807546184, 'clip_range': 0.26618822773587225, 'max_grad_norm': 0.25736963269540786}. Best is trial 0 with value: 1920.091579628642.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-04 07:15:37,471] Trial 5 finished with value: 1629.3278669505364 and parameters: {'learning_rate': 0.0008033582885000316, 'n_steps': 32, 'batch_size': 30, 'gae_lambda': 0.8547126746310278, 'ent_coef': 0.011756398405212429, 'vf_coef': 0.6374034328260573, 'clip_range': 0.2465735780562984, 'max_grad_norm': 0.191973024590956}. Best is trial 0 with value: 1920.091579628642.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Adding expert knowledge to the masking function making learning more efficient

In [None]:
# | export


def is_action_safe(self, action: int) -> bool:
    charge_amount = self._charge_amount(action)
    target_state_of_charge = self._current_state_of_charge + charge_amount
    low, high = self._battery_safe_range
    return high > target_state_of_charge > low


def expert_knowledge_action_masks(self) -> np.ndarray:
    mask = np.array(
        [
            self._is_action_valid(action) and self.is_action_safe(action)
            for action in range(self.action_space.n)
        ],
        dtype=bool,
    )
    if not np.any(mask):
        mask[len(mask) // 2] = True
    return mask

In [None]:
# | hide
# Dynamically overriding action_masks to ElectricityMarketEnv
setattr(ElectricityMarketEnv, "action_masks", expert_knowledge_action_masks)
# Dynamically overriding injection is_action_safe to ElectricityMarketEnv
setattr(ElectricityMarketEnv, "is_action_safe", is_action_safe)



### Evaluation MaskableRandomAgent with Expert Knowledge on ElectricityMarketEnv

In [None]:
# | hide
expert_maskable_random_agent = MaskableRandomAgent(
    render_mode="human", env_config=ENV_CONFIG, name="ExpertMaskableRandomAgent"
)

evaluation_data_per_agent[expert_maskable_random_agent.name] = (
    expert_maskable_random_agent.evaluate()
)

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation MaskablePPO with default hyperparameters and Expert Knowledge on ElectricityMarketEnv

In [None]:
# | hide
expert_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="ExpertMaskablePPOAgent"
)

expert_maskable_ppo_agent.train()


if not QUICK_MODE:
    expert_maskable_ppo_agent.save_model(f"{expert_maskable_ppo_agent.name}.model")

evaluation_data_per_agent[expert_maskable_ppo_agent.name] = (
    expert_maskable_ppo_agent.evaluate()
)

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation MaskablePPO with optimized hyperparameters and Expert Knowledge on ElectricityMarketEnv


In [None]:
# | hide
optimized_expert_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="OptimizedExpertMaskablePPOAgent"
)
optimized_expert_maskable_ppo_agent.optimize()

if not QUICK_MODE:
    optimized_expert_maskable_ppo_agent.export_hyperparameters(
        f"{optimized_expert_maskable_ppo_agent.name}.yaml"
    )

optimized_expert_maskable_ppo_agent.train()

if not QUICK_MODE:
    optimized_expert_maskable_ppo_agent.save_model(
        f"{optimized_expert_maskable_ppo_agent.name}.model"
    )

evaluation_data_per_agent[optimized_expert_maskable_ppo_agent.name] = (
    optimized_expert_maskable_ppo_agent.evaluate()
)

[I 2025-03-04 08:05:54,625] A new study created in RDB with name: OptimizedExpertMaskablePPOAgent


  0%|          | 0/10 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 13:00:37,374] Trial 2 finished with value: 1912.7412644624676 and parameters: {'learning_rate': 2.582024849444009e-05, 'n_steps': 429, 'batch_size': 154, 'gae_lambda': 0.837220732795052, 'ent_coef': 0.007396087261032374, 'vf_coef': 0.5502673435551507, 'clip_range': 0.2503830644646013, 'max_grad_norm': 0.8013788795893468}. Best is trial 2 with value: 1912.7412644624676.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 14:40:08,357] Trial 1 finished with value: 1955.1274975056645 and parameters: {'learning_rate': 1.974572093846395e-05, 'n_steps': 612, 'batch_size': 71, 'gae_lambda': 0.9724243578514046, 'ent_coef': 0.003368269804256756, 'vf_coef': 0.5029765181864725, 'clip_range': 0.28655565142923256, 'max_grad_norm': 0.6574466411036742}. Best is trial 1 with value: 1955.1274975056645.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 14:43:03,122] Trial 7 finished with value: 1937.8358341086516 and parameters: {'learning_rate': 7.786901550803252e-05, 'n_steps': 589, 'batch_size': 67, 'gae_lambda': 0.8782256542378953, 'ent_coef': 0.01721282379454497, 'vf_coef': 0.11616945043593437, 'clip_range': 0.21449009277545894, 'max_grad_norm': 0.42783550252834857}. Best is trial 1 with value: 1955.1274975056645.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 15:20:44,290] Trial 4 finished with value: 1943.9533680616248 and parameters: {'learning_rate': 0.00020389220790828556, 'n_steps': 47, 'batch_size': 110, 'gae_lambda': 0.9319685551479397, 'ent_coef': 0.014790809380979285, 'vf_coef': 0.4033327590898912, 'clip_range': 0.17529639544269207, 'max_grad_norm': 0.9551966589215483}. Best is trial 1 with value: 1955.1274975056645.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 17:10:06,722] Trial 5 finished with value: 1937.36848160581 and parameters: {'learning_rate': 0.00014004724928608868, 'n_steps': 170, 'batch_size': 32, 'gae_lambda': 0.9023009210305164, 'ent_coef': 0.01584760403016155, 'vf_coef': 0.6959463623967596, 'clip_range': 0.2916853556463839, 'max_grad_norm': 0.7962551447256957}. Best is trial 1 with value: 1955.1274975056645.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 18:13:13,704] Trial 9 finished with value: 1951.209096247934 and parameters: {'learning_rate': 0.00011196760109081355, 'n_steps': 481, 'batch_size': 241, 'gae_lambda': 0.9833260118635945, 'ent_coef': 0.003021932585647171, 'vf_coef': 0.37580179618994425, 'clip_range': 0.24779265544799703, 'max_grad_norm': 0.7763755295614958}. Best is trial 1 with value: 1955.1274975056645.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 18:48:50,217] Trial 8 finished with value: 1968.066803876237 and parameters: {'learning_rate': 0.0004915709371380327, 'n_steps': 301, 'batch_size': 45, 'gae_lambda': 0.9472260681398598, 'ent_coef': 0.00551598278144674, 'vf_coef': 0.5937149506354293, 'clip_range': 0.13218597297209073, 'max_grad_norm': 0.3242090243115361}. Best is trial 8 with value: 1968.066803876237.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-04 18:52:24,615] Trial 3 finished with value: 1961.9655875023236 and parameters: {'learning_rate': 1.3186650516530553e-05, 'n_steps': 205, 'batch_size': 18, 'gae_lambda': 0.9521510979109759, 'ent_coef': 0.018745960419003477, 'vf_coef': 0.8709934186582685, 'clip_range': 0.16650705525637016, 'max_grad_norm': 0.4968876093894483}. Best is trial 8 with value: 1968.066803876237.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-04 18:55:11,278] Trial 6 finished with value: 1923.591313894112 and parameters: {'learning_rate': 8.625594616843281e-05, 'n_steps': 688, 'batch_size': 17, 'gae_lambda': 0.9053898867388253, 'ent_coef': 0.00012629332383871317, 'vf_coef': 0.7159966887527807, 'clip_range': 0.2858114137408776, 'max_grad_norm': 0.510045631486288}. Best is trial 8 with value: 1968.066803876237.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-04 18:57:52,051] Trial 0 finished with value: 1912.7660500327072 and parameters: {'learning_rate': 1.9689571191100637e-05, 'n_steps': 153, 'batch_size': 16, 'gae_lambda': 0.8213223334258669, 'ent_coef': 0.008032203826223012, 'vf_coef': 0.807085712387601, 'clip_range': 0.256278616410658, 'max_grad_norm': 0.10314355001748177}. Best is trial 8 with value: 1968.066803876237.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# | hide
if not QUICK_MODE:
    with open("evaluation_data_per_agent.pkl", "wb") as f:
        pickle.dump(evaluation_data_per_agent, f)

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()