# player

> This module training optimizing of RL agents on the electricity market environment.

In [None]:
# | default_exp player

In [None]:
# | hide
from nbdev.showdoc import *

In [None]:
# | export
import pickle
import shutil
from abc import ABC
from pathlib import Path

import numpy as np
import optuna
import torch
import yaml
from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.common.wrappers import ActionMasker
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.monitor import Monitor
from tqdm.notebook import tqdm

from electricity_market.env import ElectricityMarketEnv, EnvConfig
from electricity_market.utils import EvaluationData

In [None]:
# | export
N_TRAIN_EPISODES = 3
N_TRAILS = 10
TRAIN_SEEDS = [
    111111,
    121212,
    123456,
    200000,
    217890,
    222222,
    224775,
    234567,
    253084,
    285234,
    312135,
    314831,
    333333,
    345678,
    406339,
    444444,
    471678,
    555555,
    562845,
    666666,
    701753,
    755460,
    761386,
    777777,
    789391,
    888888,
    993068,
    979797,
    987654,
    999999,
]
EVALUATE_SEEDS = [
    117127,
    136901,
    223246,
    243382,
    245720,
    248832,
    288598,
    374487,
    447331,
    447851,
    490428,
    553737,
    557309,
    571504,
    601426,
    632202,
    653634,
    844596,
    848937,
    849735,
    865470,
    866822,
    876563,
    880689,
    887591,
    911016,
    920528,
    963993,
    967995,
    992634,
]
ENV_CONFIG = EnvConfig()

TENSORBOARD_PATH = Path("../tensorboard")
LOGS_PATH = Path("../logs")

# Set QUICK_MODE = True for CI
QUICK_MODE = True

if QUICK_MODE:
    ENV_CONFIG = EnvConfig(max_timestep=10)
    TRAIN_SEEDS = [10000]
    EVALUATE_SEEDS = [90000]
    TENSORBOARD_PATH = None
    LOGS_PATH = None

evaluation_data_per_agent = {}

In [None]:
# | hide

# cleanup
if not QUICK_MODE:
    shutil.rmtree(TENSORBOARD_PATH, ignore_errors=True)
    shutil.rmtree(LOGS_PATH, ignore_errors=True)

In [None]:
# | export


class Agent(ABC):
    def __init__(self, name, env, device):
        self.name = name
        self.device = device
        self.env = env

    def evaluate(self, render: bool = False) -> EvaluationData:
        """
        Evaluate the model, and return EvaluationData.
        """
        all_rewards = []

        for seed in tqdm(EVALUATE_SEEDS, desc="seeds"):
            obs, _ = self.env.reset(seed=seed)
            episode_rewards = []
            done = False

            while not done:
                obs_tensor = torch.tensor(obs, dtype=torch.float64).to(self.device)

                action = self.choose_action(obs_tensor)
                obs, reward, done, truncated, _ = self.env.step(action)
                episode_rewards.append(reward)

                if render:
                    self.env.render()

                if truncated:
                    break

            all_rewards.append(np.sum(episode_rewards))

        return EvaluationData(
            episodes=list(range(len(all_rewards))),
            rewards=all_rewards,
        )

    def choose_action(self, obs_tensor):
        raise NotImplementedError


class ModelAgent(Agent):
    def __init__(self, name, env, env_config, model, device):
        super().__init__(name, device=device, env=env)
        self.model = model
        self.env_config = env_config

    def train(self) -> None:
        """
        Train the model
        """
        checkpoint_callback = CheckpointCallback(save_freq=1000, save_path="../logs/")

        for seed in tqdm(TRAIN_SEEDS, desc="seeds"):
            for _ in tqdm(range(N_TRAIN_EPISODES), desc="Training episodes"):
                self.env.reset(seed=seed)
                self.model.learn(
                    total_timesteps=self.env_config.max_timestep,
                    callback=checkpoint_callback,
                    reset_num_timesteps=False,
                    tb_log_name=self.name,
                )

    def choose_action(self, obs_tensor):
        action, _ = self.model.predict(obs_tensor, deterministic=True)
        return action

    def save_model(self, model_path: Path) -> None:
        self.model.save(str(model_path))

    def load_model(self, model_path: Path) -> None:
        self.model.load(str(model_path))


class MaskableAgent(Agent):
    @staticmethod
    def mask_fn(env):
        """
        Placeholder mask function if needed.
        """
        return env.unwrapped.action_masks()


class MaskableModelAgent(MaskableAgent, ModelAgent):
    def choose_action(self, obs_tensor):
        action, _ = self.model.predict(
            obs_tensor,
            action_masks=MaskableAgent.mask_fn(self.env),
            deterministic=True,
        )

In [None]:
# | export


class MaskableRandomAgent(MaskableAgent):
    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "MaskableRandomAgent",
    ):
        """
        Initialize the agent and create the environment.
        """
        device = "cuda" if torch.cuda.is_available() else "cpu"
        env = ActionMasker(
            ElectricityMarketEnv(env_config, render_mode=render_mode), self.mask_fn
        )
        super().__init__(name, device=device, env=env)

    def choose_action(self, obs_tensor):
        action_mask = self.env.action_masks()
        valid_actions = np.where(action_mask)[0]
        action = np.random.choice(valid_actions)

        return action

In [None]:
# | export


class A2CAgent(ModelAgent):
    """A2C Agent for the Electricity Market Environment."""

    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "A2CAgent",
    ):
        device = "cuda" if torch.cuda.is_available() else "cpu"
        env = Monitor(
            ElectricityMarketEnv(env_config, render_mode=render_mode),
        )
        model = A2C(
            "MlpPolicy",
            env,
            verbose=0,
            tensorboard_log=f"./tensorboard/",
            device=device,
        )
        super().__init__(
            name=name, env=env, model=model, device=device, env_config=env_config
        )

In [None]:
# | export


class MaskablePPOAgent(ModelAgent, MaskableAgent):
    """Maskable PPO Agent for the Electricity Market Environment."""

    def __init__(
        self,
        env_config: EnvConfig | None = None,
        render_mode: str | None = None,
        name: str = "MaskablePPOAgent",
    ):
        env = Monitor(
            ActionMasker(
                ElectricityMarketEnv(env_config, render_mode=render_mode),
                self.mask_fn,
            )
        )
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = MaskablePPO(
            MaskableActorCriticPolicy,
            env,
            verbose=0,
            tensorboard_log=f"./tensorboard/",
            device=device,
        )
        super().__init__(
            name=name, env=env, model=model, device=device, env_config=env_config
        )
        self.optimized_hyperparameters = {}
        self.env_config = env_config or EnvConfig()

    def optimize(self) -> None:
        """
        Optimize the agent with hyperparameters.
        """

        def objective(trial):
            learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
            n_steps = trial.suggest_int("n_steps", 32, 1024, log=True)
            batch_size = trial.suggest_int("batch_size", 16, 256, log=True)
            gae_lambda = trial.suggest_float("gae_lambda", 0.8, 1.0)
            ent_coef = trial.suggest_float("ent_coef", 0.0, 0.02)
            vf_coef = trial.suggest_float("vf_coef", 0.1, 1.0)
            clip_range = trial.suggest_float("clip_range", 0.1, 0.3)
            max_grad_norm = trial.suggest_float("max_grad_norm", 0.1, 1.0)

            agent = MaskablePPOAgent(
                self.env_config,
            )

            model = MaskablePPO(
                MaskableActorCriticPolicy,
                agent.env,
                learning_rate=learning_rate,
                n_steps=n_steps,
                batch_size=batch_size,
                gae_lambda=gae_lambda,
                ent_coef=ent_coef,
                vf_coef=vf_coef,
                clip_range=clip_range,
                max_grad_norm=max_grad_norm,
                verbose=0,
                device=self.device,
            )

            agent.model = model
            agent.train()

            return np.mean(agent.evaluate().rewards)

        study = optuna.create_study(
            study_name=self.name,
            storage="sqlite:///optuna_study.db",
            load_if_exists=True,
            direction="maximize",
            pruner=optuna.pruners.HyperbandPruner(),
            sampler=optuna.samplers.TPESampler(),
        )

        study.optimize(
            objective,
            n_trials=N_TRAILS,
            n_jobs=-1,
            show_progress_bar=True,
            catch=(ValueError,),
        )

        self.optimized_hyperparameters = study.best_params

        self.model = MaskablePPO(
            MaskableActorCriticPolicy,
            self.env,
            **self.optimized_hyperparameters,
            verbose=0,
            tensorboard_log=f"./tensorboard/",
            device=self.device,
        )

    def export_hyperparameters(self, filename: str):
        """
        Export optimized learned_hyperparameters to a YAML file.
        """
        with open(filename, "w") as file:
            yaml.dump(self.optimized_hyperparameters, file)

### Evaluation MaskableRandom on ElectricityMarketEnv


In [None]:
# | hide
maskable_random_agent = MaskableRandomAgent(render_mode="human", env_config=ENV_CONFIG)

evaluation_data_per_agent[maskable_random_agent.name] = maskable_random_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation A2C on ElectricityMarketEnv


In [None]:
# | hide
a2c_agent = A2CAgent(render_mode="human", env_config=ENV_CONFIG)

a2c_agent.train()

if not QUICK_MODE:
    a2c_agent.save_model(f"{a2c_agent.name}.model")

evaluation_data_per_agent[a2c_agent.name] = a2c_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation MaskablePPO with default hyperparameters on ElectricityMarketEnv

In [None]:
# | hide
maskable_ppo_agent = MaskablePPOAgent(render_mode="human", env_config=ENV_CONFIG)

maskable_ppo_agent.train()

if not QUICK_MODE:
    maskable_ppo_agent.save_model(f"{maskable_ppo_agent.name}.model")

evaluation_data_per_agent[maskable_ppo_agent.name] = maskable_ppo_agent.evaluate()

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation MaskablePPO with optimized hyperparameters on ElectricityMarketEnv

In [None]:
# | hide
optimized_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="OptimizedMaskablePPOAgent"
)
optimized_maskable_ppo_agent.optimize()

if not QUICK_MODE:
    optimized_maskable_ppo_agent.export_hyperparameters(
        f"{optimized_maskable_ppo_agent.name}.yaml"
    )

optimized_maskable_ppo_agent.train()

if not QUICK_MODE:
    optimized_maskable_ppo_agent.save_model(
        f"{optimized_maskable_ppo_agent.name}.model"
    )

evaluation_data_per_agent[optimized_maskable_ppo_agent.name] = (
    optimized_maskable_ppo_agent.evaluate()
)

[I 2025-03-02 11:59:15,998] A new study created in RDB with name: OptimizedMaskablePPOAgent


  0%|          | 0/10 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-02 17:04:00,517] Trial 7 finished with value: -2098.291834159794 and parameters: {'learning_rate': 0.0006484246332550967, 'n_steps': 553, 'batch_size': 249, 'gae_lambda': 0.9097701889421966, 'ent_coef': 0.0021025283263649296, 'vf_coef': 0.6467125566276877, 'clip_range': 0.13913945308583192, 'max_grad_norm': 0.45248375251952055}. Best is trial 7 with value: -2098.291834159794.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-02 17:28:34,254] Trial 0 finished with value: -2955.3475798111554 and parameters: {'learning_rate': 0.0005296890581354876, 'n_steps': 837, 'batch_size': 136, 'gae_lambda': 0.9910698427958351, 'ent_coef': 0.010719147093698114, 'vf_coef': 0.8768317486604473, 'clip_range': 0.22373438704392665, 'max_grad_norm': 0.20813790932403967}. Best is trial 7 with value: -2098.291834159794.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-02 17:35:40,308] Trial 1 finished with value: -1700.057099092567 and parameters: {'learning_rate': 0.0007011489237451647, 'n_steps': 110, 'batch_size': 126, 'gae_lambda': 0.975814946939633, 'ent_coef': 0.007393724117954046, 'vf_coef': 0.38431599158866425, 'clip_range': 0.2853699665194148, 'max_grad_norm': 0.9664534187295596}. Best is trial 1 with value: -1700.057099092567.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-02 17:50:37,020] Trial 5 finished with value: -2033.1621546715191 and parameters: {'learning_rate': 6.0784953187579085e-05, 'n_steps': 849, 'batch_size': 97, 'gae_lambda': 0.8044523714500413, 'ent_coef': 0.019571529146181755, 'vf_coef': 0.5506906923355321, 'clip_range': 0.1679521310259069, 'max_grad_norm': 0.18738440380660487}. Best is trial 1 with value: -1700.057099092567.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-02 18:09:38,317] Trial 3 finished with value: -1711.7098679725007 and parameters: {'learning_rate': 0.0009007976399841046, 'n_steps': 136, 'batch_size': 90, 'gae_lambda': 0.844792149791187, 'ent_coef': 0.01224290712008484, 'vf_coef': 0.18610150501175993, 'clip_range': 0.1614964151682085, 'max_grad_norm': 0.11770872148087524}. Best is trial 1 with value: -1700.057099092567.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-02 18:17:18,912] Trial 4 finished with value: -159.381109133015 and parameters: {'learning_rate': 0.0003697306188106052, 'n_steps': 387, 'batch_size': 66, 'gae_lambda': 0.9864871207861574, 'ent_coef': 0.00573213020170553, 'vf_coef': 0.11751975297839251, 'clip_range': 0.15589429178226066, 'max_grad_norm': 0.15605445238155394}. Best is trial 4 with value: -159.381109133015.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-02 18:20:00,794] Trial 2 finished with value: -9682.97743724463 and parameters: {'learning_rate': 3.79369352402382e-05, 'n_steps': 57, 'batch_size': 88, 'gae_lambda': 0.9900708949909685, 'ent_coef': 0.003357943505417913, 'vf_coef': 0.877113536064356, 'clip_range': 0.25255524408202423, 'max_grad_norm': 0.6601193001281527}. Best is trial 4 with value: -159.381109133015.
[I 2025-03-02 18:20:09,479] Trial 6 finished with value: -425.34065364470206 and parameters: {'learning_rate': 7.923360508186175e-05, 'n_steps': 116, 'batch_size': 68, 'gae_lambda': 0.8927862123909321, 'ent_coef': 0.0069990692116141395, 'vf_coef': 0.1658496559602054, 'clip_range': 0.22477209427901212, 'max_grad_norm': 0.2585568364327615}. Best is trial 4 with value: -159.381109133015.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-02 18:47:05,857] Trial 9 finished with value: -1357.1734271638243 and parameters: {'learning_rate': 0.0005296812024818421, 'n_steps': 95, 'batch_size': 108, 'gae_lambda': 0.9898635510122622, 'ent_coef': 0.0140331560907602, 'vf_coef': 0.5109813998274576, 'clip_range': 0.214744327351942, 'max_grad_norm': 0.6180647245332661}. Best is trial 4 with value: -159.381109133015.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-02 18:59:07,987] Trial 8 finished with value: -2612.580105158454 and parameters: {'learning_rate': 0.00028715568166124756, 'n_steps': 46, 'batch_size': 41, 'gae_lambda': 0.8346685362419539, 'ent_coef': 0.0006254583421351811, 'vf_coef': 0.3988437835309965, 'clip_range': 0.10845180788992524, 'max_grad_norm': 0.3661814619306646}. Best is trial 4 with value: -159.381109133015.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Adding expert knowledge to the masking function making learning more efficient

In [None]:
# | export


def is_action_safe(self, action: int) -> bool:
    charge_amount = self._charge_amount(action)
    target_state_of_charge = self._current_state_of_charge + charge_amount
    low, high = self._battery_safe_range
    return high > target_state_of_charge > low


def expert_knowledge_action_masks(self) -> np.ndarray:
    mask = np.array(
        [
            self._is_action_valid(action) and self.is_action_safe(action)
            for action in range(self.action_space.n)
        ],
        dtype=bool,
    )
    if not np.any(mask):
        mask[len(mask) // 2] = True
    return mask

In [None]:
# | hide
# Dynamically overriding action_masks to ElectricityMarketEnv
setattr(ElectricityMarketEnv, "action_masks", expert_knowledge_action_masks)
# Dynamically overriding injection is_action_safe to ElectricityMarketEnv
setattr(ElectricityMarketEnv, "is_action_safe", is_action_safe)



### Evaluation MaskableRandomAgent with Expert Knowledge on ElectricityMarketEnv

In [None]:
# | hide
expert_maskable_random_agent = MaskableRandomAgent(
    render_mode="human", env_config=ENV_CONFIG, name="ExpertMaskableRandomAgent"
)

evaluation_data_per_agent[expert_maskable_random_agent.name] = (
    expert_maskable_random_agent.evaluate()
)

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation MaskablePPO with default hyperparameters and Expert Knowledge on ElectricityMarketEnv

In [None]:
# | hide
expert_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="ExpertMaskablePPOAgent"
)

expert_maskable_ppo_agent.train()


if not QUICK_MODE:
    expert_maskable_ppo_agent.save_model(f"{expert_maskable_ppo_agent.name}.model")

evaluation_data_per_agent[expert_maskable_ppo_agent.name] = (
    expert_maskable_ppo_agent.evaluate()
)

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

### Evaluation MaskablePPO with optimized hyperparameters and Expert Knowledge on ElectricityMarketEnv


In [None]:
# | hide
optimized_expert_maskable_ppo_agent = MaskablePPOAgent(
    render_mode="human", env_config=ENV_CONFIG, name="OptimizedExpertMaskablePPOAgent"
)
optimized_expert_maskable_ppo_agent.optimize()

if not QUICK_MODE:
    optimized_expert_maskable_ppo_agent.export_hyperparameters(
        f"{optimized_expert_maskable_ppo_agent.name}.yaml"
    )

optimized_expert_maskable_ppo_agent.train()

if not QUICK_MODE:
    optimized_expert_maskable_ppo_agent.save_model(
        f"{optimized_expert_maskable_ppo_agent.name}.model"
    )

evaluation_data_per_agent[optimized_expert_maskable_ppo_agent.name] = (
    optimized_expert_maskable_ppo_agent.evaluate()
)

[I 2025-03-02 19:38:40,365] A new study created in RDB with name: OptimizedExpertMaskablePPOAgent


  0%|          | 0/10 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-03 00:26:49,740] Trial 4 finished with value: -17520.799999999996 and parameters: {'learning_rate': 3.758008930883173e-05, 'n_steps': 319, 'batch_size': 242, 'gae_lambda': 0.9771001833286587, 'ent_coef': 0.018594125969999622, 'vf_coef': 0.7455807690984315, 'clip_range': 0.2351316479638441, 'max_grad_norm': 0.797234262485128}. Best is trial 4 with value: -17520.799999999996.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-03 00:46:09,252] Trial 2 finished with value: -17518.436128881844 and parameters: {'learning_rate': 3.35833243912162e-05, 'n_steps': 116, 'batch_size': 192, 'gae_lambda': 0.8674583020043023, 'ent_coef': 0.007935015225616017, 'vf_coef': 0.656559360548311, 'clip_range': 0.1420952983056653, 'max_grad_norm': 0.5266661158166861}. Best is trial 2 with value: -17518.436128881844.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-03 01:11:22,679] Trial 1 finished with value: -17515.66136008465 and parameters: {'learning_rate': 3.940847543548621e-05, 'n_steps': 268, 'batch_size': 99, 'gae_lambda': 0.9118970205690571, 'ent_coef': 0.005413585866677664, 'vf_coef': 0.17486847995716487, 'clip_range': 0.24176666076828096, 'max_grad_norm': 0.21207910007381486}. Best is trial 1 with value: -17515.66136008465.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-03 02:08:54,788] Trial 0 finished with value: -15928.375476046856 and parameters: {'learning_rate': 3.092952404201151e-05, 'n_steps': 146, 'batch_size': 62, 'gae_lambda': 0.987861048112466, 'ent_coef': 0.011296527212628396, 'vf_coef': 0.9633867557909092, 'clip_range': 0.13454499916877788, 'max_grad_norm': 0.2993995130923701}. Best is trial 0 with value: -15928.375476046856.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-03 02:32:30,408] Trial 5 finished with value: -8501.912046704478 and parameters: {'learning_rate': 0.00025489981868470835, 'n_steps': 38, 'batch_size': 192, 'gae_lambda': 0.8416755028586217, 'ent_coef': 0.01922902601246564, 'vf_coef': 0.5011744081253736, 'clip_range': 0.28727392489707493, 'max_grad_norm': 0.41848691795954784}. Best is trial 5 with value: -8501.912046704478.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-03 03:05:08,268] Trial 3 finished with value: -17499.82791260961 and parameters: {'learning_rate': 0.0004479550976773692, 'n_steps': 216, 'batch_size': 34, 'gae_lambda': 0.8453612246441569, 'ent_coef': 0.017514690527727403, 'vf_coef': 0.4792628966087221, 'clip_range': 0.10215341589616643, 'max_grad_norm': 0.2554352072858279}. Best is trial 5 with value: -8501.912046704478.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-03 03:27:01,507] Trial 9 finished with value: -16365.784196277169 and parameters: {'learning_rate': 0.0008754113953125168, 'n_steps': 872, 'batch_size': 200, 'gae_lambda': 0.9084107926374108, 'ent_coef': 0.01362649896362265, 'vf_coef': 0.9875714679848316, 'clip_range': 0.15300503794949846, 'max_grad_norm': 0.4363374452198766}. Best is trial 5 with value: -8501.912046704478.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-03 03:30:24,965] Trial 6 finished with value: -14143.715910605508 and parameters: {'learning_rate': 0.00016451012905195706, 'n_steps': 114, 'batch_size': 23, 'gae_lambda': 0.9867199494218222, 'ent_coef': 0.010148094506326717, 'vf_coef': 0.67441731357266, 'clip_range': 0.1123490624764338, 'max_grad_norm': 0.10046815269287063}. Best is trial 5 with value: -8501.912046704478.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-03-03 03:43:18,814] Trial 8 finished with value: -17517.57827213949 and parameters: {'learning_rate': 3.4550892027772096e-05, 'n_steps': 519, 'batch_size': 43, 'gae_lambda': 0.803786025813684, 'ent_coef': 0.008949539577471173, 'vf_coef': 0.23615859101744172, 'clip_range': 0.2862536661181533, 'max_grad_norm': 0.14082152973099402}. Best is trial 5 with value: -8501.912046704478.


Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-03-03 03:46:26,279] Trial 7 finished with value: -16365.601534662588 and parameters: {'learning_rate': 0.0008503178405514465, 'n_steps': 515, 'batch_size': 16, 'gae_lambda': 0.9473881136841314, 'ent_coef': 0.000996887501408048, 'vf_coef': 0.8428982922127194, 'clip_range': 0.24784501235908254, 'max_grad_norm': 0.4017039539652397}. Best is trial 5 with value: -8501.912046704478.


seeds:   0%|          | 0/30 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

Training episodes:   0%|          | 0/3 [00:00<?, ?it/s]

seeds:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
# | hide
if not QUICK_MODE:
    with open("evaluation_data_per_agent.pkl", "wb") as f:
        pickle.dump(evaluation_data_per_agent, f)

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()