# electricity_market_player

> Fill in a module description here

In [None]:
#| default_exp electricity_market_player

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import optuna
from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.policies import MaskableActorCriticPolicy
from sb3_contrib.common.wrappers import ActionMasker
from sb3_contrib.common.maskable.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

from electricity_market.electricity_market_env import ElectricityMarketEnv


In [None]:
#| export
def mask_fn(env):
    return env.action_masks()


env_config = {}

env = DummyVecEnv([lambda: Monitor(ActionMasker(ElectricityMarketEnv(env_config), mask_fn))])
model = MaskablePPO(MaskableActorCriticPolicy, env, verbose=0, seed=123456)
print("Training")
model.learn(total_timesteps=10_000, use_masking=True)
print("Evaluation")
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Training
Evaluation
mean_reward:15043267.07 +/- 66641.68


In [None]:
#| export
def optimize_agent(trial):
    # Define hyperparameters to optimize
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    n_steps = trial.suggest_int('n_steps', 16, 2048, log=True)
    batch_size = trial.suggest_int('batch_size', 16, 256, log=True)

    # Create environment
    env = DummyVecEnv([lambda: Monitor(ActionMasker(ElectricityMarketEnv(env_config), mask_fn))])


    # Create the MaskablePPO model with suggested hyperparameters
    model = MaskablePPO(
        MaskableActorCriticPolicy,
        env,
        learning_rate=learning_rate,
        n_steps=n_steps,
        batch_size=batch_size,
        verbose=0,
        seed=123456
    )

    # Train the model
    model.learn(total_timesteps=int(1e5))

    # Evaluate the model using action masking
    mean_reward, _ = evaluate_policy(
        model, env, n_eval_episodes=10, deterministic=True
    )

    return mean_reward

# Set up Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(optimize_agent, n_trials=10)

# Print best trial
print("Best trial:", study.best_trial)

[I 2025-02-09 00:31:02,471] A new study created in memory with name: no-name-12667c86-4a68-45eb-ad42-fc168dc62ae8
[I 2025-02-09 00:32:07,232] Trial 0 finished with value: 20043323.9037663 and parameters: {'learning_rate': 0.01879331175017602, 'n_steps': 601, 'batch_size': 66}. Best is trial 0 with value: 20043323.9037663.
[I 2025-02-09 00:33:06,966] Trial 1 finished with value: 14970182.111348 and parameters: {'learning_rate': 1.3617257402106057e-05, 'n_steps': 180, 'batch_size': 142}. Best is trial 0 with value: 20043323.9037663.
[I 2025-02-09 00:34:29,021] Trial 2 finished with value: 17070417.523969598 and parameters: {'learning_rate': 0.001832540881813732, 'n_steps': 28, 'batch_size': 29}. Best is trial 0 with value: 20043323.9037663.
[I 2025-02-09 00:35:24,500] Trial 3 finished with value: 17010377.707562797 and parameters: {'learning_rate': 0.003259978663445662, 'n_steps': 154, 'batch_size': 235}. Best is trial 0 with value: 20043323.9037663.
[I 2025-02-09 00:36:27,582] Trial 4 f

Best trial: FrozenTrial(number=0, state=1, values=[20043323.9037663], datetime_start=datetime.datetime(2025, 2, 9, 0, 31, 2, 472342), datetime_complete=datetime.datetime(2025, 2, 9, 0, 32, 7, 232559), params={'learning_rate': 0.01879331175017602, 'n_steps': 601, 'batch_size': 66}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.1, log=True, low=1e-05, step=None), 'n_steps': IntDistribution(high=2048, log=True, low=16, step=1), 'batch_size': IntDistribution(high=256, log=True, low=16, step=1)}, trial_id=0, value=None)


In [None]:
#| export
model = MaskablePPO(MaskableActorCriticPolicy, env, verbose=0, seed=123456, **study.best_trial.params)

print("Training")
model.learn(total_timesteps=10_000, use_masking=True)
print("Evaluation")
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

Training
Evaluation
mean_reward:16380868.28 +/- 58081.60


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()