## Experimenting different exploration strategies
By adjusting action noise (sigma)

In [3]:
import numpy as np
import torch
from envs.aquaculture_env import AquacultureEnv
from utils.plot_callback import PlotCallback
from stable_baselines3 import TD3
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.noise import NormalActionNoise

print("CUDA available:", torch.cuda.is_available())

base_env = AquacultureEnv(region="north_sulawesi")
check_env(base_env)

noise_sigmas = [0.05, 0.10, 0.20]

for sigma in noise_sigmas:
    env = AquacultureEnv(region="north_sulawesi")
    n_actions = env.action_space.shape[-1]

    action_noise = NormalActionNoise(
        mean=np.zeros(n_actions),
        sigma=np.ones(n_actions) * sigma
    )

    cb = PlotCallback(
        window=1,
        save_path=f"td3_rewards_sigma_{sigma:.2f}.png",
        title=f"TD3 Training (σ={sigma:.2f})"
    )

    model = TD3(
        "MlpPolicy",
        env,
        action_noise=action_noise,
        verbose=1,
        tensorboard_log=f"./aqua_tensorboard/sigma_{sigma:.2f}",
        learning_rate=1e-3,
        batch_size=256,
        gamma=0.99,
        device="cuda"
    )

    print(f"\n=== Training with σ = {sigma:.2f} ===")
    model.learn(total_timesteps=180 * 300, callback=cb)

CUDA available: True
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

=== Training with σ = 0.05 ===
Logging to ./aqua_tensorboard/sigma_0.05\TD3_3




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 180      |
|    ep_rew_mean     | 47       |
| time/              |          |
|    episodes        | 4        |
|    fps             | 95       |
|    time_elapsed    | 7        |
|    total_timesteps | 720      |
| train/             |          |
|    actor_loss      | -2.09    |
|    critic_loss     | 0.148    |
|    learning_rate   | 0.001    |
|    n_updates       | 619      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 180      |
|    ep_rew_mean     | 17.5     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 91       |
|    time_elapsed    | 15       |
|    total_timesteps | 1440     |
| train/             |          |
|    actor_loss      | -4.36    |
|    critic_loss     | 0.329    |
|    learning_rate   | 0.001    |
|    n_updates       | 1339     |
--------------

In [22]:
import numpy as np
import torch
import optuna
import os
from datetime import datetime
from stable_baselines3 import TD3
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.noise import NormalActionNoise
from envs.aquaculture_env import AquacultureEnv
from utils.plot_callback import PlotCallback

PLOT_DIR = "plots"
os.makedirs(PLOT_DIR, exist_ok=True)
RUN_ID   = datetime.now().strftime("%Y%m%d_%H%M%S")

FIXED_NOISE_SCALE = 0.20

eval_env = AquacultureEnv(region="north_sulawesi")
eval_cb  = EvalCallback(
    eval_env,
    n_eval_episodes=20,
    eval_freq=180 * 5,
    deterministic=True
)

def objective(trial: optuna.Trial) -> float:
    lr    = trial.suggest_float("learning_rate", 1e-4,   1e-3,  log=True)
    gamma = trial.suggest_float("gamma",          0.98,   0.999)
    tau   = trial.suggest_float("tau",            1e-4,   1e-3,  log=True)
    batch = trial.suggest_categorical("batch_size", [128, 256])
    net   = trial.suggest_categorical(
                "net_arch",
                [[256, 256], [400, 300], [512, 512, 256]]
            )
    
    env = AquacultureEnv(region="north_sulawesi")
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(
        mean=np.zeros(n_actions),
        sigma=np.ones(n_actions) * FIXED_NOISE_SCALE
    )
    
    model = TD3(
        "MlpPolicy",
        env,
        learning_rate=lr,
        gamma=gamma,
        tau=tau,
        batch_size=batch,
        target_policy_noise=FIXED_NOISE_SCALE,
        target_noise_clip=FIXED_NOISE_SCALE,
        policy_kwargs=dict(net_arch=net),
        action_noise=action_noise,
        verbose=0,
        tensorboard_log="./aqua_tensorboard",
        device="cuda" if torch.cuda.is_available() else "cpu",
    )

    fname = f"{RUN_ID}_trial{trial.number:02d}_rewards.png"
    save_path = os.path.join(PLOT_DIR, fname)

    plot_cb = PlotCallback(
        window=1,
        save_path=save_path,
        title=f"Trial {trial.number} Rewards"
    )

    model.learn(180 * 100, callback=[eval_cb, plot_cb])
    return eval_cb.best_mean_reward


study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler()
)
study.optimize(objective, n_trials=10, timeout=3 * 3600)

print("Best value:", study.best_value)
print("Best params:", study.best_params)

[I 2025-04-25 02:01:26,286] A new study created in memory with name: no-name-9d607ae2-751a-4ad4-8e3f-3bfe607221fb


Eval num_timesteps=900, episode_reward=22.10 +/- 1.91
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1800, episode_reward=104.98 +/- 0.90
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=2700, episode_reward=123.31 +/- 0.99
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3600, episode_reward=133.00 +/- 0.96
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=4500, episode_reward=143.41 +/- 0.86
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=5400, episode_reward=159.97 +/- 0.83
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6300, episode_reward=157.02 +/- 0.86
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=139.08 +/- 1.30
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=145.64 +/- 0.88
Episode length: 180.00 +/- 0.00
Eval num_timesteps=9000, episode_reward=156.33 +/- 0.82
Episode length: 180.00

[I 2025-04-25 02:07:09,304] Trial 0 finished with value: 164.0621950901259 and parameters: {'learning_rate': 0.0006271977619625876, 'gamma': 0.9978334421101541, 'tau': 0.00045369660541676556, 'batch_size': 256, 'net_arch': [256, 256]}. Best is trial 0 with value: 164.0621950901259.


Eval num_timesteps=18000, episode_reward=135.05 +/- 0.95
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial00_rewards.png
📊 Total reward: 11794.75
📉 Reward variation (std dev): 29.75
Eval num_timesteps=900, episode_reward=97.15 +/- 1.38
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=149.69 +/- 1.32
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=153.26 +/- 0.99
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=143.45 +/- 1.86
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=134.35 +/- 1.05
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=147.24 +/- 1.25
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=126.37 +/- 1.64
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=119.70 +/- 1.12
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=129.32 +/- 1.31
Episode length: 180.00 

[I 2025-04-25 02:13:54,678] Trial 1 finished with value: 164.0621950901259 and parameters: {'learning_rate': 0.0008970062217683625, 'gamma': 0.990594816317376, 'tau': 0.0001379818654885925, 'batch_size': 256, 'net_arch': [512, 512, 256]}. Best is trial 0 with value: 164.0621950901259.


Eval num_timesteps=18000, episode_reward=142.69 +/- 1.92
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial01_rewards.png
📊 Total reward: 11640.05
📉 Reward variation (std dev): 19.04
Eval num_timesteps=900, episode_reward=29.57 +/- 1.93
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=82.10 +/- 1.35
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=109.84 +/- 1.26
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=119.52 +/- 1.60
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=118.70 +/- 1.56
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=131.54 +/- 1.22
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=128.77 +/- 1.23
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=137.28 +/- 1.02
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=159.76 +/- 1.12
Episode length: 180.00 +

[I 2025-04-25 02:20:43,964] Trial 2 finished with value: 168.76820621352527 and parameters: {'learning_rate': 0.00034611380503877517, 'gamma': 0.9904807274700478, 'tau': 0.00018948152964641272, 'batch_size': 256, 'net_arch': [512, 512, 256]}. Best is trial 2 with value: 168.76820621352527.


Eval num_timesteps=18000, episode_reward=165.73 +/- 1.18
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial02_rewards.png
📊 Total reward: 12018.75
📉 Reward variation (std dev): 27.09
Eval num_timesteps=900, episode_reward=48.08 +/- 1.76
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=57.37 +/- 1.62
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=62.30 +/- 1.69
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=57.09 +/- 1.20
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=58.18 +/- 1.90
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=59.83 +/- 1.54
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=86.62 +/- 1.46
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=121.18 +/- 1.17
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=81.49 +/- 1.45
Episode length: 180.00 +/- 0.0

[I 2025-04-25 02:27:06,193] Trial 3 finished with value: 181.24490643223982 and parameters: {'learning_rate': 0.00018913354014134178, 'gamma': 0.9877576938330571, 'tau': 0.00010552262741371088, 'batch_size': 256, 'net_arch': [256, 256]}. Best is trial 3 with value: 181.24490643223982.


Eval num_timesteps=18000, episode_reward=172.14 +/- 0.47
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial03_rewards.png
📊 Total reward: 10882.02
📉 Reward variation (std dev): 36.64
Eval num_timesteps=900, episode_reward=49.49 +/- 1.53
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=70.46 +/- 1.28
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=51.10 +/- 2.48
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=124.95 +/- 1.54
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=149.07 +/- 0.66
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=145.51 +/- 1.19
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=148.07 +/- 1.00
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=140.08 +/- 0.78
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=130.70 +/- 1.40
Episode length: 180.00 +/

[I 2025-04-25 02:33:53,895] Trial 4 finished with value: 181.24490643223982 and parameters: {'learning_rate': 0.0009773747200034217, 'gamma': 0.9909576095641196, 'tau': 0.00017950031503255874, 'batch_size': 256, 'net_arch': [512, 512, 256]}. Best is trial 3 with value: 181.24490643223982.


Eval num_timesteps=18000, episode_reward=81.99 +/- 2.34
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial04_rewards.png
📊 Total reward: 10570.96
📉 Reward variation (std dev): 24.26
Eval num_timesteps=900, episode_reward=72.14 +/- 1.11
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=131.74 +/- 1.21
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=134.27 +/- 0.82
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=143.06 +/- 1.57
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=136.47 +/- 1.63
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=147.14 +/- 0.94
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=152.74 +/- 0.91
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=111.04 +/- 1.08
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=100.90 +/- 1.01
Episode length: 180.00 +

[I 2025-04-25 02:40:42,662] Trial 5 finished with value: 181.24490643223982 and parameters: {'learning_rate': 0.00035098180568504265, 'gamma': 0.9837028049390338, 'tau': 0.0009659869309588304, 'batch_size': 256, 'net_arch': [512, 512, 256]}. Best is trial 3 with value: 181.24490643223982.


Eval num_timesteps=18000, episode_reward=144.97 +/- 1.08
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial05_rewards.png
📊 Total reward: 11724.91
📉 Reward variation (std dev): 16.95
Eval num_timesteps=900, episode_reward=27.04 +/- 1.77
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=109.38 +/- 1.58
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=123.62 +/- 1.06
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=126.56 +/- 1.30
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=130.35 +/- 1.70
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=133.64 +/- 1.50
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=118.21 +/- 1.45
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=116.89 +/- 0.98
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=36.12 +/- 4.71
Episode length: 180.00 +

[I 2025-04-25 02:47:35,178] Trial 6 finished with value: 181.24490643223982 and parameters: {'learning_rate': 0.000134628908155563, 'gamma': 0.9836258156404177, 'tau': 0.00010110069420307066, 'batch_size': 128, 'net_arch': [512, 512, 256]}. Best is trial 3 with value: 181.24490643223982.


Eval num_timesteps=18000, episode_reward=157.50 +/- 1.18
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial06_rewards.png
📊 Total reward: 10339.19
📉 Reward variation (std dev): 29.38
Eval num_timesteps=900, episode_reward=56.62 +/- 1.07
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=51.30 +/- 1.51
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=133.33 +/- 0.95
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=142.21 +/- 0.92
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=138.06 +/- 1.29
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=136.27 +/- 0.88
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=96.32 +/- 0.94
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=128.62 +/- 1.75
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=92.78 +/- 2.13
Episode length: 180.00 +/-

[I 2025-04-25 02:53:45,429] Trial 7 finished with value: 181.24490643223982 and parameters: {'learning_rate': 0.00026873682207451826, 'gamma': 0.9926244017930383, 'tau': 0.00024638152862258055, 'batch_size': 256, 'net_arch': [400, 300]}. Best is trial 3 with value: 181.24490643223982.


Eval num_timesteps=18000, episode_reward=141.97 +/- 0.83
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial07_rewards.png
📊 Total reward: 12312.75
📉 Reward variation (std dev): 31.42
Eval num_timesteps=900, episode_reward=54.59 +/- 1.75
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=44.53 +/- 1.31
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=108.74 +/- 1.13
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=124.00 +/- 1.07
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=128.02 +/- 1.16
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=132.46 +/- 0.78
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=136.37 +/- 0.78
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=140.64 +/- 1.35
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=146.70 +/- 0.82
Episode length: 180.00 +

[I 2025-04-25 02:59:38,302] Trial 8 finished with value: 181.24490643223982 and parameters: {'learning_rate': 0.00012071371362115884, 'gamma': 0.9895423786874181, 'tau': 0.00011548526523647217, 'batch_size': 256, 'net_arch': [400, 300]}. Best is trial 3 with value: 181.24490643223982.


Eval num_timesteps=18000, episode_reward=138.67 +/- 1.26
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial08_rewards.png
📊 Total reward: 11099.67
📉 Reward variation (std dev): 24.01
Eval num_timesteps=900, episode_reward=29.89 +/- 1.83
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=122.91 +/- 1.07
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=152.10 +/- 0.79
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=153.09 +/- 1.06
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=167.17 +/- 0.91
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=144.91 +/- 1.24
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=155.16 +/- 1.54
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=146.29 +/- 1.54
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=146.36 +/- 0.70
Episode length: 180.00 

[I 2025-04-25 03:05:31,753] Trial 9 finished with value: 181.24490643223982 and parameters: {'learning_rate': 0.0003554717159908246, 'gamma': 0.9822910707292413, 'tau': 0.000742908718306906, 'batch_size': 128, 'net_arch': [400, 300]}. Best is trial 3 with value: 181.24490643223982.


Eval num_timesteps=18000, episode_reward=143.67 +/- 1.18
Episode length: 180.00 +/- 0.00
✅ Training curve saved to: plots\20250425_020126_trial09_rewards.png
📊 Total reward: 12821.56
📉 Reward variation (std dev): 25.55
Best value: 181.24490643223982
Best params: {'learning_rate': 0.00018913354014134178, 'gamma': 0.9877576938330571, 'tau': 0.00010552262741371088, 'batch_size': 256, 'net_arch': [256, 256]}


In [25]:
from envs.aquaculture_env import AquacultureEnv
from utils.plot_callback import PlotCallback
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback
import torch
import numpy as np
import matplotlib.pyplot as plt
import os

print("CUDA Available:", torch.cuda.is_available())

env = AquacultureEnv(region="north_sulawesi")
check_env(env)

FIXED_NOISE_SCALE = 0.20
n_actions = env.action_space.shape[0]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=FIXED_NOISE_SCALE * np.ones(n_actions))

best_params = {
    "learning_rate": 0.0002,
    "gamma": 0.9878,
    "tau": 0.0001,
    "batch_size": 256,
    "net_arch": [256, 256]
}

model = TD3(
    "MlpPolicy",
    env,
    action_noise=action_noise,
    verbose=1,
    tensorboard_log="./aqua_tensorboard",
    learning_rate=best_params["learning_rate"],
    gamma=best_params["gamma"],
    tau=best_params["tau"],
    batch_size=best_params["batch_size"],
    policy_kwargs=dict(net_arch=best_params["net_arch"]),
    device="cuda" if torch.cuda.is_available() else "cpu"
)

plot_cb = PlotCallback(window=1, save_path="plots/td3_training_rewards.png", title="TD3 Training Rewards (σ=0.20)")
model.learn(total_timesteps=180 * 300, callback=plot_cb)

model_save_path = "./saved_model/td3_best_model"
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
model.save(model_save_path)
print(f"💾 Model saved to {model_save_path}")

CUDA Available: True
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./aqua_tensorboard\TD3_3




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 180      |
|    ep_rew_mean     | 33.5     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 103      |
|    time_elapsed    | 6        |
|    total_timesteps | 720      |
| train/             |          |
|    actor_loss      | -0.492   |
|    critic_loss     | 0.227    |
|    learning_rate   | 0.0002   |
|    n_updates       | 619      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 180      |
|    ep_rew_mean     | 36.8     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 93       |
|    time_elapsed    | 15       |
|    total_timesteps | 1440     |
| train/             |          |
|    actor_loss      | -0.541   |
|    critic_loss     | 0.213    |
|    learning_rate   | 0.0002   |
|    n_updates       | 1339     |
--------------

In [26]:
import time
from utils.calculation import Calculation

# Reset environment
obs, _ = env.reset()
terminated = False
truncated = False

# Accumulators
total_reward         = 0.0
total_value_gain     = 0.0
total_feed_cost      = 0.0
total_heat_cost      = 0.0
total_oxygen_cost    = 0.0

print("Action space:", env.action_space)

while not (terminated or truncated):
    action, _ = model.predict(obs, deterministic=True)
    raw = env.denormalize(obs)
    biomass, fish_count, temp, do_level, uia = raw
    feed_amount = Calculation.compute_feed_weight(action[0], biomass)

    print(f"""\n--- Day {env.day + 1} ---
Raw-obs: biomass={biomass:.1f} g, count={fish_count:.0f}, temp={temp:.2f}°C, DO={do_level:.2f} mg/L, UIA={uia:.3f} mg/L
Action: feed_rate={action[0]:.3f} → feed_amt={feed_amount:.2f} g, temp_set={action[1]:.3f}, aeration_rate={action[2]:.2f} mg/L
""")

    obs, reward, terminated, truncated, info = env.step(action)
    total_reward      += reward
    total_value_gain  += info['fish_value']
    total_feed_cost   += info['feed_cost']
    total_heat_cost   += info['heat_cost']
    total_oxygen_cost += info['oxygenation_cost']

    print(f"""Reward Breakdown:
Fish Value Gain:     {info['fish_value']:.2f}
Feed Cost:           {info['feed_cost']:.2f}
Heat Cost:           {info['heat_cost']:.2f}
Oxygenation Cost:    {info['oxygenation_cost']:.2f}
→ Net Reward:        {info['reward']:.2f}
""")

    env.render()

    if hasattr(env, 'exit_requested') and env.exit_requested:
        break

    time.sleep(0.01)

env.close()

# Final totals
print("\n=== Episode Summary ===")
print(f"Total Fish Value Gain:  {total_value_gain:.2f}")
print(f"Total Feed Cost:        {total_feed_cost:.2f}")
print(f"Total Heat Cost:        {total_heat_cost:.2f}")
print(f"Total Oxygen Cost:      {total_oxygen_cost:.2f}")
print(f"Total Net Reward:       {total_reward:.2f}")

Action space: Box([ 0.  24.   0.3], [ 1. 40.  1.], (3,), float32)

--- Day 1 ---
Raw-obs: biomass=1421.5 g, count=100, temp=24.93°C, DO=0.60 mg/L, UIA=0.060 mg/L
Action: feed_rate=0.617 → feed_amt=87.76 g, temp_set=39.998, aeration_rate=1.00 mg/L

Reward Breakdown:
Fish Value Gain:     0.61
Feed Cost:           0.11
Heat Cost:           0.10
Oxygenation Cost:    0.01
→ Net Reward:        0.39


--- Day 2 ---
Raw-obs: biomass=1520.1 g, count=100, temp=28.52°C, DO=1.00 mg/L, UIA=0.060 mg/L
Action: feed_rate=0.553 → feed_amt=84.08 g, temp_set=39.964, aeration_rate=1.00 mg/L

Reward Breakdown:
Fish Value Gain:     0.81
Feed Cost:           0.11
Heat Cost:           0.10
Oxygenation Cost:    0.01
→ Net Reward:        0.59


--- Day 3 ---
Raw-obs: biomass=1651.2 g, count=100, temp=31.03°C, DO=1.00 mg/L, UIA=0.060 mg/L
Action: feed_rate=0.739 → feed_amt=121.98 g, temp_set=34.767, aeration_rate=1.00 mg/L

Reward Breakdown:
Fish Value Gain:     0.90
Feed Cost:           0.15
Heat Cost:         