In [None]:
# Import necessary libraries
from gymnasium import Env
from gymnasium.spaces import Discrete, Box
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env
import torch
from envs.aquaculture_env import AquacultureEnv
from utils.plot_callback import PlotCallback


# Discretized environment to handle continuous action space
class DiscretizedAquacultureEnv(Env):
    def __init__(self, region="guangdong"):
        self.base_env = AquacultureEnv(region=region)
        
        # Discretize the action space
        self.feed_bins = 40
        self.temp_bins = 16
        self.air_bins = 10

        self.discrete_actions = [
            (feed, temp, air)
            for feed in np.linspace(self.base_env.action_space.low[0], self.base_env.action_space.high[0], self.feed_bins)
            for temp in np.linspace(self.base_env.action_space.low[1], self.base_env.action_space.high[1], self.temp_bins)
            for air in np.linspace(self.base_env.action_space.low[2], self.base_env.action_space.high[2], self.air_bins)
        ]
        
        self.action_space = Discrete(len(self.discrete_actions))
        self.observation_space = self.base_env.observation_space

    def reset(self, **kwargs):
        obs, info = self.base_env.reset(**kwargs)
        return obs, info

    def step(self, action_idx):
        action = np.array(self.discrete_actions[action_idx], dtype=np.float32)
        obs, reward, terminated, truncated, info = self.base_env.step(action)
        return obs, reward, terminated, truncated, info

    def render(self, mode='human'):
        return self.base_env.render()

    def close(self):
        self.base_env.close()


In [2]:
# Check if CUDA is available for model training
print(torch.cuda.is_available())

# Create the environment and validate it
env = DiscretizedAquacultureEnv(region="north_sulawesi")
check_env(env, warn=True)

# Initialize the DQN model
for eps in [0.05, 0.01, 0.001]:
    plot_cb = PlotCallback(
        window=10,
        save_path=f"training_rewards_eps_{eps}.png",
        title=f"Training Rewards (Final Îµ={eps})"
    )

    model = DQN(
        "MlpPolicy",
        env,
        verbose=1,
        tensorboard_log=f"./aqua_tensorboard_dqn_eps_{eps}",
        learning_rate=1e-4,
        batch_size=256,
        gamma=0.99,
        exploration_final_eps=eps,
        device="cuda"
    )
    print(f"\nðŸš€ Training with exploration_final_eps = {eps}")
    model.learn(total_timesteps=180 * 10, callback=plot_cb)

False
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.

ðŸš€ Training with exploration_final_eps = 0.05
Logging to ./aqua_tensorboard_dqn_eps_0.05\DQN_4
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 180      |
|    ep_rew_mean      | -18.1    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 419      |
|    time_elapsed     | 1        |
|    total_timesteps  | 720      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.252    |
|    n_updates        | 154      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 180      |
|    ep_rew_mean      | -13.6    |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 423     

In [8]:
import optuna
import os
from datetime import datetime
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_checker import check_env
from utils.plot_callback import PlotCallback

PLOT_DIR = "plots"
os.makedirs(PLOT_DIR, exist_ok=True)
RUN_ID   = datetime.now().strftime("%Y%m%d_%H%M%S")

eval_env = DiscretizedAquacultureEnv(region="north_sulawesi")
eval_cb  = EvalCallback(
    eval_env,
    n_eval_episodes=20,
    eval_freq=180 * 5,
    deterministic=True
)

def objective(trial: optuna.Trial) -> float:
    lr     = trial.suggest_float("learning_rate", 1e-4, 5e-4, log=True)
    gamma  = trial.suggest_float("gamma",           0.98,  0.995)
    batch  = trial.suggest_categorical("batch_size", [32, 64])
    buf    = trial.suggest_categorical("buffer_size", [500_000, 1000_000])
    tgt_i  = trial.suggest_categorical("target_update_interval", [5000, 10000])
    net    = trial.suggest_categorical(
                "net_arch",
                [[64, 64], [128, 128]]
             )
    env = DiscretizedAquacultureEnv(region="north_sulawesi")

    model = DQN(
        "MlpPolicy",
        env,
        learning_rate=lr,
        gamma=gamma,
        batch_size=batch,
        buffer_size=buf,
        target_update_interval=tgt_i,
        exploration_final_eps=0.01,
        policy_kwargs=dict(net_arch=net),
        verbose=0,
        tensorboard_log="./aqua_tensorboard",
        device="cuda" if torch.cuda.is_available() else "cpu",
    )

    plot_cb = PlotCallback(
        window=1,
        save_path=os.path.join(PLOT_DIR, f"{RUN_ID}_trial{trial.number:02d}_rewards.png"),
        title=f"DQN Trial {trial.number}"
    )

    model.learn(180 * 100, callback=[eval_cb, plot_cb])
    return eval_cb.best_mean_reward

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=16, timeout=3 * 3600)

print("Best value:", study.best_value)
print("Best params:", study.best_params)

[I 2025-04-25 17:29:10,986] A new study created in memory with name: no-name-d305c081-5c76-40a2-a3bb-2cc59fc639f8


Eval num_timesteps=900, episode_reward=0.10 +/- 0.97
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1800, episode_reward=0.01 +/- 0.99
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=5.53 +/- 2.14
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=3600, episode_reward=58.48 +/- 1.45
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=4500, episode_reward=101.09 +/- 2.08
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=5400, episode_reward=101.60 +/- 1.54
Episode length: 180.00 +/- 0.00
New best mean reward!
Eval num_timesteps=6300, episode_reward=101.10 +/- 1.41
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=55.85 +/- 2.27
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=10.95 +/- 3.02
Episode length: 180.00 +/- 0.00
Eval num_timesteps=9000, episode_reward=-14.89 +/- 3.98
Episode length: 180.00 +/- 0.00
Eval num_timesteps=9

[I 2025-04-25 17:30:31,746] Trial 0 finished with value: 180.81518916226923 and parameters: {'learning_rate': 0.00010290358907837275, 'gamma': 0.9826490514542094, 'batch_size': 32, 'buffer_size': 1000000, 'target_update_interval': 5000, 'net_arch': [128, 128]}. Best is trial 0 with value: 180.81518916226923.


Eval num_timesteps=18000, episode_reward=38.78 +/- 3.93
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial00_rewards.png
ðŸ“Š Total reward: 7957.21
ðŸ“‰ Reward variation (std dev): 61.83
Eval num_timesteps=900, episode_reward=-105.45 +/- 2.48
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=24.47 +/- 1.47
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=32.06 +/- 1.31
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=-83.21 +/- 1.93
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=20.98 +/- 1.66
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=-195.85 +/- 2.64
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=15.62 +/- 1.92
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=-21.02 +/- 3.24
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=-1.31 +/- 1.99
Episode length: 180

[I 2025-04-25 17:31:55,459] Trial 1 finished with value: 180.81518916226923 and parameters: {'learning_rate': 0.00013078368361118422, 'gamma': 0.9871982329503499, 'batch_size': 64, 'buffer_size': 500000, 'target_update_interval': 10000, 'net_arch': [128, 128]}. Best is trial 0 with value: 180.81518916226923.


Eval num_timesteps=18000, episode_reward=119.74 +/- 1.05
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial01_rewards.png
ðŸ“Š Total reward: 2639.53
ðŸ“‰ Reward variation (std dev): 76.50
Eval num_timesteps=900, episode_reward=47.53 +/- 1.46
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=28.36 +/- 2.40
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=29.51 +/- 2.15
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=9.98 +/- 2.53
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=-73.76 +/- 5.67
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=80.44 +/- 2.11
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=32.54 +/- 5.20
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=-35.65 +/- 2.04
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=-58.13 +/- 3.32
Episode length: 180.00

[I 2025-04-25 17:33:18,729] Trial 2 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00011556060083557823, 'gamma': 0.9949461829904909, 'batch_size': 64, 'buffer_size': 1000000, 'target_update_interval': 5000, 'net_arch': [128, 128]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=171.06 +/- 0.78
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial02_rewards.png
ðŸ“Š Total reward: 8020.43
ðŸ“‰ Reward variation (std dev): 77.06
Eval num_timesteps=900, episode_reward=36.86 +/- 0.19
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=29.23 +/- 2.49
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=-113.69 +/- 3.00
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=-184.11 +/- 10.90
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=17.18 +/- 1.96
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=-39.32 +/- 3.44
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=-89.28 +/- 3.81
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=-126.04 +/- 6.38
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=-84.02 +/- 27.47
Episode lengt

[I 2025-04-25 17:34:30,722] Trial 3 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00033430456481121045, 'gamma': 0.9917043138355404, 'batch_size': 32, 'buffer_size': 500000, 'target_update_interval': 10000, 'net_arch': [64, 64]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=118.07 +/- 1.02
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial03_rewards.png
ðŸ“Š Total reward: 298.16
ðŸ“‰ Reward variation (std dev): 70.48
Eval num_timesteps=900, episode_reward=-139.57 +/- 3.35
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=-52.15 +/- 3.41
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=-16.07 +/- 2.42
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=0.80 +/- 2.42
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=-32.13 +/- 2.39
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=20.29 +/- 1.43
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=10.56 +/- 1.28
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=-236.89 +/- 3.01
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=51.13 +/- 2.34
Episode length: 180

[I 2025-04-25 17:35:43,309] Trial 4 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00018404149583961342, 'gamma': 0.9859821217639309, 'batch_size': 32, 'buffer_size': 500000, 'target_update_interval': 10000, 'net_arch': [64, 64]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=-77.78 +/- 6.69
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial04_rewards.png
ðŸ“Š Total reward: -873.56
ðŸ“‰ Reward variation (std dev): 61.85
Eval num_timesteps=900, episode_reward=-18.31 +/- 2.35
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=108.82 +/- 1.37
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=57.52 +/- 4.68
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=-75.13 +/- 6.99
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=31.89 +/- 4.19
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=-1.27 +/- 4.50
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=-56.97 +/- 28.55
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=-1.08 +/- 1.96
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=-15.63 +/- 1.94
Episode length: 18

[I 2025-04-25 17:36:55,220] Trial 5 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.0004260295172958155, 'gamma': 0.9868218818271846, 'batch_size': 32, 'buffer_size': 500000, 'target_update_interval': 10000, 'net_arch': [64, 64]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=174.93 +/- 0.80
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial05_rewards.png
ðŸ“Š Total reward: 4352.11
ðŸ“‰ Reward variation (std dev): 68.38
Eval num_timesteps=900, episode_reward=7.83 +/- 2.48
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=98.05 +/- 1.18
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=5.42 +/- 2.58
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=100.98 +/- 1.36
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=25.03 +/- 9.65
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=-28.45 +/- 4.22
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=3.25 +/- 2.50
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=107.93 +/- 2.04
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=-30.92 +/- 3.88
Episode length: 180.00 

[I 2025-04-25 17:38:19,948] Trial 6 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00014076983870370925, 'gamma': 0.9924411770042789, 'batch_size': 64, 'buffer_size': 1000000, 'target_update_interval': 10000, 'net_arch': [128, 128]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=145.88 +/- 1.11
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial06_rewards.png
ðŸ“Š Total reward: 5083.13
ðŸ“‰ Reward variation (std dev): 54.22
Eval num_timesteps=900, episode_reward=18.49 +/- 0.58
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=28.65 +/- 1.84
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=54.36 +/- 0.89
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=108.61 +/- 0.96
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=90.41 +/- 0.67
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=81.73 +/- 2.83
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=89.41 +/- 0.73
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=8.03 +/- 3.04
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=-10.47 +/- 29.14
Episode length: 180.00

[I 2025-04-25 17:39:32,763] Trial 7 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00017050885266499573, 'gamma': 0.9820074338497919, 'batch_size': 32, 'buffer_size': 1000000, 'target_update_interval': 10000, 'net_arch': [64, 64]}. Best is trial 2 with value: 186.3059869751334.


âœ… Training curve saved to: plots\20250425_172910_trial07_rewards.png
ðŸ“Š Total reward: 5736.90
ðŸ“‰ Reward variation (std dev): 50.59
Eval num_timesteps=900, episode_reward=-1.80 +/- 1.31
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=13.92 +/- 2.31
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=-122.15 +/- 2.97
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=-91.98 +/- 6.68
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=63.46 +/- 1.53
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=-16.74 +/- 3.64
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=34.68 +/- 3.88
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=32.23 +/- 2.12
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=-65.72 +/- 3.91
Episode length: 180.00 +/- 0.00
Eval num_timesteps=9000, episode_reward=61.72 +/- 1.79
Episode length: 180.0

[I 2025-04-25 17:40:54,950] Trial 8 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00026569943815810506, 'gamma': 0.9944819981776538, 'batch_size': 64, 'buffer_size': 500000, 'target_update_interval': 10000, 'net_arch': [128, 128]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=123.71 +/- 9.48
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial08_rewards.png
ðŸ“Š Total reward: 4009.29
ðŸ“‰ Reward variation (std dev): 74.51
Eval num_timesteps=900, episode_reward=26.43 +/- 1.95
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=57.22 +/- 1.70
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=-59.49 +/- 2.35
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=78.05 +/- 1.40
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=81.13 +/- 3.14
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=55.39 +/- 7.27
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=-136.67 +/- 3.21
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=-27.03 +/- 3.72
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=11.63 +/- 2.39
Episode length: 180.

[I 2025-04-25 17:42:09,377] Trial 9 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.0003144203221988339, 'gamma': 0.9874347421668123, 'batch_size': 64, 'buffer_size': 1000000, 'target_update_interval': 10000, 'net_arch': [64, 64]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=142.30 +/- 1.03
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial09_rewards.png
ðŸ“Š Total reward: 3856.73
ðŸ“‰ Reward variation (std dev): 73.94
Eval num_timesteps=900, episode_reward=25.37 +/- 0.84
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=9.88 +/- 2.87
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=103.70 +/- 0.48
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=41.17 +/- 2.20
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=-30.96 +/- 2.31
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=-87.12 +/- 3.72
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=6.76 +/- 2.66
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=73.00 +/- 1.40
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=-54.56 +/- 13.54
Episode length: 180.0

[I 2025-04-25 17:43:33,312] Trial 10 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00010837906104857781, 'gamma': 0.9905448964962523, 'batch_size': 64, 'buffer_size': 1000000, 'target_update_interval': 5000, 'net_arch': [128, 128]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=157.28 +/- 0.99
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial10_rewards.png
ðŸ“Š Total reward: 5764.39
ðŸ“‰ Reward variation (std dev): 90.77
Eval num_timesteps=900, episode_reward=-4.29 +/- 1.32
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=3.65 +/- 2.64
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=37.66 +/- 3.10
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=68.56 +/- 2.70
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=7.23 +/- 3.05
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=58.48 +/- 2.06
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=15.76 +/- 3.19
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=-97.98 +/- 3.35
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=109.26 +/- 1.32
Episode length: 180.00 +

[I 2025-04-25 17:44:44,228] Trial 11 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00039155081485713983, 'gamma': 0.994868830456089, 'batch_size': 32, 'buffer_size': 500000, 'target_update_interval': 5000, 'net_arch': [64, 64]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=169.92 +/- 1.07
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial11_rewards.png
ðŸ“Š Total reward: 7285.41
ðŸ“‰ Reward variation (std dev): 75.08
Eval num_timesteps=900, episode_reward=-7.05 +/- 1.25
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=-3.37 +/- 3.18
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=40.01 +/- 1.75
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=5.23 +/- 1.84
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=12.87 +/- 4.05
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=6.00 +/- 2.48
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=22.15 +/- 1.70
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=6.02 +/- 2.35
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=121.27 +/- 0.93
Episode length: 180.00 +/-

[I 2025-04-25 17:46:04,276] Trial 12 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00023408236598323798, 'gamma': 0.9907504394842735, 'batch_size': 32, 'buffer_size': 500000, 'target_update_interval': 5000, 'net_arch': [128, 128]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=79.11 +/- 1.71
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial12_rewards.png
ðŸ“Š Total reward: 7313.16
ðŸ“‰ Reward variation (std dev): 73.59
Eval num_timesteps=900, episode_reward=-10.17 +/- 1.70
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=-80.49 +/- 3.41
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=-31.38 +/- 1.75
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=-113.99 +/- 4.58
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=-36.91 +/- 11.15
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=0.45 +/- 3.49
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=-3.32 +/- 2.65
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=38.28 +/- 2.08
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=120.30 +/- 2.09
Episode length: 18

[I 2025-04-25 17:47:18,077] Trial 13 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00031720836644187937, 'gamma': 0.9918376190696623, 'batch_size': 64, 'buffer_size': 1000000, 'target_update_interval': 5000, 'net_arch': [64, 64]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=153.18 +/- 1.00
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial13_rewards.png
ðŸ“Š Total reward: 6865.00
ðŸ“‰ Reward variation (std dev): 77.75
Eval num_timesteps=900, episode_reward=-144.94 +/- 2.46
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=-3.38 +/- 2.46
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=-26.39 +/- 2.67
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=-32.63 +/- 2.50
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=-25.41 +/- 12.39
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=20.18 +/- 1.46
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=61.05 +/- 1.77
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=86.04 +/- 2.24
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=90.87 +/- 1.69
Episode length: 18

[I 2025-04-25 17:48:43,187] Trial 14 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.0004956895877833771, 'gamma': 0.9932509044902064, 'batch_size': 64, 'buffer_size': 1000000, 'target_update_interval': 5000, 'net_arch': [128, 128]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=160.45 +/- 0.64
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial14_rewards.png
ðŸ“Š Total reward: 9446.17
ðŸ“‰ Reward variation (std dev): 65.00
Eval num_timesteps=900, episode_reward=-193.32 +/- 2.96
Episode length: 180.00 +/- 0.00
Eval num_timesteps=1800, episode_reward=-21.93 +/- 1.94
Episode length: 180.00 +/- 0.00
Eval num_timesteps=2700, episode_reward=-23.80 +/- 21.15
Episode length: 180.00 +/- 0.00
Eval num_timesteps=3600, episode_reward=-84.01 +/- 3.07
Episode length: 180.00 +/- 0.00
Eval num_timesteps=4500, episode_reward=-12.12 +/- 1.76
Episode length: 180.00 +/- 0.00
Eval num_timesteps=5400, episode_reward=-1.07 +/- 1.62
Episode length: 180.00 +/- 0.00
Eval num_timesteps=6300, episode_reward=71.66 +/- 2.57
Episode length: 180.00 +/- 0.00
Eval num_timesteps=7200, episode_reward=41.82 +/- 4.58
Episode length: 180.00 +/- 0.00
Eval num_timesteps=8100, episode_reward=69.14 +/- 1.44
Episode length: 1

[I 2025-04-25 17:49:55,209] Trial 15 finished with value: 186.3059869751334 and parameters: {'learning_rate': 0.00019076264179196379, 'gamma': 0.9893050542392028, 'batch_size': 32, 'buffer_size': 500000, 'target_update_interval': 5000, 'net_arch': [64, 64]}. Best is trial 2 with value: 186.3059869751334.


Eval num_timesteps=18000, episode_reward=-129.73 +/- 4.42
Episode length: 180.00 +/- 0.00
âœ… Training curve saved to: plots\20250425_172910_trial15_rewards.png
ðŸ“Š Total reward: 2862.38
ðŸ“‰ Reward variation (std dev): 87.29
Best value: 186.3059869751334
Best params: {'learning_rate': 0.00011556060083557823, 'gamma': 0.9949461829904909, 'batch_size': 64, 'buffer_size': 1000000, 'target_update_interval': 5000, 'net_arch': [128, 128]}


In [9]:
import torch
import numpy as np
import os
from stable_baselines3 import DQN
from stable_baselines3.common.env_checker import check_env
from utils.plot_callback import PlotCallback

# Check CUDA and env
print("CUDA Available:", torch.cuda.is_available())
env = DiscretizedAquacultureEnv(region="north_sulawesi")
check_env(env, warn=True)

# Best hyperparameters from tuning
best_params = {
    "learning_rate": 1e-4,
    "gamma": 0.995,
    "batch_size": 64,
    "buffer_size": 1000000,
    "target_update_interval": 5000,
    "net_arch": [128, 128],
    "exploration_final_eps": 0.01
}

# Instantiate DQN with fixed final epsilon
model = DQN(
    "MlpPolicy",
    env,
    learning_rate=best_params["learning_rate"],
    gamma=best_params["gamma"],
    batch_size=best_params["batch_size"],
    buffer_size=best_params["buffer_size"],
    target_update_interval=best_params["target_update_interval"],
    exploration_final_eps=best_params["exploration_final_eps"],
    policy_kwargs=dict(net_arch=best_params["net_arch"]),
    verbose=1,
    tensorboard_log="./aqua_tensorboard",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Training callback
plot_cb = PlotCallback(
    window=1,
    save_path="plots/dqn_training_rewards.png",
    title=f"DQN Training Rewards (Îµ={best_params['exploration_final_eps']})"
)

# Train
model.learn(total_timesteps=180 * 300, callback=plot_cb)

# Save
model_save_path = "./saved_model/dqn_best_model"
os.makedirs(os.path.dirname(model_save_path), exist_ok=True)
model.save(model_save_path)
print(f"ðŸ’¾ Model saved to {model_save_path}")

CUDA Available: False
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./aqua_tensorboard\DQN_46
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 180      |
|    ep_rew_mean      | 16.8     |
|    exploration_rate | 0.868    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 555      |
|    time_elapsed     | 1        |
|    total_timesteps  | 720      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.202    |
|    n_updates        | 154      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 180      |
|    ep_rew_mean      | 13.9     |
|    exploration_rate | 0.736    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 539      |
|    time_elapsed     | 2        |
|    to

In [5]:
import time
from utils.calculation import Calculation

obs, _ = env.reset()
terminated = False
truncated = False
total_reward = 0

print("Discrete action space size:", env.action_space.n)

while not (terminated or truncated):
    action_idx, _ = model.predict(obs, deterministic=True)
    action = env.discrete_actions[action_idx]

    raw = env.base_env.denormalize(obs)
    biomass, fish_count, temp, do_level, uia = raw
    feed_amount = Calculation.compute_feed_weight(action[0], biomass)
    print(f"""\n--- Day {env.base_env.day + 1} ---
Raw-obs: biomass={biomass:.1f} g, count={fish_count:.0f}, temp={temp:.2f}Â°C, DO={do_level:.2f} mg/L, UIA={uia:.3f} mg/L
Action: feed_rate={action[0]:.3f} â†’ feed_amt={feed_amount:.2f} g, temp_set={action[1]:.3f}, aeration_rate={action[2]:.2f}mg/L
""")

    obs, reward, terminated, truncated, info = env.step(action_idx)
    total_reward += reward

    print(f"""Reward Breakdown:
Fish Value Gain:     {info['fish_value']} $
Feed Cost:           {info['feed_cost']:.2f} $
Heat Cost:           {info['heat_cost']:.2f} $
Oxygenation Cost:    {info['oxygenation_cost']:.2f} $
â†’ Net Reward:        {info['reward']:.2f} $
""")

    env.render()

    if hasattr(env.base_env, 'exit_requested') and env.base_env.exit_requested:
        break

    time.sleep(0.01)

env.close()
print("Total Reward:", total_reward)

TypeError: AquacultureEnv.__init__() got an unexpected keyword argument 'discrete'