In [4]:
hyperparams = [
    # 01: Fast learning, strong future emphasis, slow target smoothing, automatic entropy tuning
    {"learning_rate": 1e-3, "gamma": 0.99, "tau": 0.005, "ent_coef": 'auto'},

    # 02: Same as #1 but with fixed entropy coefficient (less stochasticity)
    {"learning_rate": 1e-3, "gamma": 0.99, "tau": 0.005, "ent_coef": 0.1},

    # 03: Faster target smoothing (τ=0.01), moderate entropy for balanced exploration
    {"learning_rate": 1e-3, "gamma": 0.99, "tau": 0.01,  "ent_coef": 0.05},

    # 04: Lower future emphasis (γ=0.98), slow τ, high entropy for more exploration
    {"learning_rate": 1e-3, "gamma": 0.98, "tau": 0.005, "ent_coef": 0.2},

    # 05: Lower γ, faster τ, low entropy coefficient for stable training
    {"learning_rate": 1e-3, "gamma": 0.98, "tau": 0.01,  "ent_coef": 0.01},

    # 06: Lower γ, faster τ, auto entropy with small target entropy (more exploration)
    {"learning_rate": 1e-3, "gamma": 0.98, "tau": 0.01,  "ent_coef": 'auto_0.1'},

    # 07: Half learning rate, high γ, slow τ, high entropy for exploration
    {"learning_rate": 5e-4, "gamma": 0.99, "tau": 0.005, "ent_coef": 0.2},

    # 08: Half LR, high γ, faster τ, moderate entropy
    {"learning_rate": 5e-4, "gamma": 0.99, "tau": 0.01,  "ent_coef": 0.05},

    # 09: Half LR, lower γ, slow τ, auto entropy tuning
    {"learning_rate": 5e-4, "gamma": 0.98, "tau": 0.005, "ent_coef": 'auto'},

    # 10: Half LR, γ=0.98, slow τ, low entropy for conservative policy
    {"learning_rate": 5e-4, "gamma": 0.98, "tau": 0.005, "ent_coef": 0.01},

    # 11: Half LR, γ=0.98, faster τ, moderate entropy
    {"learning_rate": 5e-4, "gamma": 0.98, "tau": 0.01,  "ent_coef": 0.05},

    # 12: Half LR, γ=0.98, faster τ, high entropy for max exploration
    {"learning_rate": 5e-4, "gamma": 0.98, "tau": 0.01,  "ent_coef": 0.2},
]

print(f"Total configs: {len(hyperparams)}\n")
for i, config in enumerate(hyperparams, 1):
    print(f"[Hyperparameter {i:03}] {config}")

Total configs: 12

[Hyperparameter 001] {'learning_rate': 0.001, 'gamma': 0.99, 'tau': 0.005, 'ent_coef': 'auto'}
[Hyperparameter 002] {'learning_rate': 0.001, 'gamma': 0.99, 'tau': 0.005, 'ent_coef': 0.1}
[Hyperparameter 003] {'learning_rate': 0.001, 'gamma': 0.99, 'tau': 0.01, 'ent_coef': 0.05}
[Hyperparameter 004] {'learning_rate': 0.001, 'gamma': 0.98, 'tau': 0.005, 'ent_coef': 0.2}
[Hyperparameter 005] {'learning_rate': 0.001, 'gamma': 0.98, 'tau': 0.01, 'ent_coef': 0.01}
[Hyperparameter 006] {'learning_rate': 0.001, 'gamma': 0.98, 'tau': 0.01, 'ent_coef': 'auto_0.1'}
[Hyperparameter 007] {'learning_rate': 0.0005, 'gamma': 0.99, 'tau': 0.005, 'ent_coef': 0.2}
[Hyperparameter 008] {'learning_rate': 0.0005, 'gamma': 0.99, 'tau': 0.01, 'ent_coef': 0.05}
[Hyperparameter 009] {'learning_rate': 0.0005, 'gamma': 0.98, 'tau': 0.005, 'ent_coef': 'auto'}
[Hyperparameter 010] {'learning_rate': 0.0005, 'gamma': 0.98, 'tau': 0.005, 'ent_coef': 0.01}
[Hyperparameter 011] {'learning_rate': 0.000

In [None]:
import matplotlib.pyplot as plt

from envs.aquaculture_env import AquacultureEnv
from stable_baselines3 import SAC
import torch

EPISODES = 100
TIMESTEPS = 180 * EPISODES
region = "north_sulawesi"
all_rewards = []

for idx, hp in enumerate(hyperparams, 1):
    print(f"Training config #{idx}...")
    env = AquacultureEnv(region=region)
    model = SAC(
        "MlpPolicy",
        env,
        verbose=0,
        learning_rate=hp["learning_rate"],
        gamma=hp["gamma"],
        tau=hp["tau"],
        ent_coef=hp["ent_coef"],
        batch_size=256,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )
    
    rewards = []
    obs, _ = env.reset()
    model.learn(total_timesteps=TIMESTEPS)

    for ep in range(EPISODES):
        obs, _ = env.reset()
        done = False
        total_r = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, info = env.step(action)
            total_r += reward
        rewards.append(total_r)

    all_rewards.append(rewards)

    # Plotting individual curve
    plt.figure(figsize=(12, 5))
    plt.plot(rewards, label=f"Config #{idx}")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title(f"SAC Rewards - Config #{idx}")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"sac_rewards_config_{idx:02}.png")
    plt.close()

print("✅ All configs trained and saved.")


Training config #1...
Training config #2...
Training config #3...
Training config #4...
Training config #5...
Training config #6...
Training config #7...
Training config #8...
Training config #9...
Training config #10...
Training config #11...


In [None]:
from envs.aquaculture_env import AquacultureEnv
from stable_baselines3 import SAC
from stable_baselines3.common.env_checker import check_env
import torch

print(torch.cuda.is_available())
env = AquacultureEnv(region="north_sulawesi")
check_env(env)

# SAC Model
model = SAC(
    "MlpPolicy",
    env,
    verbose=1,
    tensorboard_log="./aqua_tensorboard",
    learning_rate=1e-4,
    batch_size=256,
    gamma=0.99,
    device="cuda"
)

model.learn(total_timesteps=180 * 300)

False
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Logging to ./aqua_tensorboard\SAC_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 180      |
|    ep_rew_mean     | -49.2    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 62       |
|    time_elapsed    | 11       |
|    total_timesteps | 720      |
| train/             |          |
|    actor_loss      | -6.88    |
|    critic_loss     | 0.262    |
|    ent_coef        | 0.941    |
|    ent_coef_loss   | -0.286   |
|    learning_rate   | 0.0001   |
|    n_updates       | 619      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 180      |
|    ep_rew_mean     | -38.1    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 55       |
|    time_elapsed    | 25       |
|    total_timesteps | 1440     |
| train/             |          |
|    actor_loss      | -11.5    |
|    critic_

<stable_baselines3.sac.sac.SAC at 0x1a4f2e22660>

In [3]:
import time
from utils.calculation import Calculation

obs, _ = env.reset()
terminated = False
truncated = False
total_reward = 0

print("Action space:", env.action_space)

while not (terminated or truncated):
    action, states = model.predict(obs, deterministic=True)
    raw = env.denormalize(obs)
    biomass, fish_count, temp, do_level, uia = raw
    feed_amount = Calculation.compute_feed_weight(action[0], biomass)
    print(f"""\n--- Day {env.day + 1} ---
Raw-obs: biomass={biomass:.1f} g, count={fish_count:.0f}, temp={temp:.2f}°C, DO={do_level:.2f} mg/L, UIA={uia:.3f} mg/L
Action: feed_rate={action[0]:.3f} → feed_amt={feed_amount:.2f} g, temp_set={action[1]:.3f}, aeration_rate={action[2]:.2f}mg/L
""")

    obs, reward, terminated, truncated, info = env.step(action)
    total_reward += reward

    print(f"""Reward Breakdown:
Fish Value Gain:     {info['fish_value']} $
Feed Cost:           {info['feed_cost']:.2f} $
Heat Cost:           {info['heat_cost']:.2f} $
Oxygenation Cost:    {info['oxygenation_cost']:.2f} $
→ Net Reward:        {info['reward']:.2f} $
""")

    env.render()

    if hasattr(env, 'exit_requested') and env.exit_requested:
        break

    time.sleep(0.01)

env.close()
print("Total Reward:", total_reward)


Action space: Box([ 0.  24.   0.3], [ 1. 40.  1.], (3,), float32)

--- Day 1 ---
Raw-obs: biomass=1349.0 g, count=100, temp=24.00°C, DO=0.60 mg/L, UIA=0.060 mg/L
Action: feed_rate=0.954 → feed_amt=128.73 g, temp_set=39.346, aeration_rate=0.89mg/L

Reward Breakdown:
Fish Value Gain:     0.14317488889083824 $
Feed Cost:           0.15 $
Heat Cost:           2.35 $
Oxygenation Cost:    0.19 $
→ Net Reward:        -1.76 $


--- Day 2 ---
Raw-obs: biomass=1386.9 g, count=100, temp=27.44°C, DO=0.89 mg/L, UIA=0.060 mg/L
Action: feed_rate=0.941 → feed_amt=130.47 g, temp_set=39.572, aeration_rate=0.84mg/L

Reward Breakdown:
Fish Value Gain:     0.2594268597669848 $
Feed Cost:           0.15 $
Heat Cost:           2.36 $
Oxygenation Cost:    0.18 $
→ Net Reward:        -1.53 $


--- Day 3 ---
Raw-obs: biomass=1455.5 g, count=100, temp=29.85°C, DO=0.84 mg/L, UIA=0.060 mg/L
Action: feed_rate=0.941 → feed_amt=136.93 g, temp_set=39.413, aeration_rate=0.88mg/L

Reward Breakdown:
Fish Value Gain:     