In [None]:
import gymnasium_robotics
import gymnasium as gym
from stable_baselines3 import SAC, HerReplayBuffer # Import HerReplayBuffer
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback
# Removed StopTrainingOnRewardThreshold for simplicity unless needed
import os
import numpy as np # Needed for success rate calculation in custom callback if desired
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

# --- Configuration ---
env_id = "FetchReach-v3" # Or FetchPickAndPlace-v3, etc.
# Define path for saving the best model (EvalCallback needs this)
save_dir = f"./her_sac_{env_id}_results/"
best_model_save_path = os.path.join(save_dir, "best_model")
final_model_save_path = os.path.join(save_dir, f"her_sac_{env_id}_final")

# Create directories if they don't exist
os.makedirs(save_dir, exist_ok=True)

# --- Environment Setup ---
# IMPORTANT: For HER, use the non-vectorized environment
env = gym.make(env_id, reward_type='dense')  # Normal gym env
env = DummyVecEnv([lambda: env])             # Make it a VecEnv
env = VecNormalize(env, norm_obs=True, norm_reward=False)  # Now normalize it
# Optional: Check the environment
# check_env(env)

# --- Callbacks for Evaluation ---
# EvalCallback logs evaluation results to the console and saves the best model.
eval_callback = EvalCallback(env, # Use a separate eval env (or the same, just resets)
                             best_model_save_path=best_model_save_path,
                             log_path=save_dir, # Logs eval results (like eval_success.npy) here
                             eval_freq=5000,    # Evaluate every N environment steps
                             n_eval_episodes=10, # Number of episodes for evaluation
                             deterministic=True, # Use deterministic actions for evaluation
                             render=False,
                             verbose=1)         # Print eval results to console

# --- Agent Definition (SAC + HER) ---
model = SAC(
    "MultiInputPolicy",
    env,
    replay_buffer_class=HerReplayBuffer,
    replay_buffer_kwargs=dict(
        n_sampled_goal=8,
        goal_selection_strategy='future',
    ),
    verbose=1,            # Set to 1 to print info (like episode stats) to console
    # tensorboard_log=None, # Explicitly None or just remove the parameter
    buffer_size=int(1e6),
    learning_starts=1000,
    batch_size=512,      # Larger batch size often good for HER
    learning_rate=1e-4,   # Tunable - may need adjustment
    gamma=0.99,           # Tunable for goal-based tasks
    tau=0.005,             # Tunable
    # ent_coef='auto' is the default for automatic alpha tuning
)

# --- Training ---
print(f"Starting training SAC + HER on {env_id}...")
print(f"Results and best model will be saved in: {save_dir}")

# Train the agent
# log_interval controls how often training episode stats (mean reward/length) are printed.
# For Fetch (50 steps/ep), log_interval=50 means stats every ~50 episodes.
model.learn(
    total_timesteps=1000000, # Adjust total steps as needed
    log_interval=50,        # Print training stats avg over last 50 episodes
    callback=eval_callback  # Run evaluation periodically and save best model
)

# --- Save Final Model ---
model.save(final_model_save_path)
print(f"Training finished. Final model saved to {final_model_save_path}")
print(f"Best model during training saved in {best_model_save_path}.zip")

# --- Save the VecNormalize wrapper ---
env.save(os.path.join(save_dir, "vecnormalize.pkl"))  # <--- Add this here AFTER training
print(f"Saved VecNormalize statistics to {save_dir}/vecnormalize.pkl")

env.close()

2025-04-27 15:24:07.661848: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-27 15:24:07.663551: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-27 15:24:07.697122: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using cuda device
Starting training SAC + HER on FetchReach-v3...
Results and best model will be saved in: ./her_sac_FetchReach-v3_results/
---------------------------------
| rollout/           |          |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 50       |
|    fps             | 183      |
|    time_elapsed    | 13       |
|    total_timesteps | 2500     |
| train/             |          |
|    actor_loss      | -18.7    |
|    critic_loss     | 0.0951   |
|    ent_coef        | 0.861    |
|    ent_coef_loss   | -0.997   |
|    learning_rate   | 0.0001   |
|    n_updates       | 1499     |
---------------------------------




Eval num_timesteps=5000, episode_reward=-20.13 +/- 4.48
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
---------------------------------
| eval/              |          |
|    mean_ep_length  | 50       |
|    mean_reward     | -20.1    |
|    success_rate    | 0.0      |
| time/              |          |
|    total_timesteps | 5000     |
| train/             |          |
|    actor_loss      | -36.5    |
|    critic_loss     | 0.0791   |
|    ent_coef        | 0.671    |
|    ent_coef_loss   | -2.62    |
|    learning_rate   | 0.0001   |
|    n_updates       | 3999     |
---------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 100      |
|    fps             | 145      |
|    time_elapsed    | 34       |
|    total_timesteps | 5000     |
---------------------------------
---------------------------------
| rollout/           

In [3]:
env.save("./her_sac_FetchReach-v3_results/vecnormalize.pkl")
print("✅ VecNormalize saved successfully!")

✅ VecNormalize saved successfully!


NEW BASE MODEL CODE:

In [21]:
import gymnasium_robotics
import gymnasium as gym
from stable_baselines3 import SAC, HerReplayBuffer
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import os
import numpy as np

# --- Configuration ---
env_id = "FetchReach-v3"
save_dir = f"./her_sac_{env_id}_results/"
best_model_save_path = os.path.join(save_dir, "best_model")
final_model_save_path = os.path.join(save_dir, f"her_sac_{env_id}_final")

os.makedirs(save_dir, exist_ok=True)

# --- Environment Setup ---
env = gym.make(env_id, reward_type='dense')
env = DummyVecEnv([lambda: env])
env = VecNormalize(env, norm_obs=True, norm_reward=False)

# --- Agent Definition (SAC + HER) ---
model = SAC(
    "MultiInputPolicy",
    env,
    replay_buffer_class=HerReplayBuffer,
    replay_buffer_kwargs=dict(
        n_sampled_goal=8,
        goal_selection_strategy='future',
    ),
    verbose=1,
    buffer_size=int(1e6),
    learning_starts=1000,
    batch_size=512,
    learning_rate=1e-4,
    gamma=0.99,
    tau=0.005,
)

# --- Callbacks ---

# Stop criteria: if mean reward > -5 (which roughly means success in FetchReach dense reward)
success_stop_callback = StopTrainingOnRewardThreshold(
    reward_threshold=-5,  # Reward close to 0 means successful reach
    verbose=1
)

# EvalCallback with embedded stop callback
eval_callback = EvalCallback(
    env,
    best_model_save_path=best_model_save_path,
    log_path=save_dir,
    eval_freq=5000,
    n_eval_episodes=10,
    deterministic=True,
    render=False,
    callback_on_new_best=success_stop_callback,
    verbose=1,
)

# --- Training ---
print(f"🚀 Starting training SAC + HER on {env_id}...")
print(f"📂 Results and best model will be saved in: {save_dir}")

model.learn(
    total_timesteps=1_000_000,
    log_interval=50,
    callback=eval_callback
)

# --- Save Final Model ---
model.save(final_model_save_path)
print(f"✅ Training finished. Final model saved to {final_model_save_path}")
print(f"✅ Best model during training saved in {best_model_save_path}.zip")

# --- Save VecNormalize statistics ---
env.save(os.path.join(save_dir, "vecnormalize.pkl"))
print(f"✅ Saved VecNormalize statistics to {save_dir}/vecnormalize.pkl")

env.close()

# --- Final 1000 Episode Evaluation ---

print("\n🎯 Starting final evaluation over 1000 episodes...")

# Reload environment with saved normalization
eval_env = gym.make(env_id, reward_type='dense')
eval_env = DummyVecEnv([lambda: eval_env])
eval_env = VecNormalize.load(os.path.join(save_dir, "vecnormalize.pkl"), eval_env)
eval_env.training = False
eval_env.norm_reward = False

# Reload best model
model = SAC.load(best_model_save_path, env=eval_env)

successes = []

for _ in range(1000):
    obs = eval_env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        done = done[0]  # DummyVecEnv
        info = info[0]
    successes.append(info['is_success'])

success_rate = np.mean(successes) * 100.0
print(f"\n✅ Final Evaluation: Success Rate = {success_rate:.2f}% over 1000 episodes.")

if success_rate >= 90.0:
    print("🏆 Problem Solved! ✅ (Success > 90%)")
else:
    print("⚡ Problem Not Fully Solved. (Success < 90%)")


Using cuda device
🚀 Starting training SAC + HER on FetchReach-v3...
📂 Results and best model will be saved in: ./her_sac_FetchReach-v3_results/
---------------------------------
| rollout/           |          |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 50       |
|    fps             | 192      |
|    time_elapsed    | 13       |
|    total_timesteps | 2500     |
| train/             |          |
|    actor_loss      | -18.6    |
|    critic_loss     | 0.113    |
|    ent_coef        | 0.861    |
|    ent_coef_loss   | -0.996   |
|    learning_rate   | 0.0001   |
|    n_updates       | 1499     |
---------------------------------




Eval num_timesteps=5000, episode_reward=-20.14 +/- 7.42
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
---------------------------------
| eval/              |          |
|    mean_ep_length  | 50       |
|    mean_reward     | -20.1    |
|    success_rate    | 0.0      |
| time/              |          |
|    total_timesteps | 5000     |
| train/             |          |
|    actor_loss      | -36      |
|    critic_loss     | 0.0929   |
|    ent_coef        | 0.671    |
|    ent_coef_loss   | -2.62    |
|    learning_rate   | 0.0001   |
|    n_updates       | 3999     |
---------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 100      |
|    fps             | 150      |
|    time_elapsed    | 33       |
|    total_timesteps | 5000     |
---------------------------------
---------------------------------
| rollout/           

IsADirectoryError: [Errno 21] Is a directory: 'her_sac_FetchReach-v3_results/best_model'

In [24]:
# --- Final 1000 Episode Evaluation ---

print("\n🎯 Starting final evaluation over 1000 episodes...")

# Reload environment with saved normalization
eval_env = gym.make(env_id, reward_type='dense')
eval_env = DummyVecEnv([lambda: eval_env])
eval_env = VecNormalize.load(os.path.join(save_dir, "vecnormalize.pkl"), eval_env)
eval_env.training = False
eval_env.norm_reward = False

# Correct model load with .zip
model = SAC.load(os.path.join(best_model_save_path, "best_model.zip"), env=eval_env)

successes = []

for _ in range(1000):
    obs = eval_env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = eval_env.step(action)
        done = done[0]  # DummyVecEnv
        info = info[0]
    successes.append(info['is_success'])

success_rate = np.mean(successes) * 100.0
print(f"\n✅ Final Evaluation: Success Rate = {success_rate:.2f}% over 1000 episodes.")

if success_rate >= 90.0:
    print("🏆 Problem Solved! ✅ (Success > 90%)")
else:
    print("⚡ Problem Not Fully Solved. (Success < 90%)")


🎯 Starting final evaluation over 1000 episodes...

✅ Final Evaluation: Success Rate = 36.60% over 1000 episodes.
⚡ Problem Not Fully Solved. (Success < 90%)


NEW BASE MODEL -- OPTIMIZED

In [None]:
import gymnasium_robotics
import gymnasium as gym
from stable_baselines3 import SAC, HerReplayBuffer
from stable_baselines3.common.callbacks import EvalCallback, CallbackList, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import os
import numpy as np

# --- Custom Callback ---
class StopTrainingOnSuccessRate(BaseCallback):
    """
    Custom callback to stop training once success rate exceeds threshold.
    """
    def __init__(self, success_threshold=0.90, eval_freq=5000, verbose=1):
        super().__init__(verbose)
        self.success_threshold = success_threshold
        self.eval_freq = eval_freq

    def _on_step(self) -> bool:
        if self.num_timesteps % self.eval_freq == 0:
            successes = []
            for _ in range(10):  # 10 evaluation episodes
                obs = self.training_env.reset()
                done = False
                while not done:
                    action, _ = self.model.predict(obs, deterministic=True)
                    obs, reward, done, infos = self.training_env.step(action)
                    done = done[0]
                    infos = infos[0]
                successes.append(infos['is_success'])

            success_rate = np.mean(successes)

            if self.verbose > 0:
                print(f"✅ [Success Monitor] Success rate over 10 episodes: {success_rate*100:.2f}%")

            if success_rate >= self.success_threshold:
                print(f"🏆 [Stopping Early] Success rate {success_rate*100:.2f}% exceeded threshold {self.success_threshold*100:.2f}%!")
                return False  # Stop training

        return True  # Continue training otherwise

# --- Configuration ---
env_id = "FetchReach-v3"
save_dir = f"./her_sac_{env_id}_results/"
best_model_save_path = os.path.join(save_dir, "best_model")
final_model_save_path = os.path.join(save_dir, f"her_sac_{env_id}_final")

os.makedirs(save_dir, exist_ok=True)

# --- Environment Setup ---
env = gym.make(env_id, reward_type='dense')
env = DummyVecEnv([lambda: env])
env = VecNormalize(env, norm_obs=True, norm_reward=False)

# --- Agent Definition (SAC + HER) ---
model = SAC(
    "MultiInputPolicy",
    env,
    replay_buffer_class=HerReplayBuffer,
    replay_buffer_kwargs=dict(
        n_sampled_goal=8,
        goal_selection_strategy='future',
    ),
    verbose=1,
    buffer_size=int(1e6),
    learning_starts=1000,
    batch_size=512,
    learning_rate=1e-4,
    gamma=0.99,
    tau=0.005,
)

# --- Callbacks ---

# 1. Eval callback to monitor reward
eval_callback = EvalCallback(
    env,
    best_model_save_path=best_model_save_path,
    log_path=save_dir,
    eval_freq=5000,
    n_eval_episodes=10,
    deterministic=True,
    render=False,
    verbose=1,
)

# 2. Custom stop callback (stop if real success >90%)
success_stop_callback = StopTrainingOnSuccessRate(
    success_threshold=0.90,  # 90% success required
    eval_freq=5000,
    verbose=1
)

# 3. Combine them
callback = CallbackList([eval_callback, success_stop_callback])

# --- Training ---
print(f"🚀 Starting training SAC + HER on {env_id} with success-rate-based stopping...")
print(f"📂 Results and best model will be saved in: {save_dir}")

model.learn(
    total_timesteps=1_000_000,
    log_interval=50,
    callback=callback
)

# --- Save Final Model ---
model.save(final_model_save_path)
print(f"✅ Training finished. Final model saved to {final_model_save_path}")
print(f"✅ Best model during training saved in {best_model_save_path}")

# --- Save VecNormalize statistics ---
env.save(os.path.join(save_dir, "vecnormalize.pkl"))
print(f"✅ Saved VecNormalize statistics to {save_dir}/vecnormalize.pkl")

env.close()

# --- Final 1000 Episode Evaluation ---

print("\n🎯 Starting final evaluation over 1000 episodes...")

# Reload environment with saved normalization
eval_env = gym.make(env_id, reward_type='dense')
eval_env = DummyVecEnv([lambda: eval_env])
eval_env = VecNormalize.load(os.path.join(save_dir, "vecnormalize.pkl"), eval_env)
eval_env.training = False
eval_env.norm_reward = False

# Correctly load model
model = SAC.load(os.path.join(best_model_save_path, "best_model.zip"), env=eval_env)

successes = []

for _ in range(1000):
    obs = eval_env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, infos = eval_env.step(action)
        done = done[0]
        infos = infos[0]
    successes.append(infos['is_success'])

success_rate = np.mean(successes) * 100.0
print(f"\n✅ Final Evaluation: Success Rate = {success_rate:.2f}% over 1000 episodes.")

if success_rate >= 90.0:
    print("🏆 Problem Solved! ✅ (Success > 90%)")
else:
    print("⚡ Problem Not Fully Solved. (Success < 90%)")


Using cuda device
🚀 Starting training SAC + HER on FetchReach-v3 with success-rate-based stopping...
📂 Results and best model will be saved in: ./her_sac_FetchReach-v3_results/
---------------------------------
| rollout/           |          |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 50       |
|    fps             | 191      |
|    time_elapsed    | 13       |
|    total_timesteps | 2500     |
| train/             |          |
|    actor_loss      | -18.2    |
|    critic_loss     | 0.127    |
|    ent_coef        | 0.861    |
|    ent_coef_loss   | -1       |
|    learning_rate   | 0.0001   |
|    n_updates       | 1499     |
---------------------------------
Eval num_timesteps=5000, episode_reward=-20.08 +/- 5.24
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
---------------------------------
| eval/              |          |
|    mean_ep_length  | 50       |
|    mean_reward     | -20.1    |
|    success_rate    | 0.0      |
| 

In [18]:
import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import os

# --- Paths ---
save_dir = "./her_sac_FetchReach-v3_results/"
model_path = os.path.join(save_dir, "finetuned_model_v2.zip")  # Your latest fine-tuned model
vecnormalize_path = os.path.join(save_dir, "vecnormalize.pkl")

# --- Load environment ---
env = gym.make("FetchReach-v3", reward_type='dense')
env = DummyVecEnv([lambda: env])
env = VecNormalize.load(vecnormalize_path, env)
env.training = True

# --- Load model ---
model = SAC.load(model_path, env=env)

# --- Lower learning rate ---
model.learning_rate = 5e-5
model.actor.optimizer.param_groups[0]['lr'] = 5e-5
model.critic.optimizer.param_groups[0]['lr'] = 5e-5

# Increase HER aggressive sampling
if hasattr(model.replay_buffer, 'n_sampled_goal'):
    model.replay_buffer.n_sampled_goal = 16  # 🔥 More future goals per transition
    model.replay_buffer.goal_selection_strategy = 'future'
    print("✅ Aggressive HER sampling applied.")

# (❌ Skip clearing pos / full, don't reset manually!)

# Fine-tune
model.learn(
    total_timesteps=300_000,
    log_interval=50,
    # callback=eval_callback (optional)
)

# Save model
model.save(os.path.join(save_dir, "finetuned_model_v3_aggressive"))
env.save(os.path.join(save_dir, "vecnormalize.pkl"))

print("✅ Fine-tuning with aggressive HER sampling completed!")


✅ Aggressive HER sampling applied.


ValueError: Strategy future for sampling goals not supported!

In [1]:
import gymnasium_robotics
import gymnasium as gym
from stable_baselines3 import SAC, HerReplayBuffer
from stable_baselines3.common.callbacks import EvalCallback, CallbackList, BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
import os
import numpy as np

# --- Custom Callback ---
class StopTrainingOnSuccessRate(BaseCallback):
    """
    Custom callback to stop training once success rate exceeds threshold.
    """
    def __init__(self, success_threshold=0.90, eval_freq=5000, verbose=1):
        super().__init__(verbose)
        self.success_threshold = success_threshold
        self.eval_freq = eval_freq

    def _on_step(self) -> bool:
        if self.num_timesteps % self.eval_freq == 0:
            successes = []
            for _ in range(10):  # 10 evaluation episodes
                obs = self.training_env.reset()
                done = False
                while not done:
                    action, _ = self.model.predict(obs, deterministic=True)
                    obs, reward, done, infos = self.training_env.step(action)
                    done = done[0]
                    infos = infos[0]
                successes.append(infos['is_success'])

            success_rate = np.mean(successes)

            if self.verbose > 0:
                print(f"✅ [Success Monitor] Success rate over 10 episodes: {success_rate*100:.2f}%")

            if success_rate >= self.success_threshold:
                print(f"🏆 [Stopping Early] Success rate {success_rate*100:.2f}% exceeded threshold {self.success_threshold*100:.2f}%!")
                return False  # Stop training

        return True  # Continue training otherwise

# --- Configuration ---
env_id = "FetchReach-v3"
save_dir = f"./her_sac_{env_id}_results/"
best_model_save_path = os.path.join(save_dir, "best_model")
final_model_save_path = os.path.join(save_dir, f"her_sac_{env_id}_final")

os.makedirs(save_dir, exist_ok=True)

# --- Environment Setup ---
env = gym.make(env_id, reward_type='dense')
env = DummyVecEnv([lambda: env])
env = VecNormalize(env, norm_obs=True, norm_reward=False)

# --- Agent Definition (SAC + HER) ---
model = SAC(
    "MultiInputPolicy",
    env,
    replay_buffer_class=HerReplayBuffer,
    replay_buffer_kwargs=dict(
        n_sampled_goal=8,
        goal_selection_strategy='future',
    ),
    verbose=1,
    buffer_size=int(1e6),
    learning_starts=1000,
    batch_size=512,
    learning_rate=1e-4,
    gamma=0.99,
    tau=0.005,
)

# --- Callbacks ---

# 1. Eval callback to monitor reward
eval_callback = EvalCallback(
    env,
    best_model_save_path=best_model_save_path,
    log_path=save_dir,
    eval_freq=5000,
    n_eval_episodes=10,
    deterministic=True,
    render=False,
    verbose=1,
)

# 2. Custom stop callback (stop if real success >90%)
success_stop_callback = StopTrainingOnSuccessRate(
    success_threshold=0.90,  # 90% success required
    eval_freq=5000,
    verbose=1
)

# 3. Combine them
callback = CallbackList([eval_callback, success_stop_callback])

# --- Training ---
print(f"🚀 Starting training SAC + HER on {env_id} with success-rate-based stopping...")
print(f"📂 Results and best model will be saved in: {save_dir}")

model.learn(
    total_timesteps=1_000_000,
    log_interval=50,
    callback=callback
)

# --- Save Final Model ---
model.save(final_model_save_path)
print(f"✅ Training finished. Final model saved to {final_model_save_path}")
print(f"✅ Best model during training saved in {best_model_save_path}")

# --- Save VecNormalize statistics ---
env.save(os.path.join(save_dir, "vecnormalize.pkl"))
print(f"✅ Saved VecNormalize statistics to {save_dir}/vecnormalize.pkl")

env.close()

# --- Final 1000 Episode Evaluation ---

print("\n🎯 Starting final evaluation over 1000 episodes...")

# Reload environment with saved normalization
eval_env = gym.make(env_id, reward_type='dense')
eval_env = DummyVecEnv([lambda: eval_env])
eval_env = VecNormalize.load(os.path.join(save_dir, "vecnormalize.pkl"), eval_env)
eval_env.training = False
eval_env.norm_reward = False

# Correctly load model
model = SAC.load(os.path.join(best_model_save_path, "best_model.zip"), env=eval_env)

successes = []

for _ in range(1000):
    obs = eval_env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, infos = eval_env.step(action)
        done = done[0]
        infos = infos[0]
    successes.append(infos['is_success'])

success_rate = np.mean(successes) * 100.0
print(f"\n✅ Final Evaluation: Success Rate = {success_rate:.2f}% over 1000 episodes.")

if success_rate >= 90.0:
    print("🏆 Problem Solved! ✅ (Success > 90%)")
else:
    print("⚡ Problem Not Fully Solved. (Success < 90%)")


2025-04-30 21:04:21.738511: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-30 21:04:22.035434: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-30 21:04:24.191089: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using cuda device
🚀 Starting training SAC + HER on FetchReach-v3 with success-rate-based stopping...
📂 Results and best model will be saved in: ./her_sac_FetchReach-v3_results/
---------------------------------
| rollout/           |          |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 50       |
|    fps             | 146      |
|    time_elapsed    | 17       |
|    total_timesteps | 2500     |
| train/             |          |
|    actor_loss      | -18.9    |
|    critic_loss     | 0.108    |
|    ent_coef        | 0.861    |
|    ent_coef_loss   | -0.999   |
|    learning_rate   | 0.0001   |
|    n_updates       | 1499     |
---------------------------------




Eval num_timesteps=5000, episode_reward=-19.80 +/- 4.74
Episode length: 50.00 +/- 0.00
Success rate: 0.00%
---------------------------------
| eval/              |          |
|    mean_ep_length  | 50       |
|    mean_reward     | -19.8    |
|    success_rate    | 0.0      |
| time/              |          |
|    total_timesteps | 5000     |
| train/             |          |
|    actor_loss      | -36.3    |
|    critic_loss     | 0.0934   |
|    ent_coef        | 0.672    |
|    ent_coef_loss   | -2.61    |
|    learning_rate   | 0.0001   |
|    n_updates       | 3999     |
---------------------------------
New best mean reward!
✅ [Success Monitor] Success rate over 10 episodes: 0.00%
---------------------------------
| rollout/           |          |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 100      |
|    fps             | 126      |
|    time_elapsed    | 39       |
|    total_timesteps | 5000     |
--------------------------------