In [None]:
import numpy as np
if not hasattr(np, "bool8"):
    np.bool8 = np.bool_
import torch
import gym
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

from tqdm import tqdm

In [71]:
# Import your PPO implementation
from PPO import RolloutBuffer, ppo_update, device  # citeturn3file0
from Train_policy_func import Policy, Policy_v2  # citeturn2file3

In [None]:
def evaluate_policy(policy, env, n_episodes=10, seed=2000): # different seed from training
    returns = []
    for i in range(n_episodes):
        # state, done, ep_ret = env.reset(seed=seed+i), False, 0.0
        obs = env.reset(seed=seed+i)            # reset now returns (obs, info)
        state    = obs
        done     = False
        ep_ret   = 0.0

        while not done:
            a, _, _ = policy.act(state)
            # state, r, done, _ = env.step(a)
            # ep_ret += r
            next_obs, reward, terminated, truncated, _ = env.step(a)  
            done   = bool(terminated or truncated)  
            state  = next_obs  
            ep_ret += reward
        returns.append(ep_ret)
    return np.mean(returns), returns

In [73]:
# Fix seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x224d63efc90>

In [74]:
# Create the environment
ENV_ID = "CartPole-v0"

In [75]:
def make_env():
    env = gym.make(ENV_ID)
    return env

In [76]:
# Define the search space for PPO hyperparameters
dim_space = [
    Real(1e-5, 1e-2, "log-uniform", name="learning_rate"),
    Real(0.5, 0.999, name="gamma"),
    Real(0.8, 0.99, name="gae_lambda"),
    Real(0.1, 0.3, name="clip_eps"),
    Real (0.0, 1, name="value_coef"),
    Real(0.0, 0.05, name="entropy_coef"),
    Integer(1,16, name="actor_length"),
    Integer(64, 512, name="traj_length"),
    Integer(2, 10, name="epochs"),
    Integer(32, 256, name="batch_size"),
]

In [None]:
@use_named_args(dim_space)
def objective(learning_rate, gamma, gae_lambda, clip_eps, value_coef, entropy_coef,
              actor_length, traj_length, epochs, batch_size):
    """
    Train a PPO agent with given hyperparameters and return negative mean reward.
    """

    # Ensure integer hyperparameters are Python ints
    traj_length = int(traj_length)
    epochs = int(epochs)
    batch_size = int(batch_size)

    # New policy and optimizer per trial
    policy = Policy_v2().to(device)
    optimizer = torch.optim.Adam(policy.parameters(), lr=learning_rate)
    buffer = RolloutBuffer()

    env = make_env()
    obs = env.reset(seed=42)[0]
    # total_timesteps = 10000
    updates = 200
    steps = 0

    # # Collect rollouts and update until budget exhausted
    # while steps < total_timesteps:
    #     for _ in range(n_steps):
    #         action, logp, value = policy.act(obs)
    #         next_obs, reward, terminated, truncated, _ = env.step(action)
    #         done = bool(terminated or truncated)
    #         buffer.store(obs, action, logp, reward, torch.tensor(value), done)
    #         obs = next_obs
    #         steps += 1
    #         if done:
    #             obs = env.reset()[0]

    for update in tqdm(range(updates)):
        buffer.clear() # clear the buffer for the next batch of trajectories
        actor_rewards     = []

        # 1) Collect trajectories
        for N in range(actor_length):
            state = env.reset(seed=N) # seed for reproducibility
            steps = 0

            # for tracking episode returns within this batch
            current_ep_reward   = 0.0
            reward_per_actor = []
            
            # 1.1 one trajectory per actor
            while steps < traj_length:
                action, logp, value = policy.act(state)
                # next_state, reward, done, _ = env.step(action)
                next_state, reward, terminated, truncated, _ = env.step(action)
                done = bool(terminated or truncated)

                buffer.store(state, action, logp, reward, value, done)
                state = next_state
                steps += 1

                # accumulate for this episode
                current_ep_reward += reward

                if done:
                    actor_rewards.append(current_ep_reward)
                    current_ep_reward = 0.0
                    state = env.reset(seed=2*N + steps) # reset for the next episode
                    # break

            # in case the last episode didn’t terminate exactly on traj_length
            if current_ep_reward > 0.0:
                actor_rewards.append(current_ep_reward)

        # Perform PPO update
        ppo_update(
            policy, optimizer, buffer,
            gamma=gamma,
            lam=gae_lambda,
            c1=value_coef,
            c2=entropy_coef,
            clip_eps=clip_eps,
            epochs=epochs,
            batch_size=batch_size
        )

    # Evaluate performance
    mean_reward, _ = evaluate_policy(policy, env, n_episodes=5)
    # We minimize the negative of performance
    return -mean_reward

In [78]:
if __name__ == "__main__":
    # Run Bayesian optimization
    result = gp_minimize(
        func=objective,
        dimensions=dim_space,
        n_calls=10, #to increase
        random_state=42
    )

    # Best hyperparameters
    best_params = {dim.name: val for dim, val in zip(dim_space, result.x)}
    print("Best hyperparameters found:")
    for key, val in best_params.items():
        print(f"  {key}: {val}")


  logger.warn(
100%|██████████| 200/200 [03:38<00:00,  1.09s/it]
100%|██████████| 200/200 [01:24<00:00,  2.36it/s]
100%|██████████| 200/200 [08:24<00:00,  2.52s/it]
100%|██████████| 200/200 [00:25<00:00,  7.85it/s]
100%|██████████| 200/200 [03:55<00:00,  1.18s/it]
100%|██████████| 200/200 [08:10<00:00,  2.45s/it]
100%|██████████| 200/200 [01:23<00:00,  2.41it/s]
100%|██████████| 200/200 [11:24<00:00,  3.42s/it]
100%|██████████| 200/200 [04:29<00:00,  1.35s/it]
100%|██████████| 200/200 [03:37<00:00,  1.09s/it]


Best hyperparameters found:
  learning_rate: 0.003363987115958797
  gamma: 0.7244273125515132
  gae_lambda: 0.8750785448403448
  clip_eps: 0.2853317731587589
  value_coef: 0.727271995856421
  entropy_coef: 0.016327038440291774
  actor_length: 10
  traj_length: 297
  epochs: 10
  batch_size: 221


In [None]:
# # Optionally, retrain a final model on full budget
# print("Retraining final model with best hyperparameters...")

# # Cast integer hyperparameters
# actor_length = int(best_params['actor_length'])
# epochs       = int(best_params['epochs'])
# batch_size   = int(best_params['batch_size'])

# # Re-initialize policy, optimizer and buffer
# policy    = Policy_v2().to(device)
# optimizer = torch.optim.Adam(policy.parameters(), lr=best_params['learning_rate'])
# buffer    = RolloutBuffer()

# # Create & seed the env
# env = make_env()
# state, _ = env.reset(seed=42)

# steps = 0
# total_timesteps = 2000   # e.g. increase for final training

# while steps < total_timesteps:
#     # collect one batch of rollouts
#     for _ in range(actor_length):
#         action, logp, value = policy.act(state)
#         next_state, reward, terminated, truncated, _ = env.step(action)
#         done = bool(terminated or truncated)

#         buffer.store(state, action, logp, reward, torch.tensor(value), done)
#         state = next_state
#         steps += 1

#         if done:
#             state, _ = env.reset()

#     # perform PPO update
#     ppo_update(
#         policy, optimizer, buffer,
#         gamma=best_params['gamma'],
#         lam=best_params['gae_lambda'],
#         c1=best_params['value_coef'],
#         c2=best_params['entropy_coef'],
#         clip_eps=best_params['clip_eps'],
#         epochs=epochs,
#         batch_size=batch_size
#     )

# # Final evaluation
# mean_reward, rewards = evaluate_policy(policy, env, n_episodes=10)
# print(f"Final mean reward: {mean_reward}")
