In [2]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
import numpy as np

In [7]:
# Create environment
env = gym.make("CartPole-v1")

In [8]:
model = PPO("MlpPolicy", env, verbose=1, 
            learning_rate=0.1, 
            n_steps=2048,
            batch_size=64,
            gae_lambda=0.95,
            gamma=0.99,
            n_epochs=10,
            clip_range=0.2,
            seed=0)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [9]:
model.learn(total_timesteps=10000)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.6     |
|    ep_rew_mean     | 21.6     |
| time/              |          |
|    fps             | 902      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 9.37     |
|    ep_rew_mean          | 9.37     |
| time/                   |          |
|    fps                  | 671      |
|    iterations           | 2        |
|    time_elapsed         | 6        |
|    total_timesteps      | 4096     |
| train/                  |          |
|    approx_kl            | 11.74526 |
|    clip_fraction        | 0.993    |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.0117  |
|    explained_variance   | -0.00116 |
|    learning_rate        | 0.1      |
|    loss                 | 3.76    

<stable_baselines3.ppo.ppo.PPO at 0x22cddc362d0>

In [10]:
eval_env = gym.make("CartPole-v1")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
print(f"Mean reward: {mean_reward} ± {std_reward}")

Mean reward: 9.6 ± 0.4898979485566356


In [11]:
import itertools
import random

In [13]:
hyperparam_choices = {
    "learning_rate": [1e-4, 3e-4, 1e-3],
    "gamma": [0.95, 0.99, 0.999],
    "n_steps": [512, 1024, 2048],
    "batch_size": [32, 64, 128]
}

# Generate random combinations of hyperparameters
param_combinations = list(itertools.product(
    hyperparam_choices["learning_rate"],
    hyperparam_choices["gamma"],
    hyperparam_choices["n_steps"],
    hyperparam_choices["batch_size"]
))

# Randomly select a subset of combinations to evaluate
random.shuffle(param_combinations)
param_combinations = param_combinations[:5]  # Evaluate 5 random sets

best_mean_reward = -np.inf
best_params = None

for (lr, gm, ns, bs) in param_combinations:
    # Create and train model with these hyperparams
    env = gym.make("CartPole-v1")
    model = PPO("MlpPolicy", env, verbose=0,
                learning_rate=lr,
                gamma=gm,
                n_steps=ns,
                batch_size=bs,
                seed=0)
    model.learn(total_timesteps=10000)

    # Evaluate the model
    eval_env = gym.make("CartPole-v1")
    mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=100)
    print(f"Params: lr={lr}, gamma={gm}, n_steps={ns}, batch_size={bs} -> mean_reward={mean_reward:.2f}")

    # Update best
    if mean_reward > best_mean_reward:
        best_mean_reward = mean_reward
        best_params = (lr, gm, ns, bs)

print("Best hyperparams found:", best_params, "with mean reward:", best_mean_reward)


Params: lr=0.0001, gamma=0.999, n_steps=2048, batch_size=32 -> mean_reward=430.09
Params: lr=0.0003, gamma=0.999, n_steps=1024, batch_size=32 -> mean_reward=439.43
Params: lr=0.0001, gamma=0.999, n_steps=2048, batch_size=128 -> mean_reward=122.41
Params: lr=0.001, gamma=0.999, n_steps=1024, batch_size=32 -> mean_reward=500.00
Params: lr=0.0001, gamma=0.99, n_steps=512, batch_size=32 -> mean_reward=182.81
Best hyperparams found: (0.001, 0.999, 1024, 32) with mean reward: 500.0


In [18]:
def train_with_seeds(seeds, n_timesteps=10000):
    rewards = []

    for seed in seeds:
        # Reset the environment with a specific seed
        env = gym.make('CartPole-v1')
        env.reset(seed=seed)

        # Train the model
        model = PPO('MlpPolicy', env, verbose=0,learning_rate = 0.001, gamma=0.999,n_steps=1024,batch_size=32)
        model.learn(total_timesteps=n_timesteps)

        # Evaluate the model
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=100)
        rewards.append(mean_reward)

        env.close()  # Close the environment after training

    print(f"Mean rewards with different seeds: {rewards}")

    
# Example of running with different seeds
seeds = [0,1,2]
train_with_seeds(seeds)

Mean rewards with different seeds: [500.0, 421.13, 500.0]
