<a href="https://colab.research.google.com/github/thisisWooyeol/Deep-RL-Course/blob/master/hyperparam-optimization/optuna_LunarLander_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Dependencies and Stable Baselines3 Using Pip

In [1]:
!pip install stable-baselines3
!pip install swig
!pip install gymnasium[box2d]

Collecting stable-baselines3
  Downloading stable_baselines3-2.2.1-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.7/181.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium<0.30,>=0.28.1 (from stable-baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable-baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, stable-baselines3
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1 stable-baselines3-2.2.1
Collecting swig
  Downloading swig-4.1.1.post1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m10.0 MB/s[0m 

In [2]:
# Optional: install SB3 contrib to have access to additional algorithms
!pip install sb3-contrib

Collecting sb3-contrib
  Downloading sb3_contrib-2.2.1-py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.7/80.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sb3-contrib
Successfully installed sb3-contrib-2.2.1


In [3]:
# Optuna will be used in the last part when doing hyperparameter tuning
!pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.1 colorlog-6.8.0 optuna-3.5.0


## Imports

In [4]:
import gymnasium as gym
import numpy as np

The first thing you need to import is the RL model, check the documentation to know what you can use on which problem

In [5]:
# Use PPO as the previous practice to only compare the effect of hyperparameter selections
from stable_baselines3 import PPO

In [6]:
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

  and should_run_async(code)


## Automatic Hyperparameter Tuning of PPO on LunarLander-v2 environment

### Imports

In [7]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history, plot_param_importances

### Config

In [13]:
N_TRIALS = 100  # Maximum number of trials
N_JOBS = 1  # Number of jobs to run in parallel
N_STARTUP_TRIALS = 5  # Stop random sampling after N_STARTUP_TRIALS
N_EVALUATIONS = 3  # Number of evaluations during the training
N_TIMESTEPS = int(1e6)  # Training budget
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_ENVS = 5
N_EVAL_EPISODES = 10
TIMEOUT = int(60 * 60 * 6)  # 6 hours

ENV_ID = "LunarLander-v2"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}

### Define the search space

In [14]:
from typing import Any, Dict
import torch
import torch.nn as nn

def sample_ppo_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Samper for PPO hyperparameters.

    :param trial: Optuna trial object
    :return: the sampled hyperparameters for the given trial.
    """
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 0.1, log=True)
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 8, 10)
    batch_size = 2 ** trial.suggest_int("exponent_batch_size", 6, 9)
    n_epochs = trial.suggest_int("n_epochs", 1, 8)
    gamma = 1.0 - trial.suggest_float("1-gamma", 0.0001, 0.1, log=True)
    gae_lambda = trial.suggest_float("gae_lambda", 0.95, 0.99)
    ent_coef = trial.suggest_float("ent_coef", 0.000001, 0.1, log=True)
    target_kl = trial.suggest_categorical("target_kl", [None, 0.02, 0.05])

    # Display true values
    trial.set_user_attr("n_steps", n_steps)
    trial.set_user_attr("batch_size", batch_size)
    trial.set_user_attr("gamma", gamma)

    return {
        "learning_rate": learning_rate,
        "n_steps": n_steps,
        "batch_size": batch_size,
        "n_epochs": n_epochs,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "ent_coef": ent_coef,
        "target_kl": target_kl
    }

### Define the objective function

In [15]:
from stable_baselines3.common.callbacks import EvalCallback

class TrialEvalCallback(EvalCallback):
    """
    Callback used for evaluating and reporting a trial.

    :param eval_env: Evaluation environement
    :param trial: Optuna trial object
    :param n_eval_episodes: Number of evaluation episodes
    :param eval_freq:   Evaluate the agent every ``eval_freq`` call of the callback.
    :param deterministic: Whether the evaluation should
        use a stochastic or deterministic policy.
    :param verbose:
    """

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 500000,
        deterministic: bool = True,
        verbose: int = 0,
    ):
        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            # Evaluate policy (done in the parent class)
            super()._on_step()
            self.eval_idx += 1
            # Send report to Optuna
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

### Define the objective function

In [16]:
def objective(trial: optuna.Trial) -> float:
    """
    Objective function using by Optuna to evaluate
    one configuration (i.e., one set of hyperparameters).

    Given a trial object, it will sample hyperparameters,
    evaluate it and report the result (mean episodic reward after training)

    :param trial: Optuna trial object
    :return: Mean episodic reward after training
    """

    kwargs = DEFAULT_HYPERPARAMS.copy()
    kwargs.update(sample_ppo_params(trial))

    model = PPO(**kwargs)

    eval_env = make_vec_env(env_id=ENV_ID, n_envs=N_EVAL_ENVS)
    eval_callback = TrialEvalCallback(
        eval_env=eval_env,
        trial=trial,
        n_eval_episodes=N_EVAL_EPISODES,
        eval_freq=EVAL_FREQ,
    )

    # Train model with sampled hyperparameters
    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float('nan')

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward

### The optimization loop

In [17]:
import torch

# Set pytorch num threads to 1 for faster training
torch.set_num_threads(1)
# Select the sampler: e.g. random, TPESampler, CMAES, ...
sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
# Do not prune before 1/3 of the max budget is used
pruner = MedianPruner(
    n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3
)
# Create the study and start the hyperparameter optimization
study = optuna.create_study(
    sampler=sampler, pruner=pruner, direction="maximize",
    study_name="optuna-ppo-LunarLander-v2"
)

try:
    study.optimize(objective, n_trials=N_TRIALS, n_jobs=N_JOBS, timeout=TIMEOUT)
except KeyboardInterrupt:
    pass

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print(f'    Value: {trial.value}')

print('  Params:')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

print('  User attrs:')
for key, value in trial.user_attrs.items():
    print(f'    {key}: {value}')

# Write report
study.trials_dataframe().to_csv("study_results_ppo_lunarlander-v2.csv")

fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)

fig1.show()
fig2.show()

[I 2024-01-01 05:15:39,413] A new study created in memory with name: optuna-ppo-LunarLander-v2
[I 2024-01-01 05:56:38,990] Trial 0 finished with value: -111.15117599999999 and parameters: {'learning_rate': 0.0006332020014163926, 'exponent_n_steps': 10, 'exponent_batch_size': 9, 'n_epochs': 3, '1-gamma': 0.00021325260857701014, 'gae_lambda': 0.9762677622387805, 'ent_coef': 7.787768451566595e-05, 'target_kl': None}. Best is trial 0 with value: -111.15117599999999.
[W 2024-01-01 06:11:09,736] Trial 1 failed with parameters: {'learning_rate': 0.0002104140946664786, 'exponent_n_steps': 8, 'exponent_batch_size': 6, 'n_epochs': 4, '1-gamma': 0.00023497568846401876, 'gae_lambda': 0.9588360473793212, 'ent_coef': 0.035969056628878116, 'target_kl': 0.02} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-1

Number of finished trials:  2
Best trial:
    Value: -111.15117599999999
  Params:
    learning_rate: 0.0006332020014163926
    exponent_n_steps: 10
    exponent_batch_size: 9
    n_epochs: 3
    1-gamma: 0.00021325260857701014
    gae_lambda: 0.9762677622387805
    ent_coef: 7.787768451566595e-05
    target_kl: None
  User attrs:
    n_steps: 1024
    batch_size: 512
    gamma: 0.999786747391423


ValueError: ignored


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

