In [1]:
from pathlib import Path

import yaml
import optuna
from optuna.samplers import RandomSampler
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

from megai_man.env import make_venv
from megai_man.callbacks import (
    StageLoggingCallback,
    TrainingStatsLoggerCallback,
    StopTrainingOnTimeBudget,
)

In [2]:
def sample_params(trial: optuna.Trial):
    n_steps = trial.suggest_categorical("n_steps", [128, 256, 512, 1024, 2048])
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True)
    clip_range = trial.suggest_float("clip_range", 0.1, 0.3, step=0.1)
    vf_coef = trial.suggest_float("vf_coef", 0.5, 1.0)
    ent_coef = trial.suggest_float("ent_coef", 1e-5, 1e-1, log=True)
    gae_lambda = trial.suggest_float("gae_lambda", 0.9, 1.0, log=True)
    n_epochs = trial.suggest_int("n_epochs", 4, 10)
    gamma = trial.suggest_categorical("gamma", [0.99, 0.995, 0.999])
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.5, 1)

    return {
        "n_steps": n_steps,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "clip_range": clip_range,
        "vf_coef": vf_coef,
        "ent_coef": ent_coef,
        "gae_lambda": gae_lambda,
        "n_epochs": n_epochs,
        "gamma": gamma,
        "max_grad_norm": max_grad_norm,
    }


def optimizer(tensorboard_log: str, sample_fn, timesteps_per_trial=1_000_000):
    def optimize_agent(trial):
        env_kwargs = {
            "n_envs": 8,
            "state": "CutMan",
            "screen": None,
            "frameskip": 4,
            "frame_stack": 4,
            "truncate_if_no_improvement": True,
            "obs_space": "screen",
            "action_space": "multi_discrete",
            "crop_img": False,
            "invincible": False,
            "no_enemies": False,
            "render_mode": None,
            "damage_terminate": False,
            "fixed_damage_punishment": 0.05,
            "forward_factor": 0.05,
            "backward_factor": 0.055,
            "time_punishment_factor": 0,
            "multi_input": True,
            "curriculum": False,
            "screen_rewards": False,
            "score_reward": 0,
            "distance_only_on_ground": True,
            "term_back_screen": True,
            "_enforce_subproc": False,
        }

        venv = make_venv(**env_kwargs)

        model_params = sample_fn(trial)
        model = PPO(
            policy="MultiInputPolicy",
            env=venv,
            tensorboard_log=tensorboard_log,
            verbose=0,
            seed=666,
            device="cuda",
            **model_params,
        )
        model.learn(
            timesteps_per_trial,
            callback=[
                StageLoggingCallback(),
                TrainingStatsLoggerCallback(),
                StopTrainingOnTimeBudget(budget=60 * 60),  # 1 hour
            ],
            log_interval=1,
        )
        venv.close()

        eval_venv = make_venv(**{**env_kwargs, "n_envs": 1})
        reward, _ = evaluate_policy(
            model,
            eval_venv,
            n_eval_episodes=1,
            deterministic=True,
        )
        eval_venv.close()
        return reward

    return optimize_agent


def tune(sample_fn, name, n_trials=500, timesteps_per_trial=1_000_000):
    db_path = f"studies/{name}.db"
    Path(db_path).touch(exist_ok=True)

    study = optuna.create_study(
        storage=f"sqlite:///{db_path}",
        sampler=RandomSampler(seed=666),
        study_name=name,
        direction="maximize",
        load_if_exists=True,
    )

    n_trials -= len([x for x in study.trials if x.state.name == "COMPLETE"])

    if n_trials:
        study.optimize(
            optimizer(f"logs/{name}", sample_fn, timesteps_per_trial),
            n_trials=n_trials,
            n_jobs=1,
            gc_after_trial=True,
            show_progress_bar=True,
        )
    else:
        print("Total trials already finished.")

    with open(f"{name}_opt.yml", "w") as fp:
        yaml.safe_dump(study.best_params, fp)


tune(
    sample_fn=sample_params,
    name="cutman_random_searcher",
    n_trials=100,
    timesteps_per_trial=5_000_000,
)

[I 2024-06-30 16:35:31,437] Using an existing study with name 'cutman_random_searcher' instead of creating a new one.


  0%|          | 0/84 [00:00<?, ?it/s]