## Testing on-the-fly using trained DDQNs

This notebook contains the workflow for testing the best RL models

## Import the necessary packages and developed modules

In [16]:
import sys

sys.path.append("..")
from onemax_mpdac.models.factored_ddqn import BranchingQNetwork
from onemax_mpdac.eval import ollga_mp_single_run
from joblib import Parallel, delayed
import torch
import numpy as np

## Example Testing Configuration
- problem_size: 500
- state_dim: 2
- net_arch: [50, 50]
- n_eval_episodes: 1000
- num_workers: 4

## Initialize Network and Load the checkpoint


In [17]:
n = 500
q_net = BranchingQNetwork(
    state_dim=2,
    action_dim=7,
    n_action=4,
    net_arch=[50, 50],
)
q_net.load_state_dict(
    torch.load(f"../resources/ddqn_ckpts/onemax_n{n}_fmp_as_09998.pt")
)

  q_net.load_state_dict(torch.load(f"../resources/ddqn_ckpts/onemax_n{n}_fmp_as_09998.pt"))


<All keys matched successfully>

## Set the configuration for the benchmark


In [18]:
bench_params = {
    "name": "OLLGAFactTheory",
    "discrete_action": True,
    "action_choices": [
        [1, 2, 4, 8, 16, 32, 64],
        [0.25, 0.542, 0.833, 1.125, 1.417, 1.708, 2.0],
        [1, 2, 4, 8, 16, 32, 64],
        [0.25, 0.542, 0.833, 1.125, 1.417, 1.708, 2.0],
    ],
    "problem": "OneMax",
    "instance_set_path": f"om_ollga_{n}_medium.csv",
    "observation_description": "n,f(x)",
    "reward_choice": "imp_minus_evals_shifting",
    "alias": "evenly_spread",
    "seed": 123,
}
eval_env_params = {"reward_choice": "minus_evals", "cutoff": 100000.0}

## Get the policy from the learned Factored DDQN


In [19]:
def get_actions_for_all_states(bench_params: dict, model: BranchingQNetwork, n: int):
    with torch.no_grad():
        all_states = torch.tensor(np.array([[n, fx] for fx in range(0, n)])).float()
        q_values = model(all_states)
        acts = q_values.argmax(dim=2).cpu().numpy().tolist()
    policy_unclipped = acts
    policy = []
    for fitness, sel in enumerate(policy_unclipped):
        lbd1_idx, mr_idx, lbd2_idx, cr_idx = sel
        lambda1 = bench_params["action_choices"][0][lbd1_idx]
        mutation_rate = bench_params["action_choices"][1][mr_idx]
        lambda2 = bench_params["action_choices"][2][lbd2_idx]
        crossover_rate = bench_params["action_choices"][3][cr_idx]
        policy.append(
            [
                np.int64(lambda1),
                np.float64(mutation_rate),
                np.int64(lambda2),
                np.float64(crossover_rate),
            ]
        )
    return policy


policy = get_actions_for_all_states(bench_params, q_net, n)

## Run test and observe the ERT

In [21]:
runtimes = Parallel(n_jobs=4)(
    delayed(ollga_mp_single_run)(
        bench_params=bench_params,
        eval_env_params=eval_env_params,
        policy=policy,
        seed=i,
    )
    for i in range(100)
)
runtimes = np.array(runtimes)
print(f"Runtime: {runtimes.mean():.2f} ± {runtimes.std():.2f}")

Runtime: 2423.48 ± 229.28
