In [None]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np


class XoptEnv(gym.Env):
    """
    A simple custom environment for xopt-style optimization using RL.
    The agent attempts to maximize the value of the function: f(x) = -(x-2)^2 + 4
    The optimal value is 4 at x=2.
    """

    metadata = {"render_modes": ["human"], "render_fps": 30}

    def __init__(self, target_function, initial_x_range=(-5, 5), action_scale=0.1):
        super(XoptEnv, self).__init__()

        self.variables = {"x": [-5.0, 5.0]}  # xopt's variable boundaries

        self.target_function = target_function

        low = np.array([initial_x_range[0]], dtype=np.float32)
        high = np.array([initial_x_range[1]], dtype=np.float32)
        self.observation_space = spaces.Box(
            low=low, high=high, shape=(1,), dtype=np.float32
        )

        self.action_scale = action_scale
        self.action_space = spaces.Box(
            low=np.array([-1.0], dtype=np.float32),
            high=np.array([1.0], dtype=np.float32),
            shape=(1,),
            dtype=np.float32,
        )

        self.current_x = None
        self.initial_x_range = initial_x_range

    def _get_obs(self):
        """Helper to get the observation"""
        return np.array([self.current_x], dtype=np.float32)

    def _calculate_reward(self, x):
        """The reward is the objective value we want to maximize."""
        return self.target_function(x)

    def reset(self, seed=None, options=None):
        """
        Initializes the environment, often by sampling a starting point for 'x'.
        """
        super().reset(seed=seed)

        # Sample a random starting x within the initial range
        self.current_x = self.np_random.uniform(
            low=self.initial_x_range[0], high=self.initial_x_range[1]
        )

        observation = self._get_obs()
        info = {}
        return observation, info

    def step(self, action):
        """
        Applies the action (delta_x) to the current state, evaluates the function,
        and returns the reward.
        """
        delta_x = action[0] * self.action_scale

        new_x = self.current_x + delta_x

        x_min, x_max = self.variables["x"]
        self.current_x = np.clip(new_x, x_min, x_max)

        reward = self._calculate_reward(self.current_x)

        terminated = False
        truncated = False

        observation = self._get_obs()
        info = {"objective_value": reward}

        return observation, reward, terminated, truncated, info

In [None]:
from stable_baselines3 import PPO

N_ENVS = 4


def target_function(x):
    """Calculates the value of -(x - 2.0)**2 + 4.0."""
    return -((x - 2.0) ** 2) + 4.0


vec_env = XoptEnv(target_function)

model = PPO(
    "MlpPolicy",
    vec_env,
    verbose=0,
    gamma=0.99,
    n_steps=256,
    ent_coef=0.01,
    device="auto",
)

TIMESTEPS = 25000
print(f"Starting PPO training for {TIMESTEPS} timesteps...")
model.learn(total_timesteps=TIMESTEPS)
print("Training complete.")

In [None]:
from xopt.generators.rl import RLModelGenerator
from xopt import Xopt, Evaluator


def objective_function(input_data: dict) -> dict:
    x = input_data["x"]
    return {"f": target_function(x)}


# TODO This is currently unused. Needs to align with the env, the objective function, and the xopt model wrapper
vocs = {"variables": {"x": [-5.0, 5.0]}, "objectives": {"f": "MAXIMIZE"}}


generator = RLModelGenerator(vocs=vocs)
generator.set_model(model)

evaluator = Evaluator(function=objective_function)

X = Xopt(generator=generator, evaluator=evaluator, vocs=vocs)
X.max_evaluations = 100


print("\n--- Xopt Configuration Summary ---")
print(X)

print("\nPerforming 2 random initial evaluations...")
X.random_evaluate(2)

print("\n--- Initial Optimization Results ---")
print(X.data)

X.run()