In [1]:
import keras
import numpy as np
import gymnasium as gym
import tensorflow as tf
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt
import gymnasium.utils.save_video

2025-04-13 12:44:22.278534: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744512262.293209  141991 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744512262.301517  141991 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744512262.324159  141991 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744512262.324183  141991 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744512262.324185  141991 computation_placer.cc:177] computation placer alr

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
              tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [8]:
num_states = saving_env.single_observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = saving_env.single_action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))

upper_bound = saving_env.single_action_space.high[0]
lower_bound = saving_env.single_action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  24
Size of Action Space ->  4
Max Value of Action ->  1.0
Min Value of Action ->  -1.0


In [9]:
rng = np.random.default_rng()

In [10]:
class ReplayBuffer:

    def __init__(self, max_size: int, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, seed: int | None = 42):
        """Stores the replay history with a maximum of `max_size` entries, removing old entries as needed.

        Parameters:
            max_size: maximal number of entries to keep
            observation_space: specification of the observation space
            action_space: specification of the action space
            seed: seed to initialize the internal random number generator for reproducibility"""

        self.current_observations = np.zeros((max_size, *observation_space.shape), dtype=observation_space.dtype)
        self.next_observations = np.zeros((max_size, *observation_space.shape), dtype=observation_space.dtype)
        self.actions = np.zeros((max_size, *action_space.shape), dtype=action_space.dtype)
        self.rewards = np.zeros((max_size,), dtype=np.float32)
        self.dones = np.zeros((max_size,), dtype=np.float32)
        
        self.max_size = max_size
        self.rng = np.random.default_rng(seed=seed)
        self.buffer_pointer = 0
        self.current_size = 0
        
    def add(self, current_observations: np.ndarray, actions: np.ndarray, rewards: np.ndarray, next_observations: np.ndarray, dones: np.ndarray) -> None:
        """Add a new entry to the buffer.

        Parameters:
            current_observations: environment state observed at the current step
            actions: action taken by the model
            rewards: reward received after taking the action
            next_observations: environment state obversed after taking the action
            dones: whether the episode has ended or not"""

        batch_size = current_observations.shape[0]
        idxs = (np.arange(batch_size) + self.buffer_pointer) % self.max_size

        self.current_observations[idxs] = current_observations
        self.actions[idxs] = actions
        self.rewards[idxs] = rewards
        self.next_observations[idxs] = next_observations
        self.dones[idxs] = dones

        self.buffer_pointer = (idxs[-1] + 1) % self.max_size
        self.current_size = min(self.max_size, self.current_size + batch_size)
    
    def sample(self, n_samples: int, replace: bool = True) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Randomly samples `n_samples` from the buffer.

        Parameters:
            n_samples: number of samples to select
            replace: sample with or without replacement

        Returns:
            current observations, actions, rewards, next observations, dones"""

        return self[self.rng.choice(self.current_size, size=n_samples, replace=replace)]

    def clear(self) -> None:
        """Clears the buffer"""

        self.buffer_pointer = 0
        self.current_size = 0

    def __getitem__(self, index: int | np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """Gets a sample at `index`

        Parameters:
            index: index of the sample to get

        Returns:
            current observations, actions, rewards, next observations, dones"""

        return (
            self.current_observations[index],
            self.actions[index],
            self.rewards[index],
            self.next_observations[index],
            self.dones[index]
        )
        
    def __len__(self) -> int:
        """Returns the number of entries in the buffer"""

        return self.current_size

In [571]:
def play_game(
    model: keras.Model,
    buffer: ReplayBuffer,
    env: gym.Env,
    steps: int,
    observations: np.ndarray | None = None,
    one_episode: bool = False,
    no_noise: bool = False
) -> np.ndarray:
    """Plays the environment `env` using model `model` for a total of `steps` steps.

    Arguments:
        model: model to use
        buffer: buffer to store results to
        env: environment to use
        steps: total number of steps to record
        observations: observation to start from
        one_episode: exist as soon as one of the environments finishes

    Returns:
        the last observations
    """

    if observations is None:
        observations, _ = env.reset()

    for _ in range(steps // env.num_envs):
        actions = model(observations, training=False).numpy()
        
        new_observations, rewards, terminateds, truncated, _ = env.step(actions)

        equal_observations = np.all(np.isclose(observations, new_observations), axis=1)

        dones = terminateds | truncated | equal_observations

        rewards = np.where(equal_observations, -100, rewards)

        rewards = np.where(rewards == -100, -50, rewards)

        print(rewards)
        
        buffer.add(
            current_observations=observations,
            actions=actions,
            rewards=rewards,
            next_observations=new_observations,
            dones=dones,
        )

        if one_episode and np.any(dones):
            observations = None
            break
        
        observations = new_observations
    
    return observations

In [638]:
actor_model = keras.models.load_model('./history/actor-model-78.keras')

In [639]:
saving_steps = 1000

In [640]:
buffer_size = 2 ** 16

In [641]:
saving_env = gym.make_vec("BipedalWalker-v3", hardcore=False, render_mode="rgb_array_list", num_envs=1)

In [642]:
def save_gameplay(
    model: tf.keras.Model,
    max_steps: int = 1000,
    env: gym.Env | None = None,
):
    save_buffer = ReplayBuffer(
        max_size=buffer_size,
        observation_space=saving_env.single_observation_space,
        action_space=saving_env.single_action_space
    )
    
    _ = play_game(
        model=model,
        env=env,
        steps=max_steps,
        buffer=save_buffer,
        one_episode=True
    )

    render = env.render()

    gym.utils.save_video.save_video(
        frames=render[0],
        video_folder="videos",
        fps=env.metadata["render_fps"],
    )

    return render

In [643]:
res = save_gameplay(actor_model, max_steps=saving_steps, env=saving_env)

[-0.1689446]
[-0.31295511]
[-0.35015011]
[-0.29425481]
[-0.25102684]
[-0.22957689]
[-0.2116753]
[-0.25465897]
[-0.28279477]
[-0.28274065]
[-0.15310231]
[-0.229366]
[-0.19697209]
[-0.16072549]
[-0.14549541]
[-0.21468282]
[-0.11527488]
[-0.19352396]
[-0.09312549]
[-0.1721883]
[-0.071459]
[-0.15067394]
[-0.05040178]
[-0.04100777]
[-0.10877983]
[-0.15129691]
[-0.02252436]
[-0.08263274]
[-0.03441077]
[0.07673378]
[0.07246058]
[0.10863782]
[0.10025559]
[0.15275899]
[0.21078253]
[0.14180039]
[0.08141942]
[0.1450213]
[0.1042174]
[0.04458494]
[0.06687116]
[-0.00880321]
[-0.00289559]
[0.04605253]
[0.10211919]
[0.16023058]
[0.12038131]
[0.06151672]
[0.00344715]
[-0.05462335]
[-0.11385649]
[-0.1721641]
[-0.23256207]
[-0.13813214]
[-0.07586938]
[0.00659382]
[0.08984207]
[0.04957263]
[-0.11482409]
[-0.20378612]
[-0.17127663]
[-0.22279167]
[-0.2866472]
[-0.31953219]
[-0.2122843]
[-0.43288282]
[-0.41121063]
[-0.41753766]
[-0.3705633]
[-0.26981747]
[-0.35662013]
[-0.4035725]
[-0.39259273]
[-0.40233037]