* https://github.com/araffin/rl-tutorial-jnrr19

In [1]:
from rich import print, pretty
pretty.install()
import gymnasium as gym
import numpy as np
from stable_baselines3 import PPO

In [2]:
env = gym.make("CartPole-v1")
model = PPO("MlpPolicy", env, verbose=0)

In [3]:
# helper function to evaluate the algorithm
from stable_baselines3.common.base_class import BaseAlgorithm

def evaluate(
    model: BaseAlgorithm,
    num_episodes: int = 100,
    deterministic: bool = True) -> float:
    """
    Evaluate an RL agent for `num_episodes`.
    :param model: the RL Agent
    :param_env: the gym Environment
    :param num_episodes: the number of episodes to evaluate it
    :param deterministic: whether to use deterministic or stochastic actions
    :return: Mean reward for the last `num_episodes`
    """
    # this works for a single environment
    vec_env = model.get_env()
    obs = vec_env.reset()
    all_episode_rewards = []
    for _ in range(num_episodes):
        episode_rewards = []
        done = False
        while not done:
            # _states are only useful if using LSTM policites
            # `deterministic` is to use deterministic actions
            action, _states = model.predict(obs, deterministic=deterministic)
            # actions, rewards, dones are arrays
            # because we are using a vectorized env
            obs, reward, done, _info = vec_env.step(action)
            episode_rewards.append(reward)
        all_episode_rewards.append(sum(episode_rewards))
    mean_episode_reward = np.mean(all_episode_rewards)
    print(f"Mean reward: {mean_episode_reward:.2f} - Num episodes: {num_episodes}")

    return mean_episode_reward

### Evaluate agent performance before training

In [4]:
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_episodes=100, deterministic=True)

In [5]:
# a similar function to the `evaluate` helper is provided by stable baselines
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make("CartPole-v1")
model = PPO("MlpPolicy", env, verbose=0)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn=False, deterministic=True)
print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

### Train the agent and evaluate it

In [6]:
# train
model.learn(total_timesteps=10_000)
# evaluate
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=100, warn=False, deterministic=True)
print(f"mean_reward: {mean_reward:.2f} +/- {std_reward:.2f}")

In [7]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay


def show_videos(video_path="", prefix=""):
    """
    Taken from https://github.com/eleurent/highway-env

    :param video_path: (str) Path to the folder containing videos
    :param prefix: (str) Filter the video, showing only the only starting with this prefix
    """
    html = []
    for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append(
            """<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>""".format(
                mp4, video_b64.decode("ascii")
            )
        )
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv


def record_video(env_id, model, video_length=500, prefix="", video_folder="videos/"):
    """
    :param env_id: (str)
    :param model: (RL model)
    :param video_length: (int)
    :param prefix: (str)
    :param video_folder: (str)
    """
    eval_env = DummyVecEnv([lambda: gym.make(env_id, render_mode="rgb_array")])
    # Start the video at step=0 and record 500 steps
    eval_env = VecVideoRecorder(
        eval_env,
        video_folder=video_folder,
        record_video_trigger=lambda step: step == 0,
        video_length=video_length,
        name_prefix=prefix,
    )

    obs = eval_env.reset()
    for _ in range(video_length):
        action, _ = model.predict(obs)
        obs, _, _, _ = eval_env.step(action)

    # Close the video recorder
    eval_env.close()

In [11]:
record_video("CartPole-v1", model, video_length=500, prefix="ppo-cartpole")
show_videos("videos", prefix="ppo")

Saving video to /Users/uche/Documents/Projects/RL/videos/ppo-cartpole-step-0-to-step-500.mp4
Moviepy - Building video /Users/uche/Documents/Projects/RL/videos/ppo-cartpole-step-0-to-step-500.mp4.
Moviepy - Writing video /Users/uche/Documents/Projects/RL/videos/ppo-cartpole-step-0-to-step-500.mp4



                                                                                                                                                                         

Moviepy - Done !
Moviepy - video ready /Users/uche/Documents/Projects/RL/videos/ppo-cartpole-step-0-to-step-500.mp4
