In [2]:
import gymnasium as gym
from tqdm import tqdm
import numpy as np

print(f"{gym.__version__}")

0.29.1


In [3]:
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.optim.lr_scheduler import StepLR

In [4]:
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.evaluation import evaluate_policy

In [5]:
# Example for continuous actions
# env_id = "LunarLanderContinuous-v2"

# Example for discrete actions
env_id = "Pendulum-v1"

In [6]:
env = gym.make(env_id, render_mode="human")

## Train Expert Model

We create an expert RL agent and let it learn to solve a task by interacting with the evironment.


In [7]:
# ppo_expert = TD3("MlpPolicy", env_id, verbose=1)
# ppo_expert.learn(total_timesteps=1e5)
# ppo_expert.save("ppo_expert")
ppo_expert = TD3.load("ppo_expert", env)
# ppo_expert.learn(total_timesteps=1e5)
# ppo_expert.save("ppo_expert")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


check the performance of the trained agent

In [8]:
env = gym.make(env_id)

In [9]:
mean_reward, std_reward = evaluate_policy(ppo_expert, env, n_eval_episodes=10)

print(f"Mean reward = {mean_reward} +/- {std_reward}")



Mean reward = -289.604587072134 +/- 104.41954344153768


## Create Student

We also create a student RL agent, which will later be trained with the expert dataset


In [10]:
student = TD3("MlpPolicy", env_id, verbose=1)

Using cuda device
Creating environment from the given name 'Pendulum-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [11]:
# only valid for continuous actions
# sac_student = SAC('MlpPolicy', env_id, verbose=1, policy_kwargs=dict(net_arch=[64, 64]))


We now let our expert interact with the environment (except we already have expert data) and store resultant expert observations and actions to build an expert dataset.


In [12]:
num_interactions = int(4e4 + 1)
# num_interactions = int(510)

In [13]:
def calculate_returns(rewards, dones, discount_factor=0.99):
    returns = []
    G = 0  # Initialize the return for the current episode

    # Iterate backwards through rewards
    for reward, done in zip(reversed(rewards), reversed(dones)):
        if done:
            G = 0  # Reset the return at the end of each episode
        G = reward + discount_factor * G  # Update the return
        returns.insert(0, G)  # Insert the return at the beginning of the list

    return returns

In [33]:
expert_observations = np.empty((num_interactions,) + env.observation_space.shape, dtype=np.float32)
expert_actions = np.empty((num_interactions,) + (env.action_space.shape[0],), dtype=np.float32)
expert_rewards = np.empty((num_interactions,) + (1,), dtype=np.float32)
expert_dones = np.empty((num_interactions,) + (1,), dtype=np.float32)
expert_next_observations = np.empty((num_interactions,) + env.observation_space.shape, dtype=np.float32)

obs, _ = env.reset()

for i in tqdm(range(num_interactions)):
    action, _ = ppo_expert.predict(obs, deterministic=True)
    expert_observations[i] = obs
    expert_actions[i] = action
    obs, reward, terminated, truncated, info = env.step(action)
    expert_rewards[i] = reward
    expert_dones[i] = terminated * (1 - truncated)
    if terminated or truncated:
        obs, _ = env.reset()
expert_next_observations = expert_observations[1:].copy()
expert_observations = expert_observations[:-1]
expert_actions = expert_actions[:-1]
expert_rewards = expert_rewards[:-1]
expert_dones = expert_dones[:-1]

# expert_returns = calculate_returns(expert_returns, expert_dones)
# for idx, done in enumerate(expert_dones):
#     if done:
#         print(idx)
#         break
# print(expert_returns[:idx+3])

np.savez_compressed(
    "expert_data",
    expert_observations=expert_observations,
    expert_actions=expert_actions,
    expert_rewards=expert_rewards,
    expert_dones=expert_dones,
    expert_next_observations=expert_next_observations,
)

# # Load the data
# data = np.load("expert_data.npz")

# # Access the saved arrays using their keys
# expert_observations = data['expert_observations']
# expert_actions = data['expert_actions']
# expert_rewards = data['expert_rewards']
# expert_dones = data['expert_dones']
# expert_next_observations = data['expert_next_observations']

100%|██████████| 40001/40001 [00:37<00:00, 1076.15it/s]


In [34]:
print(expert_observations[0].dtype)
print(expert_next_observations[0].dtype)
print(expert_actions[0].dtype)
print(expert_rewards[0].dtype)
print(expert_dones[0].dtype)

float32
float32
float32
float32
float32




- To seamlessly use PyTorch in the training process, we subclass an `ExpertDataset` from PyTorch's base `Dataset`.
- Note that we initialize the dataset with the previously generated expert observations and actions.
- We further implement Python's `__getitem__` and `__len__` magic functions to allow PyTorch's dataset-handling to access arbitrary rows in the dataset and inform it about the length of the dataset.
- For more information about PyTorch's datasets, you can read: https://pytorch.org/docs/stable/data.html.


In [15]:
from torch.utils.data.dataset import Dataset, random_split

In [16]:
class ExpertDataSet(Dataset):
    def __init__(
        self,
        expert_observations,
        expert_actions,
        expert_rewards,
        expert_dones,
        expert_next_observations,
    ):
        self.observations = expert_observations
        self.actions = expert_actions
        self.rewards = expert_rewards
        self.dones = expert_dones
        self.next_observations = expert_next_observations

    def __getitem__(self, index):
        return (
            self.observations[index],
            self.actions[index],
            self.rewards[index],
            self.dones[index],
            self.next_observations[index],
        )

    def __len__(self):
        return len(self.dones)



We now instantiate the `ExpertDataSet` and split it into training and test datasets.


In [35]:
expert_dataset = ExpertDataSet(
    expert_observations,
    expert_actions,
    expert_rewards,
    expert_dones,
    expert_next_observations,
)



train_size = int(1 * len(expert_dataset))



test_size = len(expert_dataset) - train_size



train_expert_dataset, test_expert_dataset = random_split(
    expert_dataset, [train_size, test_size]
)

In [36]:
print("test_expert_dataset: ", len(test_expert_dataset))
print("train_expert_dataset: ", len(train_expert_dataset))

test_expert_dataset:  0
train_expert_dataset:  40000




NOTE: The supervised learning section of this code is adapted from: https://github.com/pytorch/examples/blob/master/mnist/main.py
1. We extract the policy network of our RL student agent.
2. We load the (labeled) expert dataset containing expert observations as inputs and expert actions as targets.
3. We perform supervised learning, that is, we adjust the policy network's parameters such that given expert observations as inputs to the network, its outputs match the targets (expert actions).
By training the policy network in this way the corresponding RL student agent is taught to behave like the expert agent that was used to created the expert dataset (Behavior Cloning).


In [19]:
# def pretrain_agent(
#     student,
#     batch_size=64,
#     epochs=1000,
#     learning_rate=0.001,
#     log_interval=100,
#     no_cuda=False,
#     seed=1,
#     test_batch_size=64,
# ):
#     use_cuda = not no_cuda and th.cuda.is_available()
#     th.manual_seed(seed)
#     device = th.device("cuda" if use_cuda else "cpu")
#     kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

#     if isinstance(env.action_space, gym.spaces.Box):
#         criterion = nn.MSELoss()
#     else:
#         criterion = nn.CrossEntropyLoss()

#     # Extract initial policy
#     actor = student.policy.actor.to(device)
#     critic = student.policy.critic.to(device)

#     def train(actor, critic, device, train_loader, actor_optimizer, critic_optimizer):
#         actor.train()
#         critic.train()

#         for batch_idx, (data, target_action, target_return) in enumerate(train_loader):
#             data, target_action, target_return = (
#                 data.to(device),
#                 target_action.to(device),
#                 target_return.to(device),
#             )

#             target_action = target_action.float()
#             target_return = target_return.float()

#             action = actor(data)
#             actor_loss = criterion(action, target_action)

#             actor_optimizer.zero_grad()
#             actor_loss.backward()
#             actor_optimizer.step()

#             current_returns = critic(data, target_action)
#             critic_loss = sum(
#                 F.mse_loss(current_return, target_return)
#                 for current_return in current_returns
#             )

#             critic_optimizer.zero_grad()
#             critic_loss.backward()
#             critic_optimizer.step()

#             if batch_idx % log_interval == 0:
#                 print(
#                     "Train Epoch: {} [{}/{} ({:.0f}%)]\tActor Loss: {:.6f}\tCritic Loss: {:.6f}".format(
#                         epoch,
#                         batch_idx * len(data),
#                         len(train_loader.dataset),
#                         100.0 * batch_idx / len(train_loader),
#                         actor_loss.item(),
#                         critic_loss.item(),
#                     )
#                 )

#     def test(actor, critic, device, test_loader):
#         actor.eval()
#         critic.eval()
#         actor_loss = 0
#         critic_loss = 0
#         with th.no_grad():
#             for data, target_action, target_return in test_loader:
#                 data, target_action, target_return = (
#                     data.to(device),
#                     target_action.to(device),
#                     target_return.to(device),
#                 )

#                 target_action = target_action.float()
#                 target_return = target_return.float()

#                 action = actor(data)
#                 actor_loss += criterion(action, target_action)
#                 current_returns = critic(data, target_action)
#                 critic_loss += sum(
#                     F.mse_loss(current_return, target_return)
#                     for current_return in current_returns
#                 )

#         actor_loss /= len(test_loader.dataset)
#         critic_loss /= len(test_loader.dataset)
#         print(
#             "\nTest set: Average actor loss: {:.4f}, Average critic loss: {:.4f}\n".format(
#                 actor_loss.item(), critic_loss.item()
#             )
#         )

#     # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
#     # and testing
#     train_loader = th.utils.data.DataLoader(
#         dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
#     )
#     test_loader = th.utils.data.DataLoader(
#         dataset=test_expert_dataset,
#         batch_size=test_batch_size,
#         shuffle=True,
#         **kwargs,
#     )

#     # Define an Optimizer and a learning rate schedule.
#     actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)
#     critic_optimizer = optim.Adam(critic.parameters(), lr=learning_rate)

#     # Now we are finally ready to train the policy model.
#     for epoch in range(1, epochs + 1):
#         train(actor, critic, device, train_loader, actor_optimizer, critic_optimizer)
#         test(actor, critic, device, test_loader)

#     # Implant the trained policy network back into the RL student agent
#     student.policy.actor = actor
#     student.policy.critic = critic

In [None]:
def pretrain_agent(
    student,
    batch_size=64,
    epochs=1000,
    learning_rate=0.001,
    log_interval=100,
    no_cuda=False,
    seed=1,
    test_batch_size=64,
):
    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    # Extract initial policy
    actor = student.policy.actor.to(device)
    critic = student.policy.critic.to(device)

    def train(actor, critic, device, train_loader, actor_optimizer, critic_optimizer):
        actor.train()
        critic.train()

        for batch_idx, (data, target_action, target_return) in enumerate(train_loader):
            data, target_action, target_return = (
                data.to(device),
                target_action.to(device),
                target_return.to(device),
            )

            target_action = target_action.float()
            target_return = target_return.float()

            action = actor(data)
            actor_loss = F.mse_loss(action, target_action)

            actor_optimizer.zero_grad()
            actor_loss.backward()
            actor_optimizer.step()

            current_returns = critic(data, target_action)
            critic_loss = sum(
                F.mse_loss(current_return, target_return)
                for current_return in current_returns
            )

            critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_optimizer.step()

            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tActor Loss: {:.6f}\tCritic Loss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        actor_loss.item(),
                        critic_loss.item(),
                    )
                )

    def test(actor, critic, device, test_loader):
        actor.eval()
        critic.eval()
        actor_loss = 0
        critic_loss = 0
        with th.no_grad():
            for data, target_action, target_return in test_loader:
                data, target_action, target_return = (
                    data.to(device),
                    target_action.to(device),
                    target_return.to(device),
                )

                target_action = target_action.float()
                target_return = target_return.float()

                action = actor(data)
                actor_loss += F.mse_loss(action, target_action)
                current_returns = critic(data, target_action)
                critic_loss += sum(
                    F.mse_loss(current_return, target_return)
                    for current_return in current_returns
                )

        actor_loss /= len(test_loader.dataset)
        critic_loss /= len(test_loader.dataset)
        print(
            "\nTest set: Average actor loss: {:.4f}, Average critic loss: {:.4f}\n".format(
                actor_loss.item(), critic_loss.item()
            )
        )

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing
    train_loader = th.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader = th.utils.data.DataLoader(
        dataset=test_expert_dataset,
        batch_size=test_batch_size,
        shuffle=True,
        **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    actor_optimizer = optim.Adam(actor.parameters(), lr=learning_rate)
    critic_optimizer = optim.Adam(critic.parameters(), lr=learning_rate)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(actor, critic, device, train_loader, actor_optimizer, critic_optimizer)
        test(actor, critic, device, test_loader)

    # Implant the trained policy network back into the RL student agent
    student.policy.actor = actor
    student.policy.critic = critic

In [39]:
from stable_baselines3.common.utils import polyak_update


def train(
    student,
    batch_size=64,
    epochs=1000,
    log_interval=100,
    no_cuda=False,
    seed=42,
) -> None:
    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 0, "pin_memory": True} if use_cuda else {}

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing
    train_loader = th.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )

    student.policy.to(device)

    # Switch to train mode (this affects batch norm / dropout)
    student.policy.set_training_mode(True)

    # Update learning rate according to lr schedule
    # student._update_learning_rate([student.actor.optimizer, student.critic.optimizer])

    for epoch in range(1, epochs + 1):
        for batch_idx, (
            observations,
            actions,
            rewards,
            dones,
            next_observations,
        ) in enumerate(train_loader):
            observations, actions, rewards, dones, next_observations = (
                observations.to(device),
                actions.to(device),
                rewards.to(device),
                dones.to(device),
                next_observations.to(device),
            )
            student._n_updates += 1

            with th.no_grad():
                # Select action according to policy and add clipped noise
                next_actions = student.actor_target(next_observations)

                # Compute the next Q-values: min over all critics targets
                next_q_values = th.cat(
                    student.critic_target(next_observations, next_actions),
                    dim=1,
                )
                next_q_values, _ = th.min(next_q_values, dim=1, keepdim=True)
                target_q_values = rewards + (1 - dones) * student.gamma * next_q_values

            # Get current Q-values estimates for each critic network
            current_q_values = student.critic(observations, actions)

            # Compute critic loss
            critic_loss = sum(
                F.mse_loss(current_q, target_q_values) for current_q in current_q_values
            )
            assert isinstance(critic_loss, th.Tensor)

            # Optimize the critics
            student.critic.optimizer.zero_grad()
            critic_loss.backward()
            student.critic.optimizer.step()

            # Delayed policy updates
            if student._n_updates % student.policy_delay == 0:
                # Compute actor loss
                actor_loss = -student.critic.q1_forward(
                    observations, student.actor(observations)
                ).mean()

                # Optimize the actor
                student.actor.optimizer.zero_grad()
                actor_loss.backward()
                student.actor.optimizer.step()

                polyak_update(
                    student.critic.parameters(),
                    student.critic_target.parameters(),
                    student.tau,
                )
                polyak_update(
                    student.actor.parameters(),
                    student.actor_target.parameters(),
                    student.tau,
                )
                # Copy running stats, see GH issue #996
                polyak_update(
                    student.critic_batch_norm_stats,
                    student.critic_batch_norm_stats_target,
                    1.0,
                )
                polyak_update(
                    student.actor_batch_norm_stats,
                    student.actor_batch_norm_stats_target,
                    1.0,
                )
    
        if batch_idx % log_interval == 0:
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tActor Loss: {:.6f}\tCritic Loss: {:.6f}".format(
                    epoch,
                    batch_idx * len(dones),
                    len(train_loader.dataset),
                    100.0 * batch_idx / len(train_loader),
                    actor_loss.item(),
                    critic_loss.item(),
                )
            )

KeyboardInterrupt: 

Evaluate the agent before pretraining, it should be random

In [31]:
mean_reward, std_reward = evaluate_policy(student, env, n_eval_episodes=10)

print(f"Mean reward = {mean_reward} +/- {std_reward}")



Mean reward = -1344.4609348239378 +/- 59.453153859642484




Having defined the training procedure we can now run the pretraining!


In [34]:
train(
    student,
    batch_size=64,
    epochs=100,
    log_interval=100,
    no_cuda=True,
    seed=42,
)
student.save("student")



Finally, let us test how well our RL agent student learned to mimic the behavior of the expert


In [None]:
mean_reward, std_reward = evaluate_policy(student, env, n_eval_episodes=10)

print(f"Mean reward = {mean_reward} +/- {std_reward}")

Mean reward = -147.9170643418096 +/- 69.77307351132359


In [None]:
student.save("student")

In [None]:
env = gym.make(env_id, render_mode="human")

In [None]:
student = TD3.load("student", env)
student.learn(total_timesteps=1e5)
student.save("student")

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -435     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 23       |
|    time_elapsed    | 34       |
|    total_timesteps | 800      |
| train/             |          |
|    actor_loss      | 7.43     |
|    critic_loss     | 40.8     |
|    learning_rate   | 0.001    |
|    n_updates       | 600      |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -811     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 22       |
|    time_elapsed    | 70       |
|    total_timesteps | 1600     |
| train/             |          |
|    actor_loss      | 18.4     |
|    critic_loss     | 32.2     |
|    

KeyboardInterrupt: 