Import the Dependencies

In [368]:
# Importing Packages
import numpy as np
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from imitation.algorithms.adversarial.airl import AIRL
from imitation.util import util
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.util.util import make_vec_env
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.rewards.reward_nets import RewardNet
from imitation.util.networks import RunningNorm
from imitation.util import networks, util
import matplotlib.pyplot as plt
import pandas as pd


In [369]:
SEED = 42

Define the true reward function's weights

In [370]:
num_features = 50
weights = np.random.uniform(-1, 1, num_features)

In [371]:
import gymnasium as gym
from gymnasium import spaces

"""
Define the environment
- STATES: the set of all possible observations for an agent
- ACTIONS: the set of all possible actions an agent can take
- STEP: determines how actions lead to changes in states

In this case, the the states and actions are num_features-dimensional
""" 

# Use base class from Gym
class CustomEnv(gym.Env):
    def __init__(self, num_options: int = 2, weights=None):
        super().__init__()

        self.num_options = num_options
        # Possible states and actions are defined as a  "box" with a range from [-1, 1]
        self.observation_space = spaces.Box(low=-1, high=1, shape=(num_features,), dtype=np.float32)
        self.action_space = spaces.Box(low=-1, high=1, shape=(num_features,), dtype=np.float32)
        self.state = None
        self.max_steps = 100  # Define a fixed number of steps per episode 
        """
        Episode:
            - set of interactions between agent and env from starting state until terminal state
            - agent uses interactions to improve reward
        Steps:
            - episodes terminate after max_steps
        """
        self.current_step = 0

        # Initialize weights for reward calculation
        self.weights = weights

    # Resets the environment to an initial state
    def reset(self, seed=None, options=None):
        super().reset(seed=seed, options=options)
        if seed is not None:
            np.random.seed(seed)
        self.state = self.observation_space.sample()
        self.current_step = 0  # Reset the step counter
        return self.state, {}

    def step(self, action):
        # Calculate the reward based on the weights
        reward = np.dot(self.state * action, self.weights)
        
        # Update the state
        self.state = np.random.uniform(low=-1, high=1, size=(num_features,)).astype(np.float32)
        
        # Increment step counter
        self.current_step += 1
        
        # Define the done condition based on max_steps
        done = self.current_step >= self.max_steps
        
        # Since done is used, no need for truncated in this context
        truncated = False  

        info = {
            "obs": self.state,
            "rews": reward,
        }

        return self.state, reward, done, truncated, info

    def render(self, mode='human'):
        pass

Register the custom environment to use OpenAI's Gym API (intializes the environment with the true reward function's weights to determine state->action behavior)

In [372]:
gym.register(id='CustomEnv-v0', entry_point=lambda: CustomEnv(weights=weights), max_episode_steps=100)

Create a vectorized environment for efficient training (train multiple instances of same environment simultaneously)

In [373]:
venv = util.make_vec_env("CustomEnv-v0", rng=np.random.default_rng(SEED), n_envs=4, post_wrappers=[lambda env, _: RolloutInfoWrapper(env)])

Initalize a PPO agent to learn the environment over 10000 steps using the "MlpPolicy"
PPO (Proximal Policy Optimization):
    1. collects experiences (states, actions, rewards over some episodes)
    2. try to find advantageous actions
    3. Update policy

Using the trained policy, collect data of the trajectories (interactions of the trained agent with the environment from some episodes)

In [374]:
expert_policy = PPO('MlpPolicy', venv, verbose=1)
expert_policy.learn(total_timesteps=10000)

# Collect rollouts
rollouts = rollout.rollout(
    expert_policy,
    venv,
    rollout.make_sample_until(min_episodes=60),
    rng=np.random.default_rng(SEED),
)

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -3.62    |
| time/              |          |
|    fps             | 22177    |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 17.9        |
| time/                   |             |
|    fps                  | 7091        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.063750505 |
|    clip_fraction        | 0.463       |
|    clip_range           | 0.2         |
|    entropy_loss         | -70.9       |
|    explained_variance   | -0.0106     |
|    learning

In [375]:
save_rollouts = pd.DataFrame(rollouts)
save_rollouts.to_csv("rollouts.csv")

Define a Simple Linear Model Reward Function to Learn During Training

In [403]:

# Inherit from imitation's RewardNet
class LinearRewardNet(RewardNet):
    def __init__(self, observation_space, action_space):
        super().__init__(observation_space, action_space)

        # Define a linear layer that maps the combined state and action to a reward
        input_dim = observation_space.shape[0] + action_space.shape[0]  # total input dimensions
        self.linear = nn.Linear(input_dim, 1)  # Linear layer to produce a single reward value

    def forward(self,
        state: th.Tensor,  
        action: th.Tensor,  
        # next_state: th.Tensor,  
        done: th.Tensor,  
    ) -> th.Tensor:
        # Concatenate state and action to form the input to the linear model
        x = th.cat((state, action), dim=-1)  

        # Compute the reward using the linear layer
        reward = self.linear(x)  

        # Squeeze to remove the extra dimension
        return reward.squeeze(-1) 

In [404]:
# Imitation library's nonlinear model
reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

In [405]:
reward_net = LinearRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
)

Initialize AIRL to be trained on the environment, with the expert data, and the same MlpPolicy as the generator to train the data

In [406]:
airl_trainer = AIRL(
    venv=venv,
    demonstrations=rollouts,
    demo_batch_size=60,
    gen_algo= PPO(
        'MlpPolicy',
        venv,
        verbose=1,
    ),
    reward_net=reward_net,
)

Using cpu device


In [407]:
learner_rewards_before_training, _ = evaluate_policy(airl_trainer.gen_algo, venv, 100, return_episode_rewards=True)

# Train AIRL
airl_trainer.train(total_timesteps=20000)

# Evaluate policy after training
learner_rewards_after_training, _ = evaluate_policy(airl_trainer.gen_algo, venv, 100, return_episode_rewards=True)


# Print results
print("Rewards before training:", learner_rewards_before_training)
print("Rewards after training:", learner_rewards_after_training)

print("Mean Rewards before training:", np.mean(learner_rewards_before_training))
print("Mean Rewards after training:", np.mean(learner_rewards_after_training))

round:   0%|          | 0/2 [00:00<?, ?it/s]


TypeError: forward() takes 3 positional arguments but 5 were given

In [402]:
import torch as th
import numpy as np
from sklearn.metrics import mean_squared_error

# Assuming rollouts is your collected rollout data
# Extract states, actions, and rewards from rollouts
states = obs_array  # Assuming this is an array of shape (num_timesteps, num_features)
actions = acts_array  # Assuming this is an array of shape (num_timesteps, num_features)
true_rewards = rews_array  # Assuming rewards are stored in this key

# Create an instance of the linear reward network
observation_space = spaces.Box(low=-1, high=1, shape=(num_features,), dtype=np.float32)
action_space = spaces.Box(low=-1, high=1, shape=(num_features,), dtype=np.float32)
reward_net = LinearRewardNet(observation_space, action_space)

# Convert states and actions to tensors
states_tensor = th.tensor(states, dtype=th.float32)
actions_tensor = th.tensor(actions, dtype=th.float32)

# Use the reward network to predict rewards
predicted_rewards = reward_net(states_tensor, actions_tensor).detach().numpy()

# Calculate the Mean Squared Error
mse = mean_squared_error(true_rewards, predicted_rewards)

# Print the MSE
print("Mean Squared Error:", mse)

TypeError: forward() missing 2 required positional arguments: 'next_state' and 'done'

In [401]:
import ast
import csv

def parse_array_string(array_str):
    # Remove the prefix "array(" and the suffix ")" if present
    if 'array(' in array_str:
        array_str = array_str[array_str.index('(') + 1: array_str.rindex(')')]
    
    # Replace " " with ", " for easier conversion
    array_str = array_str.replace(' ', ', ')
    
    # Convert the cleaned string to a list and then to a NumPy array
    return np.array(eval(array_str))

# Replace 'data.csv' with the path to your CSV file
csv_file_path = 'data.csv'

# Initialize lists to hold the extracted data
obs_list = []
acts_list = []
rews_list = []

# Read the CSV file
with open('./rollouts.csv', mode='r') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        # Extract 'obs', 'acts', and 'rews' columns
        obs = row['obs']
        acts = row['acts']
        rews = row['rews']

        # Parse and append to lists if they are not empty
        if obs:  # Check if obs is not empty
            obs_list.append(obs_array)
        
        if acts:  # Check if acts is not empty
            acts_list.append(acts_array)

        if rews:  # Check if rews is not empty
            rews_list.append(rews_array)

# Convert lists to NumPy arrays if needed
obs_array = np.array(obs_list)
acts_array = np.array(acts_list)
rews_array = np.array(rews_list)