In [1]:
# Importing data manipulation/visualization packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Import ML framework library
import torch as th
import torch.nn as nn
import torch.nn.init as init

# Importing gym packages
import gymnasium as gym
from gymnasium import spaces

# Importing IRL libraries
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO

# Importing imitation library
from imitation.algorithms.adversarial.airl import AIRL
from imitation.util import util
from imitation.data import rollout
from imitation.data.wrappers import RolloutInfoWrapper
from imitation.util.util import make_vec_env
from imitation.rewards.reward_nets import BasicShapedRewardNet
from imitation.rewards.reward_nets import RewardNet
from imitation.util.networks import RunningNorm
from imitation.util import networks, util

# Import miscellaneuous packages
import random
from scipy.stats import norm

pygame 2.5.0 (SDL 2.28.0, Python 3.9.20)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# Setting the seed
SEED = 42
np.random.seed(SEED)
th.manual_seed(SEED)
if th.cuda.is_available():
    th.cuda.manual_seed_all(SEED)

In [3]:
# Arbitrary weights for the importance of the engagement level and section number
a, b = 1, 0.5

# Arbitary thresholds for what to what scores should be considered extremely interested, mildy interested, and not interested
theta_1 = 0.9
theta_2 = 0.4

# Define actions
ACTION_WAIT = 0
ACTION_READ_FREE = 1
ACTION_READ_PAY = 2

# Number of chapters
NUM_CHAPTERS = 24

In [None]:
# Helper function to calculate how interested an agent is
def interest_score (engagement_level, section_number):
    norm_section_number = (section_number - 1) / (NUM_CHAPTERS - 1)
    return a * engagement_level + b * norm_section_number

In [None]:
# Helper function to draw from Bernoulli (in my case, want to draw to choose between 2 values)
def probabilistic_choice(options):
    actions, probabilities = zip(*options)
    return random.choices(actions, weights=probabilities, k=1)[0]

In [None]:
class ExpertPolicy:
	def __call__ (self, state, obs, dones=None, infos=None):

		state = state[0] # Since we are only using 1 environment in the vectorized env so the batch size is 1
		section_number, engagement_level, time, price = state

		score = interest_score(engagement_level, section_number)

		if score > theta_1:
			if price == 1:
				action = probabilistic_choice([(ACTION_READ_PAY, 0.7), (ACTION_WAIT, 0.3)])
			else:
				action = probabilistic_choice([(ACTION_READ_FREE, 0.9), (ACTION_WAIT, 0.1)])
		elif theta_2 < score < theta_1:
			if time < 24: # time interval since last read is short
				if price == 1:
					action = probabilistic_choice([(ACTION_READ_PAY, 0.5), (ACTION_WAIT, 0.5)])
				else:
					action = probabilistic_choice([(ACTION_READ_FREE, 0.7), (ACTION_WAIT, 0.3)])
			else: # time interval since last read is long (reader is less engaged)
				if price == 1:
					action = probabilistic_choice([(ACTION_READ_PAY, 0.3), (ACTION_WAIT, 0.7)])
				else:
					action = probabilistic_choice([(ACTION_READ_FREE, 0.5), (ACTION_WAIT, 0.5)])			
		else:
			if time < 36: # time interval since last read is short
				if price == 1:
					action = probabilistic_choice([(ACTION_READ_PAY, 0.2), (ACTION_WAIT, 0.8)])
				else:
					action = probabilistic_choice([(ACTION_READ_FREE, 0.4), (ACTION_WAIT, 0.6)])
			else: # time interval since last read is long (reader is less engaged)
				if price == 1:
					action = probabilistic_choice([(ACTION_READ_PAY, 0.1), (ACTION_WAIT, 0.9)])
				else:
					action = probabilistic_choice([(ACTION_READ_FREE, 0.2), (ACTION_WAIT, 0.8)])
		return [action], None	

In [None]:
def true_reward (state, action):

	
	section_number, engagement_level, time, price = state
	score = interest_score(engagement_level, section_number)
	
	if score > theta_1:
		if price == 1:
			if action == ACTION_READ_PAY:
				return 5
			else: # Wait
				return -1
		if price == 0:
			if action == ACTION_READ_FREE:
				return 9
			else: # Wait
				return -1
	elif theta_2 < score < theta_1:
		if time < 24: # time interval since last read is short
			if price == 1:
				if action == ACTION_READ_PAY:
					return 4
				else: # Wait
					return -1
			if price == 0:
				if action == ACTION_READ_FREE:
					return 7
				else: # Wait
					return -1
		else: # time interval since last read is long (reader is less engaged)
			if price == 1:
				if action == ACTION_READ_PAY:
					return 3
				else: # Wait
					return -1
			if price == 0:
				if action == ACTION_READ_FREE:
					return 6
				else: # Wait
					return -1
	else:
		if time < 36: # time interval since last read is short
			if price == 1:
				if action == ACTION_READ_PAY:
					return 2
				else: # Wait
					return -1
			if price == 0:
				if action == ACTION_READ_FREE:
					return 5
				else: # Wait
					return -1
		else: # time interval since last read is long (reader is less engaged)
			if price == 1:
				if action == ACTION_READ_PAY:
					return 1
				else: # Wait
					return -1
			if price == 0:
				if ACTION_READ_FREE:
					return 3
				else: # Wait
					return -1		

In [None]:
class EBookEnv(gym.Env):
    def __init__(self):
        super(EBookEnv).__init__()

        # state =  [section_number, engagement_level, time, price]
        self.observation_space = spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32)

        # action = [wait, read_without_payment, read_with_payment]
        self.action_space = spaces.Discrete(3)

        self.state = None
        self.times_bought = 0

    def reset(self, seed=None, options=None):
        # Resetting the state so that the reader begins at section 1, at a 0.8 engagement level, time 0, and price 0
        self.state = np.array([1, 0.8, 0, 0], dtype=np.float32)
        self.times_bought = 0
        return self.state, {}

    def step(self, action):
        # Calculate the reward
        reward = true_reward(self.state, action)

        section_number, engagement_level, time, price = self.state

        """ *** Update the state features *** """
        if action == ACTION_READ_PAY or action == ACTION_READ_FREE: # Section number only increases if the current action is to read (pay or not pay)
            section_number += 1
            time = 0 # Reset the time because the new time interval starts at beginning of chapters

        # Update the number of times a chapter was bought
        if action == ACTION_READ_PAY:
            self.times_bought += 1

        gamma = 0.7 # parameter to change probability of price being a 1
        if section_number < 4: # Ensure that the first 4 sections are free
            price = 0
        else:
            price = probabilistic_choice([(1, 1 - gamma ** section_number), (0, gamma ** section_number)])
        
        time += 1 # Time is always incremented
        engagement_level = norm.cdf(np.random.normal(0, 1)) # Draw the engagement index from a N~(0, 1) and normalize to [0, 1]

        # Save the new state
        self.state = np.array([section_number, engagement_level, time, price], dtype=np.float32)

        # The terminating condition is when you reach the end of a book or if the time interval gets very large
        done = time >= 108 or section_number > NUM_CHAPTERS

        if done:
            print(f"Agent bought chapters {self.times_bought} times")
        truncated = False
        info = {"obs": self.state, "rews": reward}
        return self.state, float(reward), done, truncated, info

    def render(self, mode='human'):
        pass


In [9]:
# Register the gym for compatability with OpenAI Gym
gym.register(id='EBookEnv-v0', entry_point=lambda: EBookEnv())
venv = util.make_vec_env("EBookEnv-v0", rng=np.random.default_rng(SEED), n_envs=1, post_wrappers=[lambda env, _: RolloutInfoWrapper(env)])

In [10]:
# Collect 500 trajectories of the expert behavior 
expert_policy = ExpertPolicy()
trajectories = rollout.rollout(
    expert_policy,
    venv,
    rollout.make_sample_until(min_episodes=500),
    rng=np.random.default_rng(SEED),
)


  logger.warn(f"{pre} is not within the observation space.")


Agent bought chapters 19 times
Agent bought chapters 21 times
Agent bought chapters 19 times
Agent bought chapters 20 times
Agent bought chapters 19 times
Agent bought chapters 20 times
Agent bought chapters 21 times
Agent bought chapters 19 times
Agent bought chapters 19 times
Agent bought chapters 19 times
Agent bought chapters 20 times
Agent bought chapters 20 times
Agent bought chapters 21 times
Agent bought chapters 20 times
Agent bought chapters 21 times
Agent bought chapters 20 times
Agent bought chapters 20 times
Agent bought chapters 20 times
Agent bought chapters 20 times
Agent bought chapters 21 times
Agent bought chapters 20 times
Agent bought chapters 21 times
Agent bought chapters 21 times
Agent bought chapters 19 times
Agent bought chapters 21 times
Agent bought chapters 21 times
Agent bought chapters 19 times
Agent bought chapters 20 times
Agent bought chapters 20 times
Agent bought chapters 21 times
Agent bought chapters 20 times
Agent bought chapters 19 times
Agent bo

In [None]:
learner = PPO(
    env=venv,
    policy=MlpPolicy,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0005,
    gamma=0.95,
    clip_range=0.1,
    vf_coef=0.1,
    n_epochs=5,
    seed=SEED,
)

"""
    Initialize a reward network. 
"""
reward_net = BasicShapedRewardNet(
    observation_space=venv.observation_space,
    action_space=venv.action_space,
    normalize_input_layer=RunningNorm,
)

# Intialize parameters for AIRL model
airl_trainer = AIRL(
    demonstrations=trajectories,
    demo_batch_size=2048,
    gen_replay_buffer_capacity=512,
    n_disc_updates_per_round=16,
    venv=venv,
    gen_algo=learner,
    reward_net=reward_net,
    allow_variable_horizon=True
)

Running with `allow_variable_horizon` set to True. Some algorithms are biased towards shorter or longer episodes, which may significantly confound results. Additionally, even unbiased algorithms can exploit the information leak from the termination condition, producing spuriously high performance. See https://imitation.readthedocs.io/en/latest/getting-started/variable-horizon.html for more information.


In [12]:
venv.seed(SEED)

airl_trainer.train(20000)  # Train for 2_000_000 steps to match expert by learning a reward and a policy

round:   0%|          | 0/9 [00:00<?, ?it/s]

Agent bought chapters 14 times
Agent bought chapters 9 times
Agent bought chapters 8 times
Agent bought chapters 7 times
Agent bought chapters 12 times
Agent bought chapters 12 times
Agent bought chapters 14 times
Agent bought chapters 13 times
Agent bought chapters 11 times
Agent bought chapters 15 times
Agent bought chapters 13 times
Agent bought chapters 13 times
Agent bought chapters 13 times
Agent bought chapters 12 times
Agent bought chapters 14 times
Agent bought chapters 14 times
Agent bought chapters 17 times
Agent bought chapters 13 times
Agent bought chapters 11 times
Agent bought chapters 11 times
Agent bought chapters 13 times
Agent bought chapters 12 times
Agent bought chapters 12 times
Agent bought chapters 6 times
Agent bought chapters 12 times
Agent bought chapters 12 times
Agent bought chapters 10 times
Agent bought chapters 10 times
Agent bought chapters 13 times
Agent bought chapters 9 times
Agent bought chapters 11 times
Agent bought chapters 12 times
Agent bought 

round:  11%|█         | 1/9 [00:01<00:14,  1.77s/it]

Agent bought chapters 14 times
Agent bought chapters 12 times
Agent bought chapters 13 times
Agent bought chapters 14 times
Agent bought chapters 11 times
Agent bought chapters 14 times
Agent bought chapters 15 times
Agent bought chapters 11 times
Agent bought chapters 12 times
Agent bought chapters 9 times
Agent bought chapters 14 times
Agent bought chapters 10 times
Agent bought chapters 16 times
Agent bought chapters 13 times
Agent bought chapters 10 times
Agent bought chapters 9 times
Agent bought chapters 13 times
Agent bought chapters 13 times
Agent bought chapters 8 times
Agent bought chapters 15 times
Agent bought chapters 16 times
Agent bought chapters 13 times
Agent bought chapters 12 times
Agent bought chapters 9 times
Agent bought chapters 12 times
Agent bought chapters 9 times
Agent bought chapters 7 times
Agent bought chapters 9 times
Agent bought chapters 13 times
Agent bought chapters 7 times
Agent bought chapters 10 times
Agent bought chapters 15 times
Agent bought cha

round:  22%|██▏       | 2/9 [00:03<00:11,  1.63s/it]

Agent bought chapters 11 times
Agent bought chapters 15 times
Agent bought chapters 11 times
Agent bought chapters 13 times
Agent bought chapters 10 times
Agent bought chapters 15 times
Agent bought chapters 9 times
Agent bought chapters 11 times
Agent bought chapters 13 times
Agent bought chapters 12 times
Agent bought chapters 14 times
Agent bought chapters 13 times
Agent bought chapters 9 times
Agent bought chapters 13 times
Agent bought chapters 9 times
Agent bought chapters 10 times
Agent bought chapters 10 times
Agent bought chapters 11 times
Agent bought chapters 10 times
Agent bought chapters 10 times
Agent bought chapters 13 times
Agent bought chapters 8 times
Agent bought chapters 9 times
Agent bought chapters 11 times
Agent bought chapters 10 times
Agent bought chapters 13 times
Agent bought chapters 5 times
Agent bought chapters 10 times
Agent bought chapters 9 times
Agent bought chapters 10 times
Agent bought chapters 11 times
Agent bought chapters 13 times
Agent bought ch

round:  33%|███▎      | 3/9 [00:04<00:09,  1.60s/it]

Agent bought chapters 8 times
Agent bought chapters 15 times
Agent bought chapters 14 times
Agent bought chapters 14 times
Agent bought chapters 11 times
Agent bought chapters 10 times
Agent bought chapters 11 times
Agent bought chapters 8 times
Agent bought chapters 15 times
Agent bought chapters 12 times
Agent bought chapters 11 times
Agent bought chapters 7 times
Agent bought chapters 8 times
Agent bought chapters 11 times
Agent bought chapters 7 times
Agent bought chapters 15 times
Agent bought chapters 16 times
Agent bought chapters 7 times
Agent bought chapters 13 times
Agent bought chapters 12 times
Agent bought chapters 15 times
Agent bought chapters 14 times
Agent bought chapters 12 times
Agent bought chapters 8 times
Agent bought chapters 10 times
Agent bought chapters 10 times
Agent bought chapters 10 times
Agent bought chapters 12 times
Agent bought chapters 10 times
Agent bought chapters 12 times
Agent bought chapters 11 times
Agent bought chapters 13 times
Agent bought ch

round:  44%|████▍     | 4/9 [00:06<00:08,  1.60s/it]

Agent bought chapters 11 times
Agent bought chapters 14 times
Agent bought chapters 12 times
Agent bought chapters 8 times
Agent bought chapters 13 times
Agent bought chapters 13 times
Agent bought chapters 14 times
Agent bought chapters 14 times
Agent bought chapters 8 times
Agent bought chapters 14 times
Agent bought chapters 10 times
Agent bought chapters 14 times
Agent bought chapters 6 times
Agent bought chapters 12 times
Agent bought chapters 15 times
Agent bought chapters 11 times
Agent bought chapters 12 times
Agent bought chapters 13 times
Agent bought chapters 9 times
Agent bought chapters 7 times
Agent bought chapters 9 times
Agent bought chapters 11 times
Agent bought chapters 14 times
Agent bought chapters 13 times
Agent bought chapters 7 times
Agent bought chapters 15 times
Agent bought chapters 12 times
Agent bought chapters 10 times
Agent bought chapters 14 times
Agent bought chapters 14 times
Agent bought chapters 10 times
Agent bought chapters 12 times
Agent bought ch

round:  56%|█████▌    | 5/9 [00:08<00:06,  1.60s/it]

Agent bought chapters 7 times
Agent bought chapters 14 times
Agent bought chapters 11 times
Agent bought chapters 16 times
Agent bought chapters 13 times
Agent bought chapters 14 times
Agent bought chapters 13 times
Agent bought chapters 16 times
Agent bought chapters 10 times
Agent bought chapters 13 times
Agent bought chapters 17 times
Agent bought chapters 13 times
Agent bought chapters 9 times
Agent bought chapters 15 times
Agent bought chapters 18 times
Agent bought chapters 14 times
Agent bought chapters 9 times
Agent bought chapters 16 times
Agent bought chapters 11 times
Agent bought chapters 12 times
Agent bought chapters 9 times
Agent bought chapters 11 times
Agent bought chapters 13 times
Agent bought chapters 13 times
Agent bought chapters 12 times
Agent bought chapters 12 times
Agent bought chapters 11 times
Agent bought chapters 14 times
Agent bought chapters 15 times
Agent bought chapters 8 times
Agent bought chapters 14 times
Agent bought chapters 15 times
Agent bought 

round:  67%|██████▋   | 6/9 [00:09<00:04,  1.64s/it]

Agent bought chapters 16 times
Agent bought chapters 12 times
Agent bought chapters 12 times
Agent bought chapters 14 times
Agent bought chapters 14 times
Agent bought chapters 11 times
Agent bought chapters 14 times
Agent bought chapters 13 times
Agent bought chapters 12 times
Agent bought chapters 12 times
Agent bought chapters 12 times
Agent bought chapters 10 times
Agent bought chapters 13 times
Agent bought chapters 15 times
Agent bought chapters 15 times
Agent bought chapters 10 times
Agent bought chapters 16 times
Agent bought chapters 20 times
Agent bought chapters 16 times
Agent bought chapters 16 times
Agent bought chapters 15 times
Agent bought chapters 14 times
Agent bought chapters 15 times
Agent bought chapters 14 times
Agent bought chapters 14 times
Agent bought chapters 15 times
Agent bought chapters 18 times
Agent bought chapters 14 times
Agent bought chapters 17 times
Agent bought chapters 12 times
Agent bought chapters 16 times
Agent bought chapters 14 times
Agent bo

round:  78%|███████▊  | 7/9 [00:11<00:03,  1.63s/it]

Agent bought chapters 10 times
Agent bought chapters 16 times
Agent bought chapters 14 times
Agent bought chapters 18 times
Agent bought chapters 15 times
Agent bought chapters 13 times
Agent bought chapters 11 times
Agent bought chapters 15 times
Agent bought chapters 14 times
Agent bought chapters 16 times
Agent bought chapters 14 times
Agent bought chapters 15 times
Agent bought chapters 19 times
Agent bought chapters 16 times
Agent bought chapters 19 times
Agent bought chapters 15 times
Agent bought chapters 20 times
Agent bought chapters 15 times
Agent bought chapters 14 times
Agent bought chapters 13 times
Agent bought chapters 19 times
Agent bought chapters 13 times
Agent bought chapters 20 times
Agent bought chapters 15 times
Agent bought chapters 17 times
Agent bought chapters 17 times
Agent bought chapters 15 times
Agent bought chapters 16 times
Agent bought chapters 16 times
Agent bought chapters 18 times
Agent bought chapters 15 times
Agent bought chapters 18 times
Agent bo

round:  89%|████████▉ | 8/9 [00:12<00:01,  1.60s/it]

Agent bought chapters 16 times
Agent bought chapters 18 times
Agent bought chapters 18 times
Agent bought chapters 14 times
Agent bought chapters 19 times
Agent bought chapters 18 times
Agent bought chapters 20 times
Agent bought chapters 16 times
Agent bought chapters 17 times
Agent bought chapters 16 times
Agent bought chapters 16 times
Agent bought chapters 19 times
Agent bought chapters 19 times
Agent bought chapters 15 times
Agent bought chapters 16 times
Agent bought chapters 13 times
Agent bought chapters 16 times
Agent bought chapters 19 times
Agent bought chapters 15 times
Agent bought chapters 19 times
Agent bought chapters 14 times
Agent bought chapters 16 times
Agent bought chapters 15 times
Agent bought chapters 19 times
Agent bought chapters 18 times
Agent bought chapters 17 times
Agent bought chapters 16 times
Agent bought chapters 20 times
Agent bought chapters 18 times
Agent bought chapters 16 times
Agent bought chapters 17 times
Agent bought chapters 15 times
Agent bo

round: 100%|██████████| 9/9 [00:14<00:00,  1.61s/it]


In [None]:
# Helper function to calculate the behavorial cloning loss
def behavioral_cloning_loss(expert_trajs, learner_policy, device='cpu'):
    total_loss = 0
    num_samples = 0

    for traj in expert_trajs:
        states = traj.obs  
        expert_actions = traj.acts  
        
        states_tensor = th.tensor(states, dtype=th.float32, device=device)
        
        with th.no_grad():
            learner_actions = learner_policy(states_tensor)[0]

        learner_actions_np = learner_actions.cpu().numpy()
        
    #     # Calculate the loss (number of mismatched actions)
        loss = np.sum(learner_actions_np != expert_actions)
        total_loss += loss
        num_samples += len(states)
    return total_loss / num_samples  # Average loss

bc_loss = behavioral_cloning_loss(trajectories, learner.policy)
print(f"Behavioral Cloning Loss: {bc_loss}")

22341
Behavioral Cloning Loss: 0.02238037688554675


  loss = np.sum(learner_actions_np != expert_actions)
