In [2]:
# import warnings
# warnings.filterwarnings("ignore")
    
# Standard imports that we will need in the rest of the notebook.
import numpy as np
from numpy import inf

# Discrete distributions and sampling
from gym import Env, spaces, utils
from gym.utils import seeding

# Inverse reinforcement learning
from reinforce_learning import vi_rational
from inv_reinforce_learning_state_action import compute_s_a_visitations, vi_boltzmann, compute_D

# Plot
import matplotlib.pyplot as plt

# Data processing
import arviz as az

# Bayesian inference
import pymc3 as pm
import theano.tensor as tt
from theano.compile.ops import as_op

print('Loading')

state_vis_freq = np.load('./data/state_vis_freq.npy')
prob_action_given_state = np.load('./data/prob_action_given_state.npy')
transition_proba = np.load('./data/transition_proba.npy')
state_feature_name = np.load('./data/state_feature_name.npy', allow_pickle=True)
action_feature_name = np.load('./data/action_feature_name.npy', allow_pickle=True)
state_feature = np.load('./data/state_feature.npy')
action_feature = np.load('./data/action_feature.npy')
initial_state_dist = np.load('./data/initial_state_dist.npy')

from environment import MDP, Environment

environment = Environment(state_feature_name, action_feature_name, 
                          state_feature, action_feature, transition_proba, initial_state_dist)
mdp = MDP(environment)

mdp.valid_sa = mdp.T.sum(axis=2)==1

print('Done')

Loading
Done


In [3]:
def generate_trajectories(mdp, policy, timesteps=35, num_traj=50):
    '''
    Generates trajectories in the MDP given a policy.
    
    Parameters
    ----------
    mdp : object
        Instance of the MDP class.
    policy : 2D numpy array
        Array of shape (mdp.nS, mdp.nA), each value p[s,a] is the probability 
        of taking action a in state s.
    timesteps : int
        Length of each of the generated trajectories.
    num_traj : 
        Number of trajectories to generate.
    
    Returns
    -------
    3D numpy array
        Expert trajectories. 
        Dimensions: [number of traj, timesteps in the traj, 2: state & action].
    '''
    
    trajectories = np.zeros([num_traj, timesteps, 2]).astype(int)
    
    s = mdp.reset()
    for i in range(num_traj):
        for t in range(timesteps):
            action = np.random.choice(mdp.nA, p=policy[s, :])
            trajectories[i, t, :] = [s, action]
            s = mdp.step(action)
        s = mdp.reset()
    
    return trajectories

In [4]:
gamma = 1
n_traj=10000
traj_len = 35

np.random.seed(0)

print('State feature names: ',  environment.state_feature_names)
print('Action feature names: ',  environment.action_feature_names)
# The "true" reward weights and the reward
theta_state_expert = np.zeros(environment.state_feature_matrix.shape[1])
theta_state_expert[24] = 1.0
theta_action_expert = np.zeros(environment.action_feature_matrix.shape[1])
theta_action_expert[[1,4,6,8]] = 1.0

State feature names:  ['Gender:Male' 'Gender:Female' 'Age:Younger Than 30'
 'Age:Between 30 To 60' 'Age:60 And Older' 'Current Daytime Interval:Wake'
 'Current Daytime Interval:Morning' 'Current Daytime Interval:Afternoon'
 'Current Daytime Interval:Evening' 'Current Daytime Interval:Bed'
 'Current Interval Pain Score:Lower' 'Current Interval Pain Score:Normal'
 'Current Interval Pain Score:Higher'
 'Current Interval Pain Score:Not Recorded'
 'Current Interval Fatigue Score:Lower'
 'Current Interval Fatigue Score:Normal'
 'Current Interval Fatigue Score:Higher'
 'Current Interval Fatigue Score:Not Recorded'
 'Last Interval Activity Bouts:Lower'
 'Last Interval Activity Bouts:Normal'
 'Last Interval Activity Bouts:Higher'
 'Last Interval Activity Bouts:Not Recorded'
 'Eod Positive Affect And Well-Being:Lower'
 'Eod Positive Affect And Well-Being:Normal'
 'Eod Positive Affect And Well-Being:Higher'
 'Eod Positive Affect And Well-Being:Not Recorded']
Action feature names:  ['Activitybouts

In [5]:
r_s_expert = np.squeeze(np.asarray(np.dot(environment.state_feature_matrix, theta_state_expert)))
r_a_expert = np.squeeze(np.asarray(np.dot(environment.action_feature_matrix, theta_action_expert)))

In [6]:
V, Q, policy_expert = vi_boltzmann(mdp, gamma, r_s_expert, r_a_expert, traj_len)

print('State feature names: ', environment.state_feature_names)
print('Action feature names: ', environment.action_feature_names)
# print("My  theta: ", theta_expert)
print("My policy:")
print(policy_expert)

State feature names:  ['Gender:Male' 'Gender:Female' 'Age:Younger Than 30'
 'Age:Between 30 To 60' 'Age:60 And Older' 'Current Daytime Interval:Wake'
 'Current Daytime Interval:Morning' 'Current Daytime Interval:Afternoon'
 'Current Daytime Interval:Evening' 'Current Daytime Interval:Bed'
 'Current Interval Pain Score:Lower' 'Current Interval Pain Score:Normal'
 'Current Interval Pain Score:Higher'
 'Current Interval Pain Score:Not Recorded'
 'Current Interval Fatigue Score:Lower'
 'Current Interval Fatigue Score:Normal'
 'Current Interval Fatigue Score:Higher'
 'Current Interval Fatigue Score:Not Recorded'
 'Last Interval Activity Bouts:Lower'
 'Last Interval Activity Bouts:Normal'
 'Last Interval Activity Bouts:Higher'
 'Last Interval Activity Bouts:Not Recorded'
 'Eod Positive Affect And Well-Being:Lower'
 'Eod Positive Affect And Well-Being:Normal'
 'Eod Positive Affect And Well-Being:Higher'
 'Eod Positive Affect And Well-Being:Not Recorded']
Action feature names:  ['Activitybouts

In [7]:
# Generate expert trajectories using the given expert policy.
trajectories = generate_trajectories(mdp, policy_expert, traj_len, n_traj)

In [15]:
np.save("data/generated_behavior_instances.npy", trajectories)