Run the following in Terminal to create a virtual environment for this tutorial

In [None]:
# Make virtual environment. Tested on python3.6+. Run in terminal, not in Jupyter.
python3.7 -m venv tutorial-env
source tutorial-env/bin/activate

# Install COBS requirements
pip3 install -r requirements.txt

# point jupyter to virtualenv 
pip3 install ipykernel
python -m ipykernel install --user --name=tutorial_env

# Now, restart jupyter notebook and change the Kernel by selecting Kernel > Change kernel > tutorial_env

For the first tutorial, we choose to run an experiment where we increase the size of the dataset to see how sample efficient the methods are. The environment is the Graph environment with a horizon of 4 and a tabular function class for the Q functions.

In [15]:
# Imports
import numpy as np
import json
from copy import deepcopy

from ope.envs.graph import Graph
from ope.policies.basics import BasicPolicy

from ope.experiment_tools.experiment import ExperimentRunner, analysis
from ope.experiment_tools.config import Config
from ope.experiment_tools.factory import setup_params

In [18]:
# Get configuration
configuration_filename = "toy_graph_pomdp_cfg.json"
with open('cfgs/{0}'.format(configuration_filename), 'r') as f:
    param = json.load(f)

In [19]:
param = setup_params(param) # Setup parameters
runner = ExperimentRunner() # Instantiate a runner for an experiment

In [22]:
# run 5 experiments, each with a varying number of trajectories
for N in range(5):
    
    configuration = deepcopy(param['experiment']) # Make sure to deepcopy as to never change original
    configuration['num_traj'] = 8*2**N # Increase dataset size

    # store these credentials in an object
    cfg = Config(configuration)

    # initialize environment with the parameters from the config file.
    # If you'd like to use a different environment, swap this line
    env = Graph(make_pomdp=cfg.is_pomdp,
                number_of_pomdp_states=cfg.pomdp_horizon,
                transitions_deterministic=not cfg.stochastic_env,
                max_length=cfg.horizon,
                sparse_rewards=cfg.sparse_rewards,
                stochastic_rewards=cfg.stochastic_rewards)

    # set seed for the experiment
    np.random.seed(cfg.seed)

    # processor processes the state for storage,  {(processor(x), a, r, processor(x'), done)}
    processor = lambda x: x

    # absorbing state for padding if episode ends before horizon is reached. This is environment dependent.
    absorbing_state = processor(np.array([env.n_dim - 1]))

    # Setup policies. BasicPolicy takes the form [P(a=0), P(a=1), ..., P(a=n)]
    # For different policies, swap in here
    actions = [0, 1]
    pi_e = BasicPolicy(
        actions, [max(.001, cfg.eval_policy), 1 - max(.001, cfg.eval_policy)])
    pi_b = BasicPolicy(
        actions, [max(.001, cfg.base_policy), 1 - max(.001, cfg.base_policy)])

    # add env, policies, absorbing state and processor
    cfg.add({
        'env': env,
        'pi_e': pi_e,
        'pi_b': pi_b,
        'processor': processor,
        'absorbing_state': absorbing_state
    })
    cfg.add({'models': param['models']})


    # Add the configuration
    runner.add(cfg)

{1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 3, 20: 3, 21: 3, 22: 3, 23: 3, 24: 3, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 0: 0, 31: 5}
{1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 3, 20: 3, 21: 3, 22: 3, 23: 3, 24: 3, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 0: 0, 31: 5}
{1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 3, 20: 3, 21: 3, 22: 3, 23: 3, 24: 3, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 0: 0, 31: 5}
{1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 3, 20: 3, 21: 3, 22: 3, 23: 3, 24: 3, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 0: 0, 31: 5}
{1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 2, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 3, 20: 3, 21: 3, 2

In [None]:
# Run the experiments
results = runner.run()

In [14]:
# Analyze the results
# Each row in the result is (OPE estimator, V(pi_e), MSE Error from on-policy: (V(pi_e) - True)**2)
for num, result in enumerate(results):
    print('Result Experiment %s' % (num+1))
    analysis(result)
    print('*'*20)
    print()

Result Experiment 1
Results:  

         ON POLICY:    -1.1875. Error:     0.0000
            AM FQE:    -0.8876. Error:     0.0900
            DR FQE:    -1.1128. Error:     0.0056
           WDR FQE:    -1.3082. Error:     0.0146
         MAGIC FQE:    -0.8876. Error:     0.0900
           SDR FQE:    -1.1128. Error:     0.0056
        AM Retrace:    -1.0343. Error:     0.0235
        DR Retrace:    -1.0963. Error:     0.0083
       WDR Retrace:    -1.1978. Error:     0.0001
     MAGIC Retrace:    -1.0238. Error:     0.0268
       SDR Retrace:    -1.0963. Error:     0.0083
    AM Tree-Backup:    -1.0196. Error:     0.0282
    DR Tree-Backup:    -1.0969. Error:     0.0082
   WDR Tree-Backup:    -1.1976. Error:     0.0001
 MAGIC Tree-Backup:    -1.0272. Error:     0.0257
   SDR Tree-Backup:    -1.0969. Error:     0.0082
   AM Q^pi(lambda):    -1.2358. Error:     0.0023
   DR Q^pi(lambda):    -1.0358. Error:     0.0230
  WDR Q^pi(lambda):    -1.0311. Error:     0.0245
MAGIC Q^pi(lambda)

For the second tutorial, we choose to run the same experiment as before but with a different environment and different Q function class. The environment is the Pixel-Gridworld (Pix-GW) environment with a horizon of 5 and a NN function class for the Q functions.

In [None]:
from ope.envs.gridworld import Gridworld
from ope.policies.epsilon_greedy_policy import EGreedyPolicy
from ope.policies.tabular_model import TabularPolicy

In [None]:
# Get configuration
configuration_filename = "nn_example_cfg.json"
with open('cfgs/{0}'.format(configuration_filename), 'r') as f:
    param = json.load(f)

In [None]:
param = setup_params(param)
runner = ExperimentRunner() # make new runner

In [None]:
for N in range(5):
    configuration = deepcopy(param['experiment']) # Make sure to deepcopy as to never change original
    configuration['num_traj'] = 8*2**N # Increase dataset size

    # store these credentials in an object
    cfg = Config(configuration)

    # initialize environment with the parameters from the config file.
    env = Gridworld(slippage=.2*cfg.stochastic_env)

    # Set seed and 
    np.random.seed(cfg.seed)
    eval_policy = cfg.eval_policy
    base_policy = cfg.base_policy

    # to_grid and from_grid are particular to Gridworld
    # These functions are special to convert an index in a grid to an 'image'
    def to_grid(x, gridsize=[8, 8]):
        x = x.reshape(-1)
        x = x[0]
        out = np.zeros(gridsize)
        if x >= 64:
            return out
        else:
            out[x//gridsize[0], x%gridsize[1]] = 1.
        return out

    # This function takes an 'image' and returns the position in the grid
    def from_grid(x, gridsize=[8, 8]):
        if len(x.shape) == 3:
            if np.sum(x) == 0:
                x = np.array([gridsize[0] * gridsize[1]])
            else:
                x = np.array([np.argmax(x.reshape(-1))])
        return x

    # processor processes the state for storage,  {(processor(x), a, r, processor(x'), done)}
    processor = lambda x: x
    
    # Set up e-greedy policy using epsilon-optimal
    policy = env.best_policy()
    
    # absorbing state for padding if episode ends before horizon is reached. This is environment dependent.
    absorbing_state = processor(np.array([len(policy)]))

    # Setup policies.
    pi_e = EGreedyPolicy(model=TabularPolicy(policy, absorbing=absorbing_state), processor=from_grid, prob_deviation=eval_policy, action_space_dim=env.n_actions)
    pi_b = EGreedyPolicy(model=TabularPolicy(policy, absorbing=absorbing_state), processor=from_grid, prob_deviation=base_policy, action_space_dim=env.n_actions)

    cfg.add({
        'env': env,
        'pi_e': pi_e,
        'pi_b': pi_b,
        'processor': processor,
        'absorbing_state': absorbing_state,
        'convert_from_int_to_img': to_grid, # if environment state is an int, can convert to image through this function
    })
    cfg.add({'models': param['models']})

    runner.add(cfg)

In [None]:
# Run the experiments
results = runner.run()

In [None]:
# Analyze the results
# Each row in the result is (OPE estimator, V(pi_e), MSE Error from on-policy: (V(pi_e) - True)**2)
for num, result in enumerate(results):
    print('Result Experiment %s' % (num+1))
    analysis(result)
    print('*'*20)
    print()