In [7]:
from gridworld import GridworldMdp
from fast_agents import FastMyopicAgent, FastOptimalAgent
from mdp_interface import Mdp
from agent_runner import get_reward_from_trajectory
import numpy as np

In [2]:
np.set_printoptions(precision=5, linewidth=200)

In [3]:
def gen_gridworld_arr(gridworld):
    arr = np.zeros((3,width,height), dtype=np.int8)
    arr[0] = np.array(gridworld.walls)
    
    for (x,y) in gridworld.rewards:
        arr[1,x,y] = gridworld.rewards[(x,y)]
        
    (x,y) = gridworld.get_start_state()
    arr[2,x,y] = 1
    
    return arr

In [4]:
def gen_random_connected(height, width, num_rewards):
    for _ in range(5):
        try:
            return GridworldMdp.generate_random_connected(height=height,width=width,num_rewards=num_rewards,noise=0)
        except:
            pass
    raise ValueError('Could not generate Gridworld')

In [12]:
%%time

height=7
width=7
num_rewards=4

gamma = 0.9
myopic_horizon = 2

INTERVENE = 1
NOT_INTERVENE = 0

num_grids = 100
episode_length = 8
num_interventions = 3

# change this if adjusting episode_length or num_interventions
num_trials_per_grid = 92

num_entries_per_trial = 5
num_trials = num_trials_per_grid * num_grids

def get_minibatch(gridworld, curr_state, action):
    mdp = Mdp(gridworld, curr_state)
    next_state,reward = mdp.perform_action(action)
    minibatch = (curr_state, action, next_state, reward)
    return minibatch, next_state

def gen_data():

    data = np.zeros((num_trials,num_entries_per_trial,height,width))
    trial = 0

    for i in range(num_grids):
        print(f'Iter {i}', end='\r')
        gridworld = gen_random_connected(height, width, num_rewards)
        gridworld_arr = gen_gridworld_arr(gridworld)
        start_state = gridworld.get_start_state()

        def recurse(trajectory, curr_state, moves_left, interventions_left):
            nonlocal gridworld, gridworld_arr, data, trial

            if moves_left == 0:
                return get_reward_from_trajectory(trajectory)

            else:
                myopic_agent = FastMyopicAgent(horizon=myopic_horizon, num_iters=moves_left)
                myopic_agent.set_mdp(gridworld)
                action = myopic_agent.get_action(curr_state)
                minibatch, next_state = get_minibatch(gridworld, curr_state, action)
                r1 = recurse(trajectory + [minibatch], next_state, moves_left-1, interventions_left)
                
                if interventions_left == 0:
                    return r1

                optimal_agent = FastOptimalAgent(num_iters=moves_left)
                optimal_agent.set_mdp(gridworld)
                action = optimal_agent.get_action(curr_state)
                minibatch, next_state = get_minibatch(gridworld, curr_state, action)
                r2 = recurse(trajectory + [minibatch], next_state, moves_left-1, interventions_left-1)

                data[trial,:3] = gridworld_arr
                data[trial,3] = moves_left
                data[trial,4] = INTERVENE if r1 < r2 else NOT_INTERVENE
                trial += 1
                return max(r1,r2)

        recurse([], start_state, episode_length, num_interventions)

    return data

data = gen_data()

CPU times: user 16.1 s, sys: 158 ms, total: 16.2 s
Wall time: 16.2 s


In [39]:
y = data[:,3,0,0]
y.shape

(10000,)

In [40]:
(y==0).mean()

0.2803

In [41]:
(y>0).mean()

0.2389

In [42]:
(y<0).mean()

0.4808

In [43]:
y.mean()

5.379641657349001