In [1]:
from gridworld import GridworldMdp
from agents import OptimalAgent, MyopicAgent, UncalibratedAgent
from mdp_interface import Mdp
from agent_runner import get_reward_from_trajectory, run_agent
import numpy as np

In [13]:
np.set_printoptions(precision=5, linewidth=200)

In [2]:
def gen_gridworld_arr(gridworld):
    size = height * width
    arr = np.zeros((3,width,height), dtype=np.int8)
    arr[0] = np.array(gridworld.walls)
    
    for (x,y) in gridworld.rewards:
        arr[1,x,y] = gridworld.rewards[(x,y)]
        
    (x,y) = gridworld.get_start_state()
    arr[2,x,y] = 1
    
    return arr

In [4]:
def gen_random_connected(height, width, num_rewards):
    for _ in range(5):
        try:
            return GridworldMdp.generate_random_connected(height=height,width=width,num_rewards=num_rewards,noise=0)
        except:
            pass
    raise ValueError('Could not generate Gridworld')

In [38]:
%%time

height=7
width=7
num_rewards=4

num_trials = 1
gamma = 0.9

num_grids = 10000
num_start_states = 1
episode_length = 10

def gen_data():
    agent = MyopicAgent(horizon=2)
    optimal_agent = OptimalAgent()
    total_trials = num_grids * num_start_states
    mdp_size = width * height * 2
    data = np.zeros((total_trials,4,width,height))
    trial = 0

    # number of times when intervention actually made a difference
    count = 0
    total_actions = 0
    diff_actions = 0

    for i in range(num_grids):
        print(f'IterI {i}', end='\r')
        gridworld = gen_random_connected(height, width, num_rewards)
        mdp = Mdp(gridworld)
        gridworld_arr = gen_gridworld_arr(gridworld)
        # perform intervention with various episode lengths left
        for j in range(num_start_states):
            start_state = gridworld.get_random_start_state()
            mdp.gridworld.start_state = start_state
            #print(mdp.gridworld, end='\n\n')
            
            agent.set_mdp(gridworld)
            optimal_agent.set_mdp(gridworld)
            
            agent_action = agent.get_action(start_state)
            optimal_action = optimal_agent.get_action(start_state)
            
            r1,r2 = 0.0,0.0
            
            total_actions += 1
            if agent_action != optimal_action:
                diff_actions+=1
                
                agent_trajectory = run_agent(agent,mdp,episode_length=episode_length)
                r1 = get_reward_from_trajectory(agent_trajectory)
                intervened_trajectory = run_agent(agent,mdp,episode_length=episode_length, first_optimal=optimal_agent)
                r2 = get_reward_from_trajectory(intervened_trajectory)
                
#             print(mdp_size, len(gridworld_arr))
#             print(gridworld_arr)
#             print('rewards', r1 ,r2)
            data[trial,:3] = gridworld_arr
            data[trial,3] = r2 - r1

            trial += 1
            #print('---------------------------------------')
#     print()
#     print(data)
    print()
    print(data.shape)
    print(f'Out of {total_actions} actions, {diff_actions} were different')
    return data

data = gen_data()

IterI 9999
(10000, 4, 7, 7)
Out of 10000 actions, 7209 were different
CPU times: user 3min 58s, sys: 2.66 s, total: 4min
Wall time: 3min 58s


In [39]:
y = data[:,3,0,0]
y.shape

(10000,)

In [40]:
(y==0).mean()

0.2803

In [41]:
(y>0).mean()

0.2389

In [42]:
(y<0).mean()

0.4808

In [43]:
y.mean()

5.379641657349001