In [1]:
from gridworld import GridworldMdp
from agents import OptimalAgent, MyopicAgent, UncalibratedAgent
from mdp_interface import Mdp
from agent_runner import get_reward_from_trajectory, run_agent
import numpy as np

In [2]:
class Intervention:

    def __init__(self,trial_length=10,num_interventions=3,gamma=0.9):
        self.steps_left = trial_length
        self.interventions_left = num_interventions
        self.optimal_agent = OptimalAgent(gamma=gamma)

    def set_mdp(self,mdp):
        self.optimal_agent.set_mdp(mdp)

    def get_optimal_action(self,state):
        return self.optimal_agent.get_action(state)

    def will_intervene(self,state,agent):
        raise NotImplemented("Cannot call will_intervene for Intervention")

    def get_action(self,state,agent):
        if self.will_intervene(state,agent):
            self.interventions_left -= 1
            self.steps_left -= 1
            return self.get_optimal_action(state)
        self.steps_left -= 1
        return agent.get_action(state)

In [3]:
class RandomIntervention(Intervention):

    def will_intervene(self, state, agent):
        prob = self.interventions_left / self.steps_left
        return np.random.rand() < prob

In [4]:
class StrategicIntervention(Intervention):

    def __init__(self, trial_length=10, num_interventions=3, gamma=0.9, qval_threshold=2):
        super().__init__(trial_length=trial_length, num_interventions=num_interventions, gamma=gamma)
        self.qval_threshold = qval_threshold

    def will_intervene(self, state, agent):
        agent_action = agent.get_action()
        optimal_action = self.get_optimal_action()
        mu = self.optimal_agent.extend_state_to_mu(state)
        agent_qval = self.optimal_agent.qvalue(mu,agent_action)
        optimal_qval = self.optimal_agent.qvalue(mu,optimal_action)
        return optimal_qval - agent_qval > self.qval_threshold

In [97]:
height=7
width=7
num_rewards=4

def gen_random_connected():
    for _ in range(5):
        try:
            return GridworldMdp.generate_random_connected(height=height,width=width,num_rewards=num_rewards,noise=0)
        except:
            pass
    raise ValueError('Could not generate Gridworld')

In [17]:
num_trials = 1000
gamma = 0.9

In [18]:
def run_trial(agent, intervention, trial_length):

    mdp = gen_random_connected()
    env = Mdp(mdp)
    agent.set_mdp(mdp)
    intervention.set_mdp(mdp)
    trajectory = []

    for _ in range(trial_length):
        curr_state = env.get_current_state()
        action = intervention.get_action(curr_state,agent)
        next_state, reward = env.perform_action(action)
        minibatch = (curr_state, action, next_state, reward)
        agent.inform_minibatch(*minibatch)
        trajectory.append(minibatch)

    reward = get_reward_from_trajectory(trajectory,gamma)
    return reward

In [19]:
def flatten_mdp(mdp):
    size = height * width
    arr = np.zeros(size * 2, dtype=np.int8)
    for y in range(height):
        for x in range(width):
            if mdp.walls[y][x]:
                arr[x * width + y] = 1
            elif (x,y) in mdp.rewards:
                arr[size + x * width + y] = mdp.rewards[(x,y)]
            elif (x,y) == mdp.get_start_state():
                arr[x * width + y] = -1
    return arr

In [133]:
%%time
num_grids = 10
num_start_states = 20
episode_length = 10

def gen_data():
    agent = MyopicAgent(horizon=2)
    optimal_agent = OptimalAgent()
    total_trials = num_grids * num_start_states
    mdp_size = width * height * 2
    data = np.zeros((total_trials,mdp_size+2))
    trial = 0

    # number of times when intervention actually made a difference
    count = 0
    total_actions = 0
    diff_actions = 0

    for i in range(num_grids):
        print(f'IterI {i}', end='\r')
        gridworld = gen_random_connected()
        mdp = Mdp(gridworld)
        # flat_gridworld = flatten_mdp(gridworld)
        # perform intervention with various episode lengths left
        for j in range(num_start_states):
            #print(f'IterJ: {j}')
            start_state = gridworld.get_random_start_state()
            mdp.gridworld.start_state = start_state
            #print(mdp.gridworld, end='\n\n')
            
            agent.set_mdp(gridworld)
            optimal_agent.set_mdp(gridworld)
            
            agent_action = agent.get_action(start_state)
            optimal_action = optimal_agent.get_action(start_state)
            #print(f'Start State: {start_state}')
            #print(f'Agent Action: {agent_action}')
            #print(f'Optimal Action: {optimal_action}')
            
            total_actions += 1
            if agent_action != optimal_action:
                diff_actions+=1
                
                #print(mdp.gridworld, end='\n\n')
                #action_res = mdp.perform_action(agent_action)
                #print(action_res)
                #print(mdp.gridworld, end='\n\n')
                agent_trajectory = run_agent(agent,mdp,episode_length=episode_length)
                r1 = get_reward_from_trajectory(agent_trajectory)
                mdp.gridworld.start_state = start_state
                
                
                #print(mdp.gridworld, end='\n\n')
                #action_res = mdp.perform_action(optimal_action)
                #print(action_res)
                #print(mdp.gridworld, end='\n\n')
                intervened_trajectory = run_agent(agent,mdp,episode_length=episode_length, first_optimal=optimal_agent)
                r2 = get_reward_from_trajectory(intervened_trajectory)
                
                #for t1,t2 in zip(agent_trajectory, intervened_trajectory): print(f'{t1}\t|\t{t2}')
                
            #data[trial,:mdp_size] = flat_mdp
            #x,y = start_state
            #data[trial,-2] = y * width + x
            #data[trial,-1] = r2 - r1

            #print(f'Trajectory: {len(trajectory)}')
            #for t in trajectory:
            #    print(t)
                
                #if r1!=r2: print(f'Rewards: ##########{r1},{r2}#########')
                #else: print(f'Rewards: {r1},{r2}')
                if r1 != r2:
                    count += 1
            trial += 1
            #print('---------------------------------------')
    print()
    print(f'Intervention Helped: {count}/{num_grids*num_start_states} times')
    #print(total_actions, diff_actions)
    return data

data = gen_data()

IterI 9
Intervention Helped: 145/200 times
CPU times: user 3.37 s, sys: 24 ms, total: 3.39 s
Wall time: 3.39 s


In [59]:
%%time
num_grids = 1
num_start_states = 20
episode_length = 10

def gen_data():
    optimal_agent = OptimalAgent()
    total_trials = num_grids * num_start_states
    mdp_size = width * height * 2
    trial = 0

    # number of times when intervention actually made a difference
    cnt = 0
    total_actions = 0

    for i in range(num_grids):
        print(f'Iter {i}', end='\r')
        mdp = gen_random_connected()
        env = Mdp(mdp)
        optimal_agent.set_mdp(mdp)
        print(mdp, end='\n\n')
        # perform intervention with various episode lengths left
        for j in range(num_start_states):
            print(f'Iter: {j}')
            start_state = mdp.get_random_start_state()
            env.state = start_state
            optimal_action = optimal_agent.get_action(start_state)
            r2 = 0.0
            total_actions += 1
            if True:
                print(f'Start State: {start_state}')
                print(f'Optimal Action: {optimal_action}')
                env.perform_action(optimal_action)
                trajectory = run_agent(optimal_agent,env,episode_length=episode_length)
                r2 = get_reward_from_trajectory(trajectory)
            print(f'Trajectory: {len(trajectory)}')
            for t in trajectory:
                print(t)
            print(f'Rewards: {r2}')
            if r1 != r2:
                cnt += 1
            trial += 1
            print('---------------------------------------')
    print()
    print(cnt)
    print(total_actions, diff_actions)

gen_data()

XXXX 0
X 2X
X AX
XXXX

Iter: 0
Start State: (1, 1)
Optimal Action: (1, 0)
Trajectory: 10
((2, 2), (0, -1), (2, 1), -0.01)
((2, 1), (0, 0), (2, 1), 2.0)
((2, 1), (0, 0), (2, 1), 2.0)
((2, 1), (0, 0), (2, 1), 2.0)
((2, 1), (0, 0), (2, 1), 2.0)
((2, 1), (0, 0), (2, 1), 2.0)
((2, 1), (0, 0), (2, 1), 2.0)
((2, 1), (0, 0), (2, 1), 2.0)
((2, 1), (0, 0), (2, 1), 2.0)
((2, 1), (0, 0), (2, 1), 2.0)
Rewards: 11.016431198000001


NameError: name 'r1' is not defined