In [1]:
from gridworld import GridworldMdp
from agents import OptimalAgent, MyopicAgent, UncalibratedAgent
from mdp_interface import Mdp
from agent_runner import get_reward_from_trajectory
import numpy as np

In [2]:
class Intervention:

    def __init__(self,trial_length=10,num_interventions=3,gamma=0.9):
        self.steps_left = trial_length
        self.interventions_left = num_interventions
        self.optimal_agent = OptimalAgent(gamma=gamma)

    def set_mdp(self,mdp):
        self.optimal_agent.set_mdp(mdp)

    def get_optimal_action(self,state):
        return self.optimal_agent.get_action(state)

    def will_intervene(self,state,agent_action):
        raise NotImplemented("Cannot call will_intervene for Intervention")

    def get_action(self,state,agent_action):
        if self.will_intervene(state,agent_action):
            self.interventions_left -= 1
            self.steps_left -= 1
            return self.get_optimal_action(state)
        self.steps_left -= 1
        return agent_action

In [3]:
class RandomIntervention(Intervention):

    def will_intervene(self, state, agent_action):
        prob = self.interventions_left / self.steps_left
        return np.random.rand() < prob

In [4]:
def gen_random_connected(GridworldMdp, height, width, num_rewards, noise):
    for _ in range(5):
        try:
            return GridworldMdp.generate_random_connected(height=8,width=8,num_rewards=4,noise=0)
        except:
            pass
    raise 'Could not generate Gridworld'

In [5]:
num_trials = 1000
gamma = 0.9

In [6]:
def run_trial(agent, intervention, trial_length):

    mdp = gen_random_connected(GridworldMdp, height=8,width=8,num_rewards=4,noise=0)
    env = Mdp(mdp)
    agent.set_mdp(mdp)
    intervention.set_mdp(mdp)
    trajectory = []

    for _ in range(trial_length):
        curr_state = env.get_current_state()
        agent_action = agent.get_action(curr_state)
        action = intervention.get_action(curr_state,agent_action)
        next_state, reward = env.perform_action(action)
        minibatch = (curr_state, action, next_state, reward)
        agent.inform_minibatch(*minibatch)
        trajectory.append(minibatch)

    reward = get_reward_from_trajectory(trajectory,gamma)
    return reward

In [7]:
%%time
## Myopic Agent, Random Intervention
for trial_length in [10,20,30]:
    print(trial_length, end=': ')
    for num_interventions in (np.arange(0,1,.1) * trial_length):
        num_interventions = int(num_interventions)
        results = []
        for _ in range(num_trials):
            agent = MyopicAgent(horizon=3)
            intervention = RandomIntervention(trial_length=trial_length, num_interventions=num_interventions)
            results.append(run_trial(agent,intervention, trial_length))
        print(f'{np.mean(results):.2f}', end='\t')
    print()

10: 13.38	15.07	16.63	17.47	18.66	18.73	19.14	19.99	21.35	21.08	
20: 20.83	23.89	26.07	28.47	29.33	31.61	32.41	33.61	35.24	35.45	
30: 21.99	27.71	31.38	32.57	35.42	36.53	37.43	38.51	38.85	39.57	
CPU times: user 14min 31s, sys: 2.93 s, total: 14min 34s
Wall time: 14min 39s


In [11]:
myopic = np.array([[13.38,15.07,16.63,17.47,18.66,18.73,19.14,19.99,21.35,21.08],
                   [20.83,23.89,26.07,28.47,29.33,31.61,32.41,33.61,35.24,35.45],
                   [21.99,27.71,31.38,32.57,35.42,36.53,37.43,38.51,38.85,39.57]])

In [8]:
%%time
## Overconfident Agent, Random Intervention
for trial_length in [10,20,30]:
    print(trial_length, end=': ')
    for num_interventions in (np.arange(0,1,.1) * trial_length):
        num_interventions = int(num_interventions)
        results = []
        for _ in range(num_trials):
            agent = UncalibratedAgent(calibration_factor=5)
            intervention = RandomIntervention(trial_length=trial_length, num_interventions=num_interventions)
            results.append(run_trial(agent,intervention, trial_length))
        print(f'{np.mean(results):.2f}', end='\t')
    print()

10: 21.57	21.95	21.53	21.64	21.60	21.01	21.47	21.52	21.69	22.32	
20: 35.66	35.11	35.61	35.39	34.67	35.22	36.02	34.73	34.10	34.32	
30: 39.08	38.74	40.70	39.35	39.25	40.21	40.01	40.72	39.09	39.78	
CPU times: user 33min 16s, sys: 5.07 s, total: 33min 21s
Wall time: 45min 1s


In [12]:
overconfident = np.array([[21.57,21.95,21.53,21.64,21.60,21.01,21.47,21.52,21.69,22.32],
                          [35.66,35.11,35.61,35.39,34.67,35.22,36.02,34.73,34.10,34.32],
                          [39.08,38.74,40.70,39.35,39.25,40.21,40.01,40.72,39.09,39.78]])

In [9]:
%%time
## Underconfident Agent, Random Intervention
for trial_length in [10,20,30]:
    print(trial_length, end=': ')
    for num_interventions in (np.arange(0,1,.1) * trial_length):
        num_interventions = int(num_interventions)
        results = []
        for _ in range(num_trials):
            agent = UncalibratedAgent(calibration_factor=0.2)
            intervention = RandomIntervention(trial_length=trial_length, num_interventions=num_interventions)
            results.append(run_trial(agent,intervention, trial_length))
        print(f'{np.mean(results):.2f}', end='\t')
    print()

10: 22.42	22.82	21.54	21.67	21.61	22.12	21.64	21.95	21.60	21.78	
20: 35.63	35.38	35.26	35.29	35.50	35.61	35.01	35.07	35.52	33.90	
30: 39.97	40.60	40.51	40.57	39.85	38.83	39.84	40.73	40.54	39.43	
CPU times: user 34min 38s, sys: 8.41 s, total: 34min 47s
Wall time: 1h 2min 24s


In [13]:
underconfident = np.array([[22.42,22.82,21.54,21.67,21.61,22.12,21.64,21.95,21.60,21.78],
                           [35.63,35.38,35.26,35.29,35.50,35.61,35.01,35.07,35.52,33.90],
                           [39.97,40.60,40.51,40.57,39.85,38.83,39.84,40.73,40.54,39.43]])

In [10]:
%%time
## Optimal Agent, Random Intervention
for trial_length in [10,20,30]:
    print(trial_length, end=': ')
    for num_interventions in (np.arange(0,1,.1) * trial_length):
        num_interventions = int(num_interventions)
        results = []
        for _ in range(num_trials):
            agent = OptimalAgent()
            intervention = RandomIntervention(trial_length=trial_length, num_interventions=num_interventions)
            results.append(run_trial(agent,intervention, trial_length))
        print(f'{np.mean(results):.2f}', end='\t')
    print()

10: 22.62	22.32	21.89	21.63	21.48	22.09	21.72	21.95	21.94	21.36	
20: 34.67	36.26	35.91	34.95	34.71	35.23	35.25	35.53	35.77	34.97	
30: 40.81	39.20	40.48	39.78	40.30	40.32	40.41	40.18	39.00	39.51	
CPU times: user 19min 18s, sys: 2.64 s, total: 19min 20s
Wall time: 19min 23s


In [14]:
optimal = np.array([[22.62,22.32,21.89,21.63,21.48,22.09,21.72,21.95,21.94,21.36],
                    [34.67,36.26,35.91,34.95,34.71,35.23,35.25,35.53,35.77,34.97],
                    [40.81,39.20,40.48,39.78,40.30,40.32,40.41,40.18,39.00,39.51]])

In [24]:
myopic / optimal

array([[0.59151194, 0.67517921, 0.75970763, 0.80767453, 0.86871508, 0.84789498, 0.88121547, 0.91070615, 0.97310848, 0.98689139],
       [0.60080761, 0.65885273, 0.72598162, 0.81459227, 0.84500144, 0.89724666, 0.91943262, 0.94596116, 0.98518311, 1.01372605],
       [0.53883852, 0.70688776, 0.77519763, 0.81875314, 0.87890819, 0.90600198, 0.92625588, 0.95843703, 0.99615385, 1.0015186 ]])

In [25]:
underconfident / optimal

array([[0.99115827, 1.02240143, 0.98401096, 1.00184928, 1.00605214, 1.00135808, 0.99631676, 1.        , 0.98450319, 1.01966292],
       [1.02768965, 0.97573083, 0.98189919, 1.00972818, 1.02276001, 1.01078626, 0.99319149, 0.98705319, 0.9930109 , 0.96940234],
       [0.97941681, 1.03571429, 1.00074111, 1.01985923, 0.98883375, 0.96304563, 0.98589458, 1.0136884 , 1.03948718, 0.9979752 ]])

In [26]:
overconfident / optimal

array([[0.9535809 , 0.98342294, 0.98355413, 1.00046232, 1.00558659, 0.9511091 , 0.98848987, 0.98041002, 0.98860529, 1.04494382],
       [1.02855495, 0.96828461, 0.99164578, 1.01258941, 0.99884759, 0.99971615, 1.02184397, 0.97748382, 0.95331283, 0.98141264],
       [0.95760843, 0.98826531, 1.00543478, 0.98919055, 0.97394541, 0.99727183, 0.99010146, 1.01343952, 1.00230769, 1.00683371]])