In [None]:
import numpy as np

# custom libraries 
from envs import FourRooms
from agents import GPIAgent
from runners import run_experiment_episodic
from utils import load_results

%load_ext autoreload
%autoreload 2

In [None]:
# Load the representations - dicts of the form {room: {lambda: {LR, deltas}, policy}}
four_rooms_agents = load_results('fourrooms_lrs_stay')

In [None]:
goals = [80, 24, 41]
goal_rewards = [5, 10, 5]
env_lambda_ = 0.5
n_eps = 50
agent_lambdas = [0.0, 0.5, 1.0]

max_diff = -np.inf
results = {}
for agent_lambda_ in agent_lambdas:
  env = FourRooms(
      goals=goals,
      start_state=-1,  # random start state
      goal_rewards=goal_rewards,
      lambda_=env_lambda_,
      discount=0.97)

  w = env.r
  rooms = [0, 1, 2, 3]
  LRs = [four_rooms_agents[r][agent_lambda_]['LR'] for r in rooms]
  gpi_agent = GPIAgent(
      env._layout.size, 5, w, LRs)

  gpi_results = run_experiment_episodic(
      env, gpi_agent, n_eps, display_eps=10, respect_done=True, max_ep_len=40
  )

  mean_gpi, ste_gpi = np.mean(gpi_results['return hist']), np.std(gpi_results['return hist']) / np.sqrt(n_eps)
  results[agent_lambda_] = {'mean': mean_gpi, 'std': ste_gpi}


In [None]:
goals = [80, 24, 41]
goal_rewards = [5, 10, 5]
env_lambda_ = 0.5
# agent_lambda_ = 0.5
n_eps = 50
agent_lambdas = [0.0, 0.5, 1.0]

max_diff = -np.inf
traj_results = {}
for start_state in env._possible_reward_states:
    if start_state not in goals:
        results = {}
        traj_results = {}
        for agent_lambda_ in agent_lambdas:
            env = FourRooms(
                goals=goals,
                start_state=start_state,  # random start state
                goal_rewards=goal_rewards,
                lambda_=env_lambda_,
                discount=0.97)

            w = env.r
            # pre-programmed allowed goals are: 80(r0), 24 (r1), 41 (r2), 95 (r3)

            rooms = [0, 1, 2, 3]
            LRs = [four_rooms_agents[r][agent_lambda_]['LR'] for r in rooms]
            gpi_agent = GPIAgent(
                env._layout.size, 5, env.get_obs(), w, LRs)

            gpi_results = run_experiment_episodic(
                env, gpi_agent, 1, display_eps=1, respect_done=True, max_ep_len=40
            )

            mean_gpi, ste_gpi = np.mean(gpi_results['return hist']), np.std(gpi_results['return hist']) / np.sqrt(n_eps)
            results[agent_lambda_] = {'mean': mean_gpi, 'std': ste_gpi}
            traj_results[agent_lambda_] = gpi_results['trajectory']

        diff = results[0.5]['mean'] - np.mean([results[0.0]['mean'], results[1.0]['mean']])
        if diff > max_diff:
            max_diff = diff
            best_start = start_state
            best_traj_results = traj_results

In [None]:
goals = [80, 24, 41]
goal_rewards = [5, 10, 5]
env_lambda_ = 0.5
n_eps = 50
agent_lambdas = [0.0, 0.5, 1.0]
episode_datas = {}
for agent_lambda_ in agent_lambdas:
    env = FourRooms(
        goals=goals,
        start_state=69,  # fixed start state
        goal_rewards=goal_rewards,
        lambda_=env_lambda_,
        discount=0.97)

    w = env.r

    rooms = [0, 1, 2, 3]
    LRs = [four_rooms_agents[r][agent_lambda_]['LR'] for r in rooms]
    gpi_agent = GPIAgent(
        env._layout.size, 5, env.get_obs(), w, LRs)

    gpi_results = run_experiment_episodic(
        env, gpi_agent, 1, display_eps=1, respect_done=True, max_ep_len=40
    )

    mean_gpi, ste_gpi = np.mean(gpi_results['return hist']), np.std(gpi_results['return hist']) / np.sqrt(n_eps)
    episode_datas[agent_lambda_] = gpi_results['episode_data']
