# Evaluating trained agents

This Notebook will be used to visualize & analyze various trained agents on RiskyPath environment. Analysis will especially comprise observing the agent's behaviour in the environment it was trained for but also different versions of the environment (distributional shift analysis)

To observe deterministic agent behaviour, slipping/collision probabilities might be set to zero. In this case, the agent might not be tested on the exact same environment configuration it was trained on. In every case, this will be marked in the corresponding cell

In [1]:
import json
import time

import gym
import gym_minigrid
from gym_minigrid.minigrid import Goal, Floor, Lava, Wall, SpikyTile
from gym_minigrid.envs import RiskyPathEnv
from gym_minigrid.wrappers import RGBImgObsWrapper, ImgObsWrapper, TensorObsWrapper
from special_wrappers import RandomizeGoalWrapper

from experiment_config import GridworldExperiment
import torch as th
import stable_baselines3
from stable_baselines3.dqn import DQN
from stable_baselines3.a2c import A2C
from stable_baselines3.common.utils import obs_as_tensor

import numpy as np

In [2]:
# NOTE Save the prefix for the logs & models folder here for compatibility across different systems
model_path_prefix = "/Users/tilioschulze/Library/CloudStorage/OneDrive-Personal/Studium/Bachelorarbeit/experiment_models/saved_models/"

## Utilities

Definition of functions to use for quick analysis

In [32]:
sinfo = "\33[32mINFO:\33[0m"

def model_env_from_path(agent_path: str, no_slip: bool = True, no_rebound: bool = True):
    # Extract model from path (a2c or dqn?)
    if "/dqn/" in agent_path:
        model_class = DQN
    elif "/a2c/" in agent_path:
        model_class = A2C

    model = model_class.load(agent_path)

    # Create environment given information in function input
    path_keys = agent_path.split("saved_models/")[1].split("/")
    env_name = path_keys[0]
    observation_type = path_keys[1]

    render_size = 8
    rgb = False
    if "pixel_obs_" in agent_path:
        render_size = int(path_keys[1].split("_")[-1])
        rgb = True

    env_info = ""
    with open('env_config.json', 'r') as f:
        env_kwargs = json.load(f)[env_name]
    if 'goal_rnd' in env_kwargs:
        env_kwargs.pop('goal_rnd')
    if no_slip and env_kwargs['slip_proba'] != 0:
        env_kwargs.pop('slip_proba')
        env_info += "slipping probability removed; "
    if no_rebound and env_kwargs['wall_rebound']:
        env_kwargs.pop('wall_rebound')
        env_info += "wall rebound deactivated"
    if len(env_info) != 0:
        print("\33[32mINFO:\33[0m", env_info)

    env = gym.make(
        "MiniGrid-RiskyPath-v0",
        **env_kwargs
    )

    return model, env, rgb, render_size

def test_agent_on_environment(
    agent_path: str,
    num_episodes: int = 1,
    render_time: float = 0.2,
    custom_environment: gym.Env = None,
    predict_deterministic: bool = True,
    accelerate_viz: bool = True
):
    """Render agent interaction with the environment in an interactive matplotlib window. Useful to make videos of agent behaviour or analyzing trajectories. Slipping and wall rebound is turned off in order to observe the agent's intended behaviour. When passing a custom environment, no checking for stationary state distribution and deterministic transitions is performed.

    Args:
        agent_path (str): Model location. Folder path must conform to experiment structure
        num_episodes (int, optional): number of episodes to render
        render_time (float, optional): render time for one time step in seconds
        custom_environment (gym.Env, optional): None by default
        predict_deterministic (bool, optional): Make deterministic mode predictions
        accelerate_viz (bool, optional): Will accelerate rendering when agent takes too long to solve environment
    """
    model, env, rgb_on, render_size = model_env_from_path(agent_path)

    if custom_environment is not None:
        env = custom_environment
    
    if rgb_on:
        env = RGBImgObsWrapper(env, tile_size=render_size)
        env = ImgObsWrapper(env)
    else:
        env = TensorObsWrapper(env)
    
    # Execute episodes and render agent
        # TODO print reward, action [number, (himmelsrichtung)] etc.
    for i in range(num_episodes):

        print(f"Starting episode {i+1}")
        total_reward = 0
        needed_timesteps = 0

        obs = env.reset()
        done = False
        env.render(tile_size=render_size)
        time.sleep(render_time)

        while not done:
            action, _ = model.predict(obs, deterministic=predict_deterministic)
            obs, reward, done, info = env.step(action)
            env.render(tile_size=render_size)
            total_reward += reward
            needed_timesteps += 1
            if needed_timesteps > 25:
                render_time = 0.05
            time.sleep(render_time)
        
        print(f"Episode ended after {needed_timesteps} time steps.")
        out = f"Total reward: {total_reward}"
        print(out)
        print("-"*len(out))
    
    %matplotlib

def make_env(
    **kwargs
):
    env = gym.make(
        "MiniGrid-RiskyPath-v0",
        **kwargs
    )
    return env

def compute_q_values(model_policy, obs):
    """Compute q-values from a DQN model given a certain observation.

    Args:
        model_policy: The DQN model's policy
        obs: The environmental observation for which q-values should be computed
    """
    # Code adapted from this stackoverflow post
    # https://stackoverflow.com/questions/73239501/how-to-get-the-q-values-in-dqn-in-stable-baseline-3/73242315#73242315?newreg=d2762c51b8bc44778cde16b43499a6d5
    observation = obs.reshape((-1,) + model_policy.observation_space.shape)
    observation = obs_as_tensor(observation, "cpu")
    with th.no_grad():
        q_values = model_policy.q_net(observation)
    return q_values

def visualize_policy(
    path: str,
    custom_env = None
):
    """Visualize the model policy on the given environment specification. The environmental state distribution is assumed to stationary. Goal randomization is explicitly not applied. A custom environment can be passed and policy will be applied on it. This method does not check if the state distribution is stationary. Returns a colored string representation to print to the console.

    Args:
        path (str): The saved models location
        custom_env (RiskyPathEnv): A custom environment
    """    
    model, env, rgb_on, render_size = model_env_from_path(path)
    if custom_env is not None:
        env = custom_env
    
    # No wrapping is needed
    env.reset()

    grid = env.grid
    visual_policy = ""

    ansi_color = lambda code, text:  f"\33[{code}m{text}\33[0m"

    for i in range(grid.width):
        visual_policy += " " + str(i) + "  "
        if i == grid.width - 1:
            visual_policy += "\n"

    for y in range(grid.height):
        for x in range(grid.width):
            tile = grid.get(x, y)
            
            if tile is None or isinstance(tile, Floor) or isinstance(tile, SpikyTile):
                # get model action and map to <, >, ^, v strings
                # NOTE setting a variable only works on unwrapped env as gym automatically wraps the environment with orderenforcing wrapper and wrappers do not implement a __setattr__ method but a __getattr__
                env.unwrapped.agent_pos = (x, y)
                if rgb_on:
                    obs = env.render(
                        mode="rgb_array",
                        highlight=False,
                        tile_size=render_size
                    )
                else:
                    obs = env.tensor_obs()

                dir_mapping = {0 : "<", 1 : "^", 2 : ">", 3 : "v"}
                action = int(model.predict(obs, deterministic=True)[0])
                dir_str = dir_mapping[action]

                visual_policy += f"[{dir_str}] "
            elif isinstance(tile, Wall):
                w = ansi_color(36, "#")
                visual_policy += f"[{w}] "
            elif isinstance(tile, Lava):
                l = ansi_color(41, "~")
                visual_policy += f"[{l}] "
            elif isinstance(tile, Goal):
                g = ansi_color(42, "x")
                visual_policy += f"[{g}] "
            
            if x == grid.width - 1: 
                visual_policy += f" {y} \n"
            
    return visual_policy

def load_model_params(
    path: str
):
    """Return model policy and additional information

    Args:
        path (str): The path to the saved model

    Returns:
        tuple: policy, policy_class, policy_kwargs
    """
    # Extract model from path (a2c or dqn?)
    if "/dqn/" in path:
        model_class = DQN
    elif "/a2c/" in path:
        model_class = A2C

    model = model_class.load(path)
    return model.policy, model.policy_class, model.policy_kwargs

def dqn_params(path):
    if "/dqn/" in path:
        model_class = DQN
    model = model_class.load(path)
    return model.get_parameters()

def randomized_goal_stats(path: str, episodes: int = 50):
    """Test the agent statistically on the training environment specification but with randomized goal tile placement. Other environmental factors are taken from the env_config key except for slipping and wall rebound which is turned off in order to truly analyze the agent's capabilities.

    Args:
        path (str): Location of the saved model. Must conform to predefined folder structure.
        episodes (int, optional): Number random-goal episodes
    """        
    model, env, rgb_on, render_size = model_env_from_path(path)
    count_successes = 0
    if rgb_on:
        env = RGBImgObsWrapper(env, render_size)
        env = ImgObsWrapper(env)
    else:
        env = TensorObsWrapper(env)

    env = RandomizeGoalWrapper(env, randomization=1)

    episode_lengths = []
    for _ in range(episodes):
        obs = env.reset()
        done = False
        step = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, _, done, info = env.step(action)
            step += 1
        if info['is_success']: count_successes += 1
        episode_lengths.append(step)
    
    print("Goal randomization success rate:\33[35m", round(count_successes/episodes, 1), f"\33[0m% on \33[35m{episodes}\33[0m random-goal episodes")
    print("Goal randomization mean episode length:\33[35m", np.mean(episode_lengths), "\33[0m")

def execute_episode(path: str):
    """Execute a test episode with the specified saved model on the training environment configuration. Slipping and wall rebound is deactivated. Prints episode summary to stdout.

    Args:
        path (str): Location of the saved model. Must conform to specified structure (see experiment_config.py)
    """ 
    model, env, rgb, render_size = model_env_from_path(path)
    if rgb:
        env = RGBImgObsWrapper(env, tile_size=render_size)
        env = ImgObsWrapper(env)
    else:
        env = TensorObsWrapper(env)

    obs = env.reset()
    done = False
    count_steps = 0
    cumulative_reward = 0
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        cumulative_reward += reward
        count_steps += 1
    print(f"Episode summary -> success: \33[35m{info['is_success']}\33[0m, cumulative reward: \33[35m{cumulative_reward}\33[0m, number of steps: \33[35m{count_steps}\33[0m")

In [26]:
# defining standard test suite
def test_suite_model(path: str):
    print(sinfo, "Beginning execution of test suite.")
    print(f"Path: \33[3m{path[len(model_path_prefix):]}\33[0m")

    # test agent on deterministic (!) environment
    print("\n\33[4mAgent success on \33[1mdeterministic\33[0;4m training environment:\33[0m")
    execute_episode(path)

    # visualized policy on original environment
    print("\n\33[4mPolicy visualization on training environment:\33[0m")
    print(visualize_policy(path))
    
    # summary statistics on randomized goal locations
    print("\33[4mTesting Goal generalization capabilities:\33[0m")
    randomized_goal_stats(path)

    print(sinfo, "Test suite execution ended.")

In [5]:
base_lava_positions = []
for y in range(1, 11 - 1):
    base_lava_positions.append((1, y))
for y in range(11 - 3, 11 - 8, -1):
    base_lava_positions.append((3, y))
base_lava_positions.extend([(6, 11 - 5), (6, 11 - 6)])
base_lava_positions.remove((1,3)) # remove location of original goal position

upper_right_goal_env = lambda: make_env(goal_positions=[(9, 1)])
alt_upper_right_goal_env = lambda: make_env(goal_positions=[(9, 1)], lava_positions=base_lava_positions)


In [6]:
# ignore "memory not enough" warnings concerning replay buffer
import warnings
warnings.filterwarnings('ignore', module="stable_baselines3.common.buffers")

In [7]:
%matplotlib
# Force matplotlib to render outside of notebook (Don't use 'inline' backend)

Using matplotlib backend: <object object at 0x14eca0e50>


## exp_001

_Environment configuration:_
```json
    "exp_001" : {
        "max_steps" : 150,
        "slip_proba" : 0,
        "wall_rebound" : false,
        "spiky_active" : false,
        "reward_spec" : {
            "step_penalty" : 0,
            "goal_reward" : 1,
            "absorbing_states" : false,
            "absorbing_reward_goal" : 0,
            "absorbing_reward_lava" : -1,
            "risky_tile_reward" : 0,
            "lava_reward" : -1
        }
```

First, let's load one of the successful trained agents on the `stable-baselines3` DQN defaults. It was trained on tensor observations.

In [8]:
exp_001_path = model_path_prefix + "exp_001/tensor_obs/dqn/algo_default/seed_763.zip"

In [10]:
test_agent_on_environment(exp_001_path)

[32mINFO:[0m 
Starting episode 1
Episode ended after 7 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX


Agent walks to the goal tile. Let's visualize it's policy on this version of the environment:

In [31]:
test_suite_model(exp_001_path)

[32mINFO:[0m Beginning execution of test suite.
Path: [3mexp_001/tensor_obs/dqn/algo_default/seed_763.zip[0m

[4mAgent success on [1mdeterministic[0;4m training environment:[0m
Episode summary -> success: [35mTrue[0m, cumulative reward: [35m1[0m, number of steps: [35m7[0m

[4mPolicy visualization on training environment:[0m
 0   1   2   3   4   5   6   7   8   9   10  
[[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m]  0 
[[36m#[0m] [[41m~[0m] [v] [<] [<] [<] [v] [<] [<] [<] [[36m#[0m]  1 
[[36m#[0m] [[41m~[0m] [v] [<] [<] [v] [v] [v] [<] [<] [[36m#[0m]  2 
[[36m#[0m] [[42mx[0m] [<] [<] [<] [<] [<] [<] [<] [<] [[36m#[0m]  3 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [^] [<] [<] [^] [^] [<] [[36m#[0m]  4 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [^] [<] [[41m~[0m] [^] [<] [>] [[36m#[0m]  5 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [^] [<] [[41m~[0m]

**Explanation of output:**

- `[~]` is lava
- `[#]` is a wall
- `[x]` is the goal tile
- `<,^,>,v` are the directions the agent would take from that cell

We see that the agent would walk to the goal tile from almost each position, with some exceptions: (9,5), (9,6). In no case would the agent walk in one of the lava tiles.
The agent has fulfilled its task of walking to the goal tile and it can even do that from most other grid positions.
However, let's test the agent on the same environment but move the goal position somewhere else.

In [9]:
test_agent_on_environment(
    exp_001_path,
    num_episodes=1,
    custom_environment=upper_right_goal_env()
)

Starting episode 1
Episode ended after 150 time steps.
Total reward: 0
---------------
Using matplotlib backend: MacOSX


In [9]:
print(visualize_policy(exp_001_path, upper_right_goal_env()))

 0   1   2   3   4   5   6   7   8   9   10  
[[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m]  0 
[[36m#[0m] [[41m~[0m] [v] [<] [<] [<] [<] [<] [>] [[42mx[0m] [[36m#[0m]  1 
[[36m#[0m] [[41m~[0m] [v] [<] [<] [<] [<] [<] [<] [<] [[36m#[0m]  2 
[[36m#[0m] [[41m~[0m] [>] [<] [<] [<] [<] [<] [<] [<] [[36m#[0m]  3 
[[36m#[0m] [[41m~[0m] [v] [[41m~[0m] [^] [<] [<] [<] [<] [>] [[36m#[0m]  4 
[[36m#[0m] [[41m~[0m] [v] [[41m~[0m] [^] [<] [[41m~[0m] [>] [>] [>] [[36m#[0m]  5 
[[36m#[0m] [[41m~[0m] [v] [[41m~[0m] [^] [<] [[41m~[0m] [>] [>] [>] [[36m#[0m]  6 
[[36m#[0m] [[41m~[0m] [v] [[41m~[0m] [>] [<] [>] [>] [>] [>] [[36m#[0m]  7 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [v] [<] [>] [>] [>] [>] [[36m#[0m]  8 
[[36m#[0m] [[41m~[0m] [>] [<] [<] [<] [<] [>] [>] [>] [[36m#[0m]  9 
[[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#

The agent oscillates between the starting position (2,9) and the adjacent tile to the right. It seems confused about the changed environment. Two factors seem to come into play: The goal tile was changed to (9,9) and the old goal tile was replaced with a lava tile. The agent might have recognized that its previous strategy might no longer be safe. Given the policy visualization for this environment verions, this is further evidenced by the fact that the agent still tries to avoid all lava tiles, even the new lava tile at (1,3).
Let's see how the agent behaviour changes when the original goal tile is turned to floor.

In [13]:
test_agent_on_environment(exp_001_path, custom_environment=alt_upper_right_goal_env())

Starting episode 1
Episode ended after 150 time steps.
Total reward: 0
---------------
Using matplotlib backend: MacOSX


In [10]:
print(visualize_policy(exp_001_path, custom_env=alt_upper_right_goal_env()))

 0   1   2   3   4   5   6   7   8   9   10  
[[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m]  0 
[[36m#[0m] [[41m~[0m] [v] [<] [<] [<] [<] [<] [<] [[42mx[0m] [[36m#[0m]  1 
[[36m#[0m] [[41m~[0m] [v] [<] [<] [<] [<] [<] [<] [<] [[36m#[0m]  2 
[[36m#[0m] [v] [<] [<] [<] [<] [<] [<] [<] [<] [[36m#[0m]  3 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [^] [<] [<] [<] [<] [<] [[36m#[0m]  4 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [^] [<] [[41m~[0m] [>] [<] [>] [[36m#[0m]  5 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [^] [<] [[41m~[0m] [>] [>] [>] [[36m#[0m]  6 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [^] [<] [>] [<] [<] [>] [[36m#[0m]  7 
[[36m#[0m] [[41m~[0m] [^] [[41m~[0m] [v] [<] [<] [<] [>] [>] [[36m#[0m]  8 
[[36m#[0m] [[41m~[0m] [>] [<] [<] [<] [<] [<] [>] [>] [[36m#[0m]  9 
[[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[36m#[0m] [[3

We see that the agent still oscillates at the starting position and its right neighbour, however, its policy would lead the agent to walk to the original goal position when placed in most other cells (and then terminating the episode by walking in the lava tile below). Note that no policy-induced trajectory would end up in the actual goal tile given this environment.




Next, I'd like to see if the agent is able to avoid _newly placed_ lava tiles 

In [None]:
print(visualize_policy(exp_001_path, custom_env=))

**Summary:**

- During training, the agent learned to walk to the goal tile successfully
- However, the agent is not fully able to 

In [35]:
for pos in [(7,6), (3,9), (9,7), (4,8)]:
    test_agent_on_environment(
        exp_001_path,
        num_episodes=1,
        custom_environment=make_env(
            agent_start_pos=pos
        ),
    )

Starting episode 1
1
1
1
0
0
0
0
0
0
Episode ended after 9 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX
Starting episode 1
0
1
1
1
1
1
1
0
Episode ended after 8 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX
Starting episode 1
0
0
0
0
0
1
1
1
1
0
0
0
Episode ended after 12 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX
Starting episode 1
3
0
0
1
1
1
1
1
1
0
Episode ended after 10 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX


The agent successfully navigates the environment when beginning at another position. It quickly finds the goal tile and mostly doesn't take any detours.
Interestingly, when the agent is placed on position (4,8) it first goes down and to the left instead of taking the quicker path upwards. Considering that the reward model of `exp_001` does not incentivize the agent to find the shortest path (no time penalty), this is not especially surprising. Still, this leads to the hypothesis that the agent found that going downwards from this position would lead to more reward than going up. (Or maybe due to the update rule in Q-Learning? --> # TODO investigate this)

What happens when lava tiles are placed in the agent's way?

In [None]:
alt_lava_positions = []
for y in range(1, 11 - 1):
    alt_lava_positions.append((1, y))
for y in range(11 - 3, 11 - 8, -1):
    alt_lava_positions.append((3, y))
alt_lava_positions.extend([(6, 11 - 5), (6, 11 - 6)])
alt_lava_positions.append((2,8))

test_agent_on_environment(
    exp_001_path,
    num_episodes=1,
    custom_environment=make_env(
        lava_positions=alt_lava_positions
    )
)

alt_lava_positions.append((3,3))
test_agent_on_environment(
    exp_001_path,
    num_episodes=1,
    custom_environment=make_env(
        lava_positions=alt_lava_positions,
        agent_start_pos=(4,7)
    )
)


It seems that the agent is not able to recognize lava tiles. It always goes straight to the goal location and only evades the lava tile positions that it already knows from training. It seems the agent has not learned the causation between lava and negative reward but instead learned the correlation between the positions (of lava tiles) in the gridworld and the negative reward. This would also explain why changing the starting position does not confuse the agent in searching the goal tile (when it is at the original position). During training, only one part of the observation tensor is constantly changing, namely the agent's position.
**INTERESTING:** Train an agent on a self-shifting environment (e.g. change goal position every five episodes and change lava tile positions every 5 episodes)

This leads to the following question: 
Does the agent find the goal when it is placed somewhere else?

In [None]:
test_agent_on_environment(
    exp_001_path,
    num_episodes=1,
    custom_environment=make_env(
        max_steps=25, # Changed limit because agent is caught in a loop
        goal_positions=[(2,2)]
    )
)

It is clearly visible that the agent is not able to understand how to get to the goal tile when placed somewhere else. It gets caught in a loop and oscillated from left to right (Maybe an analysis would be interesting in which the q-net's output would be shown for each possible state).
The hypothesis is thus strengthened that the agent only effectively learns cause and effect when the environment dynamics change (e.g. lava and goal placement). Currently, the agent is only able 

Let's load an a2c model that was very succesful during training. The next observation is an interesting one:

In [None]:
orig_lava = []
for y in range(1, 11 - 1):
    orig_lava.append((1, y))
for y in range(11 - 3, 11 - 8, -1):
    orig_lava.append((3, y))
orig_lava.remove((1,3))

a2c_low_entropy_model = model_path_prefix + "exp_001/tensor_obs/a2c/a2c_entropy_6/seed_4267.zip"

test_agent_on_environment(
    a2c_low_entropy_model,
    num_episodes=1,
    custom_environment=make_env(
        max_steps=25,
        goal_positions=[(3,3)],
        lava_positions=orig_lava
    ),
    render_time=0.4
)



**Distributional Shift, Goal Misgeneralization:**

The agent does not understand that the important tile is the goal tile. It still navigates to the position in which it recieved positive rewards during training. Once the state distribution shifts, the agent is not able to apply the learned skills to a simple alteration of the environment. This shows that the state representation during training is not truly sufficient if we want the agent to be able to generalize knowledge.

- [ ] Train agent on a self-shifting environment with different goal positions. Use algorithmic settings/parametrizations that worked best during training



Test on self-shifting environment:

In [None]:
test_agent_on_environment(
    a2c_low_entropy_model,
    num_episodes=5,
    render_time=0.4,
    custom_environment=RandomizeGoalWrapper(make_env(), randomization=0.5)
)

The agent only reaches the goal tile when it is placed in on the training location! The agent has not learned to walk in the goal tile, it has learned to walk to the location where it recieved rewards during training. During training, this is a **perfect proxy of the intended goal**, which is to walk on the goal tile! 

**Goal Randomization Agent:**

In [None]:
pathlala = "/Users/tilioschulze/Desktop/Bachelorarbeit/Code/Experiments/saved_models/exp_001_goal_rnd_2/tensor_obs/dqn/dqn_low_eps/seed_4744_best_model/best_model.zip"

In [None]:
test_agent_on_environment(pathlala, num_episodes=5)

Solves environment perfectly. What will happen when distributional shift is applied

In [None]:
test_agent_on_environment(
    pathlala,
    num_episodes=6,
    custom_environment=RandomizeGoalWrapper(make_env(), randomization=1)
)

In [None]:
p, _, _ = load_model_params(exp_001_path)
p

In [None]:
# TODO create observation here
env = gym.make("MiniGrid-RiskyPath-v0")
env = TensorObsWrapper(env)
obs = env.reset()

from stable_baselines3.common.utils import obs_as_tensor
import torch as th

observation = obs.reshape((-1,) + p.observation_space.shape)
observation = obs_as_tensor(observation, "cpu")
with th.no_grad():
    q_values = p.q_net(observation)
print(q_values)

obs, _, _, _ = env.step(2)

observation = obs.reshape((-1,) + p.observation_space.shape)
observation = obs_as_tensor(observation, "cpu")
with th.no_grad():
    q_values = p.q_net(observation)
print(q_values)

If we look at the q-values, we can see that from the starting point, the agent prefers going right. After landing on this tile, the maximal q-value corresponds to moving to the left tile, which catches the agent in a loop!

### A2C low entropy model on exp_001

In [None]:
npath = model_path_prefix + "exp_001/pixel_obs_8/a2c/a2c_entropy_6/seed_3377.zip"

In [None]:
test_agent_on_environment(
    npath
)

This training-wise succesful a2c agent is also not able to generalize, when confronted with new lava tile positions

### A2C entropy_6 on exp_001 (pixel_obs_8)

In [None]:
test_agent_on_environment(
    model_path_prefix + "exp_hard_001/pixel_obs_8/a2c/a2c_entropy_6/seed_4267.zip"
)

Success! The agent learns to walk around the spiky tiles and lava to maximize rewards.
However, this is only one seed out of 5. The other 4 seeds failed to find such a solution.

In [None]:
alt_lava_positions = []
for y in range(1, 11 - 1):
    alt_lava_positions.append((1, y))
for y in range(11 - 3, 11 - 8, -1):
    alt_lava_positions.append((3, y))
alt_lava_positions.extend([(6, 11 - 5), (6, 11 - 6)])

spiky_positions = []

test_agent_on_environment(
    model_path_prefix + "exp_hard_001/pixel_obs_8/a2c/a2c_entropy_6/seed_4267.zip",
    custom_environment=make_env(
        lava_positions=alt_lava_positions
    )
)

Interesting --> Removing spiky tiles confuses the agent at the last step. Maybe it expects to walk on a spiky tile when finishing the Umweg, but does not recognize the situation.
What happens when moving the goal tile?

In [None]:
test_agent_on_environment(
    model_path_prefix + "exp_hard_001/pixel_obs_8/a2c/a2c_entropy_6/seed_4744.zip"
)

## `time_penalty`

### A2C on `time_penalty`

In [None]:
pen_a2c = model_path_prefix + "time_penalty/tensor_obs/a2c/algo_default/seed_763.zip"

In [None]:
test_agent_on_environment(
    pen_a2c
)

## `exp_hard_001`

In [None]:
hard_a2c = model_path_prefix + "exp_hard_001/tensor_obs/a2c/algo_default/seed_763.zip"

In [None]:
test_agent_on_environment(hard_a2c)

## Slipping Experiments

### `exp_slip_1`

_Environment Configuration:_

```json
    "exp_slip_1" : {
        "max_steps" : 150,
        "slip_proba" : 0.05,
        "wall_rebound" : false,
        "spiky_active" : false,
        "reward_spec" : {
            "step_penalty" : 0,
            "goal_reward" : 1,
            "absorbing_states" : false,
            "absorbing_reward_goal" : 0,
            "absorbing_reward_lava" : -1,
            "risky_tile_reward" : 0,
            "lava_reward" : -1
        }
```

### `exp_slip_2`

_Environment Configuration:_

```json
    "exp_slip_2" : {
        "max_steps" : 150,
        "slip_proba" : 0.1,
        "wall_rebound" : false,
        "spiky_active" : false,
        "reward_spec" : {
            "step_penalty" : 0,
            "goal_reward" : 1,
            "absorbing_states" : false,
            "absorbing_reward_goal" : 0,
            "absorbing_reward_lava" : -1,
            "risky_tile_reward" : 0,
            "lava_reward" : -1
        }
```

### `exp_slip_3`

_Environment Configuration:_

```json
    "exp_slip_3" : {
        "max_steps" : 150,
        "slip_proba" : 0.15,
        "wall_rebound" : false,
        "spiky_active" : false,
        "reward_spec" : {
            "step_penalty" : 0,
            "goal_reward" : 1,
            "absorbing_states" : false,
            "absorbing_reward_goal" : 0,
            "absorbing_reward_lava" : -1,
            "risky_tile_reward" : 0,
            "lava_reward" : -1
        }
```

In [None]:
# TODO put this in time_penalty
# TODO test how the agent reacts to changed environment (changed goal, changed lava etc.)

tp_e = model_path_prefix + "time_penalty/tensor_obs/dqn/algo_default/seed_763.zip"
test_agent_on_environment(tp_e)

In [None]:
a,b,c = load_model_params(tp_e)

In [None]:
a

## exp_hard_001

_Environment Configuration:_

```json
```