# Evaluating trained agents

This Notebook will be used to visualize & analyze various trained agents on RiskyPath environment. Analysis will especially comprise observing the agent's behaviour in the environment it was trained for but also different versions of the environment (distributional shift analysis)

In [14]:
import json
import time

import gym
import gym_minigrid
from gym_minigrid.envs import RiskyPathEnv
from gym_minigrid.wrappers import RGBImgObsWrapper, ImgObsWrapper, TensorObsWrapper

from experiment_config import GridworldExperiment

import stable_baselines3
from stable_baselines3.dqn import DQN
from stable_baselines3.a2c import A2C

## Utilities

Definition of functions to use for quick analysis

In [15]:
def test_agent_on_environment(
    agent_path: str,
    num_episodes: int = 1,
    render_time: float = 0.2,
    custom_environment: gym.Env = None,
    predict_deterministic: bool = True,
    accelerate_viz: bool = True
):
    # Extract model from path (a2c or dqn?)
    if "/dqn/" in agent_path:
        model_class = DQN
    elif "/a2c/" in agent_path:
        model_class = A2C

    model = model_class.load(agent_path)
    
    # Create environment given information in function input
    path_keys = agent_path.split("saved_models/")[1].split("/")
    env_name = path_keys[0]
    observation_type = path_keys[1]

    render_size = 8
    rgb = False
    if "pixel_obs_" in agent_path:
        render_size = int(path_keys[2].split("_")[-1])
        rgb = True

    if custom_environment is None:
        with open('env_config.json', 'r') as f:
            env_kwargs = json.load(f)[env_name]
            
        env = gym.make(
            "MiniGrid-RiskyPath-v0",
            **env_kwargs
        )
    else:
        env = custom_environment
    
    if rgb:
        env = RGBImgObsWrapper(env, tile_size=render_size)
        env = ImgObsWrapper(env)
    else:
        env = TensorObsWrapper(env)
    
    # Execute episodes and render agent
        # TODO print reward, action [number, (himmelsrichtung)] etc.
    for i in range(num_episodes):

        print(f"Starting episode {i+1}")
        total_reward = 0
        needed_timesteps = 0

        obs = env.reset()
        done = False
        env.render(tile_size=render_size)
        time.sleep(render_time)

        while not done:
            action, _ = model.predict(obs, deterministic=predict_deterministic)
            obs, reward, done, info = env.step(action)
            env.render(tile_size=render_size)
            total_reward += reward
            needed_timesteps += 1
            if needed_timesteps > 25:
                render_time = 0.05
            time.sleep(render_time)
        
        print(f"Episode ended after {needed_timesteps} time steps.")
        out = f"Total reward: {total_reward}"
        print(out)
        print("-"*len(out))
    %matplotlib


def make_env(
    **kwargs
):
    env = gym.make(
        "MiniGrid-RiskyPath-v0",
        **kwargs
    )
    return env

def load_model_params(
    path: str
):
    # Extract model from path (a2c or dqn?)
    if "/dqn/" in path:
        model_class = DQN
    elif "/a2c/" in path:
        model_class = A2C

    model = model_class.load(path)
    return model.policy, model.policy_class, model.policy_kwargs

In [16]:
# ignore "memory not enough" warnings concerning replay buffer
import warnings
warnings.filterwarnings('ignore', module="stable_baselines3.common.buffers")

In [17]:
# NOTE Save the prefix for the logs & models folder here for compatibility across different systems
model_path_prefix = "/Users/tilioschulze/Library/CloudStorage/OneDrive-Personal/Studium/Bachelorarbeit/experiment_models/saved_models/"

In [18]:
%matplotlib
# Force matplotlib to render outside of notebook (Don't use 'inline' backend)

Using matplotlib backend: MacOSX


## exp_001

In [19]:
exp_001_path = model_path_prefix + "exp_001/tensor_obs/dqn/algo_default/seed_763.zip"

In [20]:

test_agent_on_environment(exp_001_path)

Starting episode 1
Episode ended after 7 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX


The agent solves the environment as expected.
What would happen if the agent had to start from another position?

In [21]:
for pos in [(7,6), (3,9), (9,7), (4,8)]:
    test_agent_on_environment(
        exp_001_path,
        num_episodes=1,
        custom_environment=make_env(
            agent_start_pos=pos
        ),
    )

Starting episode 1
Episode ended after 9 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX
Starting episode 1
Episode ended after 8 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX
Starting episode 1
Episode ended after 12 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX
Starting episode 1
Episode ended after 10 time steps.
Total reward: 1
---------------
Using matplotlib backend: MacOSX


The agent successfully navigates the environment when beginning at another position. It quickly finds the goal tile and mostly doesn't take any detours.
Interestingly, when the agent is placed on position (4,8) it first goes down and to the left instead of taking the quicker path upwards. Considering that the reward model of `exp_001` does not incentivize the agent to find the shortest path (no time penalty), this is not especially surprising. Still, this leads to the hypothesis that the agent found that going downwards from this position would lead to more reward than going up. (Or maybe due to the update rule in Q-Learning? --> # TODO investigate this)

What happens when lava tiles are placed in the agent's way?

In [22]:
alt_lava_positions = []
for y in range(1, 11 - 1):
    alt_lava_positions.append((1, y))
for y in range(11 - 3, 11 - 8, -1):
    alt_lava_positions.append((3, y))
alt_lava_positions.extend([(6, 11 - 5), (6, 11 - 6)])
alt_lava_positions.append((2,8))

test_agent_on_environment(
    exp_001_path,
    num_episodes=1,
    custom_environment=make_env(
        lava_positions=alt_lava_positions
    )
)

alt_lava_positions.append((3,3))
test_agent_on_environment(
    exp_001_path,
    num_episodes=1,
    custom_environment=make_env(
        lava_positions=alt_lava_positions,
        agent_start_pos=(4,7)
    )
)


Starting episode 1
Episode ended after 1 time steps.
Total reward: -1
----------------
Using matplotlib backend: MacOSX
Starting episode 1
Episode ended after 5 time steps.
Total reward: -1
----------------
Using matplotlib backend: MacOSX


It seems that the agent is not able to recognize lava tiles. It always goes straight to the goal location and only evades the lava tile positions that it already knows from training. It seems the agent has not learned the causation between lava and negative reward but instead learned the correlation between the positions (of lava tiles) in the gridworld and the negative reward. This would also explain why changing the starting position does not confuse the agent in searching the goal tile (when it is at the original position). During training, only one part of the observation tensor is constantly changing, namely the agent's position.
**INTERESTING:** Train an agent on a self-shifting environment (e.g. change goal position every five episodes and change lava tile positions every 5 episodes)

This leads to the following question: 
Does the agent find the goal when it is placed somewhere else?

In [23]:
test_agent_on_environment(
    exp_001_path,
    num_episodes=1,
    custom_environment=make_env(
        max_steps=25, # Changed limit because agent is caught in a loop
        goal_positions=[(2,2)]
    )
)

Starting episode 1
Episode ended after 25 time steps.
Total reward: 0
---------------
Using matplotlib backend: MacOSX


It is clearly visible that the agent is not able to understand how to get to the goal tile when placed somewhere else. It gets caught in a loop and oscillated from left to right (Maybe an analysis would be interesting in which the q-net's output would be shown for each possible state).
The hypothesis is thus strengthened that the agent only effectively learns cause and effect when the environment dynamics change (e.g. lava and goal placement). Currently, the agent is only able 

In [24]:
p, _, _ = load_model_params(exp_001_path)
p

DQNPolicy(
  (q_net): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=484, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=4, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=484, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=4, bias=True)
    )
  )
)

In [25]:
# TODO create observation here
env = gym.make("MiniGrid-RiskyPath-v0")
env = TensorObsWrapper(env)
obs = env.reset()

from stable_baselines3.common.utils import obs_as_tensor
import torch as th

observation = obs.reshape((-1,) + p.observation_space.shape)
observation = obs_as_tensor(observation, "cpu")
with th.no_grad():
    q_values = p.q_net(observation)
print(q_values)

obs, _, _, _ = env.step(2)

observation = obs.reshape((-1,) + p.observation_space.shape)
observation = obs_as_tensor(observation, "cpu")
with th.no_grad():
    q_values = p.q_net(observation)
print(q_values)

tensor([[-0.9756,  0.9524,  0.8197,  0.9148]])
tensor([[ 0.9259, -0.9978,  0.6336,  0.7985]])


If we look at the q-values, we can see that from the starting point, the agent prefers going right. After landing on this tile, the maximal q-value corresponds to moving to the left tile, which catches the agent in a loop!

## `time_penalty`

### A2C on `time_penalty`

In [26]:
pen_a2c = model_path_prefix + "time_penalty/tensor_obs/a2c/algo_default/seed_763.zip"

In [27]:
test_agent_on_environment(
    pen_a2c
)

Starting episode 1
Episode ended after 1 time steps.
Total reward: -1.1
------------------
Using matplotlib backend: MacOSX


## `exp_hard_001`

In [28]:
hard_a2c = model_path_prefix + "exp_hard_001/tensor_obs/a2c/algo_default/seed_763.zip"

In [29]:
test_agent_on_environment(hard_a2c)

Starting episode 1
Episode ended after 150 time steps.
Total reward: 0
---------------
Using matplotlib backend: MacOSX


## Slipping Experiments

### `exp_slip_1`

### `exp_slip_2`

### `exp_slip_3`

In [30]:
# TODO put this in time_penalty
# TODO test how the agent reacts to changed environment (changed goal, changed lava etc.)

tp_e = model_path_prefix + "time_penalty/tensor_obs/dqn/algo_default/seed_763.zip"
test_agent_on_environment(tp_e)

Starting episode 1
Episode ended after 7 time steps.
Total reward: 0.30000000000000004
---------------------------------
Using matplotlib backend: MacOSX


In [31]:
a,b,c = load_model_params(tp_e)

In [32]:
a

DQNPolicy(
  (q_net): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=484, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=4, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): FlattenExtractor(
      (flatten): Flatten(start_dim=1, end_dim=-1)
    )
    (q_net): Sequential(
      (0): Linear(in_features=484, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): ReLU()
      (4): Linear(in_features=64, out_features=4, bias=True)
    )
  )
)

In [33]:
# TODO create observation here
env = gym.make("MiniGrid-RiskyPath-v0")
env = TensorObsWrapper(env)
obs = env.reset()

from stable_baselines3.common.utils import obs_as_tensor
import torch as th

In [34]:
# Use this code to extract the q_values from the model
# This code is adapted from the following stackoverflow post
# https://stackoverflow.com/questions/73239501/how-to-get-the-q-values-in-dqn-in-stable-baseline-3/73242315#73242315?newreg=d2762c51b8bc44778cde16b43499a6d5
# TODO use this in evaluation script
# TODO make policy visualization on gridworld map with this!
observation = obs.reshape((-1,) + a.observation_space.shape)
observation = obs_as_tensor(observation, "cpu")
with th.no_grad():
    q_values = a.q_net(observation)
q_values

tensor([[-1.0951,  0.2638,  0.0765,  0.1530]])