# Evaluate Models

Play each game with each model and save the results to a csv file. This file can be used to create plots and compare the models.

In [1]:
import os
import itertools
import random
import re
from builtins import range
from pathlib import Path

# Set the TensorFlow logging level to suppress debug messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.auto import tqdm

import Training.TrainingScript as TrainingScript
from Ensemble import EnsembleMethods, Ensemble

In [2]:
RESULTS_CSV = "./../../results.csv"
EVALUATION_RESULTS_CSV = "./../../evaluation_results.csv"

NUM_REPETITIONS = 15
NUM_REPETITIONS_FOR_RANDOM_BASELINE = 250

N_FOR_TOP_N_ENSEMBLES = 3
M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS = 3
ENSEMBLE_METHODS_USED = [
    EnsembleMethods.AVERAGE,
    EnsembleMethods.LOGISTIC_AVERAGE,
    EnsembleMethods.AVERAGE_WITH_CONFIDENCE,
    EnsembleMethods.LOGISTIC_AVERAGE_WITH_CONFIDENCE,
    EnsembleMethods.MAJORITY_VOTE,
]

In [3]:
results_df = pd.read_csv(RESULTS_CSV)
results_df.head()

Unnamed: 0,game,training_model,reward_history,loss_history,model_path
0,breakout,with_huber_loss_and_adam,"[0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 6.0, 0.0, ...","[0.01637558452785015, 0.011548655107617378, 0....",./../models/with_huber_loss_and_adam/breakout/...
1,breakout,with_huber_loss_and_adam,"[2.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 3.0, 0.0, ...","[0.0017978374380618334, 0.008414973504841328, ...",./../models/with_huber_loss_and_adam/breakout/...
2,enduro,with_huber_loss_and_adam,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.013470819219946861, 0.00460647139698267, 0....",./../models/with_huber_loss_and_adam/enduro/st...
3,enduro,mnih2015,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.004595236387103796, 0.005559221841394901, 0...",./../models/mnih2015/enduro/started_at_2023-09...
4,enduro,mnih2013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.10596290975809097, 0.11753647774457932, 0.1...",./../models/mnih2013/enduro/started_at_2023-09...


In [4]:
results_df[results_df["training_model"] == "mnih2013"]

Unnamed: 0,game,training_model,reward_history,loss_history,model_path
4,enduro,mnih2013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.10596290975809097, 0.11753647774457932, 0.1...",./../models/mnih2013/enduro/started_at_2023-09...
5,enduro,mnih2013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.03629424795508385, 0.07472917437553406, 0.0...",./../models/mnih2013/enduro/started_at_2023-09...
10,breakout,mnih2013,"[0.0, 3.0, 4.0, 0.0, 1.0, 2.0, 0.0, 1.0, 4.0, ...","[0.01985565945506096, 0.01905544474720955, 0.0...",./../models/mnih2013/breakout/started_at_2023-...
12,breakout,mnih2013,"[0.0, 5.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, ...","[0.001379762077704072, 0.0012381058186292648, ...",./../models/mnih2013/breakout/started_at_2023-...
14,breakout,mnih2013,"[0.0, 1.0, 0.0, 0.0, 0.0, 3.0, 1.0, 3.0, 1.0, ...","[0.015244378708302975, 0.014382739551365376, 0...",./../models/mnih2013/breakout/started_at_2023-...
18,enduro,mnih2013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.010004216805100441, 0.012425143271684647, 0...",./../models/mnih2013/enduro/started_at_2023-09...
30,seaquest,mnih2013,"[40.0, 0.0, 140.0, 60.0, 0.0, 20.0, 20.0, 0.0,...","[0.02893028035759926, 0.027503740042448044, 0....",./../models/mnih2013/seaquest/started_at_2023-...
31,seaquest,mnih2013,"[100.0, 40.0, 0.0, 40.0, 0.0, 40.0, 40.0, 40.0...","[0.05895446613430977, 0.05621371418237686, 25....",./../models/mnih2013/seaquest/started_at_2023-...
32,breakout,mnih2013,"[1.0, 4.0, 2.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...","[0.0011551892384886742, 0.0012054498074576259,...",./../models/mnih2013/breakout/started_at_2023-...
33,breakout,mnih2013,"[1.0, 3.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 5.0, ...","[0.04612644761800766, 0.04612863063812256, 0.0...",./../models/mnih2013/breakout/started_at_2023-...


In [5]:
list_of_games = list(results_df["game"].unique())
list_of_algorithms = list(results_df["training_model"].unique())

print(list_of_games, list_of_algorithms)

['breakout', 'enduro', 'seaquest'] ['with_huber_loss_and_adam', 'mnih2015', 'mnih2013', 'interpretable_cnn']


In [6]:
evaluation_data_df = pd.DataFrame(columns=["game", "model", "model_id", "episode_rewards", "mean", "standard_deviation"])

## Methods to Run Evaluations

In [7]:
@tf.function(autograph=False)
def get_action_from_model(model, state):
    q_values = model(state)
    return tf.argmax(q_values, axis=1)


In [8]:
def evaluate_model(game: str, model: tf.keras.Model, num_repetitions: int = 10):
    env = TrainingScript.create_env(game)
    # env = gym.wrappers.RecordVideo(env, video_folder='./video/', episode_trigger=lambda episode_id: episode_id % num_repetitions == 0)
    rewards = []
    for i in (tbar := tqdm(range(num_repetitions), leave=False)):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        step = 0
        while not done:
            if random.random() < 0.05:
                # To help the agent when it gets "stuck"
                # Happens mainly in Breakout caused by a bug in the game
                action = env.action_space.sample()
            else:
                step += 1
                state = tf.convert_to_tensor(state, dtype=tf.float32) / 255.0
                state = tf.expand_dims(state, axis=0)
                action = get_action_from_model(model, state)
                action = action.numpy()[0]

            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward

            tbar.set_description(f"Episode {i}  -  Step: {step}, Reward: {episode_reward}")

        rewards.append(episode_reward)
    return rewards

In [9]:
def load_model(path: str):
    model = tf.keras.models.load_model(path, compile=False)
    model.compile()
    return model

In [10]:
def load_models(path_list: list[Path]) -> list:
    return [load_model(path) for path in path_list]

In [11]:
def random_play(game: str, num_repetitions: int = 5):
    env = TrainingScript.create_env(game)
    rewards = []

    for _ in tqdm(range(num_repetitions), unit="episode", desc="Random play " + game.capitalize(), leave=False):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        step = 0
        while not done:
            step += 1

            state, reward, terminated, truncated, _ = env.step(env.action_space.sample())
            done = terminated or truncated
            episode_reward += reward

        rewards.append(episode_reward)

    return rewards

In [12]:
def get_snapshots(dir) -> list:
    if os.path.isfile(dir):
        dir = os.path.dirname(dir)

    snapshots = dict()
    snapshot_re = re.compile(r"snapshot_(?P<idx>\d+)\.keras")

    for file in Path(dir).glob("*.keras"):
        f_name = str(file.name)
        if snapshot_re.match(f_name):
            snapshots[int(snapshot_re.match(f_name)["idx"])] = str(file)

    snapshots[len(snapshots)] = str(Path(dir) / "snapshot_final.keras")
    sorted_snapshots = list(zip(*sorted(list(snapshots.items()), key=lambda a: a[1])))[1]

    return sorted_snapshots

## Random Baseline for Each Game

In [13]:
random_baseline = []
for game_name in tqdm(results_df["game"].unique(), unit="game", desc="Calculate random baseline"):
    rewards = random_play(game_name, num_repetitions=NUM_REPETITIONS_FOR_RANDOM_BASELINE)
    random_baseline.append(
        {
            "game": game_name,
            "model": "random play",
            "model_id": 0,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards)
        }
    )

Calculate random baseline:   0%|          | 0/3 [00:00<?, ?game/s]

Random play Breakout:   0%|          | 0/250 [00:00<?, ?episode/s]

Random play Enduro:   0%|          | 0/250 [00:00<?, ?episode/s]

Random play Seaquest:   0%|          | 0/250 [00:00<?, ?episode/s]

In [14]:
random_baseline_df = pd.DataFrame.from_records(random_baseline)
random_baseline_df

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation
0,breakout,random play,0,"[0.0, 2.0, 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, ...",1.264,1.397964
1,enduro,random play,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0
2,seaquest,random play,0,"[20.0, 80.0, 160.0, 120.0, 0.0, 100.0, 120.0, ...",80.16,61.812413


## Evaluate each Model on each Game

In [15]:
evaluation_results = []
for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, list_of_algorithms)))):
    t.set_description(f"Evaluation models for {game_name.capitalize()} using {model_name.capitalize()}")
    fitting_models = list(
        results_df[(results_df["game"] == game_name) & (results_df["training_model"] == model_name)]["model_path"]
    )

    for idx, m_path in enumerate(fitting_models):
        m = str(Path(m_path).resolve())

        model = load_model(m)
        rewards = evaluate_model(game_name, model, num_repetitions=NUM_REPETITIONS)

        evaluation_results.append({
            "game": game_name,
            "model": model_name,
            "model_id": idx,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards),
            "model_path": m_path
        })

  0%|          | 0/12 [00:00<?, ?it/s]

OSError: No file or directory found at C:\Users\timwi\Documents\Uni\DeepReinforcementLearning\src\models\with_huber_loss_and_adam\breakout\started_at_2023-09-23_14-53-39\snapshot_final.keras

In [None]:
evaluation_single_models_df = pd.DataFrame.from_records(evaluation_results)
evaluation_single_models_df.head()

## Evaluate as Ensembles

In [None]:
models_used = set(list_of_algorithms) - {"random play", "interpretable_cnn"}

In [None]:
def evaluate_ensemble(game: str, model: tf.keras.Model, num_repetitions: int = 10):
    env = TrainingScript.create_env(game)
    rewards = []
    for i in (tbar := tqdm(range(num_repetitions), leave=False)):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        step = 0
        while not done:
            if random.random() < 0.05:
                # To help the agent when it gets "stuck"
                # Happens mainly in Breakout caused by a bug in the game
                action = env.action_space.sample()
            else:
                step += 1
                state = tf.convert_to_tensor(state, dtype=tf.float32) / 255.0
                state = tf.expand_dims(state, axis=0)
                q_values = model.predict(state, verbose=0)
                action = tf.argmax(q_values, axis=1)
                action = action.numpy()[0]

            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward

            tbar.set_description(f"Episode {i}  -  Step: {step}, Reward: {episode_reward}")

        rewards.append(episode_reward)
    return rewards

### Uniform Ensembles consisting of the best $n$ models of a flavor:

In [None]:
evaluation_results_uniform_ensembles = []

In [None]:
for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, models_used)),
                                        desc="Build and Ensemble Models")):
    t.set_description(f"Build and Ensemble Models ({model_name.capitalize()} for {game_name})")

    # Select Top-N models:
    model_selection = evaluation_single_models_df[
                          (evaluation_single_models_df["model"] == model_name) & (evaluation_single_models_df["game"] == game_name)
                      ].sort_values(by=["mean", "standard_deviation"], ascending=[False, True])[:N_FOR_TOP_N_ENSEMBLES]
    model_paths = [str(Path(m_path).resolve()) for m_path in list(model_selection["model_path"])]
    models = load_models(model_paths)

    for idx, ensemble_method in enumerate(ENSEMBLE_METHODS_USED):
        ensemble = Ensemble(models, ensemble_method)
        t.set_description(
            f"Build and Ensemble Models ({model_name.capitalize()} for {game_name} with {ensemble_method})")

        print(f"Build and Ensemble Models ({model_name.capitalize()} for {game_name} with {ensemble_method})")
        rewards = evaluate_ensemble(game_name, ensemble, NUM_REPETITIONS)
        evaluation_results_uniform_ensembles.append({
            "game": game_name,
            "model": f"Top-{N_FOR_TOP_N_ENSEMBLES} Ensemble ({ensemble_method}) with {model_name}",
            "model_id": idx,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards),
            "model_path": model_paths
        })


In [None]:
evaluation_results_uniform_ensembles_df = pd.DataFrame.from_records(evaluation_results_uniform_ensembles)
evaluation_results_uniform_ensembles_df.head()

### Mixed Ensemble

One Ensemble containing the best $n$ models of each model type (e.g. 3 models of Mnih 2013, 3 models of Mnih 2015, and 3 models of Mnih 2015 with Huber loss and Adam).

In [None]:
evaluation_results_mixed_ensembles = []

In [None]:
for game_name in (t := tqdm(list_of_games)):
    t.set_description(f"Evaluate mixed Ensemble for {game_name}")
    # Select Top-N models for each model type:
    model_paths = []
    for model_name in list_of_algorithms:
        model_selection = evaluation_single_models_df[
                          (evaluation_single_models_df["model"] == model_name) & (evaluation_single_models_df["game"] == game_name)
                        ].sort_values(by=["mean", "standard_deviation"], ascending=[False, True])[:N_FOR_TOP_N_ENSEMBLES]

        model_paths.extend([str(Path(m_path).resolve()) for m_path in list(model_selection["model_path"])])

    models = load_models(model_paths)
    for ensemble_method in ENSEMBLE_METHODS_USED:
        t.set_description(f"Evaluate mixed Ensemble ({ensemble_method}) for {game_name}")
        ensemble = Ensemble(models)
        rewards = evaluate_ensemble(game_name, ensemble, num_repetitions=NUM_REPETITIONS)
        evaluation_results_mixed_ensembles.append({
            "game": game_name,
            "model": f"Top-{N_FOR_TOP_N_ENSEMBLES} Mixed Ensemble ({ensemble_method})",
            "model_id": 0,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards),
            "model_path": model_paths
        })


In [None]:
evaluation_results_mixed_ensembles_df = pd.DataFrame.from_records(evaluation_results_mixed_ensembles)
evaluation_results_mixed_ensembles_df.head()

### Snapshot Ensemble

Ensembles consisting of the last $M$ training snapshots of a model. In the original paper, the snapshot get selected on the fly during training by saving models at local minima and increasing the learning rate after a model was selected. This leads to models that are more different from another and achieve higher results. This is not easily applicable for RL, so I use the $M$ newest snapshots instead.

In [None]:
evaluation_results_snapshot_ensembles = []

In [None]:
for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, models_used)),
                                        desc="Build Snapshot Ensemble")):
    t.set_description(f"Build Snapshot Ensemble ({model_name.capitalize()} for {game_name})")

    model_paths_for_game_and_model = \
        evaluation_single_models_df[(evaluation_single_models_df["model"] == model_name) & (evaluation_single_models_df["game"] == game_name)]["model_path"]

    for idx, unique_model_path in enumerate(model_paths_for_game_and_model.unique()):
        model_paths = get_snapshots(unique_model_path)
        # Select M last snapshots:
        model_paths = model_paths[-M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS:]
        models = load_models(model_paths)

        for idx, ensemble_method in enumerate(ENSEMBLE_METHODS_USED):
            ensemble = Ensemble(models, ensemble_method)

            rewards = evaluate_ensemble(game_name, ensemble, NUM_REPETITIONS)
            evaluation_results_snapshot_ensembles.append({
                "game": game_name,
                "model": f"{M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS}-Snapshot Ensemble ({ensemble_method}) with {model_name}",
                "model_id": idx,
                "episode_rewards": rewards,
                "mean": np.mean(rewards),
                "standard_deviation": np.std(rewards),
                "model_path": model_paths
            })

In [None]:
evaluation_results_snapshot_ensembles_df = pd.DataFrame.from_records(evaluation_results_snapshot_ensembles)
evaluation_results_snapshot_ensembles_df.head()

## Evaluate as Soups

In [None]:
models_used = set(list_of_algorithms) - {"random play", "interpretable_cnn"}

### Uniform Soups created from the best $n$ models of a flavor:

In [None]:
evaluation_results_uniform_soups = []

In [None]:
from Soup import Soup

for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, models_used)))):
    t.set_description(f"Cooking the Soup Model ({model_name.capitalize()} for {game_name})")

    # Select Top-N models:
    model_selection = evaluation_single_models_df[
                          (evaluation_single_models_df["model"] == model_name) & (evaluation_single_models_df["game"] == game_name)
                        ].sort_values(by=["mean", "standard_deviation"], ascending=[False, True])[:N_FOR_TOP_N_ENSEMBLES]

    model_paths = [str(Path(m_path).resolve()) for m_path in list(model_selection["model_path"])]
    models = load_models(model_paths)

    soup = Soup(models).get_soup_model()

    rewards = evaluate_model(game_name, soup, NUM_REPETITIONS)
    evaluation_results_uniform_soups.append({
        "game": game_name,
        "model": f"Top-{N_FOR_TOP_N_ENSEMBLES} Soup of {model_name}",
        "model_id": 0,
        "episode_rewards": rewards,
        "mean": np.mean(rewards),
        "standard_deviation": np.std(rewards),
        "model_path": model_paths
    })


In [None]:
evaluation_results_uniform_soups_df = pd.DataFrame.from_records(evaluation_results_uniform_soups)
evaluation_results_uniform_soups_df.head()

### Snapshot Soup

Works like the Snapshot Ensemble, but as a Soup.

In [None]:
evaluation_results_snapshot_soups = []

In [None]:
for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, models_used)),
                                        desc="Cooking Soup Model")):
    t.set_description(f"Cooking Soup Model ({model_name.capitalize()} for {game_name})")

    model_paths_for_game_and_model = \
        results_df[(results_df["training_model"] == model_name) & (results_df["game"] == game_name)]["model_path"]

    for idx, unique_model_path in enumerate(model_paths_for_game_and_model.unique()):
        model_paths = get_snapshots(unique_model_path)
        # Select M last snapshots:
        model_paths = model_paths[-M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS:]
        models = load_models(model_paths)

        soup = Soup(models).get_soup_model()

        rewards = evaluate_model(game_name, soup, NUM_REPETITIONS)
        evaluation_results_snapshot_soups.append({
            "game": game_name,
            "model": f"{M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS}-Snapshot Soup {model_name}",
            "model_id": idx,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards),
            "model_path": model_paths
        })

In [None]:
evaluation_results_snapshot_soups_df = pd.DataFrame.from_records(evaluation_results_snapshot_soups)
evaluation_results_snapshot_soups_df.head()

## Save the Evaluation Data

In [None]:
evaluation_data_df = pd.concat([
    random_baseline_df,
    evaluation_single_models_df,
    # Ensembles:
    evaluation_results_uniform_ensembles_df,
    evaluation_results_mixed_ensembles_df,
    evaluation_results_snapshot_ensembles_df,
    # Soups:
    evaluation_results_uniform_soups_df,
    evaluation_results_snapshot_soups_df
])
evaluation_data_df.reset_index(drop=True, inplace=True)
evaluation_data_df.head()

In [None]:
evaluation_data_df.to_csv(EVALUATION_RESULTS_CSV, index=False)