# Evaluate Models

Play each game with each model and save the results to a csv file. This file can be used to create plots and compare the models.

In [1]:
import os
import itertools
import random
import re
from builtins import range
from pathlib import Path

# Set the TensorFlow logging level to suppress debug messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.auto import tqdm

import Training.TrainingScript as TrainingScript
from Ensemble import EnsembleMethods, Ensemble

In [2]:
RESULTS_CSV = "./../../results.csv"
EVALUATION_RESULTS_CSV = "./../../evaluation_results.csv"
MODELS = "./../../models/"

NUM_REPETITIONS = 2#5
NUM_REPETITIONS_FOR_RANDOM_BASELINE = 2#50

N_FOR_TOP_N_ENSEMBLES = 2
M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS = 3
ENSEMBLE_METHODS_USED = [
    EnsembleMethods.AVERAGE,
    EnsembleMethods.LOGISTIC_AVERAGE,
    # EnsembleMethods.AVERAGE_WITH_CONFIDENCE,
    # EnsembleMethods.LOGISTIC_AVERAGE_WITH_CONFIDENCE,
    EnsembleMethods.MAJORITY_VOTE,
]

In [3]:
results_df = pd.read_csv(RESULTS_CSV)
results_df.head()

Unnamed: 0,game,training_model,reward_history,loss_history,model_path
0,seaquest,mnih2013,"[100.0, 0.0, 100.0, 100.0, 0.0, 120.0, 40.0, 2...","[0.012014569714665413, 12.65832805633545, 0.01...",mnih2013/seaquest/started_at_2023-09-25_12-09-...
1,enduro,interpretable_cnn,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[8.388333662878722e-05, 0.00010635607759468257...",interpretable_cnn/enduro/started_at_2023-09-25...
2,breakout,interpretable_cnn,"[0.0, 2.0, 0.0, 1.0, 4.0, 2.0, 0.0, 1.0, 0.0, ...","[5.418467117124237e-05, 4.053674638271332e-05,...",interpretable_cnn/breakout/started_at_2023-09-...
3,enduro,mnih2013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.021065175533294678, 0.022660288959741592, 0...",mnih2013/enduro/started_at_2023-09-25_12-49-42...
4,seaquest,mnih2013,"[0.0, 40.0, 20.0, 20.0, 20.0, 120.0, 160.0, 20...","[0.02780219167470932, 0.027884984388947487, 0....",mnih2013/seaquest/started_at_2023-09-25_12-41-...


In [4]:
results_df[results_df["training_model"] == "mnih2013"]

Unnamed: 0,game,training_model,reward_history,loss_history,model_path
0,seaquest,mnih2013,"[100.0, 0.0, 100.0, 100.0, 0.0, 120.0, 40.0, 2...","[0.012014569714665413, 12.65832805633545, 0.01...",mnih2013/seaquest/started_at_2023-09-25_12-09-...
3,enduro,mnih2013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.021065175533294678, 0.022660288959741592, 0...",mnih2013/enduro/started_at_2023-09-25_12-49-42...
4,seaquest,mnih2013,"[0.0, 40.0, 20.0, 20.0, 20.0, 120.0, 160.0, 20...","[0.02780219167470932, 0.027884984388947487, 0....",mnih2013/seaquest/started_at_2023-09-25_12-41-...
5,breakout,mnih2013,"[0.0, 0.0, 2.0, 0.0, 2.0, 6.0, 1.0, 1.0, 1.0, ...","[0.017719639465212822, 0.016713574528694153, 0...",mnih2013/breakout/started_at_2023-09-25_13-30-...
6,enduro,mnih2013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.042389389127492905, 0.031015925109386444, 0...",mnih2013/enduro/started_at_2023-09-25_13-26-14...
7,breakout,mnih2013,"[1.0, 1.0, 0.0, 1.0, 5.0, 0.0, 1.0, 1.0, 0.0, ...","[0.043728306889534, 0.03957980498671532, 0.035...",mnih2013/breakout/started_at_2023-09-26_04-54-...
8,breakout,mnih2013,"[0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 2.0, 0.0, 3.0, ...","[0.04034668952226639, 0.004771257750689983, 0....",mnih2013/breakout/started_at_2023-09-26_04-49-...
24,enduro,mnih2013,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.05833699181675911, 0.04303630813956261, 0.0...",mnih2013/enduro/started_at_2023-09-27_20-28-55...
25,seaquest,mnih2013,"[120.0, 40.0, 120.0, 220.0, 80.0, 40.0, 180.0,...","[0.012835360132157803, 0.012768534012138844, 1...",mnih2013/seaquest/started_at_2023-09-27_23-56-...


In [5]:
list_of_games = list(results_df["game"].unique())
list_of_games = list_of_games[:1]
list_of_algorithms = list(results_df["training_model"].unique())

print(list_of_games, list_of_algorithms)

['seaquest'] ['mnih2013', 'interpretable_cnn', 'mnih2015', 'with_huber_loss_and_adam']


In [6]:
evaluation_data_df = pd.DataFrame(columns=["game", "model", "model_id", "episode_rewards", "mean", "standard_deviation"])

In [7]:
list_of_models = list(results_df["model_path"].unique())
models_dict = {
    model_path: tf.keras.models.load_model(MODELS + model_path, compile=False)
    for model_path in list_of_models
}

  function = cls._parse_function_from_config(


## Methods to Run Evaluations

In [8]:
@tf.function(autograph=False)
def get_action_from_model(model, state):
    q_values = model(state)
    return tf.argmax(q_values, axis=1)

In [9]:
@tf.function(autograph=False)
def get_action(model, state):
    state = tf.cast(tf.convert_to_tensor(state, dtype=tf.uint8), dtype=tf.float32) / 255.0
    state = tf.expand_dims(state, axis=0)
    q_values = model(state)
    return tf.argmax(q_values, axis=1)[0]

In [10]:
def evaluate_model(game: str, model: tf.keras.Model, num_repetitions: int = 10):
    env = TrainingScript.create_env(game)
    # env = gym.wrappers.RecordVideo(env, video_folder='./video/', episode_trigger=lambda episode_id: episode_id % num_repetitions == 0)
    rewards = []
    for i in (tbar := tqdm(range(num_repetitions), leave=False)):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        step = 0
        while not done:
            if random.random() < 0.05:
                # To help the agent when it gets "stuck"
                # Happens mainly in Breakout caused by a bug in the game
                action = env.action_space.sample()
            else:
                step += 1
                # state = tf.convert_to_tensor(state, dtype=tf.float32) / 255.0
                # state = tf.expand_dims(state, axis=0)
                # action = get_action_from_model(model, state)
                # action = action.numpy()[0]
                action = get_action(model, state)

            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward

            tbar.set_description(f"Episode {i}  -  Step: {step}, Reward: {episode_reward}")

        rewards.append(episode_reward)
    return rewards

In [11]:
def load_model(path: str):
    model = tf.keras.models.load_model(MODELS + path, compile=False)
    model.compile()
    return model

In [12]:
def load_models(path_list: list[Path]) -> list:
    return [load_model(path) for path in path_list]

In [13]:
def random_play(game: str, num_repetitions: int = 5):
    env = TrainingScript.create_env(game)
    rewards = []

    for _ in tqdm(range(num_repetitions), unit="episode", desc="Random play " + game.capitalize(), leave=False):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        step = 0
        while not done:
            step += 1

            state, reward, terminated, truncated, _ = env.step(env.action_space.sample())
            done = terminated or truncated
            episode_reward += reward

        rewards.append(episode_reward)

    return rewards

## Random Baseline for Each Game

In [15]:
random_baseline = []
for game_name in tqdm(results_df["game"].unique(), unit="game", desc="Calculate random baseline"):
    rewards = random_play(game_name, num_repetitions=NUM_REPETITIONS_FOR_RANDOM_BASELINE)
    random_baseline.append(
        {
            "game": game_name,
            "model": "random play",
            "model_id": 0,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards)
        }
    )

Calculate random baseline:   0%|          | 0/3 [00:00<?, ?game/s]

Random play Seaquest:   0%|          | 0/2 [00:00<?, ?episode/s]

Random play Enduro:   0%|          | 0/2 [00:00<?, ?episode/s]

Random play Breakout:   0%|          | 0/2 [00:00<?, ?episode/s]

In [16]:
random_baseline_df = pd.DataFrame.from_records(random_baseline)
random_baseline_df

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation
0,seaquest,random play,0,"[100.0, 20.0]",60.0,40.0
1,enduro,random play,0,"[0.0, 0.0]",0.0,0.0
2,breakout,random play,0,"[0.0, 0.0]",0.0,0.0


## Evaluate each Model on each Game

In [17]:
evaluation_results = []
for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, list_of_algorithms)))):
    t.set_description(f"Evaluation models for {game_name.capitalize()} using {model_name.capitalize()}")
    fitting_models = list(
        results_df[(results_df["game"] == game_name) & (results_df["training_model"] == model_name)]["model_path"]
    )

    for idx, m_path in enumerate(fitting_models):
        model = models_dict[m_path]
        rewards = evaluate_model(game_name, model, num_repetitions=NUM_REPETITIONS)

        evaluation_results.append({
            "game": game_name,
            "model": model_name,
            "model_id": idx,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards),
            "model_path": m_path
        })

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
evaluation_single_models_df = pd.DataFrame.from_records(evaluation_results)
evaluation_single_models_df.head()

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation,model_path
0,seaquest,mnih2013,0,"[40.0, 40.0]",40.0,0.0,mnih2013/seaquest/started_at_2023-09-25_12-09-...
1,seaquest,mnih2013,1,"[0.0, 40.0]",20.0,20.0,mnih2013/seaquest/started_at_2023-09-25_12-41-...
2,seaquest,mnih2013,2,"[40.0, 60.0]",50.0,10.0,mnih2013/seaquest/started_at_2023-09-27_23-56-...
3,seaquest,interpretable_cnn,0,"[100.0, 20.0]",60.0,40.0,interpretable_cnn/seaquest/started_at_2023-09-...
4,seaquest,mnih2015,0,"[120.0, 140.0]",130.0,10.0,mnih2015/seaquest/started_at_2023-09-26_05-09-...


## Evaluate as Ensembles

In [19]:
models_used = set(list_of_algorithms) - {"random play", "interpretable_cnn"}

In [20]:
def evaluate_ensemble(game: str, model: Ensemble, num_repetitions: int = 10):
    env = TrainingScript.create_env(game)
    rewards = []
    for i in (tbar := tqdm(range(num_repetitions), leave=False)):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        step = 0
        while not done:
            if random.random() < 0.05:
                # To help the agent when it gets "stuck"
                # Happens mainly in Breakout caused by a bug in the game
                action = env.action_space.sample()
            else:
                step += 1
                if model.ensemble_method == EnsembleMethods.MAJORITY_VOTE:
                    state = tf.convert_to_tensor(state, dtype=tf.float32) / 255.0
                    state = tf.expand_dims(state, axis=0)
                    q_values = model(state)
                    action = tf.argmax(q_values, axis=1)
                    # action = get_action_from_model(model, state)
                    action = action.numpy()[0]

                else:
                    action = get_action(model, state)

            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward

            tbar.set_description(f"Episode {i}  -  Step: {step}, Reward: {episode_reward}")

        rewards.append(episode_reward)
    return rewards

### Uniform Ensembles consisting of the best $n$ models of a flavor:

In [21]:
evaluation_results_uniform_ensembles = []

In [22]:
for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, models_used)),
                                        desc="Build and Ensemble Models")):
    t.set_description(f"Build and Ensemble Models ({model_name.capitalize()} for {game_name})")

    # Select Top-N models:
    model_selection = evaluation_single_models_df[
                          (evaluation_single_models_df["model"] == model_name) & (evaluation_single_models_df["game"] == game_name)
                      ].sort_values(by=["mean", "standard_deviation"], ascending=[False, True])[:N_FOR_TOP_N_ENSEMBLES]
    model_paths = [m_path for m_path in list(model_selection["model_path"])]
    models = [models_dict[m_path] for m_path in model_paths]

    for idx, ensemble_method in enumerate(ENSEMBLE_METHODS_USED):
        ensemble = Ensemble(models, ensemble_method)
        t.set_description(
            f"Build and Ensemble Models ({model_name.capitalize()} for {game_name} with {ensemble_method})")

        print(f"Build and Ensemble Models ({model_name.capitalize()} for {game_name} with {ensemble_method})")
        rewards = evaluate_ensemble(game_name, ensemble, NUM_REPETITIONS)
        evaluation_results_uniform_ensembles.append({
            "game": game_name,
            "model": f"Top-{N_FOR_TOP_N_ENSEMBLES} Ensemble ({ensemble_method}) with {model_name}",
            "model_id": idx,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards),
            "model_path": model_paths
        })

Build and Ensemble Models:   0%|          | 0/3 [00:00<?, ?it/s]

Build and Ensemble Models (Mnih2013 for seaquest with average)


  0%|          | 0/2 [00:00<?, ?it/s]

Build and Ensemble Models (Mnih2013 for seaquest with logistic_average)


  0%|          | 0/2 [00:00<?, ?it/s]

Build and Ensemble Models (Mnih2013 for seaquest with majority_vote)


  0%|          | 0/2 [00:00<?, ?it/s]

Build and Ensemble Models (Mnih2015 for seaquest with average)


  0%|          | 0/2 [00:00<?, ?it/s]

Build and Ensemble Models (Mnih2015 for seaquest with logistic_average)


  0%|          | 0/2 [00:00<?, ?it/s]

Build and Ensemble Models (Mnih2015 for seaquest with majority_vote)


  0%|          | 0/2 [00:00<?, ?it/s]

Build and Ensemble Models (With_huber_loss_and_adam for seaquest with average)


  0%|          | 0/2 [00:00<?, ?it/s]

Build and Ensemble Models (With_huber_loss_and_adam for seaquest with logistic_average)


  0%|          | 0/2 [00:00<?, ?it/s]

Build and Ensemble Models (With_huber_loss_and_adam for seaquest with majority_vote)


  0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
evaluation_results_uniform_ensembles_df = pd.DataFrame.from_records(evaluation_results_uniform_ensembles)
evaluation_results_uniform_ensembles_df.head()

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation,model_path
0,seaquest,Top-2 Ensemble (average) with mnih2013,0,"[60.0, 120.0]",90.0,30.0,[mnih2013/seaquest/started_at_2023-09-27_23-56...
1,seaquest,Top-2 Ensemble (logistic_average) with mnih2013,1,"[40.0, 40.0]",40.0,0.0,[mnih2013/seaquest/started_at_2023-09-27_23-56...
2,seaquest,Top-2 Ensemble (majority_vote) with mnih2013,2,"[40.0, 40.0]",40.0,0.0,[mnih2013/seaquest/started_at_2023-09-27_23-56...
3,seaquest,Top-2 Ensemble (average) with mnih2015,0,"[220.0, 100.0]",160.0,60.0,[mnih2015/seaquest/started_at_2023-09-26_05-09...
4,seaquest,Top-2 Ensemble (logistic_average) with mnih2015,1,"[140.0, 80.0]",110.0,30.0,[mnih2015/seaquest/started_at_2023-09-26_05-09...


### Mixed Ensemble

One Ensemble containing the best $n$ models of each model type (e.g. 3 models of Mnih 2013, 3 models of Mnih 2015, and 3 models of Mnih 2015 with Huber loss and Adam).

In [24]:
evaluation_results_mixed_ensembles = []

In [25]:
for game_name in (t := tqdm(list_of_games)):
    t.set_description(f"Evaluate mixed Ensemble for {game_name}")
    # Select Top-N models for each model type:
    model_paths = []
    for model_name in list_of_algorithms:
        model_selection = evaluation_single_models_df[
                          (evaluation_single_models_df["model"] == model_name) & (evaluation_single_models_df["game"] == game_name)
                        ].sort_values(by=["mean", "standard_deviation"], ascending=[False, True])[:N_FOR_TOP_N_ENSEMBLES]

        model_paths.extend([m_path for m_path in list(model_selection["model_path"])])

    models = [models_dict[m_path] for m_path in model_paths]
    for ensemble_method in ENSEMBLE_METHODS_USED:
        t.set_description(f"Evaluate mixed Ensemble ({ensemble_method}) for {game_name}")
        ensemble = Ensemble(models)
        rewards = evaluate_ensemble(game_name, ensemble, num_repetitions=NUM_REPETITIONS)
        evaluation_results_mixed_ensembles.append({
            "game": game_name,
            "model": f"Top-{N_FOR_TOP_N_ENSEMBLES} Mixed Ensemble ({ensemble_method})",
            "model_id": 0,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards),
            "model_path": model_paths
        })


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [26]:
evaluation_results_mixed_ensembles_df = pd.DataFrame.from_records(evaluation_results_mixed_ensembles)
evaluation_results_mixed_ensembles_df.head()

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation,model_path
0,seaquest,Top-2 Mixed Ensemble (average),0,"[40.0, 160.0]",100.0,60.0,[mnih2013/seaquest/started_at_2023-09-27_23-56...
1,seaquest,Top-2 Mixed Ensemble (logistic_average),0,"[120.0, 120.0]",120.0,0.0,[mnih2013/seaquest/started_at_2023-09-27_23-56...
2,seaquest,Top-2 Mixed Ensemble (majority_vote),0,"[100.0, 140.0]",120.0,20.0,[mnih2013/seaquest/started_at_2023-09-27_23-56...


### Snapshot Ensemble

Ensembles consisting of the last $M$ training snapshots of a model. In the original paper, the snapshot get selected on the fly during training by saving models at local minima and increasing the learning rate after a model was selected. This leads to models that are more different from another and achieve higher results. This is not easily applicable for RL, so I use the $M$ newest snapshots instead.

In [27]:
evaluation_results_snapshot_ensembles = []

In [51]:
# def get_snapshots(dir) -> list:
#     if os.path.isfile(dir):
#         dir = os.path.dirname(dir)
#
#     print(dir)
#     snapshots = dict()
#     snapshot_re = re.compile(r"snapshot_(?P<idx>\d+)\.keras")
#
#     for file in Path(dir).glob("*.keras"):
#         f_name = str(file.name)
#         if snapshot_re.match(f_name):
#             snapshots[int(snapshot_re.match(f_name)["idx"])] = str(file)
#
#     snapshots[len(snapshots)] = str(Path(dir) / "snapshot_final.keras")
#     sorted_snapshots = list(zip(*sorted(list(snapshots.items()), key=lambda a: a[1])))[1]
#
#     print(sorted_snapshots)
#
#     return sorted_snapshots

def get_snapshots(models_dir) -> list:
    models_dir = models_dir[:-len("snapshot_final.keras")]

    snapshots = dict()
    snapshot_re = re.compile(r"snapshot_(?P<idx>\d+)\.keras")

    for file in Path(MODELS + models_dir).glob("*.keras"):
        f_name = str(file.name)
        if snapshot_re.match(f_name):
            snapshots[int(snapshot_re.match(f_name)["idx"])] = models_dir + f_name

    snapshots[len(snapshots)] = models_dir + "snapshot_final.keras"
    sorted_snapshots = list(zip(*sorted(list(snapshots.items()), key=lambda a: a[1])))[1]


    return sorted_snapshots

In [None]:
for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, models_used)),
                                        desc="Build Snapshot Ensemble")):
    t.set_description(f"Build Snapshot Ensemble ({model_name.capitalize()} for {game_name})")

    model_paths_for_game_and_model = \
        evaluation_single_models_df[(evaluation_single_models_df["model"] == model_name) & (evaluation_single_models_df["game"] == game_name)]["model_path"]

    for idx, unique_model_path in enumerate(model_paths_for_game_and_model.unique()):
        print(unique_model_path)
        model_paths = get_snapshots(unique_model_path)
        # Select M last snapshots:
        model_paths = model_paths[-M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS:]
        assert len(model_paths) == M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS
        models = load_models(model_paths)

        for idx, ensemble_method in enumerate(ENSEMBLE_METHODS_USED):
            ensemble = Ensemble(models, ensemble_method)

            rewards = evaluate_ensemble(game_name, ensemble, NUM_REPETITIONS)
            evaluation_results_snapshot_ensembles.append({
                "game": game_name,
                "model": f"{M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS}-Snapshot Ensemble ({ensemble_method}) with {model_name}",
                "model_id": idx,
                "episode_rewards": rewards,
                "mean": np.mean(rewards),
                "standard_deviation": np.std(rewards),
                "model_path": model_paths
            })

Build Snapshot Ensemble:   0%|          | 0/3 [00:00<?, ?it/s]

mnih2013/seaquest/started_at_2023-09-25_12-09-15/snapshot_final.keras
mnih2013/seaquest/started_at_2023-09-25_12-09-15/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2013/seaquest/started_at_2023-09-25_12-09-15/snapshot_0.keras', 1: 'mnih2013/seaquest/started_at_2023-09-25_12-09-15/snapshot_1.keras', 10: 'mnih2013/seaquest/started_at_2023-09-25_12-09-15/snapshot_10.keras', 11: 'mnih2013/seaquest/started_at_2023-09-25_12-09-15/sn

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2013/seaquest/started_at_2023-09-25_12-41-34/snapshot_final.keras
mnih2013/seaquest/started_at_2023-09-25_12-41-34/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2013/seaquest/started_at_2023-09-25_12-41-34/snapshot_0.keras', 1: 'mnih2013/seaquest/started_at_2023-09-25_12-41-34/snapshot_1.keras', 10: 'mnih2013/seaquest/started_at_2023-09-25_12-41-34/snapshot_10.keras', 11: 'mnih2013/seaquest/started_at_2023-09-25_12-41-34/sn

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2013/seaquest/started_at_2023-09-27_23-56-28/snapshot_final.keras
mnih2013/seaquest/started_at_2023-09-27_23-56-28/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2013/seaquest/started_at_2023-09-27_23-56-28/snapshot_0.keras', 1: 'mnih2013/seaquest/started_at_2023-09-27_23-56-28/snapshot_1.keras', 10: 'mnih2013/seaquest/started_at_2023-09-27_23-56-28/snapshot_10.keras', 11: 'mnih2013/seaquest/started_at_2023-09-27_23-56-28/sn

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2015/seaquest/started_at_2023-09-26_05-09-11/snapshot_final.keras
mnih2015/seaquest/started_at_2023-09-26_05-09-11/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2015/seaquest/started_at_2023-09-26_05-09-11/snapshot_0.keras', 1: 'mnih2015/seaquest/started_at_2023-09-26_05-09-11/snapshot_1.keras', 10: 'mnih2015/seaquest/started_at_2023-09-26_05-09-11/snapshot_10.keras', 11: 'mnih2015/seaquest/started_at_2023-09-26_05-09-11/sn

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2015/seaquest/started_at_2023-09-26_05-31-20/snapshot_final.keras
mnih2015/seaquest/started_at_2023-09-26_05-31-20/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2015/seaquest/started_at_2023-09-26_05-31-20/snapshot_0.keras', 1: 'mnih2015/seaquest/started_at_2023-09-26_05-31-20/snapshot_1.keras', 10: 'mnih2015/seaquest/started_at_2023-09-26_05-31-20/snapshot_10.keras', 11: 'mnih2015/seaquest/started_at_2023-09-26_05-31-20/sn

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2015/seaquest/started_at_2023-09-26_06-55-14/snapshot_final.keras
mnih2015/seaquest/started_at_2023-09-26_06-55-14/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2015/seaquest/started_at_2023-09-26_06-55-14/snapshot_0.keras', 1: 'mnih2015/seaquest/started_at_2023-09-26_06-55-14/snapshot_1.keras', 10: 'mnih2015/seaquest/started_at_2023-09-26_06-55-14/snapshot_10.keras', 11: 'mnih2015/seaquest/started_at_2023-09-26_06-55-14/sn

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45-03/snapshot_final.keras
with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45-03/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45-03/snapshot_0.keras', 1: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45-03/snapshot_1.keras', 10: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [54]:
evaluation_results_snapshot_ensembles_df = pd.DataFrame.from_records(evaluation_results_snapshot_ensembles)
evaluation_results_snapshot_ensembles_df.head()

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation,model_path
0,seaquest,3-Snapshot Ensemble (average) with mnih2013,0,"[20.0, 20.0]",20.0,0.0,(mnih2013\seaquest\started_at_2023-09-25_12-09...
1,seaquest,3-Snapshot Ensemble (logistic_average) with mn...,1,"[60.0, 60.0]",60.0,0.0,(mnih2013\seaquest\started_at_2023-09-25_12-09...
2,seaquest,3-Snapshot Ensemble (majority_vote) with mnih2013,2,"[20.0, 20.0]",20.0,0.0,(mnih2013\seaquest\started_at_2023-09-25_12-09...
3,seaquest,3-Snapshot Ensemble (average) with mnih2013,0,"[60.0, 20.0]",40.0,20.0,(mnih2013\seaquest\started_at_2023-09-25_12-41...
4,seaquest,3-Snapshot Ensemble (average) with mnih2013,0,"[80.0, 20.0]",50.0,30.0,(mnih2013/seaquest/started_at_2023-09-25_12-09...


## Evaluate as Soups

In [55]:
models_used = set(list_of_algorithms) - {"random play", "interpretable_cnn"}

### Uniform Soups created from the best $n$ models of a flavor:

In [56]:
evaluation_results_uniform_soups = []

In [57]:
from Soup import Soup

for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, models_used)))):
    t.set_description(f"Cooking the Soup Model ({model_name.capitalize()} for {game_name})")

    # Select Top-N models:
    model_selection = evaluation_single_models_df[
                          (evaluation_single_models_df["model"] == model_name) & (evaluation_single_models_df["game"] == game_name)
                        ].sort_values(by=["mean", "standard_deviation"], ascending=[False, True])[:N_FOR_TOP_N_ENSEMBLES]

    model_paths = [m_path for m_path in list(model_selection["model_path"])]
    models = [models_dict[m_path] for m_path in model_paths]

    soup = Soup(models).get_soup_model()

    rewards = evaluate_model(game_name, soup, NUM_REPETITIONS)
    evaluation_results_uniform_soups.append({
        "game": game_name,
        "model": f"Top-{N_FOR_TOP_N_ENSEMBLES} Soup of {model_name}",
        "model_id": 0,
        "episode_rewards": rewards,
        "mean": np.mean(rewards),
        "standard_deviation": np.std(rewards),
        "model_path": model_paths
    })

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [58]:
evaluation_results_uniform_soups_df = pd.DataFrame.from_records(evaluation_results_uniform_soups)
evaluation_results_uniform_soups_df.head()

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation,model_path
0,seaquest,Top-2 Soup of mnih2013,0,"[40.0, 0.0]",20.0,20.0,[mnih2013/seaquest/started_at_2023-09-27_23-56...
1,seaquest,Top-2 Soup of mnih2015,0,"[100.0, 160.0]",130.0,30.0,[mnih2015/seaquest/started_at_2023-09-26_05-09...
2,seaquest,Top-2 Soup of with_huber_loss_and_adam,0,"[0.0, 0.0]",0.0,0.0,[with_huber_loss_and_adam/seaquest/started_at_...


### Snapshot Soup

Works like the Snapshot Ensemble, but as a Soup.

In [59]:
evaluation_results_snapshot_soups = []

In [60]:
for game_name, model_name in (t := tqdm(list(itertools.product(list_of_games, models_used)),
                                        desc="Cooking Soup Model")):
    t.set_description(f"Cooking Soup Model ({model_name.capitalize()} for {game_name})")

    model_paths_for_game_and_model = \
        results_df[(results_df["training_model"] == model_name) & (results_df["game"] == game_name)]["model_path"]

    for idx, unique_model_path in enumerate(model_paths_for_game_and_model.unique()):
        model_paths = get_snapshots(unique_model_path)
        # Select M last snapshots:
        model_paths = model_paths[-M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS:]
        assert len(model_paths) == M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS

        models = load_models(model_paths)

        soup = Soup(models).get_soup_model()

        rewards = evaluate_model(game_name, soup, NUM_REPETITIONS)
        evaluation_results_snapshot_soups.append({
            "game": game_name,
            "model": f"{M_FOR_SNAPSHOT_ENSEMBLES_AND_SOUPS}-Snapshot Soup {model_name}",
            "model_id": idx,
            "episode_rewards": rewards,
            "mean": np.mean(rewards),
            "standard_deviation": np.std(rewards),
            "model_path": model_paths
        })

Cooking Soup Model:   0%|          | 0/3 [00:00<?, ?it/s]

mnih2013/seaquest/started_at_2023-09-25_12-09-15/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2013/seaquest/started_at_2023-09-25_12-09-15/snapshot_0.keras', 1: 'mnih2013/seaquest/started_at_2023-09-25_12-09-15/snapshot_1.keras', 10: 'mnih2013/seaquest/started_at_2023-09-25_12-09-15/snapshot_10.keras', 11: 'mnih2013/seaquest/started_at_2023-09-25_12-09-15/snapshot_11.keras', 12: 'mnih2013/seaquest/started_at_2023-09-25_12-09-1

  function = cls._parse_function_from_config(


  0%|          | 0/2 [00:00<?, ?it/s]

mnih2013/seaquest/started_at_2023-09-25_12-41-34/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2013/seaquest/started_at_2023-09-25_12-41-34/snapshot_0.keras', 1: 'mnih2013/seaquest/started_at_2023-09-25_12-41-34/snapshot_1.keras', 10: 'mnih2013/seaquest/started_at_2023-09-25_12-41-34/snapshot_10.keras', 11: 'mnih2013/seaquest/started_at_2023-09-25_12-41-34/snapshot_11.keras', 12: 'mnih2013/seaquest/started_at_2023-09-25_12-41-3

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2013/seaquest/started_at_2023-09-27_23-56-28/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2013/seaquest/started_at_2023-09-27_23-56-28/snapshot_0.keras', 1: 'mnih2013/seaquest/started_at_2023-09-27_23-56-28/snapshot_1.keras', 10: 'mnih2013/seaquest/started_at_2023-09-27_23-56-28/snapshot_10.keras', 11: 'mnih2013/seaquest/started_at_2023-09-27_23-56-28/snapshot_11.keras', 12: 'mnih2013/seaquest/started_at_2023-09-27_23-56-2

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2015/seaquest/started_at_2023-09-26_05-09-11/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2015/seaquest/started_at_2023-09-26_05-09-11/snapshot_0.keras', 1: 'mnih2015/seaquest/started_at_2023-09-26_05-09-11/snapshot_1.keras', 10: 'mnih2015/seaquest/started_at_2023-09-26_05-09-11/snapshot_10.keras', 11: 'mnih2015/seaquest/started_at_2023-09-26_05-09-11/snapshot_11.keras', 12: 'mnih2015/seaquest/started_at_2023-09-26_05-09-1

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2015/seaquest/started_at_2023-09-26_05-31-20/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2015/seaquest/started_at_2023-09-26_05-31-20/snapshot_0.keras', 1: 'mnih2015/seaquest/started_at_2023-09-26_05-31-20/snapshot_1.keras', 10: 'mnih2015/seaquest/started_at_2023-09-26_05-31-20/snapshot_10.keras', 11: 'mnih2015/seaquest/started_at_2023-09-26_05-31-20/snapshot_11.keras', 12: 'mnih2015/seaquest/started_at_2023-09-26_05-31-2

  0%|          | 0/2 [00:00<?, ?it/s]

mnih2015/seaquest/started_at_2023-09-26_06-55-14/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'mnih2015/seaquest/started_at_2023-09-26_06-55-14/snapshot_0.keras', 1: 'mnih2015/seaquest/started_at_2023-09-26_06-55-14/snapshot_1.keras', 10: 'mnih2015/seaquest/started_at_2023-09-26_06-55-14/snapshot_10.keras', 11: 'mnih2015/seaquest/started_at_2023-09-26_06-55-14/snapshot_11.keras', 12: 'mnih2015/seaquest/started_at_2023-09-26_06-55-1

  0%|          | 0/2 [00:00<?, ?it/s]

with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45-03/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45-03/snapshot_0.keras', 1: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45-03/snapshot_1.keras', 10: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_10-45-03/snapshot_10.keras', 11: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_1

  0%|          | 0/2 [00:00<?, ?it/s]

with_huber_loss_and_adam/seaquest/started_at_2023-09-26_11-35-06/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_11-35-06/snapshot_0.keras', 1: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_11-35-06/snapshot_1.keras', 10: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_11-35-06/snapshot_10.keras', 11: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_1

  0%|          | 0/2 [00:00<?, ?it/s]

with_huber_loss_and_adam/seaquest/started_at_2023-09-26_12-05-08/
snapshot_0.keras
snapshot_1.keras
snapshot_10.keras
snapshot_11.keras
snapshot_12.keras
snapshot_13.keras
snapshot_14.keras
snapshot_15.keras
snapshot_16.keras
snapshot_17.keras
snapshot_18.keras
snapshot_19.keras
snapshot_2.keras
snapshot_20.keras
snapshot_21.keras
snapshot_22.keras
snapshot_23.keras
snapshot_24.keras
snapshot_25.keras
snapshot_26.keras
snapshot_27.keras
snapshot_28.keras
snapshot_29.keras
snapshot_3.keras
snapshot_30.keras
snapshot_31.keras
snapshot_32.keras
snapshot_4.keras
snapshot_5.keras
snapshot_6.keras
snapshot_7.keras
snapshot_8.keras
snapshot_9.keras
snapshot_final.keras
{0: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_12-05-08/snapshot_0.keras', 1: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_12-05-08/snapshot_1.keras', 10: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_12-05-08/snapshot_10.keras', 11: 'with_huber_loss_and_adam/seaquest/started_at_2023-09-26_1

  0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [61]:
evaluation_results_snapshot_soups_df = pd.DataFrame.from_records(evaluation_results_snapshot_soups)
evaluation_results_snapshot_soups_df.head()

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation,model_path
0,seaquest,3-Snapshot Soup mnih2013,0,"[0.0, 60.0]",30.0,30.0,(mnih2013/seaquest/started_at_2023-09-25_12-09...
1,seaquest,3-Snapshot Soup mnih2013,1,"[40.0, 40.0]",40.0,0.0,(mnih2013/seaquest/started_at_2023-09-25_12-41...
2,seaquest,3-Snapshot Soup mnih2013,2,"[40.0, 60.0]",50.0,10.0,(mnih2013/seaquest/started_at_2023-09-27_23-56...
3,seaquest,3-Snapshot Soup mnih2015,0,"[20.0, 60.0]",40.0,20.0,(mnih2015/seaquest/started_at_2023-09-26_05-09...
4,seaquest,3-Snapshot Soup mnih2015,1,"[260.0, 180.0]",220.0,40.0,(mnih2015/seaquest/started_at_2023-09-26_05-31...


## Save the Evaluation Data

In [62]:
evaluation_data_df = pd.concat([
    random_baseline_df,
    evaluation_single_models_df,
    # Ensembles:
    evaluation_results_uniform_ensembles_df,
    evaluation_results_mixed_ensembles_df,
    evaluation_results_snapshot_ensembles_df,
    # Soups:
    evaluation_results_uniform_soups_df,
    evaluation_results_snapshot_soups_df
])
evaluation_data_df.reset_index(drop=True, inplace=True)
evaluation_data_df.head()

Unnamed: 0,game,model,model_id,episode_rewards,mean,standard_deviation,model_path
0,seaquest,random play,0,"[100.0, 20.0]",60.0,40.0,
1,enduro,random play,0,"[0.0, 0.0]",0.0,0.0,
2,breakout,random play,0,"[0.0, 0.0]",0.0,0.0,
3,seaquest,mnih2013,0,"[40.0, 40.0]",40.0,0.0,mnih2013/seaquest/started_at_2023-09-25_12-09-...
4,seaquest,mnih2013,1,"[0.0, 40.0]",20.0,20.0,mnih2013/seaquest/started_at_2023-09-25_12-41-...


In [63]:
evaluation_data_df.to_csv(EVALUATION_RESULTS_CSV, index=False)