# Calculating and Visalizing Significance Tests

Creates "Critical Difference" diagrams (cd diagrams) for the results of the experiments that use Wilcoxon signed-rank test for significance testing. Based on https://github.com/hfawaz/cd-diagram by Hassan Fawaz.

In [1]:
import shutil
from ast import literal_eval

import pandas as pd

Load the results from the csv file and reformat them to the format required by cd-diagram scripts.

In [2]:
RESULTS_CSV = "./../../results.csv"
EVALUATION_RESULTS_CSV = "./../../evaluation_results.csv"

IMAGES_SAVE_FOLDER = "./../../latex/cdd"

In [3]:
USE_CDD_LABELS = False

In [4]:
evaluation_data_df = pd.read_csv(EVALUATION_RESULTS_CSV)
evaluation_data_df["episode_rewards"] = evaluation_data_df["episode_rewards"].apply(literal_eval)
evaluation_data_df.rename(columns={"mean": "rewards"}, inplace=True)

# create a new column for the mean of reward grouped by model and game
evaluation_data_df["mean"] = evaluation_data_df.groupby(["model", "game"])["rewards"].transform("mean")

In [5]:
naming_scheme = {
    "random play": "Random Play",
    "interpretable_cnn": "Interpretable CNN",
    "mnih2013": "Mnih et al. (2013)",
    "mnih2015": "Mnih et al. (2015)",
    "with_huber_loss_and_adam": "Mnih et al. (2015) with Huber Loss and Adam",
    "Top-3 Soup of mnih2013": "Top-3 Soup of Mnih et al. (2013)",
    "Top-3 Soup of mnih2015": "Top-3 Soup of Mnih et al. (2015)",
    "Top-3 Soup of with_huber_loss_and_adam": "Top-3 Soup of Mnih et al. (2015) with Huber Loss and Adam",
    "Top-3 Mixed Ensemble (average)": "Top-3 Mixed Ensemble (average)",
    "Top-3 Mixed Ensemble (logistic_average)": "Top-3 Mixed Ensemble (logistic average)",
    "Top-3 Mixed Ensemble (majority_vote)": "Top-3 Mixed Ensemble (majority vote)",
    "Top-3 Ensemble (average) with mnih2013": "Top-3 Ensemble (average) with Mnih et al. (2013)",
    "Top-3 Ensemble (average) with mnih2015": "Top-3 Ensemble (average) with Mnih et al. (2015)",
    "Top-3 Ensemble (average) with with_huber_loss_and_adam": "Top-3 Ensemble (average) with Mnih et al. (2015) with Huber Loss and Adam",
    "Top-3 Ensemble (logistic_average) with mnih2013": "Top-3 Ensemble (logistic average) with Mnih et al. (2013)",
    "Top-3 Ensemble (logistic_average) with mnih2015": "Top-3 Ensemble (logistic average) with Mnih et al. (2015)",
    "Top-3 Ensemble (logistic_average) with with_huber_loss_and_adam": "Top-3 Ensemble (logistic average) with Mnih et al. (2015) with Huber Loss and Adam",
    "Top-3 Ensemble (majority_vote) with mnih2013": "Top-3 Ensemble (majority vote) with Mnih et al. (2013)",
    "Top-3 Ensemble (majority_vote) with mnih2015": "Top-3 Ensemble (majority vote) with Mnih et al. (2015)",
    "Top-3 Ensemble (majority_vote) with with_huber_loss_and_adam": "Top-3 Ensemble (majority vote) with Mnih et al. (2015) with Huber Loss and Adam",
    "3-Snapshot Ensemble (average) with mnih2013": "3-Snapshot Ensemble (average) with Mnih et al. (2013)",
    "3-Snapshot Ensemble (average) with mnih2015": "3-Snapshot Ensemble (average) with Mnih et al. (2015)",
    "3-Snapshot Ensemble (average) with with_huber_loss_and_adam": "3-Snapshot Ensemble (average) with Mnih et al. (2015) with Huber Loss and Adam",
    "3-Snapshot Ensemble (logistic_average) with mnih2013": "3-Snapshot Ensemble (logistic average) with Mnih et al. (2013)",
    "3-Snapshot Ensemble (logistic_average) with mnih2015": "3-Snapshot Ensemble (logistic average) with Mnih et al. (2015)",
    "3-Snapshot Ensemble (logistic_average) with with_huber_loss_and_adam": "3-Snapshot Ensemble (logistic average) with Mnih et al. (2015) with Huber Loss and Adam",
    "3-Snapshot Ensemble (majority_vote) with mnih2013": "3-Snapshot Ensemble (majority vote) with Mnih et al. (2013)",
    "3-Snapshot Ensemble (majority_vote) with mnih2015": "3-Snapshot Ensemble (majority vote) with Mnih et al. (2015)",
    "3-Snapshot Ensemble (majority_vote) with with_huber_loss_and_adam": "3-Snapshot Ensemble (majority vote) with Mnih et al. (2015) with Huber Loss and Adam",
    "3-Snapshot Soup mnih2013": "3-Snapshot Soup of Mnih et al. (2013)",
    "3-Snapshot Soup mnih2015": "3-Snapshot Soup of Mnih et al. (2015)",
    "3-Snapshot Soup with_huber_loss_and_adam": "3-Snapshot Soup of Mnih et al. (2015) with Huber Loss and Adam",
}
evaluation_data_df["model"] = evaluation_data_df["model"].apply(lambda x: naming_scheme[x])

In [6]:
evaluation_data_df_for_cdd = evaluation_data_df[["model", "game", "mean"]]
evaluation_data_df_for_cdd.drop_duplicates(inplace=True)
evaluation_data_df_for_cdd = evaluation_data_df_for_cdd.rename(columns={"mean": "accuracy", "model": "classifier_name", "game": "dataset_name"}, inplace=False)
# evaluation_data_df_exploded = evaluation_data_df_exploded[evaluation_data_df_exploded["classifier_name"] != "Interpretable CNN"]
evaluation_data_df_for_cdd = evaluation_data_df_for_cdd[evaluation_data_df_for_cdd["classifier_name"] != "Interpretable CNN"]
evaluation_data_df_for_cdd.reset_index(inplace=True, drop=True)
evaluation_data_df_for_cdd

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  evaluation_data_df_for_cdd.drop_duplicates(inplace=True)


Unnamed: 0,classifier_name,dataset_name,accuracy
0,Random Play,seaquest,81.840000
1,Random Play,enduro,0.000000
2,Random Play,breakout,1.332000
3,Mnih et al. (2013),seaquest,55.333333
4,Mnih et al. (2015),seaquest,98.000000
...,...,...,...
88,3-Snapshot Soup of Mnih et al. (2013),enduro,0.000000
89,3-Snapshot Soup of Mnih et al. (2015),enduro,0.000000
90,3-Snapshot Soup of Mnih et al. (2015) with Hub...,breakout,1.433333
91,3-Snapshot Soup of Mnih et al. (2013),breakout,1.366667


In [7]:
# Normalize the accuracy per game to [0, 1]
evaluation_data_df_for_cdd["accuracy"] = evaluation_data_df_for_cdd.groupby("dataset_name")["accuracy"].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
evaluation_data_df_for_cdd.reset_index(inplace=True, drop=True)

## Create the cd-diagrams

Loads the code by Hassan Fawaz et al. and creates the cd-diagram. I slightly modified the code to make it work with the data we have and save the diagram as a pdf file.

In [8]:
import CreateCDDiagram as cdd

In [9]:
def rename_cdd_to(new_name: str):
    return  shutil.copy("./cd-diagram.pdf", f"{IMAGES_SAVE_FOLDER}/cdd_{new_name}.pdf")

### All models

In [10]:
cdd.draw_cd_diagram(df_perf=evaluation_data_df_for_cdd, title='All Models', labels=USE_CDD_LABELS)
rename_cdd_to("all_models")



['Random Play' 'Mnih et al. (2013)' 'Mnih et al. (2015)'
 'Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Ensemble (average) with Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Ensemble (logistic average) with Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Ensemble (majority vote) with Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Ensemble (average) with Mnih et al. (2013)'
 'Top-3 Ensemble (logistic average) with Mnih et al. (2013)'
 'Top-3 Ensemble (majority vote) with Mnih et al. (2013)'
 'Top-3 Ensemble (average) with Mnih et al. (2015)'
 'Top-3 Ensemble (logistic average) with Mnih et al. (2015)'
 'Top-3 Ensemble (majority vote) with Mnih et al. (2015)'
 'Top-3 Mixed Ensemble (average)'
 'Top-3 Mixed Ensemble (logistic average)'
 'Top-3 Mixed Ensemble (majority vote)'
 '3-Snapshot Ensemble (average) with Mnih et al. (2015) with Huber Loss and Adam'
 '3-Snapshot Ensemble (logistic average) with Mnih et al. (2015) with Huber Loss and Adam'
 '3-Snapshot Ensem

ValueError: zero_method 'wilcox' and 'pratt' do not work if x - y is zero for all elements.

### Ensembles Only

In [11]:
all_ensembles = evaluation_data_df_for_cdd[
        evaluation_data_df_for_cdd["classifier_name"].str.contains("Ensemble")
    ].dropna()
all_ensembles

Unnamed: 0,classifier_name,dataset_name,accuracy
12,Top-3 Ensemble (average) with Mnih et al. (201...,seaquest,0.313589
13,Top-3 Ensemble (logistic average) with Mnih et...,seaquest,0.271777
14,Top-3 Ensemble (majority vote) with Mnih et al...,seaquest,0.209059
15,Top-3 Ensemble (average) with Mnih et al. (2013),seaquest,0.188153
16,Top-3 Ensemble (logistic average) with Mnih et...,seaquest,0.125436
...,...,...,...
70,3-Snapshot Ensemble (logistic average) with Mn...,breakout,0.182796
71,3-Snapshot Ensemble (majority vote) with Mnih ...,breakout,0.172043
72,3-Snapshot Ensemble (average) with Mnih et al....,breakout,0.086022
73,3-Snapshot Ensemble (logistic average) with Mn...,breakout,0.129032


In [12]:
cdd.draw_cd_diagram(df_perf=all_ensembles, title='All Ensembles', labels=USE_CDD_LABELS)
rename_cdd_to("all_ensembles")

['Top-3 Ensemble (average) with Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Ensemble (logistic average) with Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Ensemble (majority vote) with Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Ensemble (average) with Mnih et al. (2013)'
 'Top-3 Ensemble (logistic average) with Mnih et al. (2013)'
 'Top-3 Ensemble (majority vote) with Mnih et al. (2013)'
 'Top-3 Ensemble (average) with Mnih et al. (2015)'
 'Top-3 Ensemble (logistic average) with Mnih et al. (2015)'
 'Top-3 Ensemble (majority vote) with Mnih et al. (2015)'
 'Top-3 Mixed Ensemble (average)'
 'Top-3 Mixed Ensemble (logistic average)'
 'Top-3 Mixed Ensemble (majority vote)'
 '3-Snapshot Ensemble (average) with Mnih et al. (2015) with Huber Loss and Adam'
 '3-Snapshot Ensemble (logistic average) with Mnih et al. (2015) with Huber Loss and Adam'
 '3-Snapshot Ensemble (majority vote) with Mnih et al. (2015) with Huber Loss and Adam'
 '3-Snapshot Ensemble (average) wi



'./../../latex/cdd/cdd_all_ensembles.pdf'

In [13]:
all_soups = evaluation_data_df_for_cdd[
        evaluation_data_df_for_cdd["classifier_name"].str.contains("Soup")
    ].dropna()
all_soups

Unnamed: 0,classifier_name,dataset_name,accuracy
75,Top-3 Soup of Mnih et al. (2015) with Huber Lo...,seaquest,0.188153
76,Top-3 Soup of Mnih et al. (2013),seaquest,0.198606
77,Top-3 Soup of Mnih et al. (2015),seaquest,0.271777
78,Top-3 Soup of Mnih et al. (2015) with Huber Lo...,enduro,0.0
79,Top-3 Soup of Mnih et al. (2013),enduro,0.0
80,Top-3 Soup of Mnih et al. (2015),enduro,0.0
81,Top-3 Soup of Mnih et al. (2015) with Huber Lo...,breakout,1.0
82,Top-3 Soup of Mnih et al. (2013),breakout,0.419355
83,Top-3 Soup of Mnih et al. (2015),breakout,0.064516
84,3-Snapshot Soup of Mnih et al. (2015) with Hub...,seaquest,0.212544


In [14]:
cdd.draw_cd_diagram(df_perf=all_soups, title='All Soups', labels=USE_CDD_LABELS)
rename_cdd_to("all_soups")

['Top-3 Soup of Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Soup of Mnih et al. (2013)' 'Top-3 Soup of Mnih et al. (2015)'
 '3-Snapshot Soup of Mnih et al. (2015) with Huber Loss and Adam'
 '3-Snapshot Soup of Mnih et al. (2013)'
 '3-Snapshot Soup of Mnih et al. (2015)']
3-Snapshot Soup of Mnih et al. (2013)                             0.0
3-Snapshot Soup of Mnih et al. (2015)                             1.0
3-Snapshot Soup of Mnih et al. (2015) with Huber Loss and Adam    0.0
Top-3 Soup of Mnih et al. (2013)                                  0.0
Top-3 Soup of Mnih et al. (2015)                                  0.0
Top-3 Soup of Mnih et al. (2015) with Huber Loss and Adam         1.0
dtype: float64
Top-3 Soup of Mnih et al. (2015)                                  3.833333
3-Snapshot Soup of Mnih et al. (2013)                             3.500000
3-Snapshot Soup of Mnih et al. (2015) with Huber Loss and Adam    3.500000
Top-3 Soup of Mnih et al. (2013)                           



'./../../latex/cdd/cdd_all_soups.pdf'

### Best Models

In [15]:
best_models_per_method_list = []
for model_name in ["2013", "2015\)$", "Huber Loss and Adam"]:
    print("Model:", model_name)

    # Find best ensemble for each model type:
    best_ensembles = evaluation_data_df_for_cdd[
        evaluation_data_df_for_cdd["classifier_name"].str.contains(model_name) &
        evaluation_data_df_for_cdd["classifier_name"].str.contains("Ensemble")
    ].dropna()
    if len(best_ensembles) > 0:
        best_ensembles = best_ensembles.sort_values(by=["accuracy"], ascending=False, inplace=False)
        print(f"Best Ensemble for {model_name}: {best_ensembles.iloc[0]['classifier_name']}")
        temp_selection = best_ensembles[best_ensembles["classifier_name"] == best_ensembles.iloc[0]["classifier_name"]]
        best_models_per_method_list.append(temp_selection)

    # Find best soup for each model type:
    best_soups = evaluation_data_df_for_cdd[
        evaluation_data_df_for_cdd["classifier_name"].str.contains(model_name) &
        evaluation_data_df_for_cdd["classifier_name"].str.contains("Soup")
    ].dropna()
    if len(best_soups) > 0:
        best_soups = best_soups.sort_values(by=["accuracy"], ascending=False, inplace=False)
        print(f"Best Soup for {model_name}: {best_soups.iloc[0]['classifier_name']}")
        temp_selection = best_soups[best_soups["classifier_name"] == best_soups.iloc[0]["classifier_name"]]
        best_models_per_method_list.append(temp_selection)


# Find best mixed ensemble:
best_mixed_ensemble = evaluation_data_df_for_cdd[
    evaluation_data_df_for_cdd["classifier_name"].str.contains("Mixed")
].dropna()
if len(best_mixed_ensemble) > 0:
    best_mixed_ensemble = best_mixed_ensemble.sort_values(by=["accuracy"], ascending=False, inplace=False)
    print(f"Best Mixed Ensemble: {best_mixed_ensemble.iloc[0]['classifier_name']}")
    temp_selection = best_mixed_ensemble[best_mixed_ensemble["classifier_name"] == best_mixed_ensemble.iloc[0]["classifier_name"]]
    best_models_per_method_list.append(temp_selection)

selected_data = pd.concat(best_models_per_method_list)
selected_data = pd.concat([
    selected_data,
    evaluation_data_df_for_cdd[
        (~evaluation_data_df_for_cdd["classifier_name"].str.contains("Ensemble")) &
        (~evaluation_data_df_for_cdd["classifier_name"].str.contains("Soup")) &
        (~evaluation_data_df_for_cdd["classifier_name"].str.contains("Interpretable CNN")) &
        (~evaluation_data_df_for_cdd["classifier_name"].str.contains("Random Play"))
    ],
])

selected_data.reset_index(inplace=True, drop=True)
selected_data

Model: 2013
Best Ensemble for 2013: Top-3 Ensemble (average) with Mnih et al. (2013)
Best Soup for 2013: Top-3 Soup of Mnih et al. (2013)
Model: 2015\)$
Best Ensemble for 2015\)$: Top-3 Ensemble (logistic average) with Mnih et al. (2015)
Best Soup for 2015\)$: 3-Snapshot Soup of Mnih et al. (2015)
Model: Huber Loss and Adam
Best Ensemble for Huber Loss and Adam: 3-Snapshot Ensemble (majority vote) with Mnih et al. (2015) with Huber Loss and Adam
Best Soup for Huber Loss and Adam: Top-3 Soup of Mnih et al. (2015) with Huber Loss and Adam
Best Mixed Ensemble: Top-3 Mixed Ensemble (average)


Unnamed: 0,classifier_name,dataset_name,accuracy
0,Top-3 Ensemble (average) with Mnih et al. (2013),enduro,0.922764
1,Top-3 Ensemble (average) with Mnih et al. (2013),breakout,0.258065
2,Top-3 Ensemble (average) with Mnih et al. (2013),seaquest,0.188153
3,Top-3 Soup of Mnih et al. (2013),breakout,0.419355
4,Top-3 Soup of Mnih et al. (2013),seaquest,0.198606
5,Top-3 Soup of Mnih et al. (2013),enduro,0.0
6,Top-3 Ensemble (logistic average) with Mnih et...,seaquest,0.794425
7,Top-3 Ensemble (logistic average) with Mnih et...,breakout,0.129032
8,Top-3 Ensemble (logistic average) with Mnih et...,enduro,0.0
9,3-Snapshot Soup of Mnih et al. (2015),seaquest,1.0


In [16]:
cdd.draw_cd_diagram(df_perf=selected_data, title='Selection of Best Models', labels=USE_CDD_LABELS)
rename_cdd_to("selection_of_best_models")

['Top-3 Ensemble (average) with Mnih et al. (2013)'
 'Top-3 Soup of Mnih et al. (2013)'
 'Top-3 Ensemble (logistic average) with Mnih et al. (2015)'
 '3-Snapshot Soup of Mnih et al. (2015)'
 '3-Snapshot Ensemble (majority vote) with Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Soup of Mnih et al. (2015) with Huber Loss and Adam'
 'Top-3 Mixed Ensemble (average)' 'Mnih et al. (2013)'
 'Mnih et al. (2015)' 'Mnih et al. (2015) with Huber Loss and Adam']
3-Snapshot Ensemble (majority vote) with Mnih et al. (2015) with Huber Loss and Adam    0.0
3-Snapshot Soup of Mnih et al. (2015)                                                   1.0
Mnih et al. (2013)                                                                      0.0
Mnih et al. (2015)                                                                      0.0
Mnih et al. (2015) with Huber Loss and Adam                                             0.0
Top-3 Ensemble (average) with Mnih et al. (2013)                             



'./../../latex/cdd/cdd_selection_of_best_models.pdf'

## Compare Ensembles with Soups and with Individual Models

Artificial "datasets" are created based on the model architecture and the game. This is necessary because of how the CDD library is implemented. This way the Mnih2013 model can be compared to the Mnih2013 ensemble, for example.

In [17]:
def create_artificial_dataset_name(model_name: str, game_name: str):
    if "huber loss" in model_name.lower():
        short = "with_huber_loss_and_adam"
    elif "2015" in model_name:
        short = "mnih2015"
    elif "2013" in model_name:
        short = "mnih2013"
    else:
        raise ValueError(f"Unknown model name: {model_name}")

    return f"{short} on {game_name}"

In [18]:
eval_df_temp = evaluation_data_df_for_cdd.copy()
eval_df_temp = eval_df_temp[(eval_df_temp["classifier_name"] != "Interpretable CNN") & (eval_df_temp["classifier_name"] != "Random Play") & (~eval_df_temp["classifier_name"].str.contains("Mixed"))]

In [19]:
eval_df_temp["artificial_dataset_name"] = eval_df_temp.apply(lambda row: create_artificial_dataset_name(row["classifier_name"], row["dataset_name"]), axis=1)
eval_df_temp

Unnamed: 0,classifier_name,dataset_name,accuracy,artificial_dataset_name
3,Mnih et al. (2013),seaquest,0.268293,mnih2013 on seaquest
4,Mnih et al. (2015),seaquest,0.491289,mnih2015 on seaquest
5,Mnih et al. (2015) with Huber Loss and Adam,seaquest,0.209059,with_huber_loss_and_adam on seaquest
6,Mnih et al. (2013),enduro,0.345528,mnih2013 on enduro
7,Mnih et al. (2015),enduro,0.000000,mnih2015 on enduro
...,...,...,...,...
88,3-Snapshot Soup of Mnih et al. (2013),enduro,0.000000,mnih2013 on enduro
89,3-Snapshot Soup of Mnih et al. (2015),enduro,0.000000,mnih2015 on enduro
90,3-Snapshot Soup of Mnih et al. (2015) with Hub...,breakout,0.301075,with_huber_loss_and_adam on breakout
91,3-Snapshot Soup of Mnih et al. (2013),breakout,0.279570,mnih2013 on breakout


In [20]:
# Rename classifiers to either Soup, Ensemble, or Individual Model
eval_df_temp["classifier_name"] = eval_df_temp["classifier_name"].apply(lambda x: "Soup" if "Soup" in x else ("Ensemble" if "Ensemble" in x else "Individual Model"))
eval_df_temp["accuracy"] = eval_df_temp.groupby(["artificial_dataset_name", "classifier_name"])["accuracy"].transform("mean")
eval_df_temp["dataset_name"] = eval_df_temp["artificial_dataset_name"]
eval_df_temp.drop(columns=["artificial_dataset_name"], inplace=True)
eval_df_temp.drop_duplicates(inplace=True)
eval_df_temp

Unnamed: 0,classifier_name,dataset_name,accuracy
3,Individual Model,mnih2013 on seaquest,0.268293
4,Individual Model,mnih2015 on seaquest,0.491289
5,Individual Model,with_huber_loss_and_adam on seaquest,0.209059
6,Individual Model,mnih2013 on enduro,0.345528
7,Individual Model,mnih2015 on enduro,0.0
8,Individual Model,with_huber_loss_and_adam on enduro,0.0
9,Individual Model,mnih2013 on breakout,0.376344
10,Individual Model,mnih2015 on breakout,0.333333
11,Individual Model,with_huber_loss_and_adam on breakout,0.537634
12,Ensemble,with_huber_loss_and_adam on seaquest,0.200929


In [21]:
eval_df_temp2 = eval_df_temp.copy()
eval_df_temp2 = eval_df_temp2[~eval_df_temp2["dataset_name"].str.contains("enduro")]

In [22]:
cdd.draw_cd_diagram(df_perf=eval_df_temp, title='Comparing Ensembles, Soups, and Individual Models', labels=USE_CDD_LABELS)
rename_cdd_to("Ensembles_vs_Soups_vs_Individual_Models")

['Individual Model' 'Ensemble' 'Soup']
Ensemble            1.0
Individual Model    5.0
Soup                2.0
dtype: float64
Ensemble            2.333333
Soup                2.166667
Individual Model    1.500000
dtype: float64
('Ensemble', 'Individual Model', 0.23531375547707434, False)
('Individual Model', 'Soup', 0.6325851216960414, False)
('Ensemble', 'Soup', 0.721814730439508, False)
Index(['Ensemble', 'Soup', 'Individual Model'], dtype='object')
[0, 1, 2]




'./../../latex/cdd/cdd_Ensembles_vs_Soups_vs_Individual_Models.pdf'

## Random Play vs. Mixed Ensembles

In [23]:
selected_data = pd.concat([
    evaluation_data_df_for_cdd[evaluation_data_df_for_cdd["classifier_name"] == "Random Play"],
    evaluation_data_df_for_cdd[evaluation_data_df_for_cdd["classifier_name"].str.contains("Mixed")],
])

selected_data.reset_index(inplace=True, drop=True)
selected_data

Unnamed: 0,classifier_name,dataset_name,accuracy
0,Random Play,seaquest,0.406829
1,Random Play,enduro,0.0
2,Random Play,breakout,0.268387
3,Top-3 Mixed Ensemble (average),seaquest,0.585366
4,Top-3 Mixed Ensemble (logistic average),seaquest,0.574913
5,Top-3 Mixed Ensemble (majority vote),seaquest,0.825784
6,Top-3 Mixed Ensemble (average),enduro,1.0
7,Top-3 Mixed Ensemble (logistic average),enduro,0.735772
8,Top-3 Mixed Ensemble (majority vote),enduro,0.845528
9,Top-3 Mixed Ensemble (average),breakout,0.193548


In [24]:
cdd.draw_cd_diagram(df_perf=selected_data, title='Random Play vs. Mixed Ensembles', labels=USE_CDD_LABELS)
rename_cdd_to("Random_Play_vs_Mixed_Ensembles")

['Random Play' 'Top-3 Mixed Ensemble (average)'
 'Top-3 Mixed Ensemble (logistic average)'
 'Top-3 Mixed Ensemble (majority vote)']
Random Play                                0.0
Top-3 Mixed Ensemble (average)             1.0
Top-3 Mixed Ensemble (logistic average)    1.0
Top-3 Mixed Ensemble (majority vote)       1.0
dtype: float64
Random Play                                3.666667
Top-3 Mixed Ensemble (average)             2.333333
Top-3 Mixed Ensemble (logistic average)    2.333333
Top-3 Mixed Ensemble (majority vote)       1.666667
dtype: float64
('Random Play', 'Top-3 Mixed Ensemble (logistic average)', 0.25, False)
('Random Play', 'Top-3 Mixed Ensemble (majority vote)', 0.25, False)
('Random Play', 'Top-3 Mixed Ensemble (average)', 0.5, False)
('Top-3 Mixed Ensemble (average)', 'Top-3 Mixed Ensemble (majority vote)', 0.75, False)
('Top-3 Mixed Ensemble (average)', 'Top-3 Mixed Ensemble (logistic average)', 1.0, False)
('Top-3 Mixed Ensemble (logistic average)', 'Top-3 Mixed Ense

'./../../latex/cdd/cdd_Random_Play_vs_Mixed_Ensembles.pdf'