In [6]:
import os

import attrs
import numpy as np
import pandas as pd

In [7]:
@attrs.define
class RunReader:
    path: str = attrs.field()
    score_mean: float = attrs.field(init=False)
    score_std: float = attrs.field(init=False)
    name: str = attrs.field(init=False)

    def __attrs_post_init__(self):
        self._set_content(
            "score_mean",
            os.path.join(self.path, "metrics/score_mean"),
            lambda x:  float(x.read().split(" ")[1]),
        )
        self._set_content(
            "score_std",
            os.path.join(self.path, "metrics/score_std"),
            lambda x:  float(x.read().split(" ")[1]),
        )
        self._set_content(
            "name",
            os.path.join(self.path, "tags/model"),
            lambda x:  x.read(),
            ""
        )

    def _set_content(self, attribute, path, func, default=np.nan):
        try:
            with open(path, "r") as f:
                setattr(self, attribute, func(f))
        except FileNotFoundError:
            setattr(self, attribute, default)

    def dict(self):
        return {
            "name": self.name,
            "score_mean": self.score_mean,
            "score_std": self.score_std,
        }

In [8]:
@attrs.define
class ExperimentReader:
    path: str = attrs.field()
    dataset: str = attrs.field(init=False)
    runs: list[RunReader] = attrs.field(init=False)

    def __attrs_post_init__(self):
        with open(
                os.path.join(self.path, "meta.yaml"), "r"
        ) as f:
            self.dataset = f.readlines()[-1].split(" ")[1].strip()
        dirs = [
            d for d in os.listdir(self.path) if d != "meta.yaml"
        ]
        self.runs = [
            RunReader(os.path.join(self.path, d)) for d in dirs
        ]

    def table(self):
        df = pd.DataFrame(
            [
                run.dict() for run in self.runs
            ]
        )
        return df

    def means(self):
        return self.table().groupby("name").score_mean.max()

    def stds(self):
        return self.table().groupby("name").score_std.min()

    def names(self):
        return {run.name for run in self.runs}

In [9]:
experiment = ExperimentReader("mlruns/100940072706001416")
experiment.names()

{'DecisionTreeRegressor',
 'ExplainableBoostingRegressor',
 'SparseAdditiveBoostingRegressor',
 'XGBRegressor',
 'randomforestregressor',
 'ridgecv'}

In [10]:
experiment.dataset

'657_fri_c2_250_10'

In [11]:
dirs = os.listdir("mlruns")[2:-1]
mean_list = []
std_list = []
datasets = ['215_2dplanes', '344_mv', '562_cpu_small', '197_cpu_act', '294_satellite_image', '573_cpu_act', '227_cpu_small', '564_fried', '201_pol']
for dir_ in dirs:
    experiment = ExperimentReader(os.path.join("mlruns", dir_))
    if experiment.dataset in datasets:
        mean_list.append(experiment.means())
        std_list.append(experiment.stds())
mean_df = pd.concat(mean_list, axis=1).T.reset_index(drop=True)
std_df = pd.concat(std_list, axis=1).T.reset_index(drop=True)
mean_df.index = datasets
std_df.index = datasets
mean_df

name,DecisionTreeRegressor,ExplainableBoostingRegressor,OptunaSearchCV,SparseAdditiveBoostingRegressor,XGBRegressor,randomforestregressor,ridgecv
215_2dplanes,0.69809,0.778966,-0.000146,-0.00015,0.774709,0.08343,0.466857
344_mv,0.989932,0.958052,-0.015755,0.115376,0.988259,0.990098,0.599386
562_cpu_small,0.707279,0.781496,0.750862,0.416292,0.79723,0.791282,0.359278
197_cpu_act,0.749382,0.814054,0.776858,0.761979,0.827954,0.819547,0.37073
294_satellite_image,0.756934,0.629802,-0.242676,0.521694,0.740567,0.758952,0.347154
573_cpu_act,0.749382,0.814054,0.776954,-0.078339,0.827954,0.819547,0.37073
227_cpu_small,0.707279,0.781496,0.729649,0.238467,0.79723,0.791282,0.359278
564_fried,0.614116,0.787943,0.663389,0.324129,0.774388,0.748663,0.498473
201_pol,0.930462,0.662913,-0.28579,-0.242656,0.91641,0.941184,0.08098


In [12]:
thesis_table = mean_df.copy()
thesis_table["SparseAdditiveBoostingRegressor"] = (
    thesis_table[
        ["SparseAdditiveBoostingRegressor", "OptunaSearchCV"]
    ].max(axis=1)
)
thesis_table = thesis_table[thesis_table["SparseAdditiveBoostingRegressor"].notna()]
thesis_table.drop(columns=["OptunaSearchCV"], inplace=True)
thesis_table

name,DecisionTreeRegressor,ExplainableBoostingRegressor,SparseAdditiveBoostingRegressor,XGBRegressor,randomforestregressor,ridgecv
215_2dplanes,0.69809,0.778966,-0.000146,0.774709,0.08343,0.466857
344_mv,0.989932,0.958052,0.115376,0.988259,0.990098,0.599386
562_cpu_small,0.707279,0.781496,0.750862,0.79723,0.791282,0.359278
197_cpu_act,0.749382,0.814054,0.776858,0.827954,0.819547,0.37073
294_satellite_image,0.756934,0.629802,0.521694,0.740567,0.758952,0.347154
573_cpu_act,0.749382,0.814054,0.776954,0.827954,0.819547,0.37073
227_cpu_small,0.707279,0.781496,0.729649,0.79723,0.791282,0.359278
564_fried,0.614116,0.787943,0.663389,0.774388,0.748663,0.498473
201_pol,0.930462,0.662913,-0.242656,0.91641,0.941184,0.08098


In [14]:
std_table = std_df.copy()
std_table["SparseAdditiveBoostingRegressor"] = (
    std_table[
        ["SparseAdditiveBoostingRegressor", "OptunaSearchCV"]
    ].min(axis=1)
)
std_table.drop(columns=["OptunaSearchCV"], inplace=True)
ste_table = (std_table / np.sqrt(5)).loc[thesis_table.index]
ste_table

name,DecisionTreeRegressor,ExplainableBoostingRegressor,SparseAdditiveBoostingRegressor,XGBRegressor,randomforestregressor,ridgecv
215_2dplanes,0.001649,0.001544,7.2e-05,0.001576,0.002397,0.003678
344_mv,6e-05,0.000705,0.001335,0.000183,0.000485,0.002025
562_cpu_small,0.009778,0.004991,0.006242,0.006822,0.006463,0.01482
197_cpu_act,0.007284,0.00432,0.004832,0.004635,0.006497,0.014121
294_satellite_image,0.015286,0.018081,0.016266,0.015747,0.013389,0.063308
573_cpu_act,0.007284,0.00432,0.006776,0.004635,0.006497,0.014121
227_cpu_small,0.009778,0.004991,0.0059,0.006822,0.006463,0.01482
564_fried,0.001297,0.001453,0.000957,0.001,0.001052,0.001484
201_pol,0.002242,0.007636,0.006554,0.001389,0.001419,0.006831


In [15]:
# turn each element of the table into strings
str_table = thesis_table.map(lambda x: f"{x:.3f}")
str_table = str_table + " (" + ste_table.map(lambda x: f"{x:.3f}") + ")"
str_table

name,DecisionTreeRegressor,ExplainableBoostingRegressor,SparseAdditiveBoostingRegressor,XGBRegressor,randomforestregressor,ridgecv
215_2dplanes,0.698 (0.002),0.779 (0.002),-0.000 (0.000),0.775 (0.002),0.083 (0.002),0.467 (0.004)
344_mv,0.990 (0.000),0.958 (0.001),0.115 (0.001),0.988 (0.000),0.990 (0.000),0.599 (0.002)
562_cpu_small,0.707 (0.010),0.781 (0.005),0.751 (0.006),0.797 (0.007),0.791 (0.006),0.359 (0.015)
197_cpu_act,0.749 (0.007),0.814 (0.004),0.777 (0.005),0.828 (0.005),0.820 (0.006),0.371 (0.014)
294_satellite_image,0.757 (0.015),0.630 (0.018),0.522 (0.016),0.741 (0.016),0.759 (0.013),0.347 (0.063)
573_cpu_act,0.749 (0.007),0.814 (0.004),0.777 (0.007),0.828 (0.005),0.820 (0.006),0.371 (0.014)
227_cpu_small,0.707 (0.010),0.781 (0.005),0.730 (0.006),0.797 (0.007),0.791 (0.006),0.359 (0.015)
564_fried,0.614 (0.001),0.788 (0.001),0.663 (0.001),0.774 (0.001),0.749 (0.001),0.498 (0.001)
201_pol,0.930 (0.002),0.663 (0.008),-0.243 (0.007),0.916 (0.001),0.941 (0.001),0.081 (0.007)


In [16]:
from pmlb import fetch_data

cpu_act_197 = fetch_data("197_cpu_act")
cpu_act_573 = fetch_data("573_cpu_act")

In [20]:
(cpu_act_197 == cpu_act_573).all().all()

True

In [17]:
print(list(thesis_table_.index))

['215_2dplanes', '344_mv', '562_cpu_small', '197_cpu_act', '294_satellite_image', '573_cpu_act', '227_cpu_small', '564_fried', '201_pol']
