In [None]:
import pandas as pd
import scipy.stats as st
import re
from collections import OrderedDict
from pathlib import Path
import os 
import json

EXPERIMENTS_PATH_REGEX = "data/experiments/([0-9a-zA-Z\-_]+)/([0-9]+)/([0-9a-zA-Z\-_]+)/"

def get_experiment_identifier(matcher):
    return "{}_{}_{}".format(matcher[1], matcher[2], matcher[3])

def load_stats_results(path):
    m = re.search(EXPERIMENTS_PATH_REGEX + "cross_val_stats.csv", path)
    experiment_identifier = get_experiment_identifier(m)
    cross_val_stats = pd.read_csv(path)
    return (experiment_identifier, cross_val_stats)

def generate_cross_val_summary(cross_val_df):
    mean_s = cross_val_df.mean()
    std_s = cross_val_df.std()
    for metric in mean_s.index:
        conf_interval = st.t.interval(0.95, len(cross_val_df), loc=mean_s[metric], scale=std_s[metric])
        yield (metric, (mean_s[metric], conf_interval[0], conf_interval[1]))
        
def generate_overall_stats(experiments_stats_):
    results_dict = OrderedDict()
    for experiment_identifier, cros_val_stat_df in experiments_stats_:
        stats_summary_dict = OrderedDict(generate_cross_val_summary(cros_val_stat_df))
        results_dict[experiment_identifier] = stats_summary_dict
    return results_dict

def gen_comparison_table(metric, overall_stats_df_):
    """
    metric = MSE|MAE|R2|CrossEntropy
    """
    comparison_df = []
    for experiment_identifier, stats_summary in overall_stats_df_.items():
        stats_row = stats_summary[metric]
        comparison_df += [[experiment_identifier, *stats_row]]
    comparison_df = pd.DataFrame(comparison_df, columns=["experiment_key", "mean_score", "conf_lower", "conf_upper"]).sort_values(by="mean_score")
    return comparison_df

def load_params(best_params_list_):
    params_df = []
    for path in best_params_list_:
        m = re.search(EXPERIMENTS_PATH_REGEX + "best_params.json", path)
        experiment_identifier = get_experiment_identifier(m)
        params = json.load(open(path))
        params["experiment_key"] = experiment_identifier
        params_df += [params]
    return pd.DataFrame(params_df)

In [None]:
data_path = ""
exec_dir = ""

In [None]:
os.chdir(exec_dir)
cross_val_stats_list = [str(p) for p in Path(data_path).glob('**/cross_val_stats.csv')]
best_params_list = [str(p) for p in Path(data_path).glob('**/best_params.json')]

In [None]:
experiments_stats = list(map(load_stats_results, cross_val_stats_list))
overall_stats_df = generate_overall_stats(experiments_stats)
params_df = load_params(best_params_list)

In [None]:
stat_df = gen_comparison_table('MSE', overall_stats_df)
stat_df.merge(params_df, on="experiment_key")

In [None]:
stat_df = gen_comparison_table('MAE', overall_stats_df)
stat_df.merge(params_df, on="experiment_key")

In [None]:
stat_df = gen_comparison_table('CrossEntropy', overall_stats_df)
stat_df.merge(params_df, on="experiment_key")

In [None]:
stat_df = gen_comparison_table('R2', overall_stats_df)
stat_df.merge(params_df, on="experiment_key")