In [1]:
import ast
import os
import pandas as pd

data_names = ['us-election-2020', 'bretschneider-th-main', 'bretschneider-th-school', 'twitter-hate-speech-tsa', 'waseem-and-hovy-2016']
multi_class_data_names = ['davidson-thon', 'founta-2018-thas']
data_cols = ['data_name', 'num_classes', 'data_type', 'label_col',]
variant_cols = ['variant', 'sampling_modifiedRS_mode','sampling_modifiedRS_rho', 'sampling_weightedRS_percentage', 'loss', 'wce_alpha', 'fl_gamma',]
metrics_cols = ['val_f1_macro',
                'test_f1_macro', 
                'test_f1_per_label_0', 
                'test_f1_per_label_1',
                'test_f1_per_label_2', 
                'test_f1_per_label_3',
                'test_accuracy', 
                'test_precision_macro',
                'test_precision_weighted',
                'test_recall_macro',
                'test_recall_weighted',
                'test_auprc',]
COLS = data_cols + variant_cols + ['pl_seed'] + metrics_cols + ['mlflow_run_id']

In [2]:
def get_value_by_run_id(data_dir, run_id, key):
    value = None
    for root, _, files in os.walk(data_dir + run_id):  
        if key in files:
            with open(f"{root}/{key}", "r") as f:
                value = f.readlines()[-1]
                if "val" in key or "test" in key:
                    value = value.split()[1]
                if value == "sampling_modifiedRS": # Forgot to specify in the variant value
                    value += "_oversampling"
                try:
                    value = ast.literal_eval(value)
                    if isinstance(value, list):
                        value = tuple(value)
                except (ValueError, SyntaxError):
                    pass
                break
    if value is None:
        value = "ignored"
        if "test" in key:
            value = 0
    return value

def get_log_by_data_name(data_name):
    data_runs_dir = f'../logs/{data_name}/'
    run_ids = os.listdir(data_runs_dir)
    rows = []
    for run_id in run_ids:
        run_result = {col_name: get_value_by_run_id(data_runs_dir, run_id, col_name) for col_name in COLS[:-1]}
        run_result['mlflow_run_id'] = run_id
        rows.append(run_result)
    df = pd.DataFrame(columns=COLS, data=rows)
    df.to_csv(f"results_{data_name}.csv", index=False)
    return df

def aggregate_results(df):
    agg_target = {metric: 'mean' for metric in metrics_cols}
    agg_target.update({col: 'first' for col in data_cols})
    df_seeds_mean = df.groupby(variant_cols, as_index=False).agg(agg_target)
    df_val_f1_max = df_seeds_mean.loc[df_seeds_mean.groupby(["variant"])['val_f1_macro'].idxmax()].reset_index(drop=True)
    return df_val_f1_max

In [3]:
df_all_agg_results = pd.DataFrame()
for data_name in data_names + multi_class_data_names:
    df = get_log_by_data_name(data_name)
    df_val_f1_max = aggregate_results(df)
    df_all_agg_results = pd.concat([df_all_agg_results, df_val_f1_max], ignore_index=True)
df_all_agg_results = df_all_agg_results.sort_values(by=['data_name']).reset_index(drop=True)
df_all_agg_results[["data_name", "variant", "test_f1_macro", "test_f1_per_label_0", "test_f1_per_label_1", "test_f1_per_label_2", "test_f1_per_label_3"]]

Unnamed: 0,data_name,variant,test_f1_macro,test_f1_per_label_0,test_f1_per_label_1,test_f1_per_label_2,test_f1_per_label_3
0,bretschneider-th-main,baseline,0.688885,0.957432,0.420338,0.0,0.0
1,bretschneider-th-main,sampling_modifiedRS_oversampling,0.737174,0.960542,0.513807,0.0,0.0
2,bretschneider-th-school,baseline,0.698243,0.944139,0.452347,0.0,0.0
3,bretschneider-th-school,sampling_modifiedRS_oversampling,0.762313,0.949306,0.575321,0.0,0.0
4,davidson-thon,baseline,0.747009,0.404631,0.945373,0.891022,0.0
5,davidson-thon,sampling_weightedRS,0.746566,0.414412,0.933087,0.892199,0.0
6,davidson-thon,wce,0.756658,0.424845,0.944801,0.900328,0.0
7,founta-2018-thas,baseline,0.627033,0.8655,0.520021,0.781696,0.340915
8,twitter-hate-speech-tsa,baseline,0.872058,0.98254,0.761577,0.0,0.0
9,us-election-2020,baseline,0.756243,0.957343,0.555144,0.0,0.0
