In [35]:
import os
import json
import pandas as pd

def extract_scores(base_path, data_names, json_files, task_types):
    """
    Extracts RMSE or ROC AUC scores from JSON files based on task type.

    Args:
        base_path (str): The base directory path.
        data_names (list of str): List of dataset folder names.
        json_files (list of str): List of JSON filenames (methods).
        task_types (list of str): List of task types ("regression" or "classification").

    Returns:
        pd.DataFrame: A DataFrame with datasets as rows, methods as columns, containing the extracted scores.
    """
    scores = {}

    for data_name, task_type in zip(data_names, task_types):
        scores[data_name] = {}
        for method_file in json_files:
            json_path = os.path.join(base_path, data_name, method_file+'.json')
            try:
                with open(json_path, 'r') as f:
                    info = json.load(f)

                if task_type == "regression":
                    value = info["best_rmse_scores"]["XGBRegressor"]["RMSE"]
                elif task_type == "classification":
                    value = info["best_avg_scores"]["XGBClassifier"]["roc_auc"]
                else:
                    value = None
            except (FileNotFoundError, KeyError, TypeError, json.JSONDecodeError):
                value = None

            scores[data_name][method_file] = value

    df = pd.DataFrame.from_dict(scores, orient='index')
    return df.T

In [37]:
base_path =  '/Users/.../Desktop/tabsyn-main/eval/mle'
data_names = ['adult_equal', 'default_equal', 'shoppers_equal', 'magic_equal', 'beijing_equal', 'news_equal']
methods = ['real', 'diffusion_on_copula', 'simple_KDE_VAE_encoding', 'KDE_VAE_encoding',  'smote', 'simple_KDE', 'tabsyn', 'TabKDE' ]
task_types = ["classification", "classification" , "classification", "classification", "regression", "regression"]

In [39]:
scores_df = extract_scores(base_path, data_names, json_files =  methods, task_types = task_types)

In [41]:
scores_df

Unnamed: 0,adult_equal,default_equal,shoppers_equal,magic_equal,beijing_equal,news_equal
real,0.926847,0.761084,0.926525,0.932755,0.49064,0.826638
diffusion_on_copula,0.890665,0.751845,0.924022,0.923454,0.71903,0.845462
simple_KDE_VAE_encoding,0.875744,0.72112,0.845009,0.896882,0.861018,1.09327
KDE_VAE_encoding,0.86568,0.727966,0.839495,0.897164,0.764813,0.850746
smote,0.903335,0.737619,0.914611,0.925335,0.575278,0.864778
simple_KDE,0.898869,0.738493,0.920458,0.914361,0.79011,1.019222
tabsyn,0.909962,0.763339,0.915018,0.919091,0.612587,0.89243
TabKDE,0.881403,0.740202,0.92121,0.914743,0.701989,0.887821


In [45]:
current_dir = os.getcwd()
scores_df.to_csv(os.path.join(current_dir, "mle_scores.csv"))