In [None]:
import json
import pandas as pd
from collections import defaultdict
import numpy as np
import os
from utils.utils import load_json

In [None]:
from scipy.stats import ttest_rel

def calculate_means(list_folder_paths, baseline_name=None, output_format="pandas", num_runs=3, args_print=[]):
    # Dictionary to store results by experiment name
    # experiment_results = defaultdict(list)
    experiment_results = defaultdict(lambda: defaultdict(lambda: ([], [], [])))

    # Traverse the folder and find lincls.json files
    for folder_path in list_folder_paths:
        for root, _, files in os.walk(folder_path):
            for file in files:
                if file == "lincls.json":
                    # Extract experiment name and number from folder structure
                    parts = os.path.relpath(root, folder_path).split(os.sep)
                    if len(parts) >= 2:
                        experiment_name = parts[0]
                        # Load the json file
                        file_path = os.path.join(root, file)
                        data = load_json(file_path)
                        for d in data:
                            curr_args = d["args"]
                            if os.path.exists(os.path.join(root, "config.json")):
                                curr_args.update(load_json(os.path.join(root, "config.json")))
                            # else:
                            #     print(f"Warning: No config.json found in {root}")
                            experiment_results[experiment_name][d["args"]["sample_images"]][0].append(curr_args)
                            experiment_results[experiment_name][d["args"]["sample_images"]][1].append(d["best_acc1"])
                            experiment_results[experiment_name][d["args"]["sample_images"]][2].append(d["best_acc5"])

    # Calculate means for each experiment name
    mean_results = {}
    baseline_results = experiment_results.get(baseline_name, None)
    for name, results in experiment_results.items():
        for k,v in results.items():
            # print(k)
            item_key = f"{name}_{k}"
            assert len(v[0]) == num_runs, f"Expected {num_runs} runs, but got {len(v[0])} on {item_key}"
            assert float(v[0][0]["sample_images"]) == float(k), f"Expected {k} sample images, but got {v[0][0]['sample_images']}"
            mean_results[item_key] = [item_key, name, float(v[0][0]["sample_images"])]
            # mean_results[item_key] += [v[0][0][i] if i in v[0][0] else None for i in args_print]
            mean_results[item_key] += [v[0][0].get(i, None) for i in args_print]
            mean_results[item_key] += [
                # mean acc1 and std acc1
                np.mean(v[1]),
                np.std(v[1]),
                # mean acc5 and std acc5
                np.mean(v[2]),
                np.std(v[2]),
            ]

            # Compute p-values if the baseline exists for the same sample_images
            if baseline_results is not None:
                baseline_acc1 = baseline_results[k][1]
                baseline_acc5 = baseline_results[k][2]
                assert len(baseline_acc1) == len(v[1]), f"Expected {len(v[1])} runs, but got {len(baseline_acc1)}"
                assert len(baseline_acc5) == len(v[2]), f"Expected {len(v[2])} runs, but got {len(baseline_acc5)}"
                p_value_acc1 = round(ttest_rel(baseline_acc1, v[1], nan_policy='raise', alternative='less')[1], 3)
                p_value_acc5 = round(ttest_rel(baseline_acc5, v[2], nan_policy='raise', alternative='less')[1], 3)
                mean_results[item_key].extend([p_value_acc1, p_value_acc5])

            else:
                mean_results[item_key].extend([None, None])
    
    # Create a DataFrame with values
    df = pd.DataFrame(mean_results.values(), columns=["key", "name", "sample_images"] + args_print + ["mean_acc1", "std_acc1", "mean_acc5", "std_acc5", "pvalue_acc1", "pvalue_acc5"])

    # Output based on the specified format
    if output_format.lower() == "markdown":
        print("### Results in Markdown")
        print(df.to_markdown(index=False))
    elif output_format.lower() == "latex":
        print("### Results in LaTeX")
        print(df.to_latex(index=False, caption="Experiment Results", label="tab:experiment_results"))
    else:  # Default is pandas
        print("### Results as Pandas DataFrame")
        # print(df)

    return df

In [None]:
list_folder_paths = ["results/eval_saved_dcl","baselines"]
output_format = "pandas"
baseline = "tuning_20241028_153626_dcl_0_0_0_0_-1_0"
args_print = [
    "loss_type",
    "glofnd",
    "alpha",
    "start_update",
    "lr_lda",
]
sort_order = [
    'sign_acc1', 'pvalue_acc1', 'mean_acc1', 'std_acc1'] + args_print
df = calculate_means(list_folder_paths, baseline_name=baseline, output_format=output_format, args_print=args_print)
df["sign_acc1"] = df["pvalue_acc1"] < 0.05

In [None]:
pd.set_option('display.max_colwidth', None)
df[['mean_acc1', 'std_acc1']] = df[['mean_acc1', 'std_acc1']].round(2)

In [None]:
len(df) / 4

In [None]:
df[df.sample_images == 1].sort_values(by="mean_acc1", ascending=False)[sort_order]

In [None]:
df[df.sample_images == 0.1].sort_values(by="mean_acc1", ascending=False)[sort_order]

In [None]:
df[df.sample_images == 0.01].sort_values(by="mean_acc1", ascending=False)[sort_order]

In [None]:
df[df.sample_images == 0.001].sort_values(by="mean_acc1", ascending=False)[sort_order]