In [1]:
import json
import pandas as pd
from collections import defaultdict
from statistics import mean

def analyse_retriever(file):
    # Load the data
    with open(file) as f:
        data = json.load(f)
    
    model_data = defaultdict(lambda: defaultdict(list))
    
    for entry in data:
        model = entry["model"]
        for result in entry["results"]:
            key = (result["k"], result["threshold"])
            metrics = {
                "recall@k": result["recall@k"],
                "false_positive_rate": result["false_positive_rate"],
                "precision@k": result["precision@k"],
                "f1@k": result["f1@k"]
            }
            model_data[model][key].append(metrics)
    
    # Average metrics across datasets
    averaged_data = defaultdict(dict)
    
    for model, param_dict in model_data.items():
        for (k, threshold), results in param_dict.items():
            averaged_metrics = {
                "recall@k": mean(r["recall@k"] for r in results),
                "false_positive_rate": mean(r["false_positive_rate"] for r in results),
                "precision@k": mean(r["precision@k"] for r in results),
                "f1@k": mean(r["f1@k"] for r in results)
            }
            averaged_data[model][(k, threshold)] = averaged_metrics
    
    # Select best (k, threshold) per model with highest recall@k under fpr < 0.4
    best_metrics = {}
    
    for model, metrics_dict in averaged_data.items():
        best_combo = None
        best_recall = -1
        for (k, threshold), metrics in metrics_dict.items():
            if metrics["false_positive_rate"] < 0.4 and metrics["recall@k"] > best_recall:
                best_recall = metrics["recall@k"]
                best_combo = (k, threshold, metrics)
        if best_combo:
            best_metrics[model] = {
                "k": best_combo[0],
                "threshold": best_combo[1],
                **best_combo[2]
            }
    
    df_best = pd.DataFrame.from_dict(best_metrics, orient="index").reset_index()
    df_best.rename(columns={"index": "model"}, inplace=True)
    
    df_best["X"] = df_best["model"].str.extract(r'_(\d+)_')
    df_best["X"] = pd.to_numeric(df_best["X"], errors='coerce')
    
    df_best["SPhilBERTa_priority"] = df_best["model"].str.contains("SPhilBERTa").astype(int)
    df_best.sort_values(by=["SPhilBERTa_priority", "X"], ascending=[False, True], inplace=True)
    df_best.drop(columns=["SPhilBERTa_priority", "X"], inplace=True)
    
    print(df_best.to_string(index=False))
    print()

def analyse_reranker(file_path):

    with open(file_path) as f:
        data = json.load(f)
    
    # Organize data: model -> threshold -> list of f1 scores
    model_data = defaultdict(lambda: defaultdict(list))
    
    for entry in data:
        model = entry["model"]
        threshold = entry["threshold"]
        f1_score = entry["classification_report"]["1"]["f1-score"]
        model_data[model][threshold].append(f1_score)
    
    # Average metrics across datasets
    averaged_data = defaultdict(dict)
    for model, threshold_dict in model_data.items():
        for threshold, scores in threshold_dict.items():
            averaged_data[model][threshold] = mean(scores)
    
    # Select best threshold per model by highest average f1-score
    best_metrics = {}
    for model, scores_dict in averaged_data.items():
        best_threshold = max(scores_dict, key=scores_dict.get)
        best_metrics[model] = {
            "threshold": best_threshold,
            "f1_score": scores_dict[best_threshold]
        }
    
    # Convert to DataFrame and sort
    df_best = pd.DataFrame.from_dict(best_metrics, orient="index").reset_index()
    df_best.rename(columns={"index": "model"}, inplace=True)
    
    # Extract X from model string assuming format like 'Name_X_Y'
    df_best["X"] = df_best["model"].str.extract(r'_(\d+)_')
    df_best["X"] = pd.to_numeric(df_best["X"], errors='coerce')


    df_best.sort_values(by="X", ascending=True, inplace=True)
    df_best.drop(columns=["X"], inplace=True)

    print(df_best.to_string(index=False))
    print()




In [29]:
analyse_retriever("results/BE/eval_BE_S_Ge.json")
analyse_retriever("results/BE/eval_BE_M_Ge.json")

        model  k  threshold  recall@k  false_positive_rate  precision@k     f1@k
   SPhilBERTa  7       0.75     0.825             0.382490     0.623232 0.671670
BEmargin_03_0  3       0.55     0.925             0.298156     0.728326 0.785846
BEmargin_04_0  3       0.55     0.945             0.353088     0.715826 0.782504
BEmargin_05_0  3       0.60     0.910             0.235920     0.744162 0.794176
BEmargin_06_0  3       0.60     0.960             0.353892     0.750828 0.811674
BEmargin_07_0  3       0.65     0.950             0.301316     0.786660 0.834178
BEmargin_08_0  5       0.70     0.895             0.256860     0.746576 0.788342
BEmargin_09_0  5       0.70     0.935             0.384044     0.699826 0.762672
BEmargin_10_0  3       0.75     0.895             0.305568     0.734994 0.782510

        model  k  threshold  recall@k  false_positive_rate  precision@k    f1@k
   SPhilBERTa  3       0.75   0.55000              0.39740      0.47221 0.48555
BEmargin_03_0  3       0.55  

In [34]:
analyse_reranker("results/CEP/EVAL-CEP-S1.json")
analyse_reranker("results/CEP/EVAL-CEP-S2.json")
analyse_reranker("results/CEP/EVAL-CEP-M1.json")
analyse_reranker("results/CEP/EVAL-CEP-M2.json")

         model  threshold  f1_score
CEPweight_00_5        0.2  0.983918
CEPweight_01_0        0.2  0.980650
CEPweight_02_0        0.2  0.974802
CEPweight_03_0        0.2  0.986550
CEPweight_04_0        0.4  0.980994
CEPweight_05_0        0.2  0.981287
CEPweight_06_0        0.2  0.985913
CEPweight_07_0        0.2  0.983918
CEPweight_08_0        0.2  0.980650
CEPweight_09_0        0.2  0.986550
CEPweight_10_0        0.2  0.980994
CEPweight_11_0        0.2  0.983918
CEPweight_12_0        0.2  0.975439

         model  threshold  f1_score
CEPweight_00_5        0.2  0.947369
CEPweight_01_0        0.2  0.941754
CEPweight_02_0        0.3  0.949491
CEPweight_03_0        0.4  0.948259
CEPweight_04_0        0.3  0.945069
CEPweight_05_0        0.6  0.950946
CEPweight_06_0        0.2  0.953885
CEPweight_07_0        0.2  0.943402
CEPweight_08_0        0.2  0.944645
CEPweight_09_0        0.2  0.952369
CEPweight_10_0        0.8  0.951413
CEPweight_11_0        0.2  0.949571
CEPweight_12_0        0.2  

In [36]:
analyse_reranker("results/CES/EVAL-CES-S1.json")
analyse_reranker("results/CES/EVAL-CES-M1.json")

         model  threshold  f1_score
CESweight_00_5        0.6  0.994987
CESweight_01_0        0.7  0.997619
CESweight_02_0        0.2  0.997619
CESweight_03_0        0.2  0.997619
CESweight_04_0        0.2  0.997619
CESweight_05_0        0.2  0.997619
CESweight_06_0        0.2  1.000000
CESweight_07_0        0.2  0.997619
CESweight_08_0        0.4  1.000000
CESweight_09_0        0.2  0.997619
CESweight_10_0        0.5  0.997619
CESweight_11_0        0.4  0.992607
CESweight_12_0        0.2  0.997619

         model  threshold  f1_score
CESweight_00_5        0.2  0.973684
CESweight_01_0        0.2  0.973684
CESweight_02_0        0.2  0.973684
CESweight_03_0        0.2  1.000000
CESweight_04_0        0.2  1.000000
CESweight_05_0        0.2  0.947368
CESweight_06_0        0.2  0.973684
CESweight_07_0        0.2  0.976190
CESweight_08_0        0.2  1.000000
CESweight_09_0        0.2  1.000000
CESweight_10_0        0.2  1.000000
CESweight_11_0        0.2  0.973684
CESweight_12_0        0.2  