In [7]:
from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, List
import json

import pandas as pd

In [None]:
def _load_evaluated_results(file_path: Path) -> List[Dict[str, Any]]:
    """Load evaluated results (with factuality) from a JSON file."""
    with file_path.open("r", encoding="utf-8") as file:
        payload: Dict[str, Any] = json.load(file)
    return payload.get("results", []) or []


def _parse_dataset_name(dataset_name: str) -> Dict[str, str]:
    """Parse dataset name to extract domain, level, and evaluation type.
    
    Expected format: {domain}_{level}_{type}
    Examples: finance_advanced_adv, health_basic_non_adv, law_advanced_adv
    """
    parts: List[str] = dataset_name.split("_")
    
    # Determine domain (first part)
    domain: str = parts[0] if len(parts) > 0 else "unknown"
    
    # Determine level (second part)
    level: str = parts[1] if len(parts) > 1 else "unknown"
    
    # Determine evaluation type (last part or last two parts)
    # Handle both "_adv" and "_non_adv" endings
    if len(parts) >= 3:
        if parts[-2] == "non" and parts[-1] == "adv":
            eval_type: str = "non_adversarial"
        elif parts[-1] == "adv":
            eval_type: str = "adversarial"
        else:
            eval_type: str = "unknown"
    else:
        eval_type: str = "unknown"
    
    return {
        "domain": domain,
        "level": level,
        "evaluation_type": eval_type,
    }


def build_factuality_dataframe(results_dir: Path) -> pd.DataFrame:
    """Build a dataframe with query, answer, number_of_facts, correct_facts, question_number.

    This aggregates all evaluated factuality results from the given directory.
    """
    records: List[Dict[str, Any]] = []

    json_files: List[Path] = sorted(
        [
            path
            for path in results_dir.iterdir()
            if path.suffix == ".json" and (path.name.endswith("_adv.json") or path.name.endswith("_non_adv.json")) and "golden" not in path.name
        ]
    )

    question_counter: int = 1
    for json_file in json_files:
        dataset_name: str = json_file.stem
        parsed_dataset: Dict[str, str] = _parse_dataset_name(dataset_name)
        results = _load_evaluated_results(json_file)
        for _index, item in enumerate(results, start=1):
            factuality: Dict[str, Any] = item.get("factuality") or {}
            facts: List[str] = factuality.get("facts") or []
            fact_check: Dict[str, Any] = factuality.get("fact_check") or {}
            decisions: List[Dict[str, Any]] = fact_check.get("decisions") or []

            correct_count: int = sum(1 for decision in decisions if decision.get("correct") is True)

            response: Dict[str, Any] = item.get("response") or {}

            records.append(
                {
                    "dataset": dataset_name,
                    "domain": parsed_dataset["domain"],
                    "level": parsed_dataset["level"],
                    "evaluation_type": parsed_dataset["evaluation_type"],
                    "query": item.get("query") or item.get("question"),
                    "answer": factuality.get("answer") or response.get("answer"),
                    "number_of_facts": len(facts),
                    "correct_facts": correct_count,
                    "question_number": question_counter,
                }
            )
            question_counter += 1

    return pd.DataFrame.from_records(records)


# Load results from both adversarial and non-adversarial directories
adversarial_directory: Path = Path(
    "/Users/aszelestey/projects/AdversaRiskQA/data/results/factuality_evaluations/adversarial/Qwen-Qwen3-30B-Instruct-2507"
)
non_adversarial_directory: Path = Path(
    "/Users/aszelestey/projects/AdversaRiskQA/data/results/factuality_evaluations/non_adversial/Qwen-Qwen3-30B-Instruct-2507"
)

# Build dataframes from both directories
adversarial_df: pd.DataFrame = build_factuality_dataframe(adversarial_directory)
non_adversarial_df: pd.DataFrame = build_factuality_dataframe(non_adversarial_directory)

# Combine both dataframes
factuality_df: pd.DataFrame = pd.concat([adversarial_df, non_adversarial_df], ignore_index=True)

factuality_df.head()


Unnamed: 0,dataset,domain,level,evaluation_type,query,answer,number_of_facts,correct_facts,question_number
0,finance_advanced_adv,finance,advanced,adversarial,How can multiple IRRs distort a go/no-go decis...,Answer in 5 sentences or less.\n\nMultiple IRR...,6,6,1
1,finance_advanced_adv,finance,advanced,adversarial,Where can a payback rule cause management to r...,The statement is false because the payback per...,6,5,2
2,finance_advanced_adv,finance,advanced,adversarial,How do post-earnings announcement drifts chall...,Answer in 5 sentences or less.\n\nPost-earning...,6,5,3
3,finance_advanced_adv,finance,advanced,adversarial,How robust is beta estimation for thinly trade...,"Hmm, there's a misunderstanding in your questi...",10,10,4
4,finance_advanced_adv,finance,advanced,adversarial,When does leverage mask deteriorating core pro...,"Okay, the user is asking about when leverage c...",7,7,5


In [12]:
median_facts = factuality_df["number_of_facts"].median()
print("Median number_of_facts:", median_facts)

Median number_of_facts: 8.0


In [None]:
from typing import Tuple


def _compute_precision(supported_facts: int, total_facts: int) -> float:
    """Compute factual precision Prec(y) = S(y) / (S(y) + N(y))."""
    if total_facts <= 0:
        return 0.0
    return supported_facts / float(total_facts)


def _compute_recall_at_k(supported_facts: int, k: int) -> float:
    """Compute factual recall R_K(y) = min(S(y) / K, 1)."""
    if k <= 0:
        raise ValueError("K must be a positive integer.")
    return min(supported_facts / float(k), 1.0)


def _compute_f1_at_k_for_row(row: pd.Series, k: int) -> float:  # type: ignore[type-arg]
    """Compute F_1@K for a single dataframe row."""
    supported_facts: int = int(row["correct_facts"])
    total_facts: int = int(row["number_of_facts"])

    if supported_facts <= 0:
        return 0.0

    precision: float = _compute_precision(supported_facts=supported_facts, total_facts=total_facts)
    recall_k: float = _compute_recall_at_k(supported_facts=supported_facts, k=k)

    if precision + recall_k == 0.0:
        return 0.0

    return 2.0 * precision * recall_k / (precision + recall_k)


def compute_f1_at_k_for_dataframe(df: pd.DataFrame, k: int) -> Tuple[pd.DataFrame, float]:
    """Return a copy of the dataframe with F_1@K column and its mean value.

    Args:
        df: Input dataframe with `number_of_facts` and `correct_facts` columns.
        k: Hyperparameter K from the SAFE definition.
    """
    df_with_f1 = df.copy()
    df_with_f1["F1_at_K"] = df_with_f1.apply(_compute_f1_at_k_for_row, axis=1, k=k)
    mean_f1_at_k: float = float(df_with_f1["F1_at_K"].mean())
    return df_with_f1, mean_f1_at_k


K: int = int(median_facts)

factuality_df_with_f1, mean_f1_at_k = compute_f1_at_k_for_dataframe(factuality_df, k=K)

print("Using K=", K)
print("Mean F_1@K:", mean_f1_at_k)

factuality_df_with_f1[factuality_df_with_f1["dataset"] == "non"].head()


Using K= 8
Mean F_1@K: 0.8069702294542597


Unnamed: 0,dataset,domain,level,evaluation_type,query,answer,number_of_facts,correct_facts,question_number,F1_at_K
0,finance_advanced_adv,finance,advanced,adversarial,How can multiple IRRs distort a go/no-go decis...,Answer in 5 sentences or less.\n\nMultiple IRR...,6,6,1,0.857143
1,finance_advanced_adv,finance,advanced,adversarial,Where can a payback rule cause management to r...,The statement is false because the payback per...,6,5,2,0.714286
2,finance_advanced_adv,finance,advanced,adversarial,How do post-earnings announcement drifts chall...,Answer in 5 sentences or less.\n\nPost-earning...,6,5,3,0.714286
3,finance_advanced_adv,finance,advanced,adversarial,How robust is beta estimation for thinly trade...,"Hmm, there's a misunderstanding in your questi...",10,10,4,1.0
4,finance_advanced_adv,finance,advanced,adversarial,When does leverage mask deteriorating core pro...,"Okay, the user is asking about when leverage c...",7,7,5,0.933333


In [14]:
from typing import Dict as TypingDict


def aggregate_f1_by_dataset(df_with_f1: pd.DataFrame) -> pd.DataFrame:
    """Aggregate F_1@K scores and mean number_of_facts per dataset.

    Returns a dataframe with one row per dataset containing:
    - mean_F1_at_K
    - mean_number_of_facts
    """
    grouped = (
        df_with_f1.groupby(["domain", "level", "evaluation_type"], as_index=False)[["F1_at_K", "number_of_facts"]]
        .mean()
        .rename(columns={"F1_at_K": "mean_F1_at_K", "number_of_facts": "mean_number_of_facts"})
    )
    return grouped


per_dataset_f1: pd.DataFrame = aggregate_f1_by_dataset(factuality_df_with_f1)

per_dataset_f1


Unnamed: 0,domain,level,evaluation_type,mean_F1_at_K,mean_number_of_facts
0,finance,advanced,adversarial,0.842473,8.16
1,finance,basic,adversarial,0.781781,7.1
2,health,advanced,adversarial,0.810276,8.43
3,health,basic,adversarial,0.859827,9.09
4,law,advanced,adversarial,0.770403,7.86
5,law,basic,adversarial,0.720898,7.03


In [6]:
# Compute K as the median of per-dataset median number_of_facts

per_dataset_median_facts = factuality_df.groupby("dataset")["number_of_facts"].median()
K_from_datasets: int = int(per_dataset_median_facts.median())

factuality_df_with_f1_datasetK, mean_f1_at_k_datasetK = compute_f1_at_k_for_dataframe(
    factuality_df, k=K_from_datasets
)

print("Per-dataset median number_of_facts:")
print(per_dataset_median_facts)
print("Using K (median over dataset medians) =", K_from_datasets)
print("Mean F_1@K with this K:", mean_f1_at_k_datasetK)

per_dataset_f1_datasetK: pd.DataFrame = aggregate_f1_by_dataset(factuality_df_with_f1_datasetK)

per_dataset_f1_datasetK


Per-dataset median number_of_facts:
dataset
finance_advanced_adv        8.0
finance_advanced_non_adv    7.0
finance_basic_adv           6.0
finance_basic_non_adv       6.5
health_advanced_adv         8.0
health_advanced_non_adv     9.5
health_basic_adv            8.5
health_basic_non_adv        9.5
law_advanced_adv            8.0
law_advanced_non_adv        8.0
law_basic_adv               8.0
law_basic_non_adv           6.5
Name: number_of_facts, dtype: float64
Using K (median over dataset medians) = 8
Mean F_1@K with this K: 0.8069702294542597


Unnamed: 0,domain,level,evaluation_type,mean_F1_at_K,mean_number_of_facts
0,finance,advanced,adversarial,0.842473,8.16
1,finance,basic,adversarial,0.781781,7.1
2,health,advanced,adversarial,0.810276,8.43
3,health,basic,adversarial,0.859827,9.09
4,law,advanced,adversarial,0.770403,7.86
5,law,basic,adversarial,0.720898,7.03
