In [1]:
import pandas as pd
import scipy.stats as st

DATASETS_PATH = "../data/validation/results_analysis/"

# DATASETS = ["AI_Act", "DSA", "GDPR"]
DATASETS = ["GDPR"]

In [2]:
dfs = []
for dataset in DATASETS:
    file_path = DATASETS_PATH + f"obligation_analysis_{dataset}.xlsx"
    df = pd.read_excel(file_path)
    dfs.append(df)

len(dfs)

1

In [3]:
file_path = DATASETS_PATH + f"obligation_analysis_Overall.xlsx"
overall_df = pd.concat(dfs)
overall_df.to_excel(file_path, index=False)

In [4]:
elements = [
    "ObligationTypeClassification",
    "Addressees-Value",
    "Addressees-Extraction Method",
    "Objects-Value",
    "Objects-Extraction Method",
    "Predicates-Value",
    "Predicates-Extraction Method",
    "Specifications-Value",
    "Specifications-Extraction Method",
    "Pre-Conditions-Value",
    "Pre-Conditions-Extraction Method",
    "Beneficiaries-Value",
    "Beneficiaries-Extraction Method",
]

In [5]:
import ast
import numpy as np

# Assuming final_results is a DataFrame

for element in elements:
    def safe_mean(value):
        """Convert string lists to actual lists, then compute mean."""
        if isinstance(value, str):
            try:
                value = ast.literal_eval(value)  # Convert string to list
            except (SyntaxError, ValueError):
                return np.nan  # Return NaN if conversion fails
        if isinstance(value, list):
            return np.mean(value)
        elif pd.notna(value):  # Convert single numbers to float
            return float(value)
        return np.nan  # Return NaN for missing values


    # Apply safe_mean to convert and compute mean
    value_mean = overall_df[element].apply(safe_mean)

    # Floor the mean and convert to integer
    overall_df[element] = value_mean.apply(lambda x: int(np.floor(x)) if pd.notna(x) else np.nan)

    # Create Agreement column
    overall_df[element + "_Agreement"] = value_mean.apply(lambda x: 1 if x in [0, 1] else 0)

overall_df.sample(n=10)

Unnamed: 0,ID,ReviewerPair,ObligationTypeClassification,Predicate,Addressees-Value,Addressees-Extraction Method,Objects-Value,Objects-Extraction Method,Predicates-Value,Predicates-Extraction Method,...,Objects-Value_Agreement,Objects-Extraction Method_Agreement,Predicates-Value_Agreement,Predicates-Extraction Method_Agreement,Specifications-Value_Agreement,Specifications-Extraction Method_Agreement,Pre-Conditions-Value_Agreement,Pre-Conditions-Extraction Method_Agreement,Beneficiaries-Value_Agreement,Beneficiaries-Extraction Method_Agreement
18,033.005.001.001,R2-R3,1,shall document,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,1,1
22,037.004.001.001,R2-R3,1,may designate,1,1,1,1,1,1,...,1,1,1,1,0,0,1,1,0,0
11,071.002.001.001,R1-R4,0,shall include,1,0,0,0,1,1,...,0,0,1,1,1,1,0,0,1,1
29,071.001.001.001,R2-R3,1,shall draw up,1,1,1,1,1,1,...,1,1,1,1,0,0,0,0,1,1
25,046.004.001.001,R2-R3,1,shall apply,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,1,1
19,035.003.001.001,R2-R3,1,shall carry out a data protection impact asses...,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,036.004.001.001,R1-R4,1,shall consult,1,1,0,0,0,1,...,0,0,0,1,0,1,0,0,0,1
17,030.003.001.001,R2-R3,1,shall be in writing,0,0,1,1,1,1,...,1,1,1,1,1,1,0,0,1,1
30,078.003.001.001,R2-R3,1,shall be brought,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,1,1
23,037.004.001.002,R2-R3,1,shall designate,1,1,1,1,1,1,...,1,1,1,1,0,0,1,1,0,0


In [6]:
def calculate_metrics(column, target_df):
    """ Compute classification metrics for a given column (Classification or Justification). """
    correct_answers = (target_df[column] == 1).sum()  # Correct answers (1 means correct)
    wrong_answers = (target_df[column] == 0).sum()  # Wrong answers (0 means wrong)

    accuracy = correct_answers / (correct_answers + wrong_answers) if (correct_answers + wrong_answers) > 0 else 0

    return {
        "Accuracy": accuracy
    }


def compute_accuracy_with_ci(predictions, confidence=0.95):
    """
    Compute accuracy and confidence intervals using both Wald and Wilson methods.

    :param predictions: List of binary predictions (0 or 1)
    :param confidence: Confidence level (default is 0.95 for 95% CI)
    :return: Dictionary with accuracy, Wald CI, and Wilson CI
    """
    n = len(predictions)
    if n == 0:
        raise ValueError("Prediction list cannot be empty")

    correct_predictions = sum(predictions)
    p_hat = correct_predictions / n

    # Z-score for the confidence interval
    z = st.norm.ppf(1 - (1 - confidence) / 2)

    # Wald Confidence Interval
    wald_margin = z * np.sqrt((p_hat * (1 - p_hat)) / n)
    wald_ci = (max(0, p_hat - wald_margin), min(1, p_hat + wald_margin))

    # Wilson Score Interval
    denominator = 1 + (z ** 2 / n)
    center_adjusted_probability = p_hat + (z ** 2 / (2 * n))
    adjusted_standard_error = np.sqrt((p_hat * (1 - p_hat) / n) + (z ** 2 / (4 * n ** 2)))

    lower_bound = (center_adjusted_probability - z * adjusted_standard_error) / denominator
    upper_bound = (center_adjusted_probability + z * adjusted_standard_error) / denominator
    wilson_ci = (max(0, lower_bound), min(1, upper_bound))

    return {
        "accuracy": p_hat,
        "wald_ci": wald_ci,
        "wilson_ci": wilson_ci
    }


accuracy_results = {}
for element in elements:
    element_metrics = calculate_metrics(element, overall_df)
    # print(f"\n=== Metrics for {element} ===")
    element_m2 = compute_accuracy_with_ci(overall_df[element] == 1, confidence=0.95)
    print(f"Element {element}, ", element_m2)

    accuracy_results[element] = element_metrics["Accuracy"]
    #
    # for key, value in element_metrics.items():
    #     print(f"{key}: {value:.4f}")
accuracy_results_agreement = {}
for element in elements:
    element_metrics = calculate_metrics(element + "_Agreement", overall_df)

    accuracy_results_agreement[element] = element_metrics["Accuracy"]
    # print(f"\n=== Agreement Metrics for {element} ===")
    # for key, value in element_metrics.items():
    #     print(f"{key}: {value:.4f}")

# print("Accuracy")
# print(json.dumps(accuracy_results, indent=4))
#
# print("Agreement")
# print(json.dumps(accuracy_results_agreement, indent=4))

# Create DataFrame
df = pd.DataFrame({
    "Lines": accuracy_results.keys(),
    "Accuracy": accuracy_results.values(),
    "Agreement": accuracy_results_agreement.values()
})

Element ObligationTypeClassification,  {'accuracy': 0.8387096774193549, 'wald_ci': (np.float64(0.7092372690930426), np.float64(0.9681820857456671)), 'wilson_ci': (np.float64(0.6736564604120674), np.float64(0.9290737555663471))}
Element Addressees-Value,  {'accuracy': 0.7419354838709677, 'wald_ci': (np.float64(0.5879022399247333), np.float64(0.8959687278172022)), 'wilson_ci': (np.float64(0.567538641905662), np.float64(0.8629829409360625))}
Element Addressees-Extraction Method,  {'accuracy': 0.7096774193548387, 'wald_ci': (np.float64(0.5498916325957811), np.float64(0.8694632061138964)), 'wilson_ci': (np.float64(0.5340767066925007), np.float64(0.8390419984369939))}
Element Objects-Value,  {'accuracy': 0.7741935483870968, 'wald_ci': (np.float64(0.6270096874670537), np.float64(0.9213774093071398)), 'wilson_ci': (np.float64(0.6018758161665795), np.float64(0.886048644387375))}
Element Objects-Extraction Method,  {'accuracy': 0.8387096774193549, 'wald_ci': (np.float64(0.7092372690930426), np.f

In [7]:
df.to_excel(f"../data/validation/results_analysis/obligation_analysis_Overall_Accuracy_Agreement.xlsx", index=False)