In [8]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import scipy.stats as stats
import random
import re
import utilities as ut
import modularised_utils as mut
import networkx as nx

from matplotlib.animation import FuncAnimation
from IPython.display import HTML

sns.set_theme(style="whitegrid")
seed = 0
np.random.seed(seed)

In [2]:
experiment = 'slc'
setting    = 'gaussian'

if setting == 'gaussian':
    path = f"data/{experiment}/results"

elif setting == 'empirical':
    path = f"data/{experiment}/results_empirical"

saved_folds = joblib.load(f"data/{experiment}/cv_folds.pkl")

# Load the original data dictionary
all_data      = ut.load_all_data(experiment)

LLmodel       = all_data['LLmodel']
HLmodel       = all_data['HLmodel']
Dll_samples   = all_data['LLmodel']['data']
Dhl_samples   = all_data['HLmodel']['data']
ll_graph      = all_data['LLmodel']['graph']
hl_graph      = all_data['HLmodel']['graph']
I_ll_relevant = all_data['LLmodel']['intervention_set']
ll_interventions = all_data['LLmodel']['intervention_set']
hl_interventions = all_data['HLmodel']['intervention_set']
omega         = all_data['abstraction_data']['omega']
ll_var_names  = list(all_data['LLmodel']['graph'].nodes())
hl_var_names  = list(all_data['HLmodel']['graph'].nodes())

Data loaded for 'slc'.


In [None]:
# Load dictionaries containing the results for each optimization method
if setting == 'gaussian':
    diroca_results = joblib.load(f"{path}/diroca_cv_results.pkl")
    gradca_results = joblib.load(f"{path}/gradca_cv_results.pkl")
    baryca_results = joblib.load(f"{path}/baryca_cv_results.pkl")

elif setting == 'empirical':
    diroca_results = joblib.load(f"{path}/diroca_cv_results_empirical.pkl")
    gradca_results = joblib.load(f"{path}/gradca_cv_results_empirical.pkl")
    baryca_results = joblib.load(f"{path}/baryca_cv_results_empirical.pkl")
    abslingam_results = joblib.load(f"{path}/abslingam_cv_results_empirical.pkl")

results_to_evaluate = {}

if setting == 'empirical':
    if abslingam_results:
        first_fold_key = list(abslingam_results.keys())[0]
        for style in abslingam_results[first_fold_key].keys():
            method_name = f"Abs-LiNGAM ({style})"
            new_abslingam_dict = {}
            for fold_key, fold_results in abslingam_results.items():
                if style in fold_results:
                    new_abslingam_dict[fold_key] = {style: fold_results[style]}
            results_to_evaluate[method_name] = new_abslingam_dict
    
    def create_diroca_label(run_id):
        """Parses a run_id and creates a simplified label if epsilon and delta are equal."""
        # Use regular expression to find numbers for epsilon and delta
        matches = re.findall(r'(\d+\.?\d*)', run_id)
        if len(matches) == 2:
            eps, delta = matches
            # If they are the same, use the simplified format
            if eps == delta:
                # Handle integer conversion for clean labels like '1' instead of '1.0'
                val = int(float(eps)) if float(eps).is_integer() else float(eps)
                return f"DIROCA (eps_delta_{val})"
        # Otherwise, or if parsing fails, use the full original name
        return f"DIROCA ({run_id})"

    # Unpack each DIROCA hyperparameter run with the new, clean label
    if diroca_results:
        first_fold_key = list(diroca_results.keys())[0]
        for run_id in diroca_results[first_fold_key].keys():
            method_name = create_diroca_label(run_id) # Use the new helper to create the name
            new_diroca_dict = {}
            for fold_key, fold_results in diroca_results.items():
                if run_id in fold_results:
                    new_diroca_dict[fold_key] = {run_id: fold_results[run_id]}
            results_to_evaluate[method_name] = new_diroca_dict

    results_to_evaluate['GradCA'] = gradca_results
    results_to_evaluate['BARYCA'] = baryca_results

elif setting == 'gaussian':
    results_to_evaluate['GradCA'] = gradca_results
    results_to_evaluate['BARYCA'] = baryca_results

    if diroca_results:
        first_fold_key = list(diroca_results.keys())[0]
        diroca_run_ids = list(diroca_results[first_fold_key].keys())

        # create a separate entry for each DIROCA run
        for run_id in diroca_run_ids:
            method_name = f"DIROCA ({run_id})"
            
            new_diroca_dict = {}
            for fold_key, fold_results in diroca_results.items():
                # For each fold grab the data for the current run_id
                if run_id in fold_results:
                    new_diroca_dict[fold_key] = {run_id: fold_results[run_id]}
            
            results_to_evaluate[method_name] = new_diroca_dict

label_map_gaussian = {
                        'DIROCA (eps_delta_0.111)': 'DiRoCA_star',
                        'DIROCA (eps_delta_1)': 'DIROCA_1',
                        'DIROCA (eps_delta_2)': 'DIROCA_2',
                        'DIROCA (eps_delta_4)': 'DIROCA_4',
                        'DIROCA (eps_delta_8)': 'DIROCA_8',
                        'GradCA': 'GradCA',
                        'BARYCA': 'BARYCA'
                    }

label_map_empirical = {
                        'DIROCA (eps_0.328_delta_0.107)': 'DiRoCA_star',
                        'DIROCA (eps_delta_1)': 'DIROCA_1',
                        'DIROCA (eps_delta_2)': 'DIROCA_2',
                        'DIROCA (eps_delta_4)': 'DIROCA_4',
                        'DIROCA (eps_delta_8)': 'DIROCA_8',
                        'GradCA': 'GradCA',
                        'BARYCA': 'BARYCA',
                        'Abs-LiNGAM (Perfect)': 'Abslin_p',
                        'Abs-LiNGAM (Noisy)': 'Abslin_n'
                    }

if setting == 'empirical':
    results_to_evaluate = {label_map_empirical.get(key, key): value for key, value in results_to_evaluate.items()}

elif setting == 'gaussian':
    results_to_evaluate = {label_map_gaussian.get(key, key): value for key, value in results_to_evaluate.items()}

print("\nMethods available for evaluation:")
for key in results_to_evaluate.keys():
    print(f"  - {key}")

# F-misspecification

In [4]:
def apply_structural_contamination(
    linear_data,
    graph,
    coeffs,
    noise,
    nonlinear_func=np.sin,
    k=1
):
    """
    Applies structural contamination to SCM data with flexible reuse strategy.

    Args:
        linear_data (np.ndarray): Original SCM output (already includes noise).
        graph (CausalBayesianNetwork): DAG structure.
        coeffs (dict): Edge weights {(parent, child): weight}.
        noise (np.ndarray): Exogenous noise (same shape as linear_data).
        k (float): Contamination strength k ∈ [0, 1].
        nonlinear_func (callable): Nonlinear function applied to parent values.

    Returns:
        np.ndarray: Contaminated data (n_samples x dim).
    """
    n_samples, dim = linear_data.shape
    topo_order = list(nx.topological_sort(graph))
    var_index = {var: idx for idx, var in enumerate(topo_order)}

    contaminated = np.zeros_like(noise)

    for var in topo_order:
        var_idx = var_index[var]
        parents = list(graph.predecessors(var))

        if not parents:
            linear_part = np.zeros(n_samples)
            nonlinear_part = np.zeros(n_samples)
        else:
            parent_indices = [var_index[p] for p in parents]
            parent_vals = contaminated[:, parent_indices]
            weights = np.array([coeffs.get((p, var), 0.0) for p in parents])
            linear_part = parent_vals @ weights
            nonlinear_part = k*nonlinear_func(parent_vals).sum(axis=1)

        contaminated[:, var_idx] = nonlinear_part + noise[:, var_idx]

    return contaminated

def sin(x):
    return np.sin(x)

def tanh(x):
    return np.tanh(x)

def square(x):
    return x**2

In [5]:
num_trials              = 100
nonlinear_func          = sin

In [6]:
f_spec_records = []
for trial in range(num_trials):
    for i, fold_info in enumerate(saved_folds):
        for method_name, results_dict in results_to_evaluate.items():
            fold_results = results_dict.get(f'fold_{i}', {})
            for run_key, run_data in fold_results.items():
                T_learned = run_data['T_matrix']
                test_indices = run_data['test_indices']

                errors_per_intervention = []

                for iota in I_ll_relevant:
                    # Prepare inputs
                    Dll_clean = Dll_samples[iota][test_indices]
                    Dhl_clean = Dhl_samples[omega[iota]][test_indices]

                    noise_ll = LLmodel['noise'][iota][test_indices]
                    noise_hl = HLmodel['noise'][omega[iota]][test_indices]

                    Dll_cont = apply_structural_contamination(
                        linear_data=Dll_clean,
                        graph=ll_graph,
                        coeffs=LLmodel['coeffs'],
                        noise=noise_ll,
                        nonlinear_func=nonlinear_func
                    )

                    Dhl_cont = apply_structural_contamination(
                        linear_data=Dhl_clean,
                        graph=hl_graph,
                        coeffs=HLmodel['coeffs'],
                        noise=noise_hl,
                        nonlinear_func=nonlinear_func
                    )

                    if setting == 'gaussian':
                        error = ut.calculate_abstraction_error(T_learned, Dll_cont, Dhl_cont)
                    elif setting == 'empirical':
                        error = ut.calculate_empirical_error(T_learned, Dll_cont, Dhl_cont)
                    else:
                        raise ValueError(f"Unknown setting: {setting}")

                    if not np.isnan(error):
                        errors_per_intervention.append(error)

                avg_error = np.mean(errors_per_intervention) if errors_per_intervention else np.nan
                f_spec_records.append({
                    'method': method_name,
                    'trial': trial,
                    'fold': i,
                    'error': avg_error
                })

In [None]:
# Compile results into a DataFrame
f_spec_df = pd.DataFrame(f_spec_records)

print("\n--- F-Misspecification Evaluation Complete ---")
print("="*65)
print("Overall Performance (Averaged Across All Nonlinearity Strengths)")
print("="*65)
print(f"{'Method/Run':<35} | {'Mean ± Std'}")
print("="*65)

summary = f_spec_df.groupby('method')['error'].agg(['mean', 'std', 'count'])
summary['sem'] = summary['std']  
# summary['sem'] = summary['std'] / np.sqrt(summary['count'])
summary['ci95'] = summary['sem']  

for method_name, row in summary.sort_values('mean').iterrows():
    print(f"{method_name:<35} | {row['mean']:.4f} ± {row['ci95']:.4f}")


In [9]:
label_map_empirical = {
    'DIROCA (eps_0.328_delta_0.107)': 'DiRoCA_star',
    'DIROCA (eps_delta_1)': 'DIROCA_1',
    'DIROCA (eps_delta_2)': 'DIROCA_2',
    'DIROCA (eps_delta_4)': 'DIROCA_4',
    'DIROCA (eps_delta_8)': 'DIROCA_8',
    'GradCA': 'GradCA',
    'BARYCA': 'BARYCA',
    'Abs-LiNGAM (Perfect)': 'Abslin_p',
    'Abs-LiNGAM (Noisy)': 'Abslin_n'
}
label_map_gaussian = {
    'DIROCA (eps_delta_0.111)': 'DiRoCA_star',
    'DIROCA (eps_delta_1)': 'DIROCA_1',
    'DIROCA (eps_delta_2)': 'DIROCA_2',
    'DIROCA (eps_delta_4)': 'DIROCA_4',
    'DIROCA (eps_delta_8)': 'DIROCA_8',
    'GradCA': 'GradCA',
    'BARYCA': 'BARYCA'
}

print_label_map  = {
    'DiRoCA_star':  r'DiRoCA$_{\epsilon_\ell^*, \epsilon_h^*}$',
    'DIROCA_1':     r'DiRoCA$_{1,1}$',
    'DIROCA_2':     r'DiRoCA$_{2,2}$',
    'DIROCA_4':     r'DiRoCA$_{4,4}$',
    'DIROCA_8':     r'DiRoCA$_{8,8}$',
    'GradCA':       r'GRAD$_{(\tau, \omega)}$',
    'BARYCA':       r'BARY$_{(\tau, \omega)}$',
    'Abslin_p':     r'AbsLin$_{\text{p}}$',
    'Abslin_n':     r'AbsLin$_{\text{n}}$'
}

plt.rcParams.update({
    "text.usetex": False,
    "font.family": "serif",
    "font.serif": ["Computer Modern Roman", "CMU Serif", "DejaVu Serif"],
    "mathtext.fontset": "cm",
    "mathtext.rm": "serif"
})

methods_to_plot = ['DiRoCA_star', 'DIROCA_1', 'DIROCA_2', 'DIROCA_4', 'DIROCA_8', 'GradCA', 'BARYCA', 'Abslin_p', 'Abslin_n']
display_names = [print_label_map[m] for m in methods_to_plot]

color_map = {
    r'DiRoCA$_{\epsilon_\ell^*, \epsilon_h^*}$': '#1f77b4',
    r'DiRoCA$_{1,1}$': 'gold',
    r'DiRoCA$_{2,2}$': 'darkorange',
    r'DiRoCA$_{4,4}$': 'lightskyblue',
    r'DiRoCA$_{8,8}$': 'violet',
    r'GRAD$_{(\tau, \omega)}$': '#2ca02c',
    r'BARY$_{(\tau, \omega)}$': '#d62728',
    r'AbsLin$_{\text{p}}$': '#9467bd',
    r'AbsLin$_{\text{n}}$': '#8c564b'
}

def _create_diroca_label(run_id):
    """If epsilon==delta in run_id string, compress to eps_delta_v form."""
    matches = re.findall(r'(\d+\.?\d*)', run_id)
    if len(matches) == 2:
        eps, delta = matches
        if eps == delta:
            val = int(float(eps)) if float(eps).is_integer() else float(eps)
            return f"DIROCA (eps_delta_{val})"
    return f"DIROCA ({run_id})"

def build_results_to_evaluate(experiment, setting):
    """Rebuild results_to_evaluate for a given (experiment, setting)."""
    if setting == 'gaussian':
        path = f"data/{experiment}/results"
        diroca_results = joblib.load(f"{path}/diroca_cv_results.pkl")
        gradca_results = joblib.load(f"{path}/gradca_cv_results.pkl")
        baryca_results = joblib.load(f"{path}/baryca_cv_results.pkl")
        label_map = label_map_gaussian
    else:
        path = f"data/{experiment}/results_empirical"
        diroca_results = joblib.load(f"{path}/diroca_cv_results_empirical.pkl")
        gradca_results = joblib.load(f"{path}/gradca_cv_results_empirical.pkl")
        baryca_results = joblib.load(f"{path}/baryca_cv_results_empirical.pkl")
        abslingam_results = joblib.load(f"{path}/abslingam_cv_results_empirical.pkl")
        label_map = label_map_empirical

    results_to_evaluate = {}

    # DIROCA variants
    if diroca_results:
        first_fold_key = list(diroca_results.keys())[0]
        diroca_run_ids = list(diroca_results[first_fold_key].keys())
        for run_id in diroca_run_ids:
            if setting == 'empirical':
                method_name = _create_diroca_label(run_id)
            else:
                method_name = f"DIROCA ({run_id})"

            new_diroca = {}
            for fold_key, fold_res in diroca_results.items():
                if run_id in fold_res:
                    new_diroca[fold_key] = {run_id: fold_res[run_id]}
            results_to_evaluate[method_name] = new_diroca

    # Baselines
    results_to_evaluate['GradCA'] = gradca_results
    results_to_evaluate['BARYCA'] = baryca_results

    # Abs-LiNGAM only for empirical (if present)
    if setting == 'empirical':
        if abslingam_results:
            first_fold_key = list(abslingam_results.keys())[0]
            for style in abslingam_results[first_fold_key].keys():
                method_name = f"Abs-LiNGAM ({style})"
                new_abs = {}
                for fold_key, fold_res in abslingam_results.items():
                    if style in fold_res:
                        new_abs[fold_key] = {style: fold_res[style]}
                results_to_evaluate[method_name] = new_abs

    # Map to canonical internal keys
    results_to_evaluate = {label_map.get(k, k): v for k, v in results_to_evaluate.items()}
    return results_to_evaluate

def run_k_sweep(experiment, setting, k_values, num_trials=5, strength=1, scaled=True, nonlinear_func=None):
    """Compute f_spec_df for (experiment, setting) over k_values."""
    # Load folds & data
    folds_path = f"data/{experiment}/cv_folds.pkl"
    saved_folds = joblib.load(folds_path)
    all_data = ut.load_all_data(experiment)

    Dll_samples   = all_data['LLmodel']['data']
    Dhl_samples   = all_data['HLmodel']['data']
    LLmodel       = all_data['LLmodel']
    HLmodel       = all_data['HLmodel']
    ll_graph      = all_data['LLmodel']['graph']
    hl_graph      = all_data['HLmodel']['graph']
    I_ll_relevant = all_data['LLmodel']['intervention_set']
    omega         = all_data['abstraction_data']['omega']

    results_to_evaluate = build_results_to_evaluate(experiment, setting)

    # Compute records
    records = []
    for k in k_values:
        for trial in range(num_trials):
            for i, fold_info in enumerate(saved_folds):
                for method_name, results_dict in results_to_evaluate.items():
                    fold_results = results_dict.get(f'fold_{i}', {})
                    for run_key, run_data in fold_results.items():
                        T_learned = run_data['T_matrix']
                        test_indices = run_data['test_indices']

                        errors = []
                        for iota in I_ll_relevant:
                            Dll_clean = Dll_samples[iota][test_indices]
                            Dhl_clean = Dhl_samples[omega[iota]][test_indices]

                            noise_ll = LLmodel['noise'][iota][test_indices]
                            noise_hl = HLmodel['noise'][omega[iota]][test_indices]

                            Dll_cont = apply_structural_contamination(
                                linear_data=Dll_clean,
                                graph=ll_graph,
                                coeffs=LLmodel['coeffs'],
                                noise=noise_ll,
                                nonlinear_func=nonlinear_func,
                                k=k
                            )
                            Dhl_cont = apply_structural_contamination(
                                linear_data=Dhl_clean,
                                graph=hl_graph,
                                coeffs=HLmodel['coeffs'],
                                noise=noise_hl,
                                nonlinear_func=nonlinear_func,
                                k=k
                            )

                            if setting == 'gaussian':
                                err = ut.calculate_abstraction_error(T_learned, Dll_cont, Dhl_cont)
                            else:
                                err = ut.calculate_empirical_error(T_learned, Dll_cont, Dhl_cont)

                            if not np.isnan(err):
                                errors.append(err)

                        avg_error = np.mean(errors) if errors else np.nan
                        records.append({
                            'method': method_name,
                            'k_value': k,
                            'trial': trial,
                            'fold': i,
                            'error': avg_error
                        })

    df = pd.DataFrame(records)
    present_methods = sorted(set(df['method']))
    keep = [m for m in methods_to_plot if m in present_methods]
    df = df[df['method'].isin(keep)].copy()
    df['display_name'] = df['method'].map(print_label_map)
    return df, keep


In [None]:
k_values = np.linspace(0, 100, 15)
num_trials = 1
nonlinear_func = tanh  

fig, axes = plt.subplots(2, 2, figsize=(18, 12), sharey=False)
axes = np.array(axes).reshape(2, 2)

# (row, col) -> (experiment, setting)
panels = [
    ((0, 0), ('slc', 'gaussian'),   "Gaussian"),
    ((0, 1), ('slc', 'empirical'),  "Empirical"),
    ((1, 0), ('lilucas', 'gaussian'),  "Gaussian"),
    ((1, 1), ('lilucas', 'empirical'), "Empirical"),
]

present_disp_names_global = []

for (r, c), (experiment, setting), title_str in panels:
    ax = axes[r, c]

    df_panel, keep_methods = run_k_sweep(
        experiment, setting, k_values,
        num_trials=num_trials, nonlinear_func=nonlinear_func
    )

    disp_order = [print_label_map[m] for m in methods_to_plot if m in keep_methods]

    if not df_panel.empty and disp_order:
        sns.lineplot(
            data=df_panel,
            x='k_value',
            y='error',
            hue='display_name',
            hue_order=disp_order,
            palette=color_map,
            marker='o',
            linewidth=2.5,
            markersize=8,
            errorbar='sd',
            ax=ax,
            legend=False  
        )

        present_disp_names_global.extend(disp_order)

    ax.set_title(title_str, fontsize=30)
    ax.set_xlabel(r'$k$', fontsize=32)
    if c == 0:
        ax.set_ylabel('Abstraction Error', fontsize=32)
    else:
        ax.set_ylabel('')
    ax.tick_params(axis='both', labelsize=18)
    ax.grid(True, linestyle='--', alpha=0.7)


present_disp_names_global = [dn for dn in display_names if dn in set(present_disp_names_global)]

ordered_handles = [
    plt.Line2D([], [], linestyle='-', linewidth=6,
               label=dn, color=color_map.get(dn, '#000000'))
    for dn in present_disp_names_global
]

fig.legend(
    ordered_handles,
    present_disp_names_global,
    loc='lower center',
    ncol=min(6, len(present_disp_names_global)),
    fontsize=20,
    frameon=False
)

fig.tight_layout(rect=[0, 0.12, 1, 0.97])
plt.show()

# ω-misspecification

In [4]:
def contaminate_omega_semantic(original_omega, ll_interventions, hl_interventions, 
                                        num_misalignments, seed=None, delta=None, return_changed=False):
    rng = random.Random(seed)
    contaminated_omega = dict(original_omega)

    eligible_ll = [ll for ll in original_omega if ll is not None]
    to_corrupt = rng.sample(eligible_ll, k=min(num_misalignments, len(eligible_ll)))

    changed = 0

    for ll_intervention in to_corrupt:
        original_target = original_omega[ll_intervention]
        ll_complexity = 0 if ll_intervention is None else len(ll_intervention.vv())

        # 1) same complexity, different target
        same = [hl for hl in hl_interventions
                if hl is not None and hl != original_target and len(hl.vv()) == ll_complexity]

        candidates = same

        # 2) fallback: nearest complexity (if needed)
        if not candidates:
            pairs = [(hl, abs(len(hl.vv()) - ll_complexity))
                     for hl in hl_interventions if hl is not None and hl != original_target]
            if pairs:
                min_diff = min(diff for _, diff in pairs)
                # enforce a cap if provided (delta)
                if delta is not None and min_diff > delta:
                    continue  # skip this ll; no near-enough HL target
                candidates = [hl for hl, diff in pairs if diff == min_diff]
            else:
                continue  # no alternative HL at all

        # pick a new target and set
        new_target = rng.choice(candidates)
        if new_target != original_target:
            contaminated_omega[ll_intervention] = new_target
            changed += 1

    if return_changed:
        return contaminated_omega, changed
    return contaminated_omega


def evaluate_omega_contamination(original_omega, ll_interventions, hl_interventions, total_interventions, results_to_evaluate, saved_folds, 
                                Dll_samples, Dhl_samples, setting, num_trials, delta):
    """Evaluates omega contamination across different levels."""

    contamination_levels = [int(total_interventions * 1.0)]
    omega_contamination_records = []
    
    for num_misalignments in contamination_levels:
        for trial in range(num_trials):
            contaminated_omega = contaminate_omega_semantic(
                original_omega, ll_interventions, hl_interventions, num_misalignments, 
                seed=seed, delta=delta, return_changed=False
            )
            
            for fold_id, fold_info in enumerate(saved_folds):
                for method_name, results_dict in results_to_evaluate.items():
                    fold_results = results_dict.get(f'fold_{fold_id}', {})
                    
                    for run_key, run_data in fold_results.items():
                        T_learned = run_data['T_matrix']
                        test_indices = run_data['test_indices']
                        
                        errors_per_intervention = []
                        
                        for ll_intervention in ll_interventions:
                            if ll_intervention is None:
                                continue
                                
                            contaminated_hl_intervention = contaminated_omega[ll_intervention]
                            if contaminated_hl_intervention is None:
                                continue
                            
                            Dll_test = Dll_samples[ll_intervention][test_indices]
                            Dhl_test = Dhl_samples[contaminated_hl_intervention][test_indices]
                            
                            if setting == 'gaussian':
                                error = ut.calculate_abstraction_error(T_learned, Dll_test, Dhl_test)
                            elif setting == 'empirical':
                                error = ut.calculate_empirical_error(T_learned, Dll_test, Dhl_test)
                            
                            if not np.isnan(error):
                                errors_per_intervention.append(error)
                        
                        avg_error = np.mean(errors_per_intervention) if errors_per_intervention else np.nan
                        
                        omega_contamination_records.append({
                            'method': method_name,
                            'num_misalignments': num_misalignments,
                            'trial': trial,
                            'fold': fold_id,
                            'error': avg_error
                        })
    
    return omega_contamination_records

In [5]:
# Get total number of interventions
total_interventions  = len([ll for ll in omega if ll is not None])
num_trials = 100
deltas = [1]

In [None]:
for delta in deltas:
    omega_contamination_records = evaluate_omega_contamination(
        omega, ll_interventions, hl_interventions, total_interventions, 
        results_to_evaluate, saved_folds, Dll_samples, Dhl_samples, setting, num_trials, delta
    )

    omega_contamination_df = pd.DataFrame(omega_contamination_records)

    print("=== OMEGA CONTAMINATION EVALUATION RESULTS ===")
    print("="*60)

    # Overall performance summary
    overall_performance = omega_contamination_df.groupby('method')['error'].agg(['mean', 'std']).round(2)
    overall_performance = overall_performance.sort_values('mean')
    overall_performance.columns = ['Mean Error', 'Std Error']

    print("Overall Performance (All Contamination Levels):")
    print("="*60)
    for method, row in overall_performance.iterrows():
        print(f"{method:<15} | {row['Mean Error']:6.2f} ± {row['Std Error']:5.2f}")