In [119]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import scipy.stats as stats
import random
import re
import utilities as ut
import modularised_utils as mut

from matplotlib.animation import FuncAnimation
from IPython.display import HTML

sns.set_theme(style="whitegrid")
seed = 0
np.random.seed(seed)

In [120]:
experiment = 'lilucas'
setting    = 'empirical'

if setting == 'gaussian':
    path = f"data/{experiment}/results"

elif setting == 'empirical':
    path = f"data/{experiment}/results_empirical"

saved_folds = joblib.load(f"data/{experiment}/cv_folds.pkl")

# Load the original data dictionary
all_data      = ut.load_all_data(experiment)

Dll_samples   = all_data['LLmodel']['data']
Dhl_samples   = all_data['HLmodel']['data']
I_ll_relevant = all_data['LLmodel']['intervention_set']
omega         = all_data['abstraction_data']['omega']
ll_var_names  = list(all_data['LLmodel']['graph'].nodes())
hl_var_names  = list(all_data['HLmodel']['graph'].nodes())

Data loaded for 'lilucas'.


In [121]:
# Load dictionaries containing the results for each optimization method
if setting == 'gaussian':
    diroca_results = joblib.load(f"{path}/diroca_cv_results.pkl")
    gradca_results = joblib.load(f"{path}/gradca_cv_results.pkl")
    baryca_results = joblib.load(f"{path}/baryca_cv_results.pkl")

elif setting == 'empirical':
    diroca_results = joblib.load(f"{path}/diroca_cv_results_empirical.pkl")
    gradca_results = joblib.load(f"{path}/gradca_cv_results_empirical.pkl")
    baryca_results = joblib.load(f"{path}/baryca_cv_results_empirical.pkl")
    abslingam_results = joblib.load(f"{path}/abslingam_cv_results_empirical.pkl")

results_to_evaluate = {}

if setting == 'empirical':
    if abslingam_results:
        first_fold_key = list(abslingam_results.keys())[0]
        for style in abslingam_results[first_fold_key].keys():
            method_name = f"Abs-LiNGAM ({style})"
            new_abslingam_dict = {}
            for fold_key, fold_results in abslingam_results.items():
                if style in fold_results:
                    new_abslingam_dict[fold_key] = {style: fold_results[style]}
            results_to_evaluate[method_name] = new_abslingam_dict
    
    def create_diroca_label(run_id):
        """Parses a run_id and creates a simplified label if epsilon and delta are equal."""
        # Use regular expression to find numbers for epsilon and delta
        matches = re.findall(r'(\d+\.?\d*)', run_id)
        if len(matches) == 2:
            eps, delta = matches
            # If they are the same, use the simplified format
            if eps == delta:
                # Handle integer conversion for clean labels like '1' instead of '1.0'
                val = int(float(eps)) if float(eps).is_integer() else float(eps)
                return f"DIROCA (eps_delta_{val})"
        # Otherwise, or if parsing fails, use the full original name
        return f"DIROCA ({run_id})"

    # Unpack each DIROCA hyperparameter run with the new, clean label
    if diroca_results:
        first_fold_key = list(diroca_results.keys())[0]
        for run_id in diroca_results[first_fold_key].keys():
            method_name = create_diroca_label(run_id) # Use the new helper to create the name
            new_diroca_dict = {}
            for fold_key, fold_results in diroca_results.items():
                if run_id in fold_results:
                    new_diroca_dict[fold_key] = {run_id: fold_results[run_id]}
            results_to_evaluate[method_name] = new_diroca_dict

    results_to_evaluate['GradCA'] = gradca_results
    results_to_evaluate['BARYCA'] = baryca_results

elif setting == 'gaussian':
    results_to_evaluate['GradCA'] = gradca_results
    results_to_evaluate['BARYCA'] = baryca_results

    if diroca_results:
        first_fold_key = list(diroca_results.keys())[0]
        diroca_run_ids = list(diroca_results[first_fold_key].keys())

        # create a separate entry for each DIROCA run
        for run_id in diroca_run_ids:
            method_name = f"DIROCA ({run_id})"
            
            new_diroca_dict = {}
            for fold_key, fold_results in diroca_results.items():
                # For each fold grab the data for the current run_id
                if run_id in fold_results:
                    new_diroca_dict[fold_key] = {run_id: fold_results[run_id]}
            
            results_to_evaluate[method_name] = new_diroca_dict

label_map_gaussian = {
                        'DIROCA (eps_delta_0.111)': 'DiRoCA_star',
                        'DIROCA (eps_delta_1)': 'DIROCA_1',
                        'DIROCA (eps_delta_2)': 'DIROCA_2',
                        'DIROCA (eps_delta_4)': 'DIROCA_4',
                        'DIROCA (eps_delta_8)': 'DIROCA_8',
                        'GradCA': 'GradCA',
                        'BARYCA': 'BARYCA'
                    }

label_map_empirical = {
                        'DIROCA (eps_0.328_delta_0.107)': 'DiRoCA_star',
                        'DIROCA (eps_delta_1)': 'DIROCA_1',
                        'DIROCA (eps_delta_2)': 'DIROCA_2',
                        'DIROCA (eps_delta_4)': 'DIROCA_4',
                        'DIROCA (eps_delta_8)': 'DIROCA_8',
                        'GradCA': 'GradCA',
                        'BARYCA': 'BARYCA',
                        'Abs-LiNGAM (Perfect)': 'Abslin_p',
                        'Abs-LiNGAM (Noisy)': 'Abslin_n'
                    }

if setting == 'empirical':
    results_to_evaluate = {label_map_empirical.get(key, key): value for key, value in results_to_evaluate.items()}

elif setting == 'gaussian':
    results_to_evaluate = {label_map_gaussian.get(key, key): value for key, value in results_to_evaluate.items()}

print("\nMethods available for evaluation:")
for key in results_to_evaluate.keys():
    print(f"  - {key}")


Methods available for evaluation:
  - Abslin_p
  - Abslin_n
  - DiRoCA_star
  - DIROCA_1
  - DIROCA_2
  - DIROCA_4
  - DIROCA_8
  - GradCA
  - BARYCA


## ω contamination

In [122]:
def contaminate_omega_map(original_omega, num_misalignments):
    
    """Randomly re-wires a subset of entries in the omega map."""
    
    omega_keys = [k for k in original_omega.keys() if k is not None]
    omega_vals = [original_omega[k] for k in omega_keys if original_omega[k] is not None]

    contaminated_omega = original_omega.copy()
    
    # Bound the number of misalignments by the number of eligible keys.
    num_to_corrupt = min(num_misalignments, len(omega_keys))
    # Randomly select keys to corrupt.
    to_corrupt = random.sample(omega_keys, k=num_to_corrupt)
    
    # Create a random permutation of available targets (ensuring change)
    # Use the set of targets from eligible keys.
    all_targets = list(set(omega_vals))

    for key in to_corrupt:
        original_target = original_omega[key]
        # Only corrupt if there's an alternative available.
        available_targets = [t for t in all_targets if t != original_target]
        if available_targets:
            new_target = random.choice(available_targets)
            contaminated_omega[key] = new_target
            
    return contaminated_omega

In [139]:
results_to_evaluate.keys()

dict_keys(['Abslin_p', 'Abslin_n', 'DiRoCA_star', 'DIROCA_1', 'DIROCA_2', 'DIROCA_4', 'DIROCA_8', 'GradCA', 'BARYCA'])

In [140]:
max_misalignments = len(I_ll_relevant) 
misalignment_levels = [max_misalignments] #range(0, max_misalignments)

# Number of random contaminations to average over for each setting
num_trials = 20

omega_spec_records = []
print("Omega-misspecification evaluation")

for num_misalignments in tqdm(misalignment_levels, desc="Misalignment Level"):
    for trial in range(num_trials):
        rng_seed = 10_000 * num_misalignments + trial
        random.seed(rng_seed)
        np.random.seed(rng_seed % (2**32 - 1))  

        # Create a new scrambled omega map
        omega_cont = contaminate_omega_map(omega, num_misalignments)
        
        for i, fold_info in enumerate(saved_folds):
            for method_name, results_dict in results_to_evaluate.items():
                print(method_name)
                fold_results = results_dict.get(f'fold_{i}', {})
                for run_key, run_data in fold_results.items():
                    print(run_data,'\n')
                    if 'DIROCA' in method_name:
                        method_label = method_name
                    else:
                        method_label = method_name

                    T_learned = run_data['T_matrix']
                    # print(T_learned)
                    test_indices = run_data['test_indices']
                    
                    errors_per_intervention = []
                    for iota in I_ll_relevant:
                        Dll_test = Dll_samples[iota][test_indices]
                        # Use the contaminated omega map
                        Dhl_test = Dhl_samples[omega_cont[iota]][test_indices]
                        
                        if setting == 'gaussian':
                            error = ut.calculate_abstraction_error(T_learned, Dll_test, Dhl_test)
                        elif setting == 'empirical':
                            error = ut.calculate_empirical_error(T_learned, Dll_test, Dhl_test)
                            
                        if not np.isnan(error): errors_per_intervention.append(error)
                    
                    avg_error = np.mean(errors_per_intervention) if errors_per_intervention else np.nan
                    
                    record = {
                                'method': method_label, 
                                'misalignments': num_misalignments,
                                'trial': trial,
                                'fold': i,
                                'error': avg_error
                            }
                    omega_spec_records.append(record)

omega_spec_df = pd.DataFrame(omega_spec_records)
print("\n\n--- Omega-Misspecification Evaluation Complete ---")


Omega-misspecification evaluation


Misalignment Level:   0%|          | 0/1 [00:00<?, ?it/s]

Abslin_p
{'T_matrix': array([[ 2.,  1.,  1.,  0., -0., -0.],
       [-0.,  2.,  1.,  1.,  0.,  1.],
       [ 1.,  0.,  1.,  2.,  0.,  1.]]), 'test_indices': array([   4,    6,    9, ..., 9957, 9961, 9970])} 

Abslin_n
{'T_matrix': array([[ 2.,  0.,  0.,  0., -0., -0.],
       [-0.,  2.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  2.,  0.,  0.]]), 'test_indices': array([   4,    6,    9, ..., 9957, 9961, 9970])} 

DiRoCA_star
{'T_matrix': array([[ 1.4377378 ,  1.6595978 ,  1.1388961 , -0.43553984, -1.1244984 ,
        -1.7990888 ],
       [ 0.11589024,  2.6199744 ,  0.5087681 ,  0.3946025 , -1.1558418 ,
        -1.7261349 ],
       [ 1.301204  ,  2.2361352 ,  0.77944314,  0.94663936, -1.6503682 ,
        -2.323713  ]], dtype=float32), 'optimization_params': {'L': {'pert_U': array([[-0.15704052, -0.13614765, -0.1433823 , -0.06725273,  0.1265779 ,
        -0.05060528],
       [-0.15649827, -0.14371066, -0.14210129, -0.14229603, -0.07991871,
        -0.12832484],
       [-0.15695576, -0.1

Misalignment Level: 100%|██████████| 1/1 [00:03<00:00,  3.19s/it]

Abslin_p
{'T_matrix': array([[ 2.,  1.,  1.,  0.,  0.,  0.],
       [ 0.,  2.,  1.,  1., -0.,  1.],
       [ 1.,  0.,  1.,  2.,  0.,  1.]]), 'test_indices': array([   2,   15,   17, ..., 9992, 9993, 9999])} 

Abslin_n
{'T_matrix': array([[ 2.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  2.,  1.,  0., -0.,  1.],
       [ 0.,  0.,  0.,  2.,  0.,  0.]]), 'test_indices': array([   2,   15,   17, ..., 9992, 9993, 9999])} 

DiRoCA_star
{'T_matrix': array([[ 1.4285892 ,  1.6613975 ,  1.1421654 , -0.4280189 , -1.1200155 ,
        -1.8258438 ],
       [ 0.11943465,  2.6370888 ,  0.5108727 ,  0.387137  , -1.1581727 ,
        -1.7462215 ],
       [ 1.2911476 ,  2.2353175 ,  0.7801043 ,  0.9591839 , -1.6438861 ,
        -2.3582993 ]], dtype=float32), 'optimization_params': {'L': {'pert_U': array([[-0.15532711, -0.13541321, -0.14270917, -0.06383638,  0.13068616,
        -0.0441705 ],
       [-0.15566649, -0.14413702, -0.14235963, -0.14279   , -0.08290987,
        -0.12930666],
       [-0.04800574, -0.0




In [128]:
print("\n" + "="*65)
print("Overall Performance (Averaged Across All Misalignment Levels)")
print("="*65)
print(f"{'Method/Run':<35} | {'Mean ± Std'}")
print("="*65)

summary = omega_spec_df.groupby('method')['error'].agg(['mean', 'std', 'count'])
summary['sem'] = summary['std'] #/ np.sqrt(summary['count'])
# summary['ci95'] = 1.96 * summary['sem']
summary['ci95'] = summary['sem']


for method_name, row in summary.sort_values('mean').iterrows():
    print(f"{method_name:<35} | {row['mean']:.4f} ± {row['ci95']:.4f}")
print("="*65)


Overall Performance (Averaged Across All Misalignment Levels)
Method/Run                          | Mean ± Std
GradCA                              | 311.4219 ± 2.8687
DIROCA_4                            | 313.1202 ± 8.2254
DIROCA_8                            | 313.2869 ± 8.3172
DIROCA_2                            | 317.0301 ± 7.7172
DIROCA_1                            | 334.2854 ± 3.7350
Abslin_n                            | 355.5115 ± 2.8590
Abslin_p                            | 400.4831 ± 2.7764
DiRoCA_star                         | 470.7565 ± 4.3437
BARYCA                              | 561.3300 ± 4.0691


In [129]:
# Paired t-tests for contamination analysis
from scipy.stats import ttest_rel
import warnings
warnings.filterwarnings('ignore')

# Get the best method (lowest mean error)
best_method = summary['mean'].idxmin()
best_mean = summary.loc[best_method, 'mean']

print(f"\nBest method: {best_method} (mean error: {best_mean:.4f})")
print("\nPaired t-tests (best vs others):")
print("="*60)

# Get data for best method
best_data = omega_spec_df[omega_spec_df['method'] == best_method]['error'].values

# Compare against all other methods
for method in summary.index:
    if method != best_method:
        other_data = omega_spec_df[omega_spec_df['method'] == method]['error'].values
        
        # Ensure same number of observations for paired test
        min_len = min(len(best_data), len(other_data))
        if min_len > 0:
            t_stat, p_value = ttest_rel(best_data[:min_len], other_data[:min_len])
            other_mean = summary.loc[method, 'mean']
            diff = best_mean - other_mean
            
            print(f"{best_method} vs {method:<30} | p={p_value:.4f} | diff={diff:.4f} | {'Significant' if p_value < 0.05 else 'Not significant'}")
        else:
            print(f"{best_method} vs {method:<30} | No data available")



Best method: GradCA (mean error: 311.4219)

Paired t-tests (best vs others):
GradCA vs Abslin_n                       | p=0.0000 | diff=-44.0896 | Significant
GradCA vs Abslin_p                       | p=0.0000 | diff=-89.0611 | Significant
GradCA vs BARYCA                         | p=0.0000 | diff=-249.9080 | Significant
GradCA vs DIROCA_1                       | p=0.0000 | diff=-22.8635 | Significant
GradCA vs DIROCA_2                       | p=0.0000 | diff=-5.6081 | Significant
GradCA vs DIROCA_4                       | p=0.0184 | diff=-1.6983 | Significant
GradCA vs DIROCA_8                       | p=0.0096 | diff=-1.8650 | Significant
GradCA vs DiRoCA_star                    | p=0.0000 | diff=-159.3346 | Significant


In [130]:
import random
import pandas as pd
from tqdm import tqdm

# ======================================================================
# 1. The Corrected Helper Function
# ======================================================================
def contaminate_omega_map(original_omega, num_misalignments, seed=None):
    """
    Randomly re-wires a subset of entries in the omega map using a specific seed
    for reproducible randomness.
    """
    # Create a local random number generator for this specific run
    rng = random.Random(seed)
    
    omega_keys = [k for k in original_omega.keys() if k is not None]
    omega_vals = [original_omega[k] for k in omega_keys if original_omega[k] is not None]

    contaminated_omega = original_omega.copy()
    
    num_to_corrupt = min(num_misalignments, len(omega_keys))
    
    # Use the local generator for sampling
    to_corrupt = rng.sample(omega_keys, k=num_to_corrupt)
    
    all_targets = list(set(omega_vals))

    for key in to_corrupt:
        original_target = original_omega[key]
        available_targets = [t for t in all_targets if t != original_target]
        if available_targets:
            # Use the local generator for choice
            new_target = rng.choice(available_targets)
            contaminated_omega[key] = new_target
            
    return contaminated_omega

# ======================================================================
# 2. The Corrected Evaluation Loop
# ======================================================================
max_misalignments = len(I_ll_relevant) -1
misalignment_levels = np.linspace(0, max_misalignments, 11, dtype=int)
num_trials = 3 # You can now increase this to get variance

omega_spec_records = []
print("Omega-misspecification evaluation")

for num_misalignments in tqdm(misalignment_levels, desc="Misalignment Level"):
    for trial in range(num_trials):
        
        # --- THIS IS THE FIX ---
        # Pass the trial number as the seed to get a different map for each trial
        omega_cont = contaminate_omega_map(omega, num_misalignments, seed=trial)
        # --- END OF FIX ---
        
        for i, fold_info in enumerate(saved_folds):
            for method_name, results_dict in results_to_evaluate.items():
                fold_results = results_dict.get(f'fold_{i}', {})
                for run_key, run_data in fold_results.items():

                    if 'DIROCA' in method_name:
                        method_label = method_name
                    else:
                        method_label = method_name

                    T_learned = run_data['T_matrix']
                    test_indices = run_data['test_indices']
                    
                    errors_per_intervention = []
                    for iota in I_ll_relevant:
                        Dll_test = Dll_samples[iota][test_indices]
                        # Use the contaminated omega map
                        Dhl_test = Dhl_samples[omega_cont[iota]][test_indices]
                        
                        if setting == 'gaussian':
                            error = ut.calculate_abstraction_error(T_learned, Dll_test, Dhl_test)
                        elif setting == 'empirical':
                            error = ut.calculate_empirical_error(T_learned, Dll_test, Dhl_test)
                            
                        if not np.isnan(error): errors_per_intervention.append(error)
                    
                    avg_error = np.mean(errors_per_intervention) if errors_per_intervention else np.nan

                    record = {
                        'method': method_name, 
                        'misalignments': num_misalignments,
                        'trial': trial,
                        'fold': i,
                        'error': avg_error
                    }
                    omega_spec_records.append(record)

omega_spec_df = pd.DataFrame(omega_spec_records)
print("\n\n--- Omega-Misspecification Evaluation Complete ---")

Omega-misspecification evaluation


Misalignment Level: 100%|██████████| 11/11 [00:02<00:00,  3.77it/s]



--- Omega-Misspecification Evaluation Complete ---





In [131]:
print("\n" + "="*65)
print("Overall Performance (Averaged Across All Misalignment Levels)")
print("="*65)
print(f"{'Method/Run':<35} | {'Mean ± Std'}")
print("="*65)

summary = omega_spec_df.groupby('method')['error'].agg(['mean', 'std', 'count'])
summary['sem'] = summary['std'] #/ np.sqrt(summary['count'])
# summary['ci95'] = 1.96 * summary['sem']
summary['ci95'] = summary['sem']


for method_name, row in summary.sort_values('mean').iterrows():
    print(f"{method_name:<35} | {row['mean']:.4f} ± {row['ci95']:.4f}")
print("="*65)


Overall Performance (Averaged Across All Misalignment Levels)
Method/Run                          | Mean ± Std
GradCA                              | 311.3339 ± 2.8196
DIROCA_4                            | 312.8305 ± 8.0162
DIROCA_8                            | 313.0358 ± 8.1734
DIROCA_2                            | 316.7920 ± 8.1246
DIROCA_1                            | 334.2950 ± 3.5959
Abslin_n                            | 355.6131 ± 2.4828
Abslin_p                            | 400.4104 ± 2.6991
DiRoCA_star                         | 470.9156 ± 4.0462
BARYCA                              | 561.7603 ± 3.8158
