In [1]:
import itertools
import joblib
import pickle 

import networkx as nx
import numpy as np
import pandas as pd
import cvxpy as cp

import matplotlib.pyplot as plt
import matplotlib.pylab as pl
import seaborn as sns
import random 
import joblib

import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import mean_squared_error

import numpy as np
import networkx as nx

from src.CBN import CausalBayesianNetwork as CBN
import modularised_utils as mut
import Linear_Additive_Noise_Models as lanm
import operations as ops
import evaluation_utils as evut
import opt_utils as oput
import params

np.random.seed(0)



In [2]:
experiment = 'battery_discrete'

In [3]:
M_base = joblib.load('batteries/scms/M_WMG_bins_5_avg_2.pkl')
M_abst = joblib.load('batteries/scms/M_LRCS_bins_5.pkl') 

In [4]:
df_base = joblib.load('batteries/dfs/df_WMG_bins_5_avg_2.pkl')
df_abst = joblib.load('batteries/dfs/df_LRCS_bins_5.pkl')

In [5]:
df_base.drop(df_base.columns[[1,2]], axis=1, inplace=True)
df_base.replace({75:0, 110:1, 150:2, 170:3, 180:4, 200:5}, inplace=True)

df_abst.drop(df_abst.columns[[1]], axis=1, inplace=True)
df_abst.replace({75:0, 100:1, 200:2}, inplace=True)

# Rename columns to match graph
df_base = df_base.rename(columns={
    'binned ML_avg0': 'ML0',
    'binned ML_avg1': 'ML1'
})
# Rename columns to match graph
df_abst = df_abst.rename(columns={
    'Comma gap (µm)': 'CG', 'binned ML': 'ML'
})

In [6]:
Gll = nx.DiGraph()
Gll.add_nodes_from(M_base.nodes())
Gll.add_edges_from(M_base.edges())
Ghl = nx.DiGraph()
Ghl.add_nodes_from(M_abst.nodes())
Ghl.add_edges_from(M_abst.edges())

In [7]:
# Convert DataFrames to numpy arrays if not already
df_base_np = df_base.to_numpy()
df_abst_np = df_abst.to_numpy()

# Get coefficients using the modularised_utils function
ll_coeffs = mut.get_coefficients(df_base_np, Gll)
hl_coeffs = mut.get_coefficients(df_abst_np, Ghl)

Gll = CBN(list(ll_coeffs.keys()))
Ghl = CBN(list(hl_coeffs.keys()))

In [8]:
num_llsamples = df_base.shape[0]
num_hlsamples = df_abst.shape[0]
min_samples = min(num_llsamples, num_hlsamples)

df_base = df_base[:min_samples]
df_abst = df_abst[:min_samples]

df_base= df_base.to_numpy()
df_abst= df_abst.to_numpy()

l = len(Gll.nodes())
h = len(Ghl.nodes())

In [9]:
U_ll_hat, mu_U_ll_hat, Sigma_U_ll_hat = mut.lan_abduction(df_base, Gll, ll_coeffs)
U_hl_hat, mu_U_hl_hat, Sigma_U_hl_hat = mut.lan_abduction(df_abst, Ghl, hl_coeffs)

In [11]:
# Base-level interventions 
iota0 = None
# iota1 = ops.Intervention({'CG': 75.})
# iota2 = ops.Intervention({'CG': 110.})
# iota3 = ops.Intervention({'CG': 180.})
# iota4 = ops.Intervention({'CG': 200.})
iota1 = ops.Intervention({'CG': 0})
iota2 = ops.Intervention({'CG': 1})
iota3 = ops.Intervention({'CG': 4})
iota4 = ops.Intervention({'CG': 5})

# Abstract-level interventions 
iota0_prime = None
# iota1_prime = ops.Intervention({'CG': 75.})
# iota2_prime = ops.Intervention({'CG': 100.})
# iota3_prime = ops.Intervention({'CG': 200.})
iota1_prime = ops.Intervention({'CG': 0})
iota2_prime = ops.Intervention({'CG': 1})
iota3_prime = ops.Intervention({'CG': 2})


# Mapping
omega = {
    iota0: iota0_prime,
    iota1: iota1_prime,
    iota2: iota2_prime,
    iota3: iota3_prime,
    iota4: iota3_prime
}

Ill = list(set(omega.keys()))
Ihl = list(set(omega.values()))


In [12]:
Ds = {}
Ds[None] = (df_base, df_abst)
    
joblib.dump((Gll, Ill), f"data/{experiment}/LL.pkl")
joblib.dump(ll_coeffs, f"data/{experiment}/ll_coeffs.pkl")

joblib.dump((Ghl, Ihl), f"data/{experiment}/HL.pkl")
joblib.dump(hl_coeffs, f"data/{experiment}/hl_coeffs.pkl")

joblib.dump(Ds, f"data/{experiment}/Ds.pkl")

joblib.dump(omega, f"data/{experiment}/omega.pkl")
joblib.dump((U_hl_hat, mu_U_hl_hat, Sigma_U_hl_hat), f"data/{experiment}/exogenous_HL.pkl")

['data/battery_discrete/exogenous_HL.pkl']

In [13]:
LLmodels = {}
for iota in Ill:
    LLmodels[iota] = lanm.LinearAddSCM(Gll, ll_coeffs, iota)

HLmodels = {}
for eta in Ihl:
    HLmodels[eta] = lanm.LinearAddSCM(Ghl, hl_coeffs, eta)

In [14]:
joblib.dump(LLmodels, f"data/{experiment}/LLmodels.pkl")
joblib.dump(HLmodels, f"data/{experiment}/HLmodels.pkl")

['data/battery_discrete/HLmodels.pkl']

# EMPIRICAL PERSPECTIVE

In [15]:
L_matrices = oput.compute_struc_matrices(LLmodels, Ill)
H_matrices = oput.compute_struc_matrices(HLmodels, Ihl)

In [16]:
ll_bound = round(evut.compute_empirical_radius(N=num_llsamples, eta=0.05, c1=1000.0, c2=1.0, alpha=2.0, m=l), 3)
hl_bound = round(evut.compute_empirical_radius(N=num_hlsamples, eta=0.05, c1=1000.0, c2=1.0, alpha=2.0, m=h), 3)

In [51]:
epsilon, delta = ll_bound, hl_bound

eta_max = 0.001
eta_min = 0.001

max_iter = 10000
num_steps_min = 5
num_steps_max = 5

robust_L = True 
robust_H = True

initialization = 'random' # 'random'

tol  = 1e-4
seed = 23

In [52]:
opt_params_erica = {
                        'U_L': U_ll_hat,
                        'U_H': U_hl_hat,
                        'L_models': LLmodels,
                        'H_models': HLmodels,
                        'omega': omega,
                        'epsilon': epsilon,
                        'delta': delta,
                        'eta_min': eta_min,
                        'eta_max': eta_max,
                        'num_steps_min': num_steps_min,
                        'num_steps_max': num_steps_max,
                        'max_iter': max_iter,
                        'tol': tol,
                        'seed': seed,
                        'robust_L': robust_L,
                        'robust_H': robust_H,
                        'initialization': initialization,
                        'experiment': 'battery_discrete'
                    }

In [19]:
diroca_train_results_empirical = {}

In [20]:
# Define different epsilon=delta values
eps_delta_values     = [8, ll_bound, 1, 2, 4]

# For each epsilon=delta value
for eps_delta in eps_delta_values:
    print(f"Training for ε=δ = {eps_delta}")
    # Update theta parameters
    if eps_delta == ll_bound:
        opt_params_erica['epsilon'] = ll_bound
        opt_params_erica['delta']   = hl_bound
    
    else:
        opt_params_erica['epsilon'] = eps_delta
        opt_params_erica['delta']   = eps_delta
    
    # Run ERICA optimization
    params_empirical, T_empirical = oput.run_empirical_erica_optimization_batt(**opt_params_erica)
    
    # Store results including optimization parameters and transformation matrix
    if eps_delta == ll_bound:
        diroca_train_results_empirical['T_'+str(ll_bound)+'-'+str(hl_bound)] = {
                                                    'optimization_params': params_empirical,
                                                    'T_matrix': T_empirical
                                                }
    else:
        diroca_train_results_empirical['T_'+str(eps_delta)] = {
                                                    'optimization_params': params_empirical,
                                                    'T_matrix': T_empirical
                                                }

print("\nTraining completed. T matrices stored in trained_results dictionary.")
print("Available ε=δ values:", list(diroca_train_results_empirical.keys()))



Training for ε=δ = 8


 22%|██▏       | 219/1000 [00:04<00:16, 48.02it/s]


Converged at iteration 220
Training for ε=δ = 0.346


 28%|██▊       | 278/1000 [00:05<00:15, 47.31it/s]


Converged at iteration 279
Training for ε=δ = 1


 27%|██▋       | 273/1000 [00:05<00:14, 50.78it/s]


Converged at iteration 274
Training for ε=δ = 2


 26%|██▌       | 258/1000 [00:05<00:17, 43.09it/s]


Converged at iteration 259
Training for ε=δ = 4


 22%|██▏       | 219/1000 [00:04<00:17, 45.20it/s]

Converged at iteration 220

Training completed. T matrices stored in trained_results dictionary.
Available ε=δ values: ['T_8', 'T_0.346-0.393', 'T_1', 'T_2', 'T_4']





In [53]:
params_enrico, T_enrico = oput.run_empirical_erica_optimization(**{**opt_params_erica, 'robust_L': False, 'robust_H': False})

 45%|████▍     | 4452/10000 [00:05<00:06, 844.00it/s]


Converged at iteration 4453


In [54]:
diroca_train_results_empirical['T_0.00'] = {
                                'optimization_params': params_enrico,
                                'T_matrix': T_enrico
                            }

In [23]:
opt_params_bary = {
                        'U_ll_hat':U_ll_hat,
                        'U_hl_hat':U_hl_hat,
                        'L_matrices':L_matrices,
                        'H_matrices':H_matrices,
                        'max_iter':1000,
                        'tol':tol,
                        'seed':seed
                    }
                                 

In [24]:
T_bary = oput.run_empirical_bary_optim(**opt_params_bary)
params_bary = {'L':{}, 'H':{}}

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [00:00<00:00, 2982.04it/s]


In [25]:
diroca_train_results_empirical['T_b'] = {
                                'optimization_params': params_bary,
                                'T_matrix': T_bary
                            }

In [26]:
opt_params_smooth = {
                        'U_L': U_ll_hat,
                        'U_H': U_hl_hat,
                        'L_models': LLmodels,
                        'H_models': HLmodels,
                        'omega': omega,
                        'eta_min': eta_min,
                        'num_steps_min': num_steps_min,
                        'max_iter': 300, #300
                        'tol': tol,
                        'seed': seed,
                        'noise_sigma': 0.1, #0.1
                        'num_noise_samples': 10
                        }

In [27]:
params_smooth, T_smooth = oput.run_empirical_smooth_optimization_batt(**opt_params_smooth)

100%|██████████| 300/300 [00:13<00:00, 22.03it/s]


In [28]:
diroca_train_results_empirical['T_s'] = {
                                'optimization_params': params_smooth,
                                'T_matrix': T_smooth
                            }

In [29]:
linabs_results = evut.run_abs_lingam_complete(df_base, df_abst)

In [30]:
diroca_train_results_empirical['T_pa'] = {'optimization_params':{'L':{'pert_U':U_ll_hat},'H':{'pert_U':U_hl_hat}}, 'T_matrix': linabs_results['Perfect']['T'].T}
diroca_train_results_empirical['T_na'] = {'optimization_params':{'L':{'pert_U':U_ll_hat},'H':{'pert_U':U_hl_hat}}, 'T_matrix': linabs_results['Noisy']['T'].T}

In [55]:
joblib.dump(diroca_train_results_empirical, f"data/{experiment}/diroca_train_results_empirical.pkl")

['data/battery_discrete/diroca_train_results_empirical.pkl']

# Downstream Evaluation

In [56]:
# def downstream_evaluation(T, df_base, df_abst, noise_level, noise_in):
#     from sklearn.linear_model import Lasso
#     from sklearn.model_selection import LeaveOneGroupOut
#     from sklearn.metrics import mean_squared_error
#     import numpy as np

#     from sklearn.linear_model import Lasso
#     from sklearn.model_selection import LeaveOneGroupOut
#     from sklearn.metrics import mean_squared_error
#     import numpy as np

#     # Define fixed hyperparameters
#     lasso_params = {'alpha': 0.1, 'max_iter': 1000, 'tol': 0.01}
    
#     # Copy inputs
#     df_base_noisy = df_base.copy()
#     df_abst_noisy = df_abst.copy()
#     df_abst_noisy = df_abst_noisy.astype(float)

#     if noise_in == 'both':
#         df_base_noisy += np.random.normal(0, noise_level, df_base.shape)
#         df_abst_noisy += np.random.normal(0, noise_level, df_abst.shape)
#     elif noise_in == 'base':
#         df_base_noisy += np.random.normal(0, noise_level, df_base.shape)
#         df_abst_noisy = df_abst
#     elif noise_in =='abst':
#         df_abst_noisy += np.random.normal(0, noise_level, df_abst.shape)
#         df_base_noisy = df_base
#     elif noise_in == 'none':
#         df_base_noisy, df_abst_noisy = df_base, df_abst
    

#     # Generate transformed and real abstract samples
#     tau_samples = T @ df_base_noisy.T
#     abst_samples = df_abst_noisy.T

#     # Step 1: Transpose to get (N, dim)
#     X_real = abst_samples.T
#     X_gen  = tau_samples.T

#     # Step 2: Define target labels and intervention groupings
#     y = df_abst[:, 1]
#     groups = df_abst[:, 0]
#     # y = df_abst_noisy[:, "binned ML"]
#     # groups = df_abst_noisy[:, "Comma gap (µm)"]


#     assert X_real.shape[0] == len(y) == len(groups), "Mismatch in number of samples"

#     # Step 3: Combine real and generated data
#     X_all = np.concatenate([X_real, X_gen], axis=0)
#     y_all = np.concatenate([y, y], axis=0)
#     groups_all  = np.concatenate([groups, groups], axis=0)

#     logo = LeaveOneGroupOut()

#     # Mode 1: Real → Real
#     mse_real = []
#     for train_idx, test_idx in logo.split(X_real, y, groups=groups):
#         model = Lasso().fit(X_real[train_idx], y[train_idx])
#         y_pred = model.predict(X_real[test_idx])
#         mse_real.append(mean_squared_error(y[test_idx], y_pred))

#    # Mode 2: Augmented → Real
#     mse_aug = []
#     for train_idx, test_idx in logo.split(X_real, y, groups=groups):
#         # Use noisy y for training with transformed data
#         y_train = df_abst_noisy[train_idx, 1]
#         model = Lasso().fit(X_gen[train_idx], y_train)
#         y_pred = model.predict(X_real[test_idx])
#         mse_aug.append(mean_squared_error(y[test_idx], y_pred))

#     # Mode 3: Real + Augmented → Real
#     mse_mix = []
#     for test_group in np.unique(groups):
#         test_mask = (groups == test_group)
#         test_idx_real = np.where(test_mask)[0]

#         train_mask_real = (groups != test_group)
#         train_idx_real = np.where(train_mask_real)[0]
#         train_idx_gen = np.arange(len(y)) + len(y)
#         train_idx_all = np.concatenate([train_idx_real, train_idx_gen])

#         model = Lasso(**lasso_params).fit(X_all[train_idx_all], y_all[train_idx_all])

#         y_pred = model.predict(X_real[test_idx_real])
#         mse_mix.append(mean_squared_error(y[test_idx_real], y_pred))

#     return {
#         "Real": (np.mean(mse_real), np.std(mse_real)),
#         "Aug": (np.mean(mse_aug), np.std(mse_aug)),
#         "AugReal": (np.mean(mse_mix), np.std(mse_mix))
#     }

In [57]:
def downstream_evaluation_paper(T, df_base, df_abst):
    """
    Implements the paper's evaluation methodology with three scenarios
    """
    # Get unique Comma Gap values
    comma_gaps = np.unique(df_abst[:, 0])
    #lasso_params = {'alpha': 0.1, 'max_iter': 1000, 'tol': 0.01}
    lasso_params = {'alpha': 0.0001, 'max_iter': 500, 'tol': 0.0001}

    # Scenario 1: Before abstraction (Real → Real)
    mse_real = []
    for cg in comma_gaps:
        # Split data
        test_mask = (df_abst[:, 0] == cg)
        train_mask = ~test_mask
        
        # Train data
        X_train = df_abst[train_mask, 0].reshape(-1, 1)  # Comma Gap as feature
        y_train = df_abst[train_mask, 1]  # Mass Loading as target
        
        # Test data
        X_test = df_abst[test_mask, 0].reshape(-1, 1)
        y_test = df_abst[test_mask, 1]
        
        # Train and evaluate
        model = Lasso().fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_real.append(mean_squared_error(y_test, y_pred))
    
    # Scenario 2: After abstraction with support (Aug → Real)
    mse_aug = []
    # Generate transformed samples
    tau_samples = T @ df_base.T
    enhanced_data = np.concatenate([df_abst, tau_samples.T])
    
    for cg in comma_gaps:
        test_mask = (df_abst[:, 0] == cg)
        train_mask_abst = ~test_mask
        train_mask_full = np.concatenate([train_mask_abst, np.ones(len(tau_samples.T), dtype=bool)])
        
        # Train data (including transformed samples)
        X_train = enhanced_data[train_mask_full, 0].reshape(-1, 1)
        y_train = enhanced_data[train_mask_full, 1]
        
        # Test data (only original LRCS)
        X_test = df_abst[test_mask, 0].reshape(-1, 1)
        y_test = df_abst[test_mask, 1]
        
        model = Lasso(**lasso_params).fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_aug.append(mean_squared_error(y_test, y_pred))
    
    # Scenario 3: After abstraction without support (Real+Aug → Real)
    mse_mix = []
    for cg in comma_gaps:
        test_mask_abst = (df_abst[:, 0] == cg)
        test_mask_tau = (tau_samples.T[:, 0] == cg)
        
        train_mask_abst = ~test_mask_abst
        train_mask_tau = ~test_mask_tau
        
        # Combine masks for training
        train_data = np.concatenate([
            df_abst[train_mask_abst],
            tau_samples.T[train_mask_tau]
        ])
        
        X_train = train_data[:, 0].reshape(-1, 1)
        y_train = train_data[:, 1]
        
        # Test only on LRCS data
        X_test = df_abst[test_mask_abst, 0].reshape(-1, 1)
        y_test = df_abst[test_mask_abst, 1]
        
        model = Lasso(**lasso_params).fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_mix.append(mean_squared_error(y_test, y_pred))
    
    return {
        "Real": (np.mean(mse_real), np.std(mse_real)),
        "Aug": (np.mean(mse_aug), np.std(mse_aug)),
        "AugReal": (np.mean(mse_mix), np.std(mse_mix))
    }

In [58]:
# print("\n" + "="*80)
# print(f"{'Method':<15} {'Real':<25} {'Aug':<25} {'AugReal':<25}")
# # print(f"{'Method':<15} {'Real':<25} ")

# print("="*80)

# for method in list(diroca_train_results_empirical.keys()):
#     T = diroca_train_results_empirical[method]['T_matrix']
#     d = downstream_evaluation_paper(T, df_base, df_abst)#, noise_level=0.0, noise_in='both')
    
#     real_str = f"{d['Real'][0]:.3f} ± {d['Real'][1]:.3f}"
#     aug_str = f"{d['Aug'][0]:.3f} ± {d['Aug'][1]:.3f}"
#     augreal_str = f"{d['AugReal'][0]:.3f} ± {d['AugReal'][1]:.3f}"
    
#     print(f"{method:<15} {real_str:<25} {aug_str:<25} {augreal_str:<25}")
#     # print(f"{method:<15} {real_str:<25} ")

# print("="*80)

In [59]:
def print_ordered_results(results_dict, scenario_name):
    # Extract results for the given scenario
    scenario_results = {method: results_dict[method][scenario_name] for method in results_dict.keys()}
    # Sort by mean error (first element of the tuple) in descending order (worst to best)
    sorted_results = dict(sorted(scenario_results.items(), key=lambda x: x[1][0], reverse=True))
    
    print(f"\n{scenario_name} Scenario")
    print("="*80)
    print(f"{'Rank':<5} {'Method':<15} {'Error (mean ± CI)':<35}")
    print("-"*80)
    
    for rank, (method, (mean, std)) in enumerate(sorted_results.items(), 1):
        print(f"{rank:<5} {method:<15} {mean:>8.4f} ± {std:<8.4f}")

In [60]:
# # Using dictionary comprehension
# keys_to_remove = ['T_pa', 'T_na']
# filtered_dict = {k: v for k, v in diroca_train_results_empirical.items() if k not in keys_to_remove}

In [61]:
# Print results for each scenario
print("\nAbstraction Performance Evaluation")
print("="*80)

# Real → Real scenario
print_ordered_results({method: downstream_evaluation_paper(diroca_train_results_empirical[method]['T_matrix'], 
                                                        df_base, df_abst) for method in diroca_train_results_empirical.keys()}, 
                     'Real')

# Aug → Real scenario
print_ordered_results({method: downstream_evaluation_paper(diroca_train_results_empirical[method]['T_matrix'], 
                                                        df_base, df_abst) for method in diroca_train_results_empirical.keys()}, 
                     'Aug')

# Real+Aug → Real scenario
print_ordered_results({method: downstream_evaluation_paper(diroca_train_results_empirical[method]['T_matrix'], 
                                                        df_base, df_abst) for method in diroca_train_results_empirical.keys()}, 
                     'AugReal')

print("\n" + "="*80)


Abstraction Performance Evaluation

Real Scenario
Rank  Method          Error (mean ± CI)                  
--------------------------------------------------------------------------------
1     T_8               5.5559 ± 2.3877  
2     T_0.346-0.393     5.5559 ± 2.3877  
3     T_1               5.5559 ± 2.3877  
4     T_2               5.5559 ± 2.3877  
5     T_4               5.5559 ± 2.3877  
6     T_0.00            5.5559 ± 2.3877  
7     T_b               5.5559 ± 2.3877  
8     T_s               5.5559 ± 2.3877  
9     T_pa              5.5559 ± 2.3877  
10    T_na              5.5559 ± 2.3877  

Aug Scenario
Rank  Method          Error (mean ± CI)                  
--------------------------------------------------------------------------------
1     T_b              13.9859 ± 11.3037 
2     T_8               9.0901 ± 6.1204  
3     T_4               9.0901 ± 6.1204  
4     T_0.346-0.393     8.8546 ± 6.0513  
5     T_0.00            8.7645 ± 6.5790  
6     T_2               7.7

In [62]:
T_results_emp = diroca_train_results_empirical

In [41]:
data = evut.generate_empirical_data(LLmodels, HLmodels, omega, U_ll_hat, U_hl_hat)

In [42]:
test_observ        = True
test_interv        = True 
metric             = 'fro'
num_iter           = 20

if test_observ and test_interv:
    test_data = data

elif test_observ:
    test_data = {None: data[None]}

elif test_interv:
    test_data = {k: v for k, v in data.items() if k is not None}

In [63]:
def mod_noise(U_samples, intervention):
    """
    Modify exogenous noise for exact interventions by setting the entire column 
    to the intervention value for each intervened variable.
    
    Args:
        U_samples: Original noise samples (n_samples x n_variables)
        intervention: Intervention object or None
    """
    U_modified = U_samples.copy()
    
    if intervention is not None:
        # Get dictionary of interventions
        intervention_dict = intervention.vv()
        
        # For each intervened variable
        for var in intervention.Phi():  # Use Phi() to get variables
            value = intervention_dict[var]
            
            # If var is already an integer index, use it directly
            if isinstance(var, (int, np.integer)):
                var_idx = var
            # Otherwise try to get the name or string representation
            else:
                var_idx = str(var)
                # Map variable name to index based on your convention
                # For example, if 'CG' maps to 0, 'ML1' to 1, etc.
                var_map = {'CG': 0, 'ML1': 1, 'ML2': 2, 'S': 0, 'T': 1}
                var_idx = var_map.get(var_idx, 0)  # default to 0 if not found
            
            # Set entire column to intervention value
            U_modified[:, var_idx] = value
    
    return U_modified

In [64]:
results_single = {method: {'errors': [], 'mean': 0, 'ci': 0} for method in T_results_emp.keys()}

for name, method_data in T_results_emp.items():
    T = method_data['T_matrix']
    errors = []  # Store errors for each intervention
    scale_factor = 1/np.sqrt(len(Ill))

    for iota in Ill:
        L_i = LLmodels[iota].F
        H_i = HLmodels[omega[iota]].F
        # if iota is not None:
        #     D_l = L_i @ mod_noise(U_ll_hat, iota).T
        #     D_h = H_i @ mod_noise(U_hl_hat, omega[iota]).T
        # else:
        D_l = L_i @ U_ll_hat.T
        D_h = H_i @ U_hl_hat.T
        
        base_norm = D_l#/ np.linalg.norm(D_l, 'fro')
        abst_norm = D_h#/ np.linalg.norm(D_h, 'fro')
        
        tau_base = T @ base_norm
        dist = evut.compute_empirical_distance(tau_base, abst_norm, 'fro')
        errors.append(dist)  # Store individual errors

    # Calculate mean and CI
    mean_error = np.mean(errors)
    std_error = np.std(errors)
    ci = 1.96 * std_error

    # Store all statistics
    results_single[name] = {
        'errors': errors,
        'mean': mean_error,
        'ci': ci
    }

# Scale the final results
max_mean = max(v['mean'] for v in results_single.values())
scale_factor = 1/max_mean

# # Scale means and CIs
# for method in results_single:
#     results_single[method]['mean'] *= scale_factor
#     results_single[method]['ci'] *= scale_factor

# Sort by mean error
results_single = dict(sorted(results_single.items(), key=lambda x: x[1]['mean']))

# Print results
print("\n" + "="*100)
print(f"{'Method':<15} {'Error (mean ± CI)':<35}")
print("="*100)

for method, stats in results_single.items():
    print(f"{method:<15} {stats['mean']:>8.4f} ± {stats['ci']:<8.4f}")


Method          Error (mean ± CI)                  
T_0.346-0.393     7.5298 ± 0.0000  
T_8               8.0453 ± 0.0000  
T_4               8.0453 ± 0.0000  
T_0.00            8.1113 ± 0.0000  
T_s               8.7232 ± 0.0000  
T_2              10.1014 ± 0.0000  
T_1              10.1788 ± 0.0000  
T_b              12.1699 ± 0.0000  
T_na             27.0588 ± 0.0000  
T_pa             29.5961 ± 0.0000  


In [65]:
hat_dict = {'L': U_ll_hat, 'H': U_hl_hat}

In [66]:
worst = 'T_8'

In [67]:
U_worst_L = diroca_train_results_empirical[worst]['optimization_params']['L']['pert_U']
U_worst_H = diroca_train_results_empirical[worst]['optimization_params']['H']['pert_U']


In [68]:
worst_dict = {'L': U_worst_L, 'H': U_worst_H}

In [69]:
center = 'worst'
if center == 'hat':
    center_matrix = hat_dict
elif center == 'worst':
    center_matrix = worst_dict

In [70]:
coverage_type='uniform'

In [71]:
pert_L = evut.generate_perturbation_matrix(10, 'boundary', 'L', hat_dict, coverage = coverage_type)
pert_H = evut.generate_perturbation_matrix(10, 'boundary', 'H', hat_dict, coverage = coverage_type)

In [100]:
rad_values = np.arange(0.0, 10.0, 1).tolist()  
noise_levels = np.arange(0.0, 10.0, 1).tolist()  

In [101]:
def downstream_evaluation_paper_with_noise(T, df_base, df_abst, rad, noise_level, noise_in):
    """
    Implements the paper's evaluation methodology with three scenarios, with added noise functionality
    """
    # Create noisy copies of the data and convert to float
    df_base_noisy = df_base.copy().astype(float)
    df_abst_noisy = df_abst.copy().astype(float)
    
    pert_L = evut.generate_perturbation_matrix(rad, 'boundary', 'L', hat_dict, coverage = coverage_type)
    pert_H = evut.generate_perturbation_matrix(rad, 'boundary', 'H', hat_dict, coverage = coverage_type)
    if noise_level == 0:
        noise_in = 'none'
    # Add noise according to specified parameters
    if noise_in == 'both':
        df_base_noisy += np.random.normal(0, noise_level, df_base.shape)
        #df_base_noisy = (LLmodels[None].F @ (center_matrix['L'] + pert_L).T).T
        df_abst_noisy += np.random.normal(0, noise_level, df_abst.shape)
        #df_abst_noisy = (HLmodels[None].F @ (center_matrix['H'] + pert_H).T).T
        
    elif noise_in == 'base':
        df_base_noisy += np.random.normal(0, noise_level, df_base.shape)
        df_abst_noisy = df_abst.copy().astype(float)

    elif noise_in == 'abst':
        df_abst_noisy += np.random.normal(0, noise_level, df_abst.shape)
        df_base_noisy = df_base.copy().astype(float)
        
    elif noise_in == 'none':
        df_base_noisy = df_base.copy().astype(float)
        df_abst_noisy = df_abst.copy().astype(float)
    
    # Get unique Comma Gap values
    comma_gaps = np.unique(df_abst_noisy[:, 0])
    
    # Rest of the function remains the same...
    # Scenario 1: Before abstraction (Real → Real)
    mse_real = []
    for cg in comma_gaps:
        test_mask = (df_abst_noisy[:, 0] == cg)
        train_mask = ~test_mask
        
        X_train = df_abst_noisy[train_mask, 0].reshape(-1, 1)
        y_train = df_abst_noisy[train_mask, 1]
        
        X_test = df_abst_noisy[test_mask, 0].reshape(-1, 1)
        y_test = df_abst_noisy[test_mask, 1]
        
        model = Lasso().fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_real.append(mean_squared_error(y_test, y_pred))
    
    # Scenario 2: After abstraction with support (Aug → Real)
    mse_aug = []
    # Generate transformed samples using noisy base data
    tau_samples = T @ df_base_noisy.T
    enhanced_data = np.concatenate([df_abst_noisy, tau_samples.T])
    
    for cg in comma_gaps:
        test_mask = (df_abst_noisy[:, 0] == cg)
        train_mask_abst = ~test_mask
        train_mask_full = np.concatenate([train_mask_abst, np.ones(len(tau_samples.T), dtype=bool)])
        
        X_train = enhanced_data[train_mask_full, 0].reshape(-1, 1)
        y_train = enhanced_data[train_mask_full, 1]
        
        X_test = df_abst_noisy[test_mask, 0].reshape(-1, 1)
        y_test = df_abst_noisy[test_mask, 1]
        
        model = Lasso().fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_aug.append(mean_squared_error(y_test, y_pred))
    
    # Scenario 3: After abstraction without support (Real+Aug → Real)
    mse_mix = []
    for cg in comma_gaps:
        test_mask_abst = (df_abst_noisy[:, 0] == cg)
        test_mask_tau = (tau_samples.T[:, 0] == cg)
        
        train_mask_abst = ~test_mask_abst
        train_mask_tau = ~test_mask_tau
        
        train_data = np.concatenate([
            df_abst_noisy[train_mask_abst],
            tau_samples.T[train_mask_tau]
        ])
        
        X_train = train_data[:, 0].reshape(-1, 1)
        y_train = train_data[:, 1]
        
        X_test = df_abst_noisy[test_mask_abst, 0].reshape(-1, 1)
        y_test = df_abst_noisy[test_mask_abst, 1]
        
        model = Lasso().fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse_mix.append(mean_squared_error(y_test, y_pred))
    
    return {
        "Real": (np.mean(mse_real), np.std(mse_real)),
        "Aug": (np.mean(mse_aug), np.std(mse_aug)),
        "AugReal": (np.mean(mse_mix), np.std(mse_mix))
    }

In [102]:
def print_ordered_noisy_results(results_dict, scenario_name, noise_level, noise_in):
    # Extract results for the given scenario
    scenario_results = {method: results_dict[method][scenario_name] for method in results_dict.keys()}
    # Sort by mean error (first element of the tuple) in descending order (worst to best)
    sorted_results = dict(sorted(scenario_results.items(), key=lambda x: x[1][0], reverse=True))
    
    print(f"\n{scenario_name} Scenario (Noise Level: {noise_level}, Applied to: {noise_in})")
    print("="*100)
    print(f"{'Rank':<5} {'Method':<15} {'Error (mean ± CI)':<35}")
    print("-"*100)
    
    for rank, (method, (mean, std)) in enumerate(sorted_results.items(), 1):
        print(f"{rank:<5} {method:<15} {mean:>8.4f} ± {std:<8.4f}")

# Print results for each scenario and noise configuration
def evaluate_and_print_noisy_results(diroca_train_results_empirical, df_base, df_abst, noise_level, noise_in):
    print("\nAbstraction Performance Evaluation with Noise")
    print("="*100)
    print(f"Noise Configuration: Level = {noise_level}, Applied to: {noise_in}")
    print("="*100)

    # Collect all results
    results = {method: downstream_evaluation_paper_with_noise(
        diroca_train_results_empirical[method]['T_matrix'],
        df_base, df_abst, rad=rad,
        noise_level=noise_level,
        noise_in=noise_in
    ) for method in diroca_train_results_empirical.keys()}

    # Print results for each scenario
    print_ordered_noisy_results(results, 'Real', noise_level, noise_in)
    print_ordered_noisy_results(results, 'Aug', noise_level, noise_in)
    print_ordered_noisy_results(results, 'AugReal', noise_level, noise_in)

    print("\n" + "="*100)

In [103]:
def evaluate_and_print_noisy_results(diroca_train_results_empirical, df_base, df_abst, rad, noise_level, noise_in):
    print("\nAbstraction Performance Evaluation with Noise")
    print("="*100)
    print(f"Noise Configuration: Level = {rad}, Applied to: {noise_in}")
    print("="*100)

    # Initialize dictionaries to store accumulated results
    accumulated_results = {
        'Real': {'mean': [], 'std': []},
        'Aug': {'mean': [], 'std': []},
        'AugReal': {'mean': [], 'std': []}
    }

    # Run 100 iterations
    for _ in range(5):
    #for rad in rad_values:
        # Collect results for each method
        results = {method: downstream_evaluation_paper_with_noise(
            diroca_train_results_empirical[method]['T_matrix'],
            df_base, df_abst, rad=rad,
            noise_level=noise_level,
            noise_in=noise_in
        ) for method in diroca_train_results_empirical.keys()}

        # Accumulate results
        for scenario in ['Real', 'Aug', 'AugReal']:
            for method in results:
                mean, std = results[method][scenario]
                accumulated_results[scenario]['mean'].append(mean)
                accumulated_results[scenario]['std'].append(std)

    # Calculate averages and structure final results
    final_results = {}
    for method in diroca_train_results_empirical.keys():
        method_results = {}
        for scenario in ['Real', 'Aug', 'AugReal']:
            # Get the mean and std for this method across all runs
            means = [accumulated_results[scenario]['mean'][i] for i in range(len(accumulated_results[scenario]['mean'])) if i % len(diroca_train_results_empirical) == list(diroca_train_results_empirical.keys()).index(method)]
            stds = [accumulated_results[scenario]['std'][i] for i in range(len(accumulated_results[scenario]['std'])) if i % len(diroca_train_results_empirical) == list(diroca_train_results_empirical.keys()).index(method)]
            
            method_results[scenario] = (np.mean(means), np.mean(stds))
        final_results[method] = method_results

    # Print results for each scenario
    for scenario in ['Real', 'Aug', 'AugReal']:
        print_ordered_noisy_results(final_results, scenario, noise_level, noise_in)

    print("\n" + "="*100)


#for rad in rad_values:
for noise_level in noise_levels:
    print(f"Noise Level: {noise_level}")
    evaluate_and_print_noisy_results(
        diroca_train_results_empirical,
        df_base,
        df_abst,
        rad = .1,
        noise_level=noise_level,
        noise_in='both'
    )

Noise Level: 0.0

Abstraction Performance Evaluation with Noise
Noise Configuration: Level = 0.1, Applied to: both

Real Scenario (Noise Level: 0.0, Applied to: both)
Rank  Method          Error (mean ± CI)                  
----------------------------------------------------------------------------------------------------
1     T_8               5.5559 ± 2.3877  
2     T_0.346-0.393     5.5559 ± 2.3877  
3     T_1               5.5559 ± 2.3877  
4     T_2               5.5559 ± 2.3877  
5     T_4               5.5559 ± 2.3877  
6     T_0.00            5.5559 ± 2.3877  
7     T_b               5.5559 ± 2.3877  
8     T_s               5.5559 ± 2.3877  
9     T_pa              5.5559 ± 2.3877  
10    T_na              5.5559 ± 2.3877  

Aug Scenario (Noise Level: 0.0, Applied to: both)
Rank  Method          Error (mean ± CI)                  
----------------------------------------------------------------------------------------------------
1     T_b               5.2976 ± 5.7994  
2 

In [82]:
def print_ordered_noisy_results(results_dict, scenario_name, noise_level, noise_in):
    # Extract results for the given scenario
    scenario_results = {method: results_dict[method][scenario_name] for method in results_dict.keys()}
    # Sort by mean error (first element of the tuple) in descending order (worst to best)
    sorted_results = dict(sorted(scenario_results.items(), key=lambda x: x[1][0], reverse=True))
    
    print(f"\n{scenario_name} Scenario (Noise Level: {noise_level}, Applied to: {noise_in})")
    print("="*100)
    print(f"{'Rank':<5} {'Method':<15} {'Error (mean ± CI)':<35}")
    print("-"*100)
    
    for rank, (method, (mean, std)) in enumerate(sorted_results.items(), 1):
        print(f"{rank:<5} {method:<15} {mean:>8.4f} ± {std:<8.4f}")

# Define noise levels to test
noise_levels = np.linspace(0, 1, 11)  # 11 points from 0 to 1

# Test each noise level
for noise_level in noise_levels:
    print("\nAbstraction Performance Evaluation with Noise")
    print("="*100)
    print(f"Noise Configuration: Level = {noise_level:.1f}, Applied to: both")
    print("="*100)

    # Initialize dictionaries to store accumulated results
    accumulated_results = {
        'Real': {'mean': [], 'std': []},
        'Aug': {'mean': [], 'std': []},
        'AugReal': {'mean': [], 'std': []}
    }

    # Run multiple iterations for statistical significance
    for _ in range(5):
        # Collect results for each method
        results = {method: downstream_evaluation_paper_with_noise(
            diroca_train_results_empirical[method]['T_matrix'],
            df_base, df_abst, rad=0.1,  # Fixed radius
            noise_level=noise_level,     # Varying noise level
            noise_in='both'
        ) for method in diroca_train_results_empirical.keys()}

        # Accumulate results
        for scenario in ['Real', 'Aug', 'AugReal']:
            for method in results:
                mean, std = results[method][scenario]
                accumulated_results[scenario]['mean'].append(mean)
                accumulated_results[scenario]['std'].append(std)

    # Calculate averages and structure final results
    final_results = {}
    for method in diroca_train_results_empirical.keys():
        method_results = {}
        for scenario in ['Real', 'Aug', 'AugReal']:
            means = [accumulated_results[scenario]['mean'][i] for i in range(len(accumulated_results[scenario]['mean'])) 
                    if i % len(diroca_train_results_empirical) == list(diroca_train_results_empirical.keys()).index(method)]
            stds = [accumulated_results[scenario]['std'][i] for i in range(len(accumulated_results[scenario]['std'])) 
                   if i % len(diroca_train_results_empirical) == list(diroca_train_results_empirical.keys()).index(method)]
            
            method_results[scenario] = (np.mean(means), np.mean(stds))
        final_results[method] = method_results

    # Print results for each scenario
    for scenario in ['Real', 'Aug', 'AugReal']:
        print_ordered_noisy_results(final_results, scenario, noise_level, 'both')

    print("\n" + "="*100)


Abstraction Performance Evaluation with Noise
Noise Configuration: Level = 0.0, Applied to: both

Real Scenario (Noise Level: 0.0, Applied to: both)
Rank  Method          Error (mean ± CI)                  
----------------------------------------------------------------------------------------------------
1     T_0.00            0.5220 ± 0.6203  
2     T_na              0.5195 ± 0.6185  
3     T_pa              0.5187 ± 0.6246  
4     T_4               0.5182 ± 0.6285  
5     T_0.346-0.393     0.5176 ± 0.6195  
6     T_2               0.5166 ± 0.6244  
7     T_1               0.5161 ± 0.6214  
8     T_8               0.5153 ± 0.6146  
9     T_b               0.5117 ± 0.6131  
10    T_s               0.5108 ± 0.6162  

Aug Scenario (Noise Level: 0.0, Applied to: both)
Rank  Method          Error (mean ± CI)                  
----------------------------------------------------------------------------------------------------
1     T_na              2.9441 ± 3.0479  
2     T_pa         