In [1]:
import random
import itertools
import joblib
import pickle 
import os

from itertools import chain, combinations
from typing import Optional
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression

from scipy.stats import wasserstein_distance
from scipy.stats import norm
from scipy.stats import wishart
import networkx as nx
import matplotlib.pyplot as plt

from scipy.linalg import sqrtm
from scipy.special import rel_entr
from scipy.spatial.distance import jensenshannon

from src.CBN import CausalBayesianNetwork as CBN
import modularised_utils as mut
import Linear_Additive_Noise_Models as lanm
import operations as ops

from sklearn.linear_model import LinearRegression
import params
import numpy as np
import pandas as pd
import networkx as nx
import statsmodels.api as sm

In [2]:
experiment = 'synth1'

In [3]:
ll_mu_hat       = np.array([0, 0, 0])  
ll_Sigma_hat    = np.diag([1, 2, 1]) 


hl_mu_hat       = np.array([0, 0])  
hl_Sigma_hat    = np.diag([1, 2])

In [4]:
# Define the radius of the Wasserstein balls (epsilon, delta) and the size for both models.
epsilon         = params.radius[experiment][0]
ll_num_envs     = params.n_envs[experiment][0]

delta           = params.radius[experiment][1]
hl_num_envs     = params.n_envs[experiment][1]

# Define the number of samples per environment. Currently every environment has the same number of samples
num_llsamples   = params.n_samples[experiment][0]
num_hlsamples   = params.n_samples[experiment][1]

distance_err    = 'wass'

In [5]:
# Ambiguity set construction: Based on epsilon and delta include distribution (as many as the num_envs) that
# pass the "gelbrich" test.
ll_moments = mut.sample_moments_U(mu_hat    = ll_mu_hat,
                                  Sigma_hat = ll_Sigma_hat,
                                  bound     = epsilon,
                                  num_envs  = ll_num_envs)

A_ll       = mut.sample_distros_Gelbrich(ll_moments) #Low-level: A_epsilon


hl_moments = mut.sample_moments_U(mu_hat    = hl_mu_hat,
                                  Sigma_hat = hl_Sigma_hat,
                                  bound     = delta,
                                  num_envs  = hl_num_envs)

A_hl       = mut.sample_distros_Gelbrich(hl_moments) #High-level A_delta

In [6]:
S = 'Smoking'
T = 'Tar'
C = 'Cancer'

S_ = 'Smoking_'
C_ = 'Cancer_'

ll_endogenous_coeff_dict = {(S, T): 0.3, (T, C): 0.2}
ll_causal_graph          = CBN(list(ll_endogenous_coeff_dict.keys()))
llcm                     = lanm.LinearAddSCM(ll_causal_graph, ll_endogenous_coeff_dict)
Dll_noise                = A_ll[0].sample(10000)[0]
Dll_samples              = llcm.sample_settings(Dll_noise)

hl_endogenous_coeff_dict = {(S_, C_): 0.6}
hl_causal_graph          = CBN(list(hl_endogenous_coeff_dict.keys()))
hlcm                     = lanm.LinearAddSCM(hl_causal_graph, hl_endogenous_coeff_dict)
Dhl_noise                = A_hl[0].sample(10000)[0]
Dhl_samples              = hlcm.sample_settings(Dhl_noise)

In [7]:
coefficients = mut.get_mle_coefficients(Dll_samples, ll_causal_graph)
print(coefficients)

{('Smoking', 'Tar'): 0.2984797773630235, ('Tar', 'Cancer'): 0.19987539997230558}


In [9]:
U, mu  = mut.lan_abduction(Dll_samples, ll_causal_graph, coefficients)

In [11]:
mean_U = np.mean(U, axis=0)
mean_U

  arr = asanyarray(a)


ValueError: operands could not be broadcast together with shapes (10000,3) (3,3) 

In [10]:
mut.lan_abduction(Dll_samples, ll_causal_graph, coefficients)

(array([[-0.44571664, -0.33276221,  0.32544466],
        [-1.17890942,  0.91576935,  2.03253243],
        [-0.37662672,  1.03540905, -0.45704497],
        ...,
        [ 0.53363363,  0.29664976, -2.07140003],
        [-0.02825778, -0.85853373, -0.44567322],
        [-0.4507428 , -1.66739107, -0.61139118]]),
 array([ 0.05542259,  0.07281238, -0.04902966]),
 array([[ 0.90956111,  0.00511329, -0.00201476],
        [ 0.00511329,  1.89729062,  0.01115016],
        [-0.00201476,  0.01115016,  1.01399216]]))

In [7]:
ll_coefs               = mut.get_coefficients(Dll_samples, ll_causal_graph)
U_l, mean_U_l, cov_U_l = mut.lan_abduction(Dll_samples, ll_causal_graph, ll_coefs)

hl_coefs               = mut.get_coefficients(Dhl_samples, hl_causal_graph)
U_h, mean_U_h, cov_U_h = mut.lan_abduction(Dhl_samples, hl_causal_graph, hl_coefs)

In [8]:
dag = CBN(list(ll_coefs.keys()))
cm  = lanm.LinearAddSCM(dag, ll_coefs)

In [9]:
cm.return_adjacency_matrix()

array([[0.        , 0.30635831, 0.        ],
       [0.        , 0.        , 0.20882779],
       [0.        , 0.        , 0.        ]])

In [None]:
import numpy as np
from scipy.optimize import minimize

# Function to generate perturbed datasets
def generate_perturbed_datasets(xi_hat, num_datasets, epsilon, p=2):
    """
    Generate perturbed datasets based on the Wasserstein ball.
    
    Args:
    xi_hat (numpy.ndarray): Original empirical samples, shape (N, m)
    num_datasets (int): Number of perturbed datasets to generate
    epsilon (float): Wasserstein ball radius
    p (int): Power for the Wasserstein distance constraint (default is 2)
    
    Returns:
    list of numpy.ndarray: List of perturbed datasets, each of shape (N, m)
    """
    N, m = xi_hat.shape
    perturbed_datasets = []
    
    # Target distribution or something you want to optimize over (optional)
    some_target_distribution = np.random.randn(N, m)

    # Loss function to minimize (e.g., distance to target distribution)
    def objective_function(Theta_flat, xi_hat):
        Theta = Theta_flat.reshape(N, m)
        perturbed_samples = xi_hat + Theta
        # Define a loss function here, e.g., distance from some target distribution
        loss = np.linalg.norm(perturbed_samples - some_target_distribution)
        return loss

    # Wasserstein constraint
    def wasserstein_constraint(Theta_flat):
        Theta = Theta_flat.reshape(N, m)
        return epsilon**p - np.mean(np.linalg.norm(Theta, ord=p, axis=1))

    # Initial guess for perturbations (start with zeros)
    Theta_init = np.zeros((N * m,))  # Flattened version of Theta_init

    # Define bounds to ensure samples remain within some space (if needed)
    bounds = [(-epsilon, epsilon)] * (N * m)  # Bounds for each element of Theta

    # Generate multiple perturbed datasets
    for _ in range(num_datasets):
        result = minimize(objective_function, Theta_init, args=(xi_hat,),
                          constraints={'type': 'ineq', 'fun': wasserstein_constraint},
                          bounds=bounds)
        
        # Get the optimal perturbations and generate the perturbed dataset
        Theta_opt = result.x.reshape(N, m)  # Reshape to get the original matrix form
        perturbed_dataset = xi_hat + Theta_opt  # Apply the perturbation
        perturbed_datasets.append(perturbed_dataset)

    return perturbed_datasets


# Example usage
xi_hat = np.array([[ 1.51008893,  1.30967393,  1.21715588],
                   [ 0.61357709,  0.52529855,  0.80942376],
                   [ 0.83850483,  1.8185908 , -0.9543258 ],
                   [ 1.14229802,  0.19042338,  1.34538866],
                   [ 0.79083913, -0.09300395,  2.11915202],
                   [ 1.34298599,  1.29666316,  0.11071287],
                   [ 0.9738364 ,  0.71673072,  0.87294646],
                   [ 0.9225889 ,  0.01205517,  0.09270841],
                   [ 0.94860394,  1.14796928,  0.57379993],
                   [ 0.51783036, -1.57670005,  0.89880883]])

epsilon = 0.1  # Define the Wasserstein ball radius
num_datasets = 5  # Number of perturbed datasets to generate

# Generate perturbed datasets
perturbed_datasets = generate_perturbed_datasets(xi_hat, num_datasets, epsilon)

# Example: Access the first perturbed dataset
print(perturbed_datasets)


In [16]:
import numpy as np

# Function to compute empirical distribution P_N
def empirical_distribution(P_N):
    N = len(P_N)
    return P_N, np.ones(N) / N  # Return the samples and their weights (1/N)

# Example dataset: N samples with k variables each
N = 5  # Number of samples
k = 2  # Number of variables per sample

# Create a sample dataset (replace this with your actual dataset)
data = np.random.randn(N, k)

# Compute empirical distribution P_N
samples, weights = empirical_distribution(data)

# Output the samples and weights
print("Samples (x_i):")
print(samples)

print("\nWeights (1/N for each sample):")
print(weights)

Samples (x_i):
[[ 0.59052003  0.39646013]
 [-0.3465302  -1.50803031]
 [-0.93230998  0.89260516]
 [-0.55444732  0.71291643]
 [ 0.73847934  0.35594886]]

Weights (1/N for each sample):
[0.2 0.2 0.2 0.2 0.2]


In [15]:
perturbed_samples

array([[ 6.19190006e-02,  1.52367358e+00],
       [ 4.64995785e-01, -5.84862328e-01],
       [-2.94821146e-01,  5.65221370e-02],
       [-1.58874056e+00,  1.47683924e+00],
       [-1.98604682e+00, -1.93759398e+00],
       [ 4.72642466e-01, -2.92036146e-01],
       [ 1.05801366e+00, -1.33273531e+00],
       [-4.98287478e-01,  6.37078417e-01],
       [ 1.82538087e+00,  5.93908124e-01],
       [ 5.33864011e-01,  3.80632599e-01],
       [-2.04494684e-01,  1.20420164e+00],
       [ 1.20613446e+00,  8.46666191e-01],
       [-9.17067302e-01,  4.08418214e-01],
       [ 1.94066240e+00,  2.00187183e+00],
       [-1.35542056e+00,  9.23502599e-01],
       [ 5.11578693e-01,  1.10774924e+00],
       [ 2.57793032e-01,  1.10987304e+00],
       [-5.26314554e-01, -7.90926453e-01],
       [-2.77185385e-01, -9.99674160e-01],
       [ 5.14191336e-01, -1.23366912e+00],
       [ 1.13791194e+00,  4.54684160e-01],
       [-2.06002885e+00,  1.31312254e+00],
       [ 4.17410634e-01, -1.77609429e-01],
       [-8.