In [2]:
import joblib
import numpy as np
import torch

import modularised_utils as mut
import opt_utils as oput

import Linear_Additive_Noise_Models as lanm
import operations as ops
from scipy.linalg import sqrtm

import params

np.random.seed(0)

In [3]:
experiment = 'synth1'

In [4]:
# Define the radius of the Wasserstein balls (epsilon, delta) and the size for both models.
epsilon         = params.radius[experiment][0]
ll_num_envs     = params.n_envs[experiment][0]

delta           = params.radius[experiment][1]
hl_num_envs     = params.n_envs[experiment][1]

# Define the number of samples per environment. Currently every environment has the same number of samples
num_llsamples   = params.n_samples[experiment][0]
num_hlsamples   = params.n_samples[experiment][1]

In [5]:
Dll = mut.load_samples(experiment)[None][0] 
Gll = mut.load_ll_model(experiment)[0]
Ill = mut.load_ll_model(experiment)[1]


Dhl = mut.load_samples(experiment)[None][1] 
Ghl = mut.load_hl_model(experiment)[0]
Ihl = mut.load_hl_model(experiment)[1]

omega = mut.load_omega_map(experiment)

In [6]:
ll_coeffs = mut.get_coefficients(Dll, Gll)
hl_coeffs = mut.get_coefficients(Dhl, Ghl) 

In [7]:
# # [Not suggested] In case we want to explore also the interventional --> worse estimation!
# Dlls, Dhls = [], []
# for dpair in list(mut.load_samples(experiment).values()):
#     Dlls.append(dpair[0])
#     Dhls.append(dpair[1])
    
# ll_coeffs = mut.get_coefficients(Dlls, Gll)
# hl_coeffs = mut.get_coefficients(Dhls, Ghl) 

In [8]:
U_ll_hat, mu_U_ll_hat, Sigma_U_ll_hat = mut.lan_abduction(Dll, Gll, ll_coeffs)
U_hl_hat, mu_U_hl_hat, Sigma_U_hl_hat = mut.lan_abduction(Dhl, Ghl, hl_coeffs)

In [9]:
LLmodels = {}
for iota in Ill:
    LLmodels[iota] = lanm.LinearAddSCM(Gll, ll_coeffs, iota)
    
HLmodels, Dhl_samples = {}, {}
for eta in Ihl:
    HLmodels[eta] = lanm.LinearAddSCM(Ghl, hl_coeffs, eta)

### Barycenter

In [9]:
L_matrices = []  # List of L_i matrices
for iota in Ill:
    L_matrices.append(LLmodels[iota].compute_mechanism())

H_matrices = []  # List of H_i matrices
for eta in Ihl:
    H_matrices.append(HLmodels[eta].compute_mechanism())

mu_bary_L, Sigma_bary_L = oput.compute_gauss_barycenter(L_matrices, mu_U_ll_hat, Sigma_U_ll_hat)
mu_bary_H, Sigma_bary_H = oput.compute_gauss_barycenter(H_matrices, mu_U_hl_hat, Sigma_U_hl_hat)

print("Low-level barycenter Mean:", mu_bary_L)
print("Low-level barycenter Covariance:", Sigma_bary_L)
print( )
print("High-level barycenter Mean:", mu_bary_H)
print("High-level barycenter Covariance:", Sigma_bary_H)

Low-level barycenter Mean: [-0.00678588 -0.01069607 -0.00015191]
Low-level barycenter Covariance: [[1.04033442 0.28379336 0.03931234]
 [0.28379336 2.0108559  0.21143766]
 [0.03931234 0.21143766 0.99009455]]

High-level barycenter Mean: [ 0.0042843  -0.00863504]
High-level barycenter Covariance: [[1.35385779 0.58226716]
 [0.58226716 0.97012678]]


In [17]:
V                 = oput.sample_projection(mu_U_ll_hat.shape[0], mu_U_hl_hat.shape[0], use_stiefel=False)
mu_bary_L_proj    = V @ mu_bary_L
Sigma_bary_L_proj = V @ Sigma_bary_L @ V.T

monge, A = oput.monge_map(mu_bary_L_proj, Sigma_bary_L_proj, mu_bary_H, Sigma_bary_H)
T        = V.T @ A

In [18]:
x = np.array([0.5, 0.1, -0.2])  # Example point from the first Gaussian (l = 3)

print(T.T @ x)
# # Apply the Monge map to the point x
# T_x = T_func(x_proj)

[0.32158873 0.29159566]


In [95]:
# Ambiguity set construction: Based on epsilon and delta include distribution (as many as the num_envs) that
# pass the "gelbrich" test.
ll_moments = mut.sample_moments_U(mu_hat    = mu_U_ll_hat,
                                  Sigma_hat = Sigma_U_ll_hat,
                                  bound     = epsilon,
                                  num_envs  = ll_num_envs)

A_ll       = mut.sample_distros_Gelbrich(ll_moments) #Low-level: A_epsilon


hl_moments = mut.sample_moments_U(mu_hat    = mu_U_hl_hat,
                                  Sigma_hat = Sigma_U_hl_hat,
                                  bound     = delta,
                                  num_envs  = hl_num_envs)

A_hl       = mut.sample_distros_Gelbrich(hl_moments) #High-level A_delta

In [10]:
abstraction_errors             = {}
abstraction_env_errors         = {}
max_env_avg_interv_error_value = -np.inf
max_env_avg_interv_error_key   = None
distance_err                   = 'wass'

for lenv in A_ll:

    Dll_noise      = lenv.sample(num_llsamples)[0]
    ll_environment = mut.get_exogenous_distribution(Dll_noise)

    for henv in A_hl:
        Dhl_noise      = henv.sample(num_hlsamples)[0]
        hl_environment = mut.get_exogenous_distribution(Dhl_noise)

        total_ui_error = 0
        num_distros    = len(Ill)

        n, m  = len(LLmodels[None].endogenous_vars), len(HLmodels[None].endogenous_vars)

        T     = mut.sample_stoch_matrix(n, m)

        for iota in Ill:
            llcm   = LLmodels[iota]
            hlcm   = HLmodels[omega[iota]]
            llmech = llcm.compute_mechanism()
            hlmech = hlcm.compute_mechanism()
            error  = mut.ui_error_dist(distance_err, lenv, henv, llmech, hlmech, T)

            total_ui_error += error

        avg_interv_error = total_ui_error/num_distros

        if avg_interv_error > max_env_avg_interv_error_value:
            max_env_avg_interv_error_value = avg_interv_error
            max_env_avg_interv_error_key   = (lenv, henv)

        abstraction_errors[str(T)] = avg_interv_error
        abstraction_env_errors['ll: '+str(ll_environment.means_)+' hl: '+str(hl_environment.means_)] = avg_interv_error

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [11]:
max_tau   = max(abstraction_errors, key=abstraction_errors.get)
max_error = abstraction_errors[max_tau]

print(f"Abstraction: {max_tau}, Error: {max_error}")
print('==============================================================================' )
max_lenv = max_env_avg_interv_error_key[0]
max_henv = max_env_avg_interv_error_key[1]

print(f"max LL mean vector = {max_lenv.means_}")
print(f"max LL covariance = {max_lenv.covariances_}")
print( )

print(f"max HL mean vector = {max_henv.means_}")
print(f"max HL covariance = {max_henv.covariances_}")
print('==============================================================================' )
print(f"max environment, average interventional abstraction error = {max_env_avg_interv_error_value}")

Abstraction: [[0.21761458 0.78238542]
 [0.75095088 0.24904912]
 [0.86664525 0.13335475]], Error: 1.080267499893765
max LL mean vector = [[0.03909886 0.02270429 0.149256  ]]
max LL covariance = [[[0.82603265 0.         0.        ]
  [0.         2.03002425 0.        ]
  [0.         0.         0.84745966]]]

max HL mean vector = [[ 0.07752375 -0.03284999]]
max HL covariance = [[[1.03175598 0.        ]
  [0.         0.77764308]]]
max environment, average interventional abstraction error = 1.080267499893765


In [22]:
LLmodels[None].compute_mechanism()

array([[1.        , 0.28018097, 0.06072652],
       [0.        , 1.        , 0.21674035],
       [0.        , 0.        , 1.        ]])

In [None]:
experiments     = ['synth1_gnd', 'little_lucas']

for experiment in experiments:

    # Define the radius of the Wasserstein balls (epsilon, delta) and the size for both models.
    epsilon         = params.radius[experiment][0]
    ll_num_envs     = params.n_envs[experiment][0]

    delta           = params.radius[experiment][1]
    hl_num_envs     = params.n_envs[experiment][1]

    # Define the number of samples per environment. Currently every environment has the same number of samples
    num_llsamples   = params.n_samples[experiment][0]
    num_hlsamples   = params.n_samples[experiment][1]

    Dll = mut.load_samples(experiment)[None][0] 
    Gll = mut.load_ll_model(experiment)[0]
    Ill = mut.load_ll_model(experiment)[1]


    Dhl = mut.load_samples(experiment)[None][1] 
    Ghl = mut.load_hl_model(experiment)[0]
    Ihl = mut.load_hl_model(experiment)[1]

    omega = mut.load_omega_map(experiment)

    ll_coeffs = mut.get_coefficients(Dll, Gll)
    hl_coeffs = mut.get_coefficients(Dhl, Ghl) 
    num_experiments = 100


In [None]:
for experiment in experiments:
    errors = []
    for n in num_experiments:
        run_opt(n)
        plot_abstraction_error(m)
        errors.append(abst_error(n))
    avg_abst_error = np.mean(errors)
    std_abst_error = np.std(errors)

In [45]:
mu_L    = torch.from_numpy(mu_U_ll_hat)
Sigma_L = torch.from_numpy(Sigma_U_ll_hat)

mu_H    = torch.from_numpy(mu_U_hl_hat)
Sigma_H = torch.from_numpy(Sigma_U_hl_hat)

l = mu_L.shape[0]
h = mu_H.shape[0]

num_intervs = len(Ill)
# Define T as a tensor with requires_grad=True for automatic differentiation
T = torch.randn(h, l, requires_grad=True)

# Compute the objective function as the expectation over samples
objective = 0
for i in range(num_intervs):
    L_i = torch.from_numpy(LLmodels[iota].compute_mechanism())
    H_i = torch.from_numpy(HLmodels[omega[iota]].compute_mechanism())
    
    # 1st term: || T (L_i * mu_L) - (H_i * mu_H) ||_2^2
    # Fix matrix-vector multiplication and norm calculation
    L_i_mu_L = L_i @ mu_L  # Result: (m,)
    H_i_mu_H = H_i @ mu_H  # Result: (n,)
    
    term1 = torch.norm(T.float() @ L_i_mu_L.float() - H_i_mu_H.float())**2  # Now this is dimensionally consistent
   
    # 2nd term: Tr(T L_i Sigma_L L_i^T T^T)
    term2 = torch.trace(T.float() @ L_i.float() @ Sigma_L.float() @ L_i.T.float() @ T.T.float())
    
    # 3rd term: Tr(H_i Sigma_H H_i^T)
    term3 = torch.trace(H_i.float() @ Sigma_H.float() @ H_i.T.float())
    
    # Ensure positive-definiteness for Cholesky decomposition
    L_i_Sigma_L = T.float() @ L_i.float() @ Sigma_L.float() @ L_i.T.float() @ T.T.float()
    H_i_Sigma_H = H_i.float() @ Sigma_H.float() @ H_i.T.float()
    #L_i_Sigma_L = L_i_Sigma_L + torch.eye(L_i_Sigma_L.shape[0]) * 1e-6
    #H_i_Sigma_H = H_i_Sigma_H + torch.eye(H_i_Sigma_H.shape[0]) * 1e-6

    # 4th term: -2 * || (T L_i Sigma_L L_i^T T^T)^(1/2) * (H_i Sigma_H H_i^T)^(1/2) ||_*
    # Whether you compute the matrix square root via torch.sqrt() or Cholesky decomposition, 
    # the singular values of the resulting matrix product will be the same.
    term4 = -2 * torch.norm(torch.linalg.cholesky(L_i_Sigma_L) @ torch.linalg.cholesky(H_i_Sigma_H), 'nuc')
    #term4 = -2 * torch.norm(torch.sqrt(L_i_Sigma_L) @ torch.sqrt(H_i_Sigma_H), 'nuc')

    
    # Sum up terms
    objective += term1 + term2 + term3 + term4

# Average the objective over all interventions
objective /= num_intervs

# Compute gradients (subgradients for nuclear norm)
objective.backward()

# Get the gradient of T
grad_T = T.grad

# Print the gradient
print("Gradient of T:", grad_T)

# Update T using an optimizer (e.g., stochastic gradient descent)
optimizer = torch.optim.Adam([T], lr=0.01)
optimizer = torch.optim.SGD([T], lr=0.01)

# Perform one step of gradient descent
optimizer.step()

# Optionally, zero out gradients for the next step
optimizer.zero_grad()


Gradient of T: tensor([[-6.6168,  1.0648,  8.8707],
        [ 2.5843, -4.8567, -1.3703]])


In [8]:
OLD STUFFF

In [20]:
def augmented_lagrangian(L, H, Q_vars, W_vars, mu_L, mu_H, Sigma_L, Sigma_H,
                         lambda_eps, lambda_del, lambda_Q, lambda_W, rho_Q, rho_W):
    loss = 0

    # Loss components from the original objective
    for i in range(N):
        loss += cp.norm(L[i] @ mu_L - H[i] @ mu_H, 'fro')**2
        loss += cp.trace(L[i] @ Sigma_L @ L[i].T)
        loss += cp.trace(H[i] @ Sigma_H @ H[i].T)
        loss -= 2 * cp.norm(Q_vars[i], 'fro') * cp.norm(W_vars[i], 'fro')

    # To compute the expected value, we will average the loss
    loss /= N  # Average over the number of samples

    # Penalties for epsilon constraint
    loss += lambda_eps * (epsilon**2 - cp.norm(mu_L - mu_L_hat)**2 - cp.norm(cp.sqrt(Sigma_L) - cp.sqrt(Sigma_L_hat))**2)

    # Penalties for delta constraint
    loss += lambda_del * (delta**2 - cp.norm(mu_H - mu_H_hat)**2 - cp.norm(cp.sqrt(Sigma_H) - cp.sqrt(Sigma_H_hat))**2)

    # Lagrange multiplier terms for Q
    for i in range(N):
        loss += cp.sum(lambda_Q[i] * (Q_vars[i] - cp.sqrt(L[i] @ Sigma_L @ L[i].T)))

    # Lagrange multiplier terms for W
    for i in range(N):
        loss += cp.sum(lambda_W[i] * (W_vars[i] - cp.sqrt(H[i] @ Sigma_H @ H[i].T)))

    # Penalty terms for Q
    loss += (rho_Q / 2) * sum(cp.norm(Q_vars[i] - cp.sqrt(L[i] @ Sigma_L @ L[i].T), 'fro')**2 for i in range(N))

    # Penalty terms for W
    loss += (rho_W / 2) * sum(cp.norm(W_vars[i] - cp.sqrt(H[i] @ Sigma_H @ H[i].T), 'fro')**2 for i in range(N))

    return cp.Maximize(loss)


In [21]:
import numpy as np

# Step 1: Initialize parameters
N = 5  # Number of samples
dim_mu_L = 3  # Dimension of mu_L
dim_mu_H = 3  # Dimension of mu_H
dim_Sigma = 3  # Dimension for covariance matrices

# Step 2: Generate example L and H matrices
L = [np.random.rand(4, dim_mu_L) for _ in range(N)]  # Example L_i matrices
H = [np.random.rand(4, dim_mu_H) for _ in range(N)]  # Example H_i matrices

# Step 3: Define initial estimates for mu_L, Sigma_L, mu_H, and Sigma_H
mu_L_hat = np.random.rand(dim_mu_L)  # Initial estimate for mu_L
Sigma_L_hat = np.random.rand(dim_Sigma, dim_Sigma)
Sigma_L_hat = Sigma_L_hat @ Sigma_L_hat.T  # Make Sigma_L_hat symmetric and positive semi-definite

mu_H_hat = np.random.rand(dim_mu_H)  # Initial estimate for mu_H
Sigma_H_hat = np.random.rand(dim_Sigma, dim_Sigma)
Sigma_H_hat = Sigma_H_hat @ Sigma_H_hat.T  # Make Sigma_H_hat symmetric and positive semi-definite

# Step 4: Define constraint parameters
epsilon = 0.1  # Parameter for epsilon constraint
delta = 0.1    # Parameter for delta constraint
alpha = 0.1    # Parameter for proximal operator

# Step 5: Initialize Lagrange multipliers
lambda_eps = 0.0  # Lagrange multiplier for epsilon constraint
lambda_del = 0.0  # Lagrange multiplier for delta constraint
lambda_Q = [np.zeros((4, 4)) for _ in range(N)]  # Lagrange multipliers for Q_i
lambda_W = [np.zeros((4, 4)) for _ in range(N)]  # Lagrange multipliers for W_i

# Step 6: Define penalty parameters
rho_Q = 1.0  # Penalty parameter for Q constraints
rho_W = 1.0  # Penalty parameter for W constraints

# Print initialized values
print("Initialized mu_L_hat:", mu_L_hat)
print("Initialized Sigma_L_hat:\n", Sigma_L_hat)
print("Initialized mu_H_hat:", mu_H_hat)
print("Initialized Sigma_H_hat:\n", Sigma_H_hat)
print("Initialized Lagrange multipliers for Q:\n", lambda_Q)
print("Initialized Lagrange multipliers for W:\n", lambda_W)


Initialized mu_L_hat: [0.90415869 0.34825547 0.51398949]
Initialized Sigma_L_hat:
 [[1.15835009 1.14381192 1.08191875]
 [1.14381192 1.66689119 1.30431101]
 [1.08191875 1.30431101 1.16743324]]
Initialized mu_H_hat: [0.45913576 0.98003258 0.49261809]
Initialized Sigma_H_hat:
 [[0.56694418 0.13732236 0.29177477]
 [0.13732236 0.03876096 0.1114772 ]
 [0.29177477 0.1114772  0.45306785]]
Initialized Lagrange multipliers for Q:
 [array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]]), array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]]), array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]]), array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]]), array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])]
Initialized Lagrange multipliers for W:
 [array([[0., 0., 0., 0.

In [22]:
# Step 7: Compute the augmented Lagrangian
# This step assumes you have defined your augmented_lagrangian function already.

result = augmented_lagrangian(L, H, Q_vars, W_vars, mu_L_hat, mu_H_hat, Sigma_L_hat, Sigma_H_hat,
                              lambda_eps, lambda_del, lambda_Q, lambda_W, rho_Q, rho_W)

# Print the result
print("Augmented Lagrangian result:", result)


Augmented Lagrangian result: maximize (power(Pnorm(reshape([-0.68077198 -0.41429305  0.04588054 -0.90939219], (4,), F), 2), 2.0) + trace([[1.75 2.73 0.95 0.90]
 [2.73 4.26 1.51 1.40]
 [0.95 1.51 0.56 0.48]
 [0.90 1.40 0.48 0.47]]) + trace([[1.33 1.10 0.28 0.98]
 [1.10 0.97 0.27 0.78]
 [0.28 0.27 0.08 0.19]
 [0.98 0.78 0.19 0.74]]) + -2.0 @ Pnorm(reshape(var197, (16,), F), 2) @ Pnorm(reshape(var202, (16,), F), 2) + power(Pnorm(reshape([-0.40805684 -0.38390375  0.18957148 -0.31252703], (4,), F), 2), 2.0) + trace([[3.00 3.78 2.80 3.44]
 [3.78 4.80 3.54 4.39]
 [2.80 3.54 2.62 3.19]
 [3.44 4.39 3.19 4.25]]) + trace([[1.11 1.04 0.37 1.40]
 [1.04 1.04 0.39 1.34]
 [0.37 0.39 0.15 0.49]
 [1.40 1.34 0.49 1.78]]) + -2.0 @ Pnorm(reshape(var198, (16,), F), 2) @ Pnorm(reshape(var203, (16,), F), 2) + power(Pnorm(reshape([-0.76014819 -0.18013291  0.57865513  0.27663614], (4,), F), 2), 2.0) + trace([[3.01 2.80 3.90 3.30]
 [2.80 2.97 3.96 3.32]
 [3.90 3.96 5.35 4.49]
 [3.30 3.32 4.49 3.78]]) + trace([[1

This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``multiply`` for elementwise multiplication.
This code path has been hit 11 times so far.

This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``multiply`` for elementwise multiplication.
This code path has been hit 12 times so far.

This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``

In [18]:
Q_vars = [cp.Variable((4, 4)) for _ in range(N)]  # Adjust size as necessary
W_vars = [cp.Variable((4, 4)) for _ in range(N)]  # Adjust size as necessary

In [67]:
#augmented_lagrangian computes the augmented lagrangian for the optimization problem
def augmented_lagrangian(LLmodels, HLmodels, Q_vars, W_vars, mu_L, mu_H, Sigma_L, Sigma_H,
                     mu_U_ll_hat, mu_U_hl_hat, Sigma_U_ll_hat, Sigma_U_hl_hat,
                     lambda_eps, lambda_del, lambda_Q, lambda_W, 
                     rho_Q, rho_W):
    #LLmodels are the ll models for every iota
    #HLmodels are the hl models for every omega_iota
    #mu_U_ll_hat is the mean vector of the low-level model
    #mu_U_hl_hat is the mean vector of the high-level model
    #Sigma_U_ll_hat is the covariance matrix of the low-level model
    #Sigma_U_hl_hat is the covariance matrix of the high-level model

    #mu_L is the mean vector of the low-level model
    #mu_H is the mean vector of the high-level model
    #Sigma_L is the covariance matrix of the low-level model
    #Sigma_H is the covariance matrix of the high-level model

    #lambda_eps is the Lagrange multiplier for the epsilon constraint (ll)
    #lambda_del is the Lagrange multiplier for the delta constraint (hl)

    #lambda_Q is the Lagrange multiplier for the Q_i constraints for every i
    #lambda_W is the Lagrange multiplier for the W_i constraints for every i

    #rho_Q is the penalty parameter for the Q_i constraints: same for all i
    #rho_W is the penalty parameter for the W_i constraints: same for all i

    #Q_vars are the Q_i variables for every i
    #W_vars are the W_i variables for every i

    loss = 0

    # Loss components from the original objective
    for iota in Ill:

        llcm = LLmodels[iota]
        hlcm = HLmodels[omega[iota]]
        llmech = llcm.compute_mechanism() 
        hlmech = hlcm.compute_mechanism()

        loss += cp.norm(llmech @ mu_L - hlmech @ mu_H, 'fro')**2
        loss += cp.trace(llmech @ Sigma_L @ llmech.T)
        loss += cp.trace(hlmech @ Sigma_H @ hlmech.T)
        loss -= 2 * cp.norm(Q_vars[iota], 'fro') * cp.norm(W_vars[iota], 'fro')

    # To compute the expected value, we will average the loss
    loss /= len(Ill)  # Average over the number of samples

    #Penalties for epsilon constraint
    loss += lambda_eps * (epsilon**2 - cp.norm(mu_L - mu_U_ll_hat)**2 - cp.norm(cp.sqrt(Sigma_L) - cp.sqrt(Sigma_U_ll_hat))**2)

    # Penalties for delta constraint
    loss += lambda_del * (delta**2 - cp.norm(mu_H - mu_U_hl_hat)**2 - cp.norm(cp.sqrt(Sigma_H) - cp.sqrt(Sigma_U_hl_hat))**2)

    # Lagrange multiplier terms for Q and W
    for iota in Ill:
        llcm = LLmodels[iota]
        hlcm = HLmodels[omega[iota]]
        llmech = llcm.compute_mechanism() 
        hlmech = hlcm.compute_mechanism()
        loss += cp.sum(lambda_Q[iota] * (Q_vars[iota] - cp.sqrt(llmech @ Sigma_L @ llmech.T)))
        loss += cp.sum(lambda_W[iota] * (W_vars[iota] - cp.sqrt(hlmech @ Sigma_H @ hlmech.T)))

    # Penalty terms for Q and W
    for iota in Ill:
        llcm = LLmodels[iota]
        hlcm = HLmodels[omega[iota]]
        llmech = llcm.compute_mechanism() 
        hlmech = hlcm.compute_mechanism()
        loss += (rho_Q / 2) * cp.norm(Q_vars[iota] - cp.sqrt(llmech @ Sigma_L @ llmech.T), 'fro')**2
        loss += (rho_W / 2) * cp.norm(W_vars[iota] - cp.sqrt(hlmech @ Sigma_H @ hlmech.T), 'fro')**2 

    return loss
    # return cp.Maximize(loss)


In [None]:
check the updates and add proper variable names
use cvxpy to solve the optimization problem

In [112]:
# Define the update functions
def update_mu_L(LLmodels, HLmodels, mu_H, lambda_eps, mu_U_ll_hat):
    N = len(LLmodels)
    E_LL = sum(LL_i.T @ LL_i for LL_i in LLmodels) / N
    E_LH_mu_H = sum(LL_i.T @ HL_i @ mu_H for LL_i, HL_i in zip(LLmodels, HLmodels)) / N
    reg_term = (lambda_eps / 2) * np.eye(mu_U_ll_hat.shape[0])
    mu_L_expr = np.linalg.inv(E_LL + reg_term) @ (E_LH_mu_H + (lambda_eps / 2) * mu_U_ll_hat)
    return mu_L_expr

def update_mu_H(HLmodels, LLmodels, mu_L, lambda_delta, mu_U_hl_hat):
    N = len(HLmodels)
    E_HH = sum(HL_iota.T @ HL_iota for HL_iota in HLmodels) / N
    E_HL_mu_L = sum(HL_iota.T @ LL_iota @ mu_L for LL_iota, HL_iota in zip(LLmodels, HLmodels)) / N
    reg_term = (lambda_delta / 2) * np.eye(mu_U_hl_hat.shape[0])
    mu_H_expr = np.linalg.inv(E_HH + reg_term) @ (E_HL_mu_L + (lambda_delta / 2) * mu_U_hl_hat)
    return mu_H_expr

def update_Sigma_L(LLmodels, lambda_eps, Sigma_U_ll_hat, lambda_Q, Sigma_L_k):
    # Ensure that lambda_eps is a scalar or a single value
    if np.isscalar(lambda_eps) or lambda_eps.size == 1:
        lambda_eps_value = lambda_eps if np.isscalar(lambda_eps) else lambda_eps.item()
        if lambda_eps_value > 0:
            # Check shapes of LLmodels and lambda_Q
            LL_sum = np.sum([LL_i @ LL_i.T for LL_i in LLmodels], axis=0)  # Should result in (4, 4) if each LL_i is (4, 3)
            lambda_Q_sum = np.sum(lambda_Q, axis=0)  # Ensure this matches the expected shape
            
            # Make sure lambda_Q_sum is shaped correctly for the operation
            if lambda_Q_sum.shape != LL_sum.shape:
                raise ValueError(f"Shape mismatch: LL_sum shape {LL_sum.shape} and lambda_Q_sum shape {lambda_Q_sum.shape}")

            Sigma_L_updated = np.linalg.inv(LL_sum + lambda_eps_value * np.eye(Sigma_L_k.shape[0])) @ (
                lambda_eps_value * Sigma_U_ll_hat + lambda_Q_sum
            )
        else:
            Sigma_L_updated = np.zeros_like(Sigma_L_k)  # Handle the case where the condition is not met
    else:
        raise ValueError("lambda_eps should be a scalar or a single value")

    return Sigma_L_updated

def update_Sigma_H(HLmodels, lambda_delta, Sigma_U_hl_hat, lambda_W, Sigma_H_k):
    if np.isscalar(lambda_delta) or lambda_delta.size == 1:
        lambda_delta_value = lambda_delta if np.isscalar(lambda_delta) else lambda_delta.item()
        if lambda_delta_value > 0:
            Sigma_H_updated = np.linalg.inv(np.sum(HLmodels, axis=0) + lambda_delta_value * np.eye(Sigma_H_k.shape[0])) @ (lambda_delta_value * Sigma_U_hl_hat + np.sum(lambda_W, axis=0))
        else:
            Sigma_H_updated = np.zeros_like(Sigma_H_k)
    else:
        raise ValueError("lambda_delta should be a scalar or a single value")
    return Sigma_H_updated

def update_Q_i(Q_i_k, lambda_Q_i, rho_Q, alpha):
    norm_Q = np.linalg.norm(Q_i_k, 'fro')
    if norm_Q > alpha:
        Q_i_updated = (1 - (alpha / norm_Q)) * (Q_i_k - (1 / rho_Q) * lambda_Q_i)
    else:
        Q_i_updated = np.zeros_like(Q_i_k)
    return Q_i_updated

def update_W_i(W_i_k, lambda_W_i, rho_W, alpha):
    norm_W = np.linalg.norm(W_i_k, 'fro')
    if norm_W > alpha:
        W_i_updated = (1 - (alpha / norm_W)) * (W_i_k - (1 / rho_W) * lambda_W_i)
    else:
        W_i_updated = np.zeros_like(W_i_k)
    return W_i_updated

def update_lambda_Q(lambda_Q_i_k, rho_Q, Q_i_k_plus_1, LL_i, Sigma_L_k_plus_1):
    term = np.sqrt(LL_i @ Sigma_L_k_plus_1 @ LL_i.T)
    lambda_Q_i_updated = lambda_Q_i_k + rho_Q * (Q_i_k_plus_1 - term)
    return lambda_Q_i_updated

def update_lambda_W(lambda_W_i_k, rho_W, W_i_k_plus_1, HL_i, Sigma_H_k_plus_1):
    term = np.sqrt(HL_i @ Sigma_H_k_plus_1 @ HL_i.T)
    lambda_W_i_updated = lambda_W_i_k + rho_W * (W_i_k_plus_1 - term)
    return lambda_W_i_updated

def update_lambda_epsilon(lambda_eps_k, mu_L_k_plus_1, mu_U_ll_hat, Sigma_L_k_plus_1, Sigma_U_ll_hat, epsilon):
    lambda_eps_updated = lambda_eps_k + (epsilon**2 - np.linalg.norm(mu_L_k_plus_1 - mu_U_ll_hat)**2 - np.linalg.norm(np.sqrt(Sigma_L_k_plus_1) - np.sqrt(Sigma_U_ll_hat))**2)
    return lambda_eps_updated

def update_lambda_delta(lambda_del_k, mu_H_k_plus_1, mu_U_hl_hat, Sigma_H_k_plus_1, Sigma_U_hl_hat, delta):
    lambda_del_updated = lambda_del_k + (delta**2 - np.linalg.norm(mu_H_k_plus_1 - mu_U_hl_hat)**2 - np.linalg.norm(np.sqrt(Sigma_H_k_plus_1) - np.sqrt(Sigma_U_hl_hat))**2)
    return lambda_del_updated

def compute_monge_map(mu_a, Sigma_a, mu_b, Sigma_b):
    # Check dimensions
    if mu_a.shape[0] != 3 or mu_b.shape[0] != 2:
        raise ValueError("Mean vectors must be of shape (3,) for Gaussian A and (2,) for Gaussian B.")
    
    if Sigma_a.shape != (3, 3) or Sigma_b.shape != (2, 2):
        raise ValueError("Covariance matrices must be of shape (3, 3) for Gaussian A and (2, 2) for Gaussian B.")

    # Compute the square root of the covariance matrices
    Sigma_a_sqrt = np.linalg.cholesky(Sigma_a)

    # Use a 2D projection from 3D to 2D, we need a method to match the dimensions
    # Here we assume a simple linear map for projection
    A = Sigma_a_sqrt[:2, :2]  # Take the first two rows/columns for the projection

    # Compute the transformation matrix
    A_inv = np.linalg.inv(A)
    B_inv = np.linalg.inv(Sigma_b)
    
    # Calculate the optimal transformation using the covariance matrices
    transformation_matrix = A_inv @ Sigma_b @ A_inv

    # Compute the inverse square root of the transformation matrix
    transformation_matrix_sqrt_inv = np.linalg.inv(np.linalg.cholesky(transformation_matrix))

    def T(x):
        # Ensure the input x is in the expected shape (3,)
        if x.shape[0] != 3:
            raise ValueError("Input x must be a 3D vector (shape: (3,)).")
        # Map the 3D vector to 2D
        return transformation_matrix_sqrt_inv @ (x - mu_a)[:2] + mu_b

    return T

In [115]:
# Optimization Parameters Initialization
num_iterations = 100000  # Number of iterations for optimization
rho_Q = 1.0  # Penalty parameter for Q updates
rho_W = 1.0  # Penalty parameter for W updates
alpha = 0.5  # Proximal parameter
epsilon = 0.1  # Epsilon constraint value
delta = 0.1  # Delta constraint value

# Example initialization (replace these with actual data)
LLmodels = [np.random.rand(4, 3) for _ in range(5)]  # Example L_i matrices
HLmodels = [np.random.rand(4, 2) for _ in range(5)]  # Example H_i matrices
mu_U_ll_hat = np.random.rand(3)  # Target mean for mu_L
mu_U_hl_hat = np.random.rand(2)  # Target mean for mu_H
Sigma_U_ll_hat = np.eye(3)  # Target covariance for Sigma_L
Sigma_U_hl_hat = np.eye(2)  # Target covariance for Sigma_H

# Initialize variables
mu_L = np.random.rand(3)  # Initial estimate for mu_L
mu_H = np.random.rand(2)  # Initial estimate for mu_H
Sigma_L = np.eye(3)  # Initial estimate for Sigma_L
Sigma_H = np.eye(2)  # Initial estimate for Sigma_H
Q_vars = [np.random.rand(4, 4) for _ in range(5)]  # Initial Q matrices
W_vars = [np.random.rand(4, 4) for _ in range(5)]  # Initial W matrices
lambda_Q = [np.zeros((4, 4)) for _ in range(5)]  # Initial lambda_Q
lambda_W = [np.zeros((4, 4)) for _ in range(5)]  # Initial lambda_W
lambda_eps = 0.0  # Initial lambda_eps
lambda_del = 0.0  # Initial lambda_delta

# Optimization Loop
for k in range(num_iterations):
    # Update mu_L and mu_H
    mu_L = update_mu_L(LLmodels, HLmodels, mu_H, lambda_eps, mu_U_ll_hat)
    mu_H = update_mu_H(HLmodels, LLmodels, mu_L, lambda_delta, mu_U_hl_hat)

    # Update Sigma_L and Sigma_H
    Sigma_L = Sigma_U_ll_hat #update_Sigma_L(LLmodels, rho_Q, Sigma_U_ll_hat, lambda_Q, Sigma_L)
    Sigma_H = Sigma_U_hl_hat #update_Sigma_H(HLmodels, rho_W, Sigma_U_hl_hat, lambda_W, Sigma_H)

    # Update Q and W
    for iota in range(len(Q_vars)):
        Q_vars[iota] = update_Q_i(Q_vars[iota], lambda_Q[iota], rho_Q, alpha)
        W_vars[iota] = update_W_i(W_vars[iota], lambda_W[iota], rho_W, alpha)

    # Update lambda multipliers
    for iota in range(len(Q_vars)):
        lambda_Q[iota] = update_lambda_Q(lambda_Q[iota], rho_Q, Q_vars[iota], LLmodels[iota], Sigma_L)
        lambda_W[iota] = update_lambda_W(lambda_W[iota], rho_W, W_vars[iota], HLmodels[iota], Sigma_H)

    # Update lambda_eps and lambda_delta
    lambda_eps = update_lambda_epsilon(lambda_eps, mu_L, mu_U_ll_hat, Sigma_L, Sigma_U_ll_hat, epsilon)
    lambda_del = update_lambda_delta(lambda_del, mu_H, mu_U_hl_hat, Sigma_H, Sigma_U_hl_hat, delta)

# Print the final results
print("Final mu_L:", mu_L)
print("Final Sigma_L:\n", Sigma_L)
print("Final mu_H:", mu_H)
print("Final Sigma_H:\n", Sigma_H)
print( )
tau = compute_monge_map(mu_L, Sigma_L, mu_H, Sigma_H)
x_sample = np.array([1.5, 2.5, 3.5])
mapped_point = tau(x_sample)

print("Mapped point:", mapped_point)

Final mu_L: [0.55675283 0.06727336 0.64644982]
Final Sigma_L:
 [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
Final mu_H: [0.38240578 0.64187901]
Final Sigma_H:
 [[1. 0.]
 [0. 1.]]

Mapped point: [1.32565294 3.07460566]


In [108]:
import torch

# Define the dimensions
dim_mu_L = 3
dim_mu_H = 2
dim_Sigma = 2
N = 5  # Number of samples

# Initialize parameters
L = [torch.randn(dim_Sigma, dim_mu_L) for _ in range(N)]
H = [torch.randn(dim_Sigma, dim_mu_H) for _ in range(N)]
Q = [torch.randn(dim_Sigma, dim_mu_L) for _ in range(N)]
W = [torch.randn(dim_Sigma, dim_mu_H) for _ in range(N)]
epsilon = 0.1
delta = 0.1
mu_L_hat = torch.randn(dim_mu_L)
Sigma_L_hat = torch.randn(dim_Sigma, dim_Sigma)
mu_H_hat = torch.randn(dim_mu_H)
Sigma_H_hat = torch.randn(dim_Sigma, dim_Sigma)
alpha = 0.5
rho_Q = 1.0
rho_W = 1.0
num_iterations = 100

# Initialize variables
mu_L = torch.zeros(dim_mu_L, requires_grad=True)
mu_H = torch.zeros(dim_mu_H, requires_grad=True)
Sigma_L = torch.eye(dim_Sigma, requires_grad=True)
Sigma_H = torch.eye(dim_Sigma, requires_grad=True)

Q_vars = [torch.zeros(Q[i].shape, requires_grad=True) for i in range(N)]
W_vars = [torch.zeros(W[i].shape, requires_grad=True) for i in range(N)]

lambda_Q = [torch.zeros(Q[i].shape) for i in range(N)]
lambda_W = [torch.zeros(W[i].shape) for i in range(N)]
lambda_eps = torch.tensor(0.0, requires_grad=True)
lambda_del = torch.tensor(0.0, requires_grad=True)

optimizer = torch.optim.Adam([mu_L, mu_H, Sigma_L, Sigma_H, lambda_eps, lambda_del] + Q_vars + W_vars, lr=0.01)

for k in range(num_iterations):
    optimizer.zero_grad()

    # Compute the augmented Lagrangian
    lagrangian = 0
    for i in range(N):
        lagrangian += torch.norm(L[i] @ mu_L - H[i] @ mu_H)**2
        lagrangian += torch.norm(L[i] @ Sigma_L @ L[i].T - Q_vars[i])**2
        lagrangian += torch.norm(H[i] @ Sigma_H @ H[i].T - W_vars[i])**2

    lagrangian += lambda_eps * (epsilon**2 - torch.norm(mu_L - mu_L_hat)**2 - torch.norm(Sigma_L - Sigma_L_hat)**2)
    lagrangian += lambda_del * (delta**2 - torch.norm(mu_H - mu_H_hat)**2 - torch.norm(Sigma_H - Sigma_H_hat)**2)

    # Compute gradients
    lagrangian.backward()

    # Update variables
    optimizer.step()

    # Update Lagrange multipliers
    with torch.no_grad():
        for i in range(N):
            lambda_Q[i] += rho_Q * (Q_vars[i] - torch.sqrt(L[i] @ Sigma_L @ L[i].T))
            lambda_W[i] += rho_W * (W_vars[i] - torch.sqrt(H[i] @ Sigma_H @ H[i].T))

        lambda_eps += (epsilon**2 - torch.norm(mu_L - mu_L_hat)**2 - torch.norm(Sigma_L - Sigma_L_hat)**2)
        lambda_del += (delta**2 - torch.norm(mu_H - mu_H_hat)**2 - torch.norm(Sigma_H - Sigma_H_hat)**2)

# Print the results
print("mu_L:", mu_L)
print("Sigma_L:", Sigma_L)
print("mu_H:", mu_H)
print("Sigma_H:", Sigma_H)
print("Q_vars:", Q_vars)
print("W_vars:", W_vars)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x3 and 2x2)

In [70]:
import numpy as np



# Example usage
mu_a = np.array([1.0, 2.0, 3.0])  # Mean of the 3D Gaussian
Sigma_a = np.array([[1.0, 0.5, 0.2], [0.5, 1.0, 0.3], [0.2, 0.3, 1.0]])  # Covariance of the 3D Gaussian

mu_b = np.array([3.0, 4.0])  # Mean of the 2D Gaussian
Sigma_b = np.array([[1.0, 0.2], [0.2, 1.0]])  # Covariance of the 2D Gaussian

# Compute the Monge map
monge_map = compute_monge_map(mu_a, Sigma_a, mu_b, Sigma_b)

# Example input point from the 3D Gaussian
x_sample = np.array([1.5, 2.5, 3.5])
mapped_point = monge_map(x_sample)

print("Mapped point:", mapped_point)


Mapped point: [3.53163521 6.39094344]


In [137]:
import cvxpy as cp
import numpy as np

def update_mu_L(LLmodels, HLmodels, mu_H, lambda_eps, mu_U_ll_hat):
    """
    Update mu_L using the specified formula in a CVXPY context.
    
    Parameters:
    - LLmodels: List of L_i matrices (square numpy arrays).
    - HLmodels: List of H_i matrices (square numpy arrays).
    - mu_H: Current estimate of mu_H (cvxpy Variable).
    - lambda_eps: Regularization parameter for epsilon.
    - mu_U_ll_hat: Reference mean vector of the low-level model.
    
    Returns:
    - Updated mu_L as a cvxpy Variable.
    """
    # Number of samples
    N = len(LLmodels)

    # CVXPY variable for mu_L (3-dimensional)
    mu_L = cp.Variable(mu_U_ll_hat.shape[0])
    
    # Compute the expected value of L_i^T L_i
    E_LL = sum(LL_i @ LL_i.T for LL_i in LLmodels) / N  # Correct dimension handling
    
    # Compute the expected value of L_i^T H_i mu_H
    E_LH_mu_H = sum(LL_i @ HL_i @ mu_H for LL_i, HL_i in zip(LLmodels, HLmodels)) / N
    
    # Regularization term
    reg_term = (lambda_eps / 2) * np.eye(mu_U_ll_hat.shape[0])
    
    # Construct the update expression using cp.inv for matrix inversion
    mu_L_expr = cp.inv(E_LL + reg_term) @ (E_LH_mu_H + (lambda_eps / 2) * mu_U_ll_hat)

    return mu_L, mu_L_expr  # Returning mu_L variable for optimization and its expression

# Example usage
d_LL = 3  # Dimension of low-level model
d_HL = 2  # Dimension of high-level model

# Create square matrices for LLmodels and HLmodels
LLmodels = [np.random.rand(d_LL, d_LL) for _ in range(5)]  # Example L_i matrices (3x3)
HLmodels = [np.random.rand(d_HL, d_HL) for _ in range(5)]  # Example H_i matrices (2x2)

# Current estimate for mu_H as a CVXPY variable (2-dimensional)
mu_H = cp.Variable(d_HL)  
lambda_eps = 0.1  # Regularization parameter
# Reference mean vector for mu_L (3-dimensional)
mu_U_ll_hat = np.random.rand(d_LL)  

# Update mu_L
mu_L, mu_L_expr = update_mu_L(LLmodels, HLmodels, mu_H, lambda_eps, mu_U_ll_hat)

# Print the CVXPY expression for the updated mu_L
print("Update expression for mu_L:", mu_L_expr)


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 3)

In [166]:
import cvxpy as cp
import numpy as np

# Define your problem data
L = np.random.rand(10, 3, 3)  # Example data, replace with actual values
H = np.random.rand(10, 2, 2)
U_L = np.random.rand(5, 3)
U_H = np.random.rand(5, 2)
N = 5
epsilon = 1.0
delta = 1.0
num_samples = L.shape[0]
m, n = H.shape[1], L.shape[1]

# Initialize variables
T = np.random.rand(m, n)
Theta = np.random.rand(N, n)
Phi = np.random.rand(N, m)

# Alternating optimization parameters
max_iters = 100
tol = 1e-4

# Define the update functions
def update_T(L, H, U_L, U_H, Theta, Phi, T_prev):
    """Update step for T using CVXPY with fixed Theta and Phi."""
    T_var = cp.Variable((m, n), nonneg=True)
    objective = 0
    for i in range(num_samples):
        Li = L[i]
        Hi = H[i]
        A = T_var @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)
        objective += cp.norm(A, "fro")**2
    objective = cp.Minimize(objective / num_samples)
    prob = cp.Problem(objective)
    prob.solve()
    return T_var.value

def update_Theta(L, H, U_L, U_H, T, Phi, Theta_prev, epsilon, N):
    """Update step for Theta using CVXPY with fixed T and Phi."""
    Theta_var = cp.Variable((N, n))
    objective = 0
    for i in range(num_samples):
        Li = L[i]
        Hi = H[i]
        A = T @ Li @ (U_L.T + Theta_var.T) - Hi @ (U_H.T + Phi.T)
        objective += cp.norm(A, "fro")**2
    constraints = [cp.norm(Theta_var, "fro") <= np.sqrt(N * epsilon**2)]
    objective = cp.Minimize(-objective / num_samples)
    prob = cp.Problem(objective, constraints)
    prob.solve()
    return Theta_var.value

def update_Phi(L, H, U_L, U_H, T, Theta, Phi_prev, delta, N):
    """Update step for Phi using CVXPY with fixed T and Theta."""
    Phi_var = cp.Variable((N, m))
    objective = 0
    for i in range(num_samples):
        Li = L[i]
        Hi = H[i]
        A = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi_var.T)
        objective += cp.norm(A, "fro")**2
    constraints = [cp.norm(Phi_var, "fro") <= np.sqrt(N * delta**2)]
    objective = cp.Minimize(-objective / num_samples)
    prob = cp.Problem(objective, constraints)
    prob.solve()
    return Phi_var.value

# Main optimization loop
for iteration in range(max_iters):
    T_prev, Theta_prev, Phi_prev = T.copy(), Theta.copy(), Phi.copy()

    # Update each variable in turn
    T = update_T(L, H, U_L, U_H, Theta, Phi, T)
    Theta = update_Theta(L, H, U_L, U_H, T, Phi, Theta, epsilon, N)
    Phi = update_Phi(L, H, U_L, U_H, T, Theta, Phi, delta, N)

    # Check for convergence
    if (np.linalg.norm(T - T_prev, 'fro') < tol and
        np.linalg.norm(Theta - Theta_prev, 'fro') < tol and
        np.linalg.norm(Phi - Phi_prev, 'fro') < tol):
        print(f"Converged in {iteration + 1} iterations.")
        break

# Final optimized values of T, Theta, and Phi
print("Optimized T:", T)
print("Optimized Theta:", Theta)
print("Optimized Phi:", Phi)


DCPError: Problem does not follow DCP rules. Specifically:
The objective is not DCP, even though each sub-expression is.
You are trying to minimize a function that is concave.

In [183]:
import cvxpy as cp
import numpy as np

# Define your problem data
L = np.random.rand(10, 3, 3)  # Example data, replace with actual values
H = np.random.rand(10, 2, 2)
U_L = np.random.rand(5, 3)
U_H = np.random.rand(5, 2)
N = 5
epsilon = 1.0
delta = 1.0
num_samples = L.shape[0]
m, n = H.shape[1], L.shape[1]

# Initialize variables
T = np.random.rand(m, n)
Theta = np.random.rand(N, n)
Phi = np.random.rand(N, m)

# Alternating optimization parameters
max_iters = 100
tol = 1e-4

# Define the update functions
def update_T(L, H, U_L, U_H, Theta, Phi, T_prev):
    """Update step for T using CVXPY with fixed Theta and Phi."""
    T_var = cp.Variable((m, n), nonneg=True)
    objective = 0
    for i in range(num_samples):
        Li = L[i]
        Hi = H[i]
        A = T_var @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)
        objective += cp.norm(A, "fro")**2
    objective = cp.Minimize(objective / num_samples)
    prob = cp.Problem(objective)
    prob.solve()
    return T_var.value

def update_Theta(L, H, U_L, U_H, T, Phi, Theta_prev, epsilon, N):
    """Update step for Theta using CVXPY with fixed T and Phi."""
    Theta_var = cp.Variable((N, n))
    objective = 0
    for i in range(num_samples):
        Li = L[i]
        Hi = H[i]
        A = T @ Li @ (U_L.T + Theta_var.T) - Hi @ (U_H.T + Phi.T)
        objective = cp.norm(A, "fro")**2
    constraints = [cp.norm(Theta_var, "fro") <= np.sqrt(N * epsilon**2)]
    # Minimize the negative of the objective to handle maximization
    objective = cp.Minimize(objective / num_samples)
    prob = cp.Problem(objective, constraints)
    prob.solve()
    return Theta_var.value

def update_Phi(L, H, U_L, U_H, T, Theta, Phi_prev, delta, N):
    """Update step for Phi using CVXPY with fixed T and Theta."""
    Phi_var = cp.Variable((N, m))
    objective = 0
    for i in range(num_samples):
        Li = L[i]
        Hi = H[i]
        A = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi_var.T)
        objective = cp.norm(A, "fro")**2
    constraints = [cp.norm(Phi_var, "fro") <= np.sqrt(N * delta**2)]
    # Minimize the negative of the objective to handle maximization
    objective = cp.Minimize(objective / num_samples)
    prob = cp.Problem(objective, constraints)
    prob.solve()
    return Phi_var.value

# Main optimization loop
for iteration in range(max_iters):
    T_prev, Theta_prev, Phi_prev = T.copy(), Theta.copy(), Phi.copy()

    # Update each variable in turn
    T     = update_T(L, H, U_L, U_H, Theta, Phi, T)
    Theta = update_Theta(L, H, U_L, U_H, T, Phi, Theta, epsilon, N)
    Phi   = update_Phi(L, H, U_L, U_H, T, Theta, Phi, delta, N)

    # Check for convergence
    if (np.linalg.norm(T - T_prev, 'fro') < tol and
        np.linalg.norm(Theta - Theta_prev, 'fro') < tol and
        np.linalg.norm(Phi - Phi_prev, 'fro') < tol):
        print(f"Converged in {iteration + 1} iterations.")
        break

# Final optimized values of T, Theta, and Phi
print("Optimized T:", T)
print("Optimized Theta:", Theta)
print("Optimized Phi:", Phi)


Converged in 17 iterations.
Optimized T: [[1.76400242e-06 2.47997573e-06 2.61594958e-06]
 [1.72439876e-06 2.44073676e-06 2.54596391e-06]]
Optimized Theta: [[-0.0015276  -0.00126539 -0.00157781]
 [-0.004604   -0.00381353 -0.00475515]
 [-0.00521074 -0.00431356 -0.00537965]
 [-0.00543193 -0.0044979  -0.00560906]
 [-0.00454818 -0.00376629 -0.00469665]]
Optimized Phi: [[-0.10597937 -0.31120573]
 [-0.36405932 -0.86442975]
 [-0.9319195  -0.15612672]
 [-0.71643512 -0.56644337]
 [-0.56412914 -0.53022537]]


In [203]:
U_L = U_ll_hat
U_H = U_hl_hat

num_samples, n = U_L.shape
num_samples, m = U_H.shape

epsilon     = 1.0
delta       = 1.0
alpha       = 0.01  # Learning rate for ascent steps in Theta and Phi

# Initialize variables
T     = np.random.rand(m, n)
Theta = np.random.rand(N, n)
Phi   = np.random.rand(N, m)

# Project onto Frobenius ball function
def project_onto_frobenius_ball(matrix, radius):
    norm = np.linalg.norm(matrix, 'fro')
    if norm > radius:
        return matrix * (radius / norm)
    return matrix

# Update function for T 
def update_T(U_L, U_H, Theta, Phi):
    T_var = cp.Variable((m, n), nonneg=True)
    objective = 0
    for iota in Ill:
        Li = LLmodels[iota].compute_mechanism() 
        Hi = HLmodels[omega[iota]].compute_mechanism()
        A  = T_var @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)

        objective += cp.norm(A, "fro")**2

    objective = cp.Minimize(objective / num_samples)
    prob = cp.Problem(objective)
    prob.solve()
    return T_var.value

# Gradient ascent step for Theta
def ascent_step_Theta(U_L, U_H, T, Phi, Theta, epsilon, N, alpha):
    gradient = np.zeros_like(Theta)
    for iota in Ill:
        Li = LLmodels[iota].compute_mechanism() 
        Hi = HLmodels[omega[iota]].compute_mechanism()
        A  = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)

        gradient += ((T @ Li).T @ A).T  # Compute gradient wrt Theta

    gradient /= num_samples
    Theta += alpha * gradient  # Ascent step
    return project_onto_frobenius_ball(Theta, np.sqrt(N * epsilon**2))

# Gradient ascent step for Phi
def ascent_step_Phi(U_L, U_H, T, Theta, Phi, delta, N, alpha):
    gradient = np.zeros_like(Phi)
    for iota in Ill:
        Li = LLmodels[iota].compute_mechanism() 
        Hi = HLmodels[omega[iota]].compute_mechanism()
        A  = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)

        gradient += (Hi @ A).T  # Compute gradient wrt Phi

    gradient /= num_samples
    Phi += alpha * gradient  # Ascent step
    return project_onto_frobenius_ball(Phi, np.sqrt(N * delta**2))

# Main optimization loop
max_iters = 100
tol = 1e-4

for iteration in range(max_iters):
    T_prev, Theta_prev, Phi_prev = T.copy(), Theta.copy(), Phi.copy()

    # Minimize wrt T
    T = update_T(L, H, U_L, U_H, Theta, Phi)

    # Maximize wrt Theta and Phi using gradient ascent
    Theta = ascent_step_Theta(U_L, U_H, T, Phi, Theta, epsilon, N, alpha)
    Phi   = ascent_step_Phi(U_L, U_H, T, Theta, Phi, delta, N, alpha)

    # Check for convergence
    if (np.linalg.norm(T - T_prev, 'fro') < tol and
        np.linalg.norm(Theta - Theta_prev, 'fro') < tol and
        np.linalg.norm(Phi - Phi_prev, 'fro') < tol):
        print(f"Converged in {iteration + 1} iterations.")
        break

# Final optimized values of T, Theta, and Phi
print("Optimized T:", T)
print("Optimized Theta:", Theta)
print("Optimized Phi:", Phi)

Optimized T: [[0.19407151 0.50730026 0.48335842]
 [0.19189463 0.13342697 0.2741195 ]]
Optimized Theta: [[ 0.04010824  0.48980465  0.32861929]
 [ 0.4817851   0.13822311  0.41436349]
 [-0.95775434 -1.00583146 -0.69239412]
 [-0.17044718  0.71039676  0.5405109 ]
 [ 0.29193497  0.59663659  0.62026516]]
Optimized Phi: [[ 0.38564357 -0.17346259]
 [ 0.04767077  0.3532042 ]
 [-1.68878638 -1.13244729]
 [ 0.2707614  -0.07152461]
 [ 0.19612339  0.34677054]]


In [215]:
x_sample = np.array([1.5, 2.5, 3.5])
mapped_point = T @ x_sample
print(f'{x_sample} maps to {mapped_point}')

[1.5 2.5 3.5] maps to [3.25111237 1.58082761]


Barycenter Mean: [-0.2  2.8]
Barycenter Covariance: [[0.88284666 0.23464565]
 [0.23464565 2.50711389]]
