In [13]:
import json
import pickle
import time
from datetime import datetime, timedelta
from itertools import product

import numpy as np
import seaborn as sns
import torch
import joblib
import os

import matplotlib.pyplot as plt
from tqdm import tqdm

# Local modules
import modularised_utils as mut
import opt_utils as oput
import evaluation_utils as evut
import Linear_Additive_Noise_Models as lanm
import operations as ops
import params

np.random.seed(0)

In [2]:
experiment       = 'synth1'
abduction        = False
coeff_estimation = False

# Define the radius of the Wasserstein balls (epsilon, delta) and the size for both models.
epsilon, delta           = params.radius[experiment]
ll_num_envs, hl_num_envs = params.n_envs[experiment]

# Define the number of samples per environment. Currently every environment has the same number of samples
num_llsamples, num_hlsamples  = params.n_samples[experiment]

# Load ground truth abstraction
Tau = mut.load_T(experiment)

In [3]:
Dll_obs  = mut.load_samples(experiment)[None][0] 
Gll, Ill = mut.load_model(experiment, 'LL')
l        = len(Gll.nodes())

Dhl_obs  = mut.load_samples(experiment)[None][1] 
Ghl, Ihl = mut.load_model(experiment, 'HL')
h        = len(Ghl.nodes())

omega    = mut.load_omega_map(experiment)

In [4]:
if coeff_estimation == True:
    ll_coeffs = mut.get_coefficients(Dll_obs, Gll)
    hl_coeffs = mut.get_coefficients(Dhl_obs, Ghl) 
else:
    ll_coeffs = mut.load_coeffs(experiment, 'LL')
    hl_coeffs = mut.load_coeffs(experiment, 'HL')

In [5]:
if abduction == True:
    U_ll_hat, mu_U_ll_hat, Sigma_U_ll_hat = mut.lan_abduction(Dll_obs, Gll, ll_coeffs)
    U_hl_hat, mu_U_hl_hat, Sigma_U_hl_hat = mut.lan_abduction(Dhl_obs, Ghl, hl_coeffs)
else:
    U_ll_hat, mu_U_ll_hat, Sigma_U_ll_hat = mut.load_exogenous(experiment, 'LL')
    U_hl_hat, mu_U_hl_hat, Sigma_U_hl_hat = mut.load_exogenous(experiment, 'HL')

In [6]:
A_ll = mut.generate_perturbed_datasets(D = U_ll_hat, bound = epsilon, num_envs = ll_num_envs) #Low-level: A_epsilon
A_hl = mut.generate_perturbed_datasets(D = U_hl_hat, bound = delta, num_envs = hl_num_envs) #High-level A_delta

In [7]:
LLmodels = {}
for iota in Ill:
    LLmodels[iota] = lanm.LinearAddSCM(Gll, ll_coeffs, iota)
    
HLmodels, Dhl_samples = {}, {}
for eta in Ihl:
    HLmodels[eta] = lanm.LinearAddSCM(Ghl, hl_coeffs, eta)

In [8]:
# U_L = U_ll_hat
# U_H = U_hl_hat

# num_samples, n = U_L.shape
# num_samples, m = U_H.shape

# epsilon = 0.5  # Radius of the Wasserstein ball for the low-level model
# delta   = 0.5 # Radius of the Wasserstein ball for the high-level model
# alpha   = 0.0001 # Learning rate for ascent steps in Theta and Phi

# # Initialize variables
# T     = np.random.rand(m, n)
# Theta = np.random.rand(num_samples, n)
# Phi   = np.random.rand(num_samples, m)

# # Project onto Frobenius ball function
# def project_onto_frobenius_ball(matrix, radius):
#     norm = np.linalg.norm(matrix, 'fro')
#     if norm > radius:
#         return matrix * (radius / norm)
#     return matrix

# # Update function for T 
# def update_T(U_L, U_H, Theta, Phi):
#     T_var = cp.Variable((m, n), nonneg=True)
#     objective = 0
#     for iota in Ill:
#         Li = LLmodels[iota].F() 
#         Hi = HLmodels[omega[iota]].F()
#         A  = T_var @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)

#         objective += cp.norm(A, "fro")**2

#     objective = cp.Minimize(objective / num_samples)
#     prob      = cp.Problem(objective)
#     prob.solve()
#     return T_var.value

# # Gradient ascent step for Theta
# def ascent_step_Theta(U_L, U_H, T, Phi, Theta, epsilon, alpha):
#     gradient = np.zeros_like(Theta)
#     for iota in Ill:
#         Li = LLmodels[iota].F() 
#         Hi = HLmodels[omega[iota]].F()
#         A  = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)

#         gradient += ((T @ Li).T @ A).T  # Compute gradient wrt Theta

#     gradient /= num_samples
#     Theta += alpha * gradient  # Ascent step
#     return project_onto_frobenius_ball(Theta, np.sqrt(num_samples * epsilon**2))

# # Gradient ascent step for Phi
# def ascent_step_Phi(U_L, U_H, T, Theta, Phi, delta, alpha):
#     gradient = np.zeros_like(Phi)
#     for iota in Ill:
#         Li = LLmodels[iota]._compute_reduced_form() 
#         Hi = HLmodels[omega[iota]]._compute_reduced_form()
#         A  = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)

#         gradient += (Hi @ A).T  # Compute gradient wrt Phi

#     gradient /= num_samples
#     Phi += alpha * gradient  # Ascent step
#     return project_onto_frobenius_ball(Phi, np.sqrt(num_samples * delta**2))

# # Main optimization loop
# max_iters = 100
# tol = 1e-5

# for iteration in tqdm(range(max_iters)):
#     T_prev, Theta_prev, Phi_prev = T.copy(), Theta.copy(), Phi.copy()

#     # Minimize wrt T
#     T = update_T(U_L, U_H, Theta, Phi)

#     # Maximize wrt Theta and Phi using gradient ascent
#     Theta = ascent_step_Theta(U_L, U_H, T, Phi, Theta, epsilon, alpha)
#     Phi   = ascent_step_Phi(U_L, U_H, T, Theta, Phi, delta, alpha)

#     # Check for convergence
#     if (np.linalg.norm(T - T_prev, 'fro') < tol and
#         np.linalg.norm(Theta - Theta_prev, 'fro') < tol and
#         np.linalg.norm(Phi - Phi_prev, 'fro') < tol):
#         print(f"Converged in {iteration + 1} iterations.")
#         break

# # Final optimized values of T, Theta, and Phi
# print("Optimized T:", T)
# print("Optimized Theta:", Theta)
# print("Optimized Phi:", Phi)

In [27]:
# Projection onto Frobenius ball
def project_onto_frobenius_ball(matrix, radius):
    norm = torch.norm(matrix, p='fro')
    if norm > radius:
        return matrix * (radius / norm)
    return matrix

# Objective function
def objective(U_L, U_H, T, Theta, Phi, L, H):
    loss_iota = 0
    for iota in Ill:
        L_i       = torch.from_numpy(LLmodels[iota].F).float()
        H_i       = torch.from_numpy(HLmodels[omega[iota]].F).float()
        pert_L_i  = U_L + Theta
        pert_H_i  = U_H + Phi
       
        loss_iota = loss_iota + torch.norm(T @ L_i @ pert_L_i.T - H_i @ pert_H_i.T, p='fro')**2

    loss = loss_iota/len(Ill)

    return loss                                                          

def run_empirical_optimization(theta_hatL, theta_hatH, initial_theta,
                               epsilon, delta, eta_min, eta_max,
                               num_stpes_min, num_stpes_max, max_iter, tol, seed,
                               robust_L, robust_H, plot_epochs, display_results):

    torch.manual_seed(seed) 

    # Start timing
    start_time = time.time()
    erica      = robust_L or robust_H
   
    U_L = torch.from_numpy(theta_hatL['U_L']).float()
    U_H = torch.from_numpy(theta_hatH['U_H']).float()
    num_samples = U_L.shape[0]
    l = U_L.shape[1]
    h = U_H.shape[1]

# Initialize variables with requires_grad=True
    T = torch.randn(h, l, requires_grad=True)
    if initial_theta == 'random':
        Theta = torch.randn(num_samples, l, requires_grad=True)
        Phi = torch.randn(num_samples, h, requires_grad=True)
    elif initial_theta == 'projected':
        Theta = torch.randn(num_samples, l, requires_grad=True)
        Phi = torch.randn(num_samples, h, requires_grad=True)
        with torch.no_grad():
            Theta.data = project_onto_frobenius_ball(Theta, torch.sqrt(torch.tensor(num_samples) * epsilon**2))
            Phi.data = project_onto_frobenius_ball(Phi, torch.sqrt(torch.tensor(num_samples) * delta**2))
# Create optimizers
    optimizer_T = torch.optim.SGD([T], lr=eta_min)
    optimizer_theta = torch.optim.SGD([Theta], lr=eta_max)
    optimizer_phi = torch.optim.SGD([Phi], lr=eta_max)

    prev_objective = float('inf')
    epoch_objectives = {'T_objectives_overall': [], 'theta_objectives_overall': []}

    for iteration in tqdm(range(max_iter)):
        # Minimization step for T
        for _ in range(num_stpes_min):
            optimizer_T.zero_grad()
            loss_min = objective(U_L, U_H, T, Theta, Phi, LLmodels, HLmodels)
            loss_min.backward()
            optimizer_T.step()
        
        # Maximization step for Theta and Phi
        for _ in range(num_stpes_max):
            optimizer_theta.zero_grad()
            optimizer_phi.zero_grad()
            
            loss_max = objective(U_L, U_H, T, Theta, Phi, LLmodels, HLmodels)
            loss_max.backward()
            
            # Update using optimizers
            optimizer_theta.step()
            optimizer_phi.step()
            
            # Project back onto constraint sets
            with torch.no_grad():
                Theta.data = project_onto_frobenius_ball(Theta, torch.sqrt(torch.tensor(num_samples) * epsilon**2))
                Phi.data = project_onto_frobenius_ball(Phi, torch.sqrt(torch.tensor(num_samples) * delta**2))
        
        # Check convergence
        with torch.no_grad():
            current_objective = objective(U_L, U_H, T, Theta, Phi, LLmodels, HLmodels).item()
            objective_change = abs(prev_objective - current_objective)
            if objective_change < tol:
                print(f"Converged in {iteration + 1} iterations.")
                break
            prev_objective = current_objective

        if plot_epochs:
            oput.plot_epoch_objectives(epoch_objectives, erica)


    
    U_L_final = U_L + Theta
    U_H_final = U_H + Phi

    paramsL      = {'U_L': U_L_final.detach().numpy(), 'radius': epsilon}
    paramsH      = {'U_H': U_H_final.detach().numpy(), 'radius': delta}
    T            = T.detach().numpy()
    end_time     = time.time()
    elapsed_time = end_time - start_time

    if display_results == True:
        oput.print_results(T, paramsL, paramsH, elapsed_time)


    return T, Theta, Phi

In [28]:
epsilon = .9
delta   = .8

theta_hatL   = {'U_L': U_ll_hat, 'radius': epsilon}
theta_hatH   = {'U_H': U_hl_hat, 'radius': delta}

seed = 23

In [29]:
params_empirical = {
                    'theta_hatL': theta_hatL,      # Initial low-level parameters
                    'theta_hatH': theta_hatH,      # Initial high-level parameters
                    'initial_theta': 'random',     # Added: initialization method for Theta/Phi
                    'epsilon': 0.5,               # Low-level radius
                    'delta': 0.5,                 # High-level radius
                    'eta_min': 0.001,             # Added: minimum learning rate
                    'eta_max': 0.01,              # Added: maximum learning rate
                    'num_stpes_min': 4,           # Steps for minimization
                    'num_stpes_max': 3,           # Steps for maximization
                    'max_iter': 10,               # Maximum iterations
                    'tol': 1e-5,                  # Convergence tolerance
                    'seed': seed,                 # Random seed
                    'robust_L': True,             # Added: robustness flag for low-level
                    'robust_H': True,             # Added: robustness flag for high-level
                    'plot_epochs': False,         # Added: whether to plot epoch objectives
                    'display_results': False      # Whether to display results
                   }

results = run_empirical_optimization(**params_empirical)

100%|██████████| 10/10 [00:01<00:00,  6.58it/s]


In [30]:
results

(array([[nan, nan, nan],
        [nan, nan, nan]], dtype=float32),
 tensor([[nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan],
         ...,
         [nan, nan, nan],
         [nan, nan, nan],
         [nan, nan, nan]], requires_grad=True),
 tensor([[nan, nan],
         [nan, nan],
         [nan, nan],
         ...,
         [nan, nan],
         [nan, nan],
         [nan, nan]], requires_grad=True))

In [14]:
# Tau  = mut.load_T(experiment)
# #Tau = torch.from_numpy(Tau).float()
# diff = torch.norm(T-Tau, 'fro')

# print(f"Estimated T =  {T}, \n \nGround Truth T = {Tau} \n \nFrobenius Distance = {diff}")

Estimated T =  [[1.52829830e-02 4.73341307e-01 1.25829370e-11]
 [2.26566373e-02 5.95564466e-01 3.49693126e-12]], 
 
Ground Truth T = [[1 2 1]
 [0 1 0]] 
 
Frobenius Distance = 0


In [None]:
x_sample = np.array([1.5, 2.5, 3.5])
mapped_point = T @ x_sample
print(f'{x_sample} maps to {mapped_point}')

In [None]:
#WITHOUT CVXPY
U_L = U_ll_hat
U_H = U_hl_hat

num_samples, n = U_L.shape
num_samples, m = U_H.shape

# Parameters
epsilon = 1.0
delta = 1.0
alpha = 0.01  # Learning rate for ascent steps in Theta and Phi
learning_rate_T = 0.001  # Learning rate for descent step in T

# Initialize primal and dual variables
T = np.random.rand(m, n)
Theta = np.random.rand(num_samples, n)
Phi = np.random.rand(num_samples, m)

# Define a function to project onto Frobenius ball
def project_onto_frobenius_ball(matrix, radius):
    norm = np.linalg.norm(matrix, 'fro')
    if norm > radius:
        return matrix * (radius / norm)
    return matrix

# Define a function to project onto the non-negative orthant
def project_onto_non_negative(matrix):
    return np.maximum(matrix, 0)

# Gradient descent step for T
def descent_step_T(U_L, U_H, T, Theta, Phi, learning_rate_T):
    gradient = np.zeros_like(T)
    for iota in Ill:
        Li = LLmodels[iota].compute_mechanism() 
        Hi = HLmodels[omega[iota]].compute_mechanism()
        A = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)
        
        # Compute gradient with respect to T
        gradient += A @ (Li @ (U_L.T + Theta.T)).T
        
    gradient /= num_samples
    T = T - learning_rate_T * gradient  # Gradient descent step
    return project_onto_non_negative(T)  # Ensure non-negativity

# Gradient ascent step for Theta
def ascent_step_Theta(U_L, U_H, T, Phi, Theta, epsilon, alpha):
    gradient = np.zeros_like(Theta)
    for iota in Ill:
        Li = LLmodels[iota].compute_mechanism() 
        Hi = HLmodels[omega[iota]].compute_mechanism()
        A = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)

        gradient += ((T @ Li).T @ A).T  # Compute gradient wrt Theta

    gradient /= num_samples
    Theta += alpha * gradient  # Ascent step
    return project_onto_frobenius_ball(Theta, np.sqrt(num_samples * epsilon**2))

# Gradient ascent step for Phi
def ascent_step_Phi(U_L, U_H, T, Theta, Phi, delta, alpha):
    gradient = np.zeros_like(Phi)
    for iota in Ill:
        Li = LLmodels[iota].compute_mechanism() 
        Hi = HLmodels[omega[iota]].compute_mechanism()
        A = T @ Li @ (U_L.T + Theta.T) - Hi @ (U_H.T + Phi.T)

        gradient += (Hi @ A).T  # Compute gradient wrt Phi

    gradient /= num_samples
    Phi += alpha * gradient  # Ascent step
    return project_onto_frobenius_ball(Phi, np.sqrt(num_samples * delta**2))

# Main optimization loop
max_iters = 100
tol = 1e-4

for iteration in range(max_iters):
    T_prev, Theta_prev, Phi_prev = T.copy(), Theta.copy(), Phi.copy()

    # Gradient descent step for T
    T = descent_step_T(U_L, U_H, T, Theta, Phi, learning_rate_T)

    # Gradient ascent steps for Theta and Phi
    Theta = ascent_step_Theta(U_L, U_H, T, Phi, Theta, epsilon, alpha)
    Phi = ascent_step_Phi(U_L, U_H, T, Theta, Phi, delta, alpha)

    # Check for convergence
    if (np.linalg.norm(T - T_prev, 'fro') < tol and
        np.linalg.norm(Theta - Theta_prev, 'fro') < tol and
        np.linalg.norm(Phi - Phi_prev, 'fro') < tol):
        print(f"Converged in {iteration + 1} iterations.")
        break

# Final optimized values of T, Theta, and Phi
print("Optimized T:", T)
print("Optimized Theta:", Theta)
print("Optimized Phi:", Phi)
