In [19]:
# user defined R installation
import os
os.environ['R_HOME'] = 'D:/Program Files/R-4.5.0' #path to your R installation
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import rpy2.robjects as robjects
from rpy2.robjects import globalenv
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector

from causaloptim_python import run_experiment, extract_prob_dict



# Load R packages
#igraph = importr('igraph')
causaloptim = importr('causaloptim')
base = importr('base')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Data Generation

True ATE: 0.19831845457157393


#### causaloptim

In [None]:
# Set seed and generate synthetic data in Python
np.random.seed(299128)
n = 5000
Z = np.random.binomial(1, 0.5, size=n)
U = np.random.binomial(1, 0.5, size=n)

# X ~ Z + U
logit_X = -1 + 1.5 * Z + 1.2 * U
p_X = 1 / (1 + np.exp(-logit_X))
X = np.random.binomial(1, p_X)

# Y ~ X + U
beta_X, beta_U = 1.0, 1.5
logit_Y = -0.5 + beta_X * X + beta_U * U
p_Y = 1 / (1 + np.exp(-logit_Y))
Y = np.random.binomial(1, p_Y)

df = pd.DataFrame({'Y': Y, 'X': X, 'Z': Z})

# Ground truth ATE
logit_Y1 = -0.5 + beta_X * 1 + beta_U * U
logit_Y0 = -0.5 + beta_X * 0 + beta_U * U
p_Y1 = 1 / (1 + np.exp(-logit_Y1))
p_Y0 = 1 / (1 + np.exp(-logit_Y0))
ATE_true = np.mean(p_Y1 - p_Y0)
print("True ATE:", ATE_true)


graph_str = "(Z -+ X, X -+ Y, Ur -+ X, Ur -+ Y)"
leftside = [1, 0, 0, 0]
latent   = [0, 0, 0, 1]
nvals    = [2, 2, 2, 2]
rlconnect = [0, 0, 0, 0]
monotone = [0, 0, 0, 0]

prob_dict = extract_prob_dict(df)


bounds = run_experiment(graph_str, leftside, latent, nvals, rlconnect, monotone, prob_dict)
print("Bounds:", bounds)

True ATE: 0.19725243674534862
Bounds: (<rpy2.robjects.vectors.FloatVector object at 0x000001762E758510> [14]
R classes: ('numeric',)
[-0.117957], <rpy2.robjects.vectors.FloatVector object at 0x000001762E727B10> [14]
R classes: ('numeric',)
[0.545062])


#### Varying Synthetic Data

In [None]:
def simulate_with_target_ate(
    n=5000,
    min_ate=0.05,
    max_ate=0.95,
    max_attempts=1000,
    coef_ranges=None,
    intercept_ranges=None,
    noise_std=0.5
):
    """
    Simulates data for binary treatment effect analysis.
    Fixes clustering by:
    - Using continuous U ~ N(0,1)
    - Adding Gaussian noise to logits

    Returns dict with true ATE and all coefficients.
    """
    if coef_ranges is None:
        coef_ranges = {
            'b_Z': (0.0, 3.0),
            'b_U_X': (0.0, 3.0),
            'b_X_Y': (0.5, 6.0),
            'b_U_Y': (0.0, 3.0)
        }
    if intercept_ranges is None:
        intercept_ranges = {
            'intercept_X': (-2.0, 2.0),
            'intercept_Y': (-4.0, 2.0)
        }

    for attempt in range(max_attempts):
        seed = np.random.randint(0, 1e6)
        np.random.seed(seed)

        # Coefficients
        b_Z = np.random.uniform(*coef_ranges['b_Z'])
        b_U_X = np.random.uniform(*coef_ranges['b_U_X'])
        b_X_Y = np.random.uniform(*coef_ranges['b_X_Y'])
        b_U_Y = np.random.uniform(*coef_ranges['b_U_Y'])

        intercept_X = np.random.uniform(*intercept_ranges['intercept_X'])
        intercept_Y = np.random.uniform(*intercept_ranges['intercept_Y'])

        # Generate continuous latent confounder
        Z = np.random.binomial(1, 0.5, size=n)
        U = np.random.normal(0, 1, size=n)

        # Generate treatment X
        logit_X = intercept_X + b_Z * Z + b_U_X * U
        p_X = 1 / (1 + np.exp(-logit_X))
        X = np.random.binomial(1, p_X)

        # Generate outcome Y with noise
        logit_Y = intercept_Y + b_X_Y * X + b_U_Y * U + np.random.normal(0, noise_std, size=n)
        p_Y = 1 / (1 + np.exp(-logit_Y))
        Y = np.random.binomial(1, p_Y)

        # Counterfactual outcomes with same noise structure
        logit_Y1 = intercept_Y + b_X_Y * 1 + b_U_Y * U + np.random.normal(0, noise_std, size=n)
        logit_Y0 = intercept_Y + b_X_Y * 0 + b_U_Y * U + np.random.normal(0, noise_std, size=n)
        p_Y1 = 1 / (1 + np.exp(-logit_Y1))
        p_Y0 = 1 / (1 + np.exp(-logit_Y0))
        ATE_true = np.mean(p_Y1 - p_Y0)

        if min_ate <= ATE_true <= max_ate:
            return {
                'seed': seed,
                'b_Z': b_Z,
                'b_U_X': b_U_X,
                'b_X_Y': b_X_Y,
                'b_U_Y': b_U_Y,
                'intercept_X': intercept_X,
                'intercept_Y': intercept_Y,
                'ATE_true': ATE_true,
                'Z': Z,
                'U': U,
                'X': X,
                'Y': Y
            }

    raise ValueError(f"Failed to generate ATE in range [{min_ate}, {max_ate}] after {max_attempts} attempts")

def simulate_uniform_ate_distribution(
    n=5000,
    n_bins=10,
    samples_per_bin=50,
    bin_range=(0.05, 0.95),
    simulate_kwargs=None,
    max_attempts=30000,
    verbose=True
):
    if simulate_kwargs is None:
        simulate_kwargs = {}

    bins = np.linspace(*bin_range, n_bins + 1)
    bin_counts = np.zeros(n_bins, dtype=int)
    data = []

    attempts = 0
    total_required = n_bins * samples_per_bin

    while sum(bin_counts) < total_required and attempts < max_attempts:
        sim = simulate_with_target_ate(n=n, min_ate=bin_range[0], max_ate=bin_range[1], **simulate_kwargs)
        ate = sim["ATE_true"]
        bin_idx = np.digitize(ate, bins) - 1

        if 0 <= bin_idx < n_bins and bin_counts[bin_idx] < samples_per_bin:
            bin_counts[bin_idx] += 1
            data.append(sim)

        attempts += 1

    if attempts >= max_attempts and sum(bin_counts) < total_required:
        print(f"⚠️ Warning: Only filled {sum(bin_counts)} / {total_required} samples after {attempts} attempts.")
        if verbose:
            for i in range(n_bins):
                print(f"  Bin {i} ({bins[i]:.2f}-{bins[i+1]:.2f}): {bin_counts[i]} samples")

    return data




sim_data = simulate_uniform_ate_distribution(
    n=5000,
    n_bins=5,
    samples_per_bin=25,
    simulate_kwargs={'noise_std': 1.0},  # more variation!
    verbose=True
)

df_uniform = pd.DataFrame(sim_data)
df_uniform.ATE_true.hist(bins=20)


KeyboardInterrupt: 

In [149]:
results = []

n = 50000
graph_str = "(Z -+ X, X -+ Y, Ur -+ X, Ur -+ Y)"
leftside = [1, 0, 0, 0]
latent   = [0, 0, 0, 1]
nvals    = [2, 2, 2, 2]
rlconnect = [0, 0, 0, 0]
monotone = [0, 0, 0, 0]

for i in range(500):
    # Simulate data with diverse ATEs
    sim = simulate_with_target_ate(n=n, min_ate=0.05, max_ate=0.95)

    # Create dataframe and get probabilities
    df = pd.DataFrame({'Y': sim['Y'], 'X': sim['X'], 'Z': sim['Z']})
    prob_dict = extract_prob_dict(df)

    # Run causaloptim
    r_bounds = run_experiment(
        graph_str, leftside, latent, nvals, rlconnect, monotone, prob_dict
    )
    bound_lower = float(r_bounds[0][0])
    bound_upper = float(r_bounds[1][0])

    # Save result
    results.append({
        'seed': sim['seed'],
        'b_Z': sim['b_Z'],
        'b_U_X': sim['b_U_X'],
        'b_X_Y': sim['b_X_Y'],
        'b_U_Y': sim['b_U_Y'],
        'ATE_true': sim['ATE_true'],
        'bound_lower': bound_lower,
        'bound_upper': bound_upper
    })
df_results = pd.DataFrame(results)


In [159]:
df_results.ATE_true.describe()

count    500.000000
mean       0.388889
std        0.123048
min        0.102664
25%        0.320806
50%        0.408023
75%        0.473789
max        0.602265
Name: ATE_true, dtype: float64

#### Iteration testing area

In [123]:
def simulate_with_target_ate(n=5000, min_ate=0.05, max_ate=0.95, max_attempts=1000):
    for attempt in range(max_attempts):
        seed = np.random.randint(0, 1e6)
        np.random.seed(seed)

        # Random coefficients
        b_Z = np.round(np.random.uniform(0.5, 2.0), 3)
        b_U_X = np.round(np.random.uniform(0.5, 2.0), 3)
        b_X_Y = np.round(np.random.uniform(0.5, 5.0), 3)   # wider range to stretch ATE
        b_U_Y = np.round(np.random.uniform(0.0, 2.0), 3)

        # Generate data
        Z = np.random.binomial(1, 0.5, size=n)
        U = np.random.binomial(1, 0.5, size=n)

        logit_X = -1 + b_Z * Z + b_U_X * U
        p_X = 1 / (1 + np.exp(-logit_X))
        X = np.random.binomial(1, p_X)

        logit_Y = -0.5 + b_X_Y * X + b_U_Y * U
        p_Y = 1 / (1 + np.exp(-logit_Y))
        Y = np.random.binomial(1, p_Y)

        # Compute true ATE
        logit_Y1 = -0.5 + b_X_Y * 1 + b_U_Y * U
        logit_Y0 = -0.5 + b_X_Y * 0 + b_U_Y * U
        p_Y1 = 1 / (1 + np.exp(-logit_Y1))
        p_Y0 = 1 / (1 + np.exp(-logit_Y0))
        ATE_true = np.mean(p_Y1 - p_Y0)

        if min_ate <= ATE_true <= max_ate:
            return {
                'seed': seed,
                'b_Z': b_Z,
                'b_U_X': b_U_X,
                'b_X_Y': b_X_Y,
                'b_U_Y': b_U_Y,
                'ATE_true': ATE_true,
                'Z': Z,
                'U': U,
                'X': X,
                'Y': Y
            }

    raise ValueError(f"Failed to generate ATE in range [{min_ate}, {max_ate}] after {max_attempts} attempts")


sim = simulate_with_target_ate(n=5000, min_ate=0.05, max_ate=0.95)

df = pd.DataFrame({'Y': sim['Y'], 'X': sim['X'], 'Z': sim['Z']})
prob_dict = extract_prob_dict(df)
r_bounds = run_experiment(
    graph_str, leftside, latent, nvals, rlconnect, monotone, prob_dict
)
bound_lower = float(r_bounds[0][0])
bound_upper = float(r_bounds[1][0])

print(f"Seed: {sim['seed']}")
print(f"Coefficients: b_Z={sim['b_Z']}, b_U_X={sim['b_U_X']}, b_X_Y={sim['b_X_Y']}, b_U_Y={sim['b_U_Y']}")
print(f"True ATE: {sim['ATE_true']}")
print(f"Bounds: {bound_lower:.3f} — {bound_upper:.3f}")

Seed: 437838
Coefficients: b_Z=0.677, b_U_X=1.496, b_X_Y=2.038, b_U_Y=0.409
True ATE: 0.421055270147257
Bounds: -0.172 — 0.673


In [110]:
seed = np.random.randint(0, 1e6)
np.random.seed(seed)

# Random coefficients
b_Z = np.round(np.random.uniform(0.5, 2.0), 3)
b_U_X = np.round(np.random.uniform(0.5, 2.0), 3)
b_X_Y = np.round(np.random.uniform(0.5, 2.0), 3)
b_U_Y = np.round(np.random.uniform(0.5, 2.0), 3)

# Simulate data
Z = np.random.binomial(1, 0.5, size=n)
U = np.random.binomial(1, 0.5, size=n)

logit_X = -1 + b_Z * Z + b_U_X * U
p_X = 1 / (1 + np.exp(-logit_X))
X = np.random.binomial(1, p_X)

logit_Y = -0.5 + b_X_Y * X + b_U_Y * U
p_Y = 1 / (1 + np.exp(-logit_Y))
Y = np.random.binomial(1, p_Y)

# True ATE
logit_Y1 = -0.5 + b_X_Y * 1 + b_U_Y * U
logit_Y0 = -0.5 + b_X_Y * 0 + b_U_Y * U
p_Y1 = 1 / (1 + np.exp(-logit_Y1))
p_Y0 = 1 / (1 + np.exp(-logit_Y0))
ATE_true = np.mean(p_Y1 - p_Y0)

# Estimate bounds
df = pd.DataFrame({'Y': Y, 'X': X, 'Z': Z})
prob_dict = extract_prob_dict(df)
r_bounds = run_experiment(
graph_str, leftside, latent, nvals, rlconnect, monotone, prob_dict
)
bound_lower = float(r_bounds[0][0])
bound_upper = float(r_bounds[1][0])

print(f"Seed: {seed}")
print(f"Coefficients:")
print(f"  b_Z: {b_Z}")
print(f"  b_U_X: {b_U_X}")
print(f"  b_X_Y: {b_X_Y}")
print(f"  b_U_Y: {b_U_Y}")
print(f"True ATE: {ATE_true}")
print(f"Probabilities =1:")
print(f" p(Y=1): {p_Y}")
print(f" p(X=1): {p_X}")

print(f"Bounds:")
print(f"  Lower: {bound_lower}")
print(f"  Upper: {bound_upper}")

Seed: 289586
Coefficients:
  b_Z: 0.908
  b_U_X: 1.942
  b_X_Y: 1.967
  b_U_Y: 0.871
True ATE: 0.37794317326984606
Probabilities =1:
 p(Y=1): [0.91197567 0.91197567 0.37754067 ... 0.91197567 0.81260097 0.37754067]
 p(X=1): [0.8641271  0.8641271  0.47701621 ... 0.8641271  0.47701621 0.26894142]
Bounds:
  Lower: -0.1325433177613982
  Upper: 0.7056929857810956
