In [8]:
# user defined R installation
import os
os.environ['R_HOME'] = 'D:/Program Files/R-4.5.0' #path to your R installation
%load_ext autoreload
%autoreload 2



import numpy as np
import pandas as pd
import rpy2.robjects as robjects
from rpy2.robjects import globalenv
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector

import sys
sys.path.append('../')
from causaloptim_py.causaloptim_python import run_experiment, extract_prob_dict
from binaryIV import simulate_deterministic_data_with_probabilistic_ate

from entropy_bounds.utils import entropy_dist

import matplotlib.pyplot as plt
import seaborn as sns
import math




# Load R packages
#igraph = importr('igraph')
causaloptim = importr('causaloptim')
base = importr('base')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
x = np.random.normal(0, 1, 1)[0]
print(type(x))

In [None]:
# Required imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def pick_from_bimodal(n=1, mu1=1, sigma1=0.5, mu2=-1, sigma2=0.5):
    """
    Generate samples from a bimodal distribution.

    Args:
        n (int): Number of samples to pick. Default is 1.
        mu1 (float): Mean of the first normal distribution. Default is 1.
        sigma1 (float): Standard deviation of the first normal distribution. Default is 0.5.
        mu2 (float): Mean of the second normal distribution. Default is -1.
        sigma2 (float): Standard deviation of the second normal distribution. Default is 0.5.

    Returns:
        float or np.ndarray: A single sample if n=1, otherwise an array of n samples.
    """
    N=20000
    X1 = np.random.normal(mu1, sigma1, N)
    X2 = np.random.normal(mu2, sigma2, N)
    X = np.concatenate([X1, X2])
    # sns.histplot(X, bins=30, kde=True)
    # Pick n random samples from the bimodal distribution
    samples = np.random.choice(X, n, replace=False)
    # if n is one sample, return the sample as a single value
    if n == 1:
        return samples[0]
    return samples

    # # Example usage
    # samples = pick_from_bimodal(n=20000, mu1=1, sigma1=0.5, mu2=-1, sigma2=0.5)
    # print(samples)
    # sns.histplot(samples, bins=30, kde=True)
    # plt.show() 

def simulate_deterministic_data_with_probabilistic_ate(
    n=500,
    seed=None,
    b_U_X = np.random.normal(0, 1, 1)[0],
    b_U_Y = np.random.normal(0, 1, 1)[0],
    b_Z = pick_from_bimodal(),
    b_X_Y = pick_from_bimodal(),
    intercept_X = 0,
    intercept_Y = 0
):
    """
    Simulate deterministic (binary) data for causal analysis, 
    while computing the Average Treatment Effect (ATE) from smooth logistic potential outcome probabilities.

    Args:
        n (int): Number of samples to generate. Default is 500.
        seed (int, optional): Random seed for reproducibility. Default is None.
        b_U_X (float): Coefficient for the effect of unobserved confounder U on X. Default is drawn from N(0, 1).
        b_U_Y (float): Coefficient for the effect of unobserved confounder U on Y. Default is drawn from N(0, 1).
        b_Z (float): Coefficient for the effect of instrument Z on X. Default is drawn from a bimodal distribution.
        b_X_Y (float): Coefficient for the effect of treatment X on Y. Default is drawn from a bimodal distribution.
        intercept_X (float): Intercept for the logistic model of X. Default is 0.
        intercept_Y (float): Intercept for the logistic model of Y. Default is 0.

    Returns:
        dict: A dictionary containing:
            - seed (int): The random seed used.
            - intercept_X (float): Intercept for X.
            - intercept_Y (float): Intercept for Y.
            - b_Z (float): Coefficient for Z.
            - b_U_X (float): Coefficient for U on X.
            - b_X_Y (float): Coefficient for X on Y.
            - b_U_Y (float): Coefficient for U on Y.
            - ATE_true (float): True Average Treatment Effect.
            - p_Y1 (np.ndarray): Probabilities of Y=1 under treatment.
            - p_Y0 (np.ndarray): Probabilities of Y=1 under control.
            - Z (np.ndarray): Instrument variable.
            - U (np.ndarray): Unobserved confounder.
            - X (np.ndarray): Treatment assignment.
            - Y (np.ndarray): Outcome variable.
    """
    if seed is None:
        seed = np.random.randint(0, 1e6)
    np.random.seed(seed)

    b_U_X = np.random.normal(0, 1, 1)[0]
    b_U_Y = np.random.normal(0, 1, 1)[0]
    b_Z = pick_from_bimodal()
    b_X_Y = pick_from_bimodal()
    intercept_X = 0
    intercept_Y = 0

    # print(f"Seed: {seed}, b_U_X: {b_U_X}, b_U_Y: {b_U_Y}, b_Z: {b_Z}, b_X_Y: {b_X_Y}")




    # Binary variables
    Z = np.random.binomial(1, 0.5, size=n)
    U = np.random.binomial(1, 0.5, size=n)

    # Treatment assignment
    logit_X = intercept_X + b_Z * Z + b_U_X * U
    p_X = 1 / (1 + np.exp(-logit_X))
    X = np.random.binomial(1, p_X)

    # print("b_Z:", b_Z)
    # print("b_U_X:", b_U_X)
    # print("Mean of p_X:", np.mean(p_X))
    # print("Mean of X:", np.mean(X))

    # Deterministic outcome
    logit_Y = intercept_Y + b_X_Y * X + b_U_Y * U
    p_Y = 1 / (1 + np.exp(-logit_Y))
    Y = np.random.binomial(1, p_Y)

    # Probabilistic potential outcomes
    logit_Y1 = intercept_Y + b_X_Y * 1 + b_U_Y * U
    logit_Y0 = intercept_Y + b_X_Y * 0 + b_U_Y * U
    p_Y1 = 1 / (1 + np.exp(-logit_Y1))
    p_Y0 = 1 / (1 + np.exp(-logit_Y0))
    ATE_true = np.mean(p_Y1 - p_Y0)

    return {
        'seed': seed,
        'intercept_X': intercept_X,
        'intercept_Y': intercept_Y,
        'b_Z': b_Z,
        'b_U_X': b_U_X,
        'b_X_Y': b_X_Y,
        'b_U_Y': b_U_Y,
        'ATE_true': ATE_true,
        'p_Y1': p_Y1,
        'p_Y0': p_Y0,
        'Z': Z,
        'U': U,
        'X': X,
        'Y': Y
    }


results = []

for b_X_Y in np.arange(-7, 7, 0.1):  # Use np.arange for 0.5 steps
    sim = simulate_deterministic_data_with_probabilistic_ate(
        b_X_Y= b_X_Y
    )

    df = pd.DataFrame({'Y': sim['Y'], 'X': sim['X'], 'Z': sim['Z']})
    prob_dict = extract_prob_dict(df)

    graph_str = "(Z -+ X, X -+ Y, Ur -+ X, Ur -+ Y)"
    leftside = [1, 0, 0, 0]
    latent = [0, 0, 0, 1]
    nvals = [2, 2, 2, 2]
    rlconnect = [0, 0, 0, 0]
    monotone = [0, 0, 0, 0]

    bounds = run_experiment(graph_str, leftside, latent, nvals, rlconnect, monotone, prob_dict)
    bound_lower = float(bounds[0][0])
    bound_upper = float(bounds[1][0])
    bounds_valid = bound_lower <= sim['ATE_true'] <= bound_upper



    results.append({
        'b_X_Y': b_X_Y,
        'b_Z': sim['b_Z'],
        'b_U_X': sim['b_U_X'],
        'b_U_Y': sim['b_U_Y'],
        'entropy_Y': entropy_dist(sim['Y']),
        'entropy_Z': entropy_dist(sim['Z']),
        'entropy_U': entropy_dist(sim['U']),
        'corr_X_Y': np.corrcoef(sim['X'], sim['Y'])[0, 1],
        'corr_X_Z': np.corrcoef(sim['X'], sim['Z'])[0, 1],
        'corr_Y_Z': np.corrcoef(sim['Y'], sim['Z'])[0, 1],
        'ATE_true': sim['ATE_true'],
        'bound_lower': bound_lower,
        'bound_upper': bound_upper,
        'bound_width': bound_upper - bound_lower,
        'bounds_valid': bounds_valid
    })

df_results = pd.DataFrame(results)

In [30]:
df_results


Unnamed: 0,b_X_Y,b_Z,b_U_X,b_U_Y,entropy_Y,entropy_Z,entropy_U,ATE_true,bound_lower,bound_upper,bound width,bounds_valid
0,-7.0,1.487896,-1.850996,-0.078011,-0.0,-0.0,-0.0,-0.281539,-0.520000,0.180000,0.700000,True
1,-6.9,-1.295102,0.581143,1.299944,-0.0,-0.0,-0.0,0.145468,-0.182372,0.370513,0.552885,True
2,-6.8,-0.731034,-0.349651,-0.774432,-0.0,-0.0,-0.0,-0.112717,-0.436859,0.372756,0.809615,True
3,-6.7,-0.145602,-0.283114,-0.175917,-0.0,-0.0,-0.0,-0.251478,-0.601732,0.327765,0.929497,True
4,-6.6,-1.852444,-1.550785,2.066214,-0.0,-0.0,-0.0,0.155826,-0.358303,0.267547,0.625850,True
...,...,...,...,...,...,...,...,...,...,...,...,...
135,6.5,-1.124842,-0.905324,-0.386564,-0.0,-0.0,-0.0,0.201990,-0.289071,0.498039,0.787110,True
136,6.6,-0.578951,1.670480,0.467024,-0.0,-0.0,-0.0,0.006099,-0.373109,0.518198,0.891307,True
137,6.7,0.885803,-2.894412,0.875244,-0.0,-0.0,-0.0,0.201051,-0.438920,0.373877,0.812797,True
138,6.8,1.472115,0.707489,0.416913,-0.0,-0.0,-0.0,-0.064594,-0.399680,0.367987,0.767667,True


In [None]:
## Plot df_results as line chart, x axis should be b_X_Y, y axis should be ATE_true, bound_lower and bound_upper.
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_results, x='b_X_Y', y='ATE_true', label='True ATE', color='blue')
sns.lineplot(data=df_results, x='b_X_Y', y='bound_lower', label='Lower Bound', color='orange')
sns.lineplot(data=df_results, x='b_X_Y', y='bound_upper', label='Upper Bound', color='green')
plt.axhline(0, color='red', linestyle='--', label='Zero Line')
plt.title('Causaloptim Bounds vs True ATE')
plt.xlabel('b_X_Y Coefficient')
plt.ylabel('ATE Value')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Perform OLS regression using statsmodels
import statsmodels.api as sm
valid_df = df_results[df_results['bounds_valid'] == True]
# Add a constant for the intercept
X = sm.add_constant(valid_df[['b_X_Y', 'b_Z', 'b_U_X', 'b_U_Y', 'corr_X_Y', 'corr_X_Z', 'corr_Y_Z']])
y = valid_df['bound_width']

# Fit the OLS model
ols_model = sm.OLS(y, X).fit()

# Print the summary
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:            bound_width   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                 -0.010
Method:                 Least Squares   F-statistic:                    0.2848
Date:                Tue, 22 Apr 2025   Prob (F-statistic):              0.888
Time:                        12:07:01   Log-Likelihood:                 195.65
No. Observations:                 279   AIC:                            -381.3
Df Residuals:                     274   BIC:                            -363.1
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7645      0.007    105.302      0.0