# Front-door causal bootstrapping (Algorithm 2)

This notebook implements **Algorithm 2** from *Causal bootstrapping* (Little & Badawy, 2020).  
Front-door setting uses a mediator **Z** and unmeasured confounding between **Y** and **X**.

Weights (Algorithm 2 / Eq. 14):
\[
w_i = \frac{\hat p(z_i\mid y^*)}{N\,\hat p(z_i\mid y_i)}
\]
Then resample index \(i\) with probability proportional to \(w_i\) to produce a bootstrap sample approximating \(p(x\mid do(y))\).

Key reference: Algorithm 2 and Eq. (14) in the paper. fileciteturn0file0


In [None]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

def _ensure_2d(arr):
    arr = np.asarray(arr)
    if arr.ndim == 1:
        arr = arr.reshape(-1, 1)
    return arr

def gaussian_kernel_matrix(A, B=None, bandwidth=1.0):
    """
    Gaussian/RBF kernel matrix: K_ij = exp(-0.5 * ||a_i - b_j||^2 / h^2)
    A: (n,d), B: (m,d)
    """
    A = _ensure_2d(A)
    if B is None:
        B = A
    else:
        B = _ensure_2d(B)
    dists = cdist(A, B, metric="euclidean")
    return np.exp(-0.5 * (dists / float(bandwidth)) ** 2)

def kernel_y_vector(y_data, y_star, *, discrete=True, bandwidth_y=1.0):
    """
    K[y_i - y*] used in the paper.
    - discrete=True -> Kronecker delta (1 if equal else 0)
    - discrete=False -> Gaussian kernel on (y_i - y*)
    """
    y_data = np.asarray(y_data)
    if discrete:
        return (y_data == y_star).astype(float)
    return np.exp(-0.5 * ((y_data - y_star) / float(bandwidth_y)) ** 2)

def phat_y_given_S(y_data, S_data, y_star, *, discrete_y=True, bandwidth_S=1.0, bandwidth_y=1.0, eps=1e-12):
    """
    Nonparametric estimate of p_hat(y* | S_i) for each i, using kernel regression:
      p_hat(y*|S_i) = sum_j K_S(S_i, S_j) * K_Y(y_j, y*) / sum_j K_S(S_i, S_j)
    Returns: (N,) vector over i.
    """
    K_S = gaussian_kernel_matrix(S_data, bandwidth=bandwidth_S)  # (N,N)
    K_Y = kernel_y_vector(y_data, y_star, discrete=discrete_y, bandwidth_y=bandwidth_y)  # (N,)
    numer = K_S @ K_Y
    denom = K_S.sum(axis=1)
    return numer / np.maximum(denom, eps)

def phat_z_given_y(z_data, y_data, *, bandwidth_z=1.0, eps=1e-12):
    """
    For front-door (Algorithm 2): estimate p_hat(z_i | y=v) for all i and each discrete y=v.
    KDE on z within each y-group:
      p_hat(z_i | y=v) ∝ (1/N_v) * sum_{j:y_j=v} K_z(z_i, z_j)
    Returns: dict mapping y_value -> (N,) vector, entry i is p_hat(z_i | y_value)
    """
    z = _ensure_2d(z_data)
    y = np.asarray(y_data)
    y_vals = np.unique(y)

    Kzz = gaussian_kernel_matrix(z, bandwidth=bandwidth_z)  # (N,N)
    phat = {}
    for v in y_vals:
        mask = (y == v)
        Nv = int(mask.sum())
        if Nv == 0:
            phat[v] = np.full(len(y), eps)
            continue
        numer = Kzz[:, mask].sum(axis=1) / float(Nv)
        phat[v] = np.maximum(numer, eps)
    return phat


## Choose columns

- `y_col`: intervention / prediction target **Y** (assumed *discrete* here, matching Algorithm 2)
- `z_cols`: mediator(s) **Z**
- `x_cols`: features **X** used to train your model (typically exclude Z so you learn the effect of Y on X, not the mediation variable itself)

Kernel settings:
- `bandwidth_z` controls KDE smoothness on mediator space.


In [None]:
# --- USER INPUTS ---
df = pd.read_csv("heart_disease_preprocessed.csv")  # change if needed

y_col = "heartdiseasepresence"
z_cols = ["ca"]  # mediator(s) Z

# Default X: everything except Y and Z
x_cols = [c for c in df.columns if c not in ([y_col] + z_cols)]

bandwidth_z = 1.0
random_seed = 0

print("y_col:", y_col)
print("z_cols:", z_cols)
print("x_cols:", x_cols[:10], "..." if len(x_cols) > 10 else "")
print("N:", len(df))

In [None]:
def causal_bootstrap_frontdoor(df, *, x_cols, y_col, z_cols,
                             bandwidth_z=1.0, random_seed=0, eps=1e-12):
    """Front-door causal bootstrap (Algorithm 2), discrete Y.

    Returns dataframe with columns x_cols + [y_col], same number of rows as df.
    """
    N = len(df)
    y = df[y_col].to_numpy()
    z = df[z_cols].to_numpy(dtype=float)

    rng = np.random.default_rng(random_seed)
    y_vals = np.unique(y)

    # Precompute p_hat(z_i | y=v) for all v
    phat_z = phat_z_given_y(z, y, bandwidth_z=bandwidth_z, eps=eps)

    out_rows = []
    for y_star in y_vals:
        n_star = int((y == y_star).sum())
        if n_star == 0:
            continue

        # weights for each i: p_hat(z_i|y_star) / (N * p_hat(z_i|y_i))
        denom = np.array([phat_z[yi][i] for i, yi in enumerate(y)], dtype=float)
        numer = phat_z[y_star]
        w = numer / (float(N) * np.maximum(denom, eps))

        w_sum = w.sum()
        if w_sum <= 0:
            continue
        p = w / w_sum
        idx = rng.choice(np.arange(N), size=n_star, replace=True, p=p)

        block = df.iloc[idx][x_cols].copy()
        block[y_col] = y_star
        out_rows.append(block)

    df_star = pd.concat(out_rows, ignore_index=True)
    if len(df_star) != N:
        df_star = df_star.sample(n=N, replace=True, random_state=random_seed).reset_index(drop=True)
    return df_star

In [None]:
df_frontdoor = causal_bootstrap_frontdoor(
    df,
    x_cols=x_cols,
    y_col=y_col,
    z_cols=z_cols,
    bandwidth_z=bandwidth_z,
    random_seed=random_seed,
)

df_frontdoor.head(), df_frontdoor.shape

In [None]:
out_path = "heart_disease_preprocessed_frontdoor_BOOTSTRAPPED.csv"
df_frontdoor.to_csv(out_path, index=False)
out_path