# Propensity score matching study

Creating some sample calculation for propensity score matching.

In [17]:
import numpy as np
import statsmodels.api as sm

from scipy.stats import norm, bernoulli
from sklearn.neighbors import NearestNeighbors

In [37]:
# generating data
s = 10000
# covariate 1
x1 = bernoulli.rvs(p=.1, size=s)
x2 = bernoulli.rvs(p=.1, size=s)

# treatment is caused by x1 only
p1 = .9
p2 = .1
d = bernoulli.rvs(p=p1, size=s) * x1 + bernoulli.rvs(p=p2, size=s) * (1 - x1)

# some dummy caused by treatment but not causing y
x3 = bernoulli.rvs(p=.8, size=s) * d + bernoulli.rvs(p=.2, size=s) * (1 - d)

# outcome is caused by x1, x2 and treatment
# ATE is 2
y = x1 + x2 + 2 * d

# naive ATE estimation based on observed data is biased
(y * d).sum() / d.sum() - (y * (1 - d)).sum() / (1 - d).sum()

np.float64(2.4990729265738634)

In [38]:
# chatgpt for the win
def greedy_ps_matching(d, y, x1, x2, with_replacement=False, caliper=None, seed=None):
    rng = np.random.default_rng(seed)

    # logistic regression
    X = np.column_stack([x1, x2])
    X = sm.add_constant(X)
    ps = sm.Logit(d, X).fit(disp=False).predict(X)

    treated = np.where(d == 1)[0]
    controls = np.where(d == 0)[0]
    available_controls = set(controls)

    matches = []
    for t in treated:
        # list of currently available controls
        ac = np.array(list(available_controls))
        diffs = np.abs(ps[ac] - ps[t])

        # apply caliper if needed
        if caliper is not None:
            mask = diffs <= caliper
            if not np.any(mask):
                continue
            ac = ac[mask]
            diffs = diffs[mask]

        # compute minimal distance
        min_dist = np.min(diffs)
        candidates = ac[diffs == min_dist]

        # randomly pick one among ties
        c = rng.choice(candidates)
        matches.append((t, c))

        if not with_replacement:
            available_controls.remove(c)

    # compute ATT
    t_idx = np.array([t for t, _ in matches])
    c_idx = np.array([c for _, c in matches])
    ATT = np.mean(y[t_idx] - y[c_idx])

    return matches, ATT, ps


In [39]:
matches, att, ps = greedy_ps_matching(d, y, x1, x2, with_replacement=False)

print(f"""
ATT: {att}

treated units: {d.sum()}
matched treated units: {len(matches)}
""")


ATT: 2.1241830065359477

treated units: 1836
matched treated units: 1836



In [40]:
"""
Note that the result with the stratified approach is closer to the truth (compare "propensity_score_stratification_study.ipynb")
which makes sense, since via the matching we drop a lot of control units since only about 18% are in treatment.
"""

'\nNote that the result with the stratified approach is closer to the truth (compare "propensity_score_stratification_study.ipynb")\nwhich makes sense, since via the matching we drop a lot of control units since only about 18% are in treatment.\n'