In [37]:
from causalml.dataset import synthetic_data
from causalml.inference.meta import BaseSRegressor, BaseTRegressor, BaseXRegressor, BaseRRegressor, LRSRegressor
from causalml.inference.meta import LRSRegressor
from causalml.inference.meta import XGBTRegressor, MLPTRegressor
from causalml.inference.meta import BaseXRegressor
from causalml.inference.meta import BaseRRegressor
from xgboost import XGBRegressor
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Read in Ace data
ace_data = pd.read_csv('ace_data.csv')

# y, X, treatment, _, _, e = synthetic_data(mode=1, n=1000, p=5, sigma=1.0)

# # Print the shape of the data
# print(X.shape)
# print(y.shape)
# print(treatment.shape)

In [32]:
# Get the column names of ace_data
column_names = ace_data.columns
print(column_names)

Index(['GENHLTH', 'MARITAL', '_SEX', 'MENTHLTH', '_EDUCAG', '_INCOMG1',
       'POORHLTH', 'ADDEPEV3', '_AGEG5YR', '_AGE65YR', '_AGE80', '_AGE_G',
       'DECIDE', 'DIFFALON', 'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN',
       'ACEDIVRC', 'ACEPUNCH', 'ACEHURT1', 'ACESWEAR', 'ACETOUCH', 'ACETTHEM',
       'ACEHVSEX'],
      dtype='object')


Simple Example with Drugs as Treatment and Mental Health as Response

In [33]:
# filter out rows that have `nan` values in the 'ACEDRUGS' or 'MENTHLTH' columns
ace_data = ace_data.dropna(subset=['ACEDRUGS', 'MENTHLTH'])

# Filter the dataset to only include rows where the 'ACEDRUGS' column is less than 2
ace_data = ace_data[ace_data['ACEDRUGS'] < 3] # Only two levels of treatment

# Declare the treatment and outcome columns
treatment = ace_data['ACEDRUGS']
y = ace_data['MENTHLTH']

# # Subtract 1 from the treatment column
treatment = treatment - 1

print(treatment.unique())

# Declare X
X = ace_data[['_AGE_G', '_SEX', '_EDUCAG', '_INCOMG1']]

# Print the shapes of X, treatment, and y
print(X.shape)
print(y.shape)
print(treatment.shape)

[1. 0.]
(59508, 4)
(59508,)
(59508,)


# Propensity Score
Propensity score, which is the probability of receiving the treatment given the observed features.

In the context of causal inference, the propensity score is a balancing score: conditional on the propensity score, the distribution of observed covariates will be the same between treated and untreated subjects.

To create e with non-synthetic data, you would typically use a binary classification model where the features are your covariates and the target is whether or not the subject received treatment. The predicted probability of receiving treatment is your propensity score.

This code fits a logistic regression model to predict the treatment given the features, and then uses this model to compute the propensity score. Note that this is a very basic example and in practice you might need to consider more sophisticated models or methods to estimate the propensity score, depending on the complexity of your data.

In [39]:
# Calculate the propensity score (basic and prompt engineered could be wrong)
model = LogisticRegression()
model.fit(X, y)

# The propensity score
e = model.predict_proba(X)[:, 1]
print(len(e))

59508


In [34]:
learner_s = LRSRegressor()
ate_s = learner_s.estimate_ate(X=X, treatment=treatment, y=y)
print(ate_s)
print('ATE estimate: {:.03f}'.format(ate_s[0][0]))
print('ATE lower bound: {:.03f}'.format(ate_s[1][0]))
print('ATE upper bound: {:.03f}'.format(ate_s[2][0]))


(array([11.37570596]), array([10.38730163]), array([12.36411028]))
ATE estimate: 11.376
ATE lower bound: 10.387
ATE upper bound: 12.364


In [35]:
nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
                 learning_rate_init=.1,
                 early_stopping=True,
                 random_state=42)
te, lb, ub = nn.estimate_ate(X, treatment, y)
print('Average Treatment Effect (Neural Network (MLP)): {:.2f} ({:.2f}, {:.2f})'.format(te[0], lb[0], ub[0]))

Average Treatment Effect (Neural Network (MLP)): 10.57 (9.62, 11.52)


In [40]:
xl = BaseXRegressor(learner=XGBRegressor(random_state=42))
te, lb, ub = xl.estimate_ate(X, treatment, y, e)
print('Average Treatment Effect (BaseXRegressor using XGBoost): {:.2f} ({:.2f}, {:.2f})'.format(te[0], lb[0], ub[0]))

Average Treatment Effect (BaseXRegressor using XGBoost): 12.37 (11.45, 13.29)


In [41]:
rl = BaseRRegressor(learner=XGBRegressor(random_state=42))
te, lb, ub =  rl.estimate_ate(X=X, p=e, treatment=treatment, y=y)
print('Average Treatment Effect (BaseRRegressor using XGBoost): {:.2f} ({:.2f}, {:.2f})'.format(te[0], lb[0], ub[0]))

Average Treatment Effect (BaseRRegressor using XGBoost): 1.32 (1.31, 1.32)
