In [71]:
from causalml.dataset import synthetic_data
from causalml.inference.meta import BaseXRegressor, BaseRRegressor, LRSRegressor, BaseDRLearner, MLPTRegressor, BaseRLearner, BaseSRegressor
from xgboost import XGBRegressor
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Read in Ace data
ace_data = pd.read_csv('ace_data.csv')

# y, X, treatment, _, _, e = synthetic_data(mode=1, n=1000, p=5, sigma=1.0)

# # Print the shape of the data
# print(X.shape)
# print(y.shape)
# print(treatment.shape)

In [72]:
# Get the column names of ace_data
column_names = ace_data.columns
print(column_names)

Index(['FINALWT', 'GENHLTH', 'MARITAL', '_SEX', 'MENTHLTH', '_EDUCAG',
       '_INCOMG1', 'POORHLTH', 'ADDEPEV3', '_AGEG5YR', '_AGE65YR', '_AGE80',
       '_AGE_G', 'DECIDE', 'DIFFALON', 'ACEDEPRS', 'ACEDRINK', 'ACEDRUGS',
       'ACEPRISN', 'ACEDIVRC', 'ACEPUNCH', 'ACEHURT1', 'ACESWEAR', 'ACETOUCH',
       'ACETTHEM', 'ACEHVSEX'],
      dtype='object')


In [73]:
# Declare the list of treatments with multiple levels
multi_levels = ['ACEPUNCH', 'ACEHURT1', 'ACESWEAR', 'ACETOUCH', 'ACETTHEM', 'ACEHVSEX']

In [74]:
def perform_analysis(data, target_col, treatment_col, feature_cols, sample_weights_col = 'FINALWT'):

    # Drop NAs in the treatment and target columns
    data = data.dropna(subset=[treatment_col, target_col])
    
    # Filter out unwanted responses depending on the treatment
    if treatment_col == 'ACEDIVRC':
        # Create a dictionary mapping the recoded treatment levels to their original labels
        treatment_labels = {0: 'Yes', 1: 'No', 2: 'Parents not married'}

        # Keep only the responses that are 1, 2, or 8 (1 = Yes, 2 = No, 8 = Not Married)
        data = data[data[treatment_col].isin([1, 2, 8])]
        # Recode the treatment variables
        data[treatment_col] = data[treatment_col].map({1: 1, 2: 0, 8: 2})
        
        # Print the control group
        print('Control Group is {} i.e. Parents Married'.format(treatment_labels[0]))
        
        # Declare the treatment and target
        treatment = data[treatment_col]
        y = data[target_col]

        # Declare X
        X = data[feature_cols]
        
        # Declare the sample weights
        sample_weights = data[sample_weights_col]
        
        # Perform the analysis
        
        # Estimate the ATE using the LRS Regressor
        learner_s = LRSRegressor()
        te, lb, ub = learner_s.estimate_ate(X=X, treatment=treatment, y=y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with LRS Regressor on treatment {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))

        # Estimate the ATE using the Neural Network (MLP)
        nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
                        learning_rate_init=.1,
                        early_stopping=True,
                        random_state=42)
        te, lb, ub = nn.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with Neural Network (MLP) on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                  .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))

        # Estimate the ATE using the BaseXRegressor
        xl = BaseXRegressor(learner=XGBRegressor(random_state=42))
        te, lb, ub = xl.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseXRegressor using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                  .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
        
        # Estimate the ATE using the BaseDRLearner
        dr = BaseDRLearner(learner=XGBRegressor(random_state=42))
        te, lb, ub = dr.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseDRLearner using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
        
        # Estimate the ATE using the BaseRLearner
        rl = BaseRLearner(learner=XGBRegressor(random_state=42))
        te, lb, ub = rl.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseRLearner using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
            
        # Print whitespace
        print('\n')
        
    elif treatment_col in multi_levels:
        
        # Create a dictionary mapping the recoded treatment levels to their original labels
        treatment_labels = {0: 'Never', 1: 'Once', 2: 'More than once'}
        
        # Print the control group
        print('Control Group is {} i.e. Never had ACE'.format(treatment_labels[0]))
        
        # Keep only the responses that are 1 or 2 or 3 (1 = Never, 2 = Once, 3 = More than once)
        data = data[data[treatment_col].isin([1, 2, 3])]
        # Recode the treatment and target variables
        data[treatment_col] = data[treatment_col].map({1: 0, 2: 1, 3: 2})
        
        # Declare the treatment and target
        treatment = data[treatment_col]
        y = data[target_col]

        # Declare X
        X = data[feature_cols]
        
        # Declare the sample weights
        sample_weights = data[sample_weights_col]

        # Perform the analysis -----------------------------------------
        
        # Estimate the ATE using the LRS Regressor
        learner_s = LRSRegressor()
        te, lb, ub = learner_s.estimate_ate(X=X, treatment=treatment, y=y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with LRS Regressor on treatment {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))

        # Estimate the ATE using the Neural Network (MLP)
        nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
                        learning_rate_init=.1,
                        early_stopping=True,
                        random_state=42)
        te, lb, ub = nn.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with Neural Network (MLP) on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                  .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))

        # Estimate the ATE using the BaseXRegressor
        xl = BaseXRegressor(learner=XGBRegressor(random_state=42))
        te, lb, ub = xl.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseXRegressor using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                  .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
        
        # Estimate the ATE using the BaseDRLearner
        dr = BaseDRLearner(learner=XGBRegressor(random_state=42))
        te, lb, ub = dr.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseDRLearner using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
        
        # Estimate the ATE using the BaseRLearner
        rl = BaseRLearner(learner=XGBRegressor(random_state=42))
        te, lb, ub = rl.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseRLearner using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
            
        # Print whitespace
        print('\n')
    else:
        # Create a dictionary mapping the recoded treatment levels to their original labels
        treatment_labels = {0: 'No', 1: 'Yes'}
        
        # Print the control group
        print('Control Group is {} i.e. No Answer to ACE Question'.format(treatment_labels[0]))
        
        # Keep only the responses that are 1 or 2 or 3 (1 = Yes, 2 = No)
        data = data[data[treatment_col].isin([1, 2])]
        
        # Recode the treatment variable
        data[treatment_col] = data[treatment_col].map({2: 0, 1: 1})
        
        # Declare the treatment and target
        treatment = data[treatment_col]
        y = data[target_col]

        # Declare X
        X = data[feature_cols]
        
        # Declare the sample weights
        sample_weights = data[sample_weights_col]
        
        # Calculate the propensity score
        model = LogisticRegression()
        model.fit(X, y)
        e = model.predict_proba(X)[:, 1]
        
        # Perform the analysis -----------------------------------------
        
        # Estimate the ATE using the LRS Regressor
        learner_s = LRSRegressor()
        te, lb, ub = learner_s.estimate_ate(X=X, treatment=treatment, y=y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with LRS Regressor on treatment {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))

        # Estimate the ATE using the Neural Network (MLP)
        nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
                        learning_rate_init=.1,
                        early_stopping=True,
                        random_state=42)
        te, lb, ub = nn.estimate_ate(X, treatment, y)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with Neural Network (MLP) on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                  .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))

        # Estimate the ATE using the BaseXRegressor
        xl = BaseXRegressor(learner=XGBRegressor(random_state=42))
        te, lb, ub = xl.estimate_ate(X, treatment, y, e)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseXRegressor using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                  .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
        
        # Estimate the ATE using the BaseDRLearner
        dr = BaseDRLearner(learner=XGBRegressor(random_state=42))
        te, lb, ub = dr.estimate_ate(X, treatment, y, e)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseDRLearner using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
        
        # Estimate the ATE using the BaseRLearner
        rl = BaseRLearner(learner=XGBRegressor(random_state=42))
        te, lb, ub = rl.estimate_ate(X, treatment, y, e)
        # Print all the treatment average effects
        for treatment_level in range(len(te)):
            print('ATE with BaseRLearner using XGBoost on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
                .format(treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))
        
        # Print whitespace
        print('\n')
    
    

In [75]:
def recode_target(data, target_col):
    # If Target is ADDEPEV3, recode the target variable to binary
    if target_col == 'ADDEPEV3':
        data[target_col] = data[target_col].map({1: 1, 2: 0})
    
    # If target is MENTHLTH, filter out responses that are 77 or 99 and recode 88 to 0
    if target_col == 'MENTHLTH':
        
        # Filter out responses that are 77
        data = data[data[target_col] != 77]
        
        # Filter out responses that are 99
        data = data[data[target_col] != 99]
        
        # Replace the value 88 to 0 and leave the rest as they are
        data[target_col] = data[target_col].replace({88: 0})
    
    return data

In [76]:
# Declare the feature columns
feature_cols = ['_AGE_G', '_SEX', '_EDUCAG', '_INCOMG1']

# Declare the target columns
target_cols = ['ADDEPEV3', 'MENTHLTH']

# Declare the treatment columns
treatment_cols = ['ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN',
       'ACEDIVRC', 'ACEPUNCH', 'ACEHURT1', 'ACESWEAR', 'ACETOUCH', 'ACETTHEM',
       'ACEHVSEX']

# Declare the sample weights column
sample_weights_col = 'FINALWT'

# Iterate over all combinations of target and treatment columns
for target in target_cols:
    # Recode the target variable
    ace_data_df = recode_target(ace_data, target)
    for treatment in treatment_cols:
        print(f"Performing analysis for target {target} and treatment {treatment}")
        perform_analysis(ace_data_df, target, treatment, feature_cols, sample_weights_col)


Performing analysis for target ADDEPEV3 and treatment ACEDEPRS
Control Group is No i.e. No Answer to ACE Question
ATE with LRS Regressor on treatment Yes: 0.28 (0.27, 0.29)
ATE with Neural Network (MLP) on treatment level Yes: 0.30 (0.29, 0.31)
ATE with BaseXRegressor using XGBoost on treatment level Yes: 0.26 (0.25, 0.27)
ATE with BaseDRLearner using XGBoost on treatment level Yes: 0.26 (0.25, 0.27)
ATE with BaseRLearner using XGBoost on treatment level Yes: 0.25 (0.25, 0.25)


Performing analysis for target ADDEPEV3 and treatment ACEDRINK
Control Group is No i.e. No Answer to ACE Question
ATE with LRS Regressor on treatment Yes: 0.13 (0.12, 0.14)
ATE with Neural Network (MLP) on treatment level Yes: 0.14 (0.13, 0.14)
ATE with BaseXRegressor using XGBoost on treatment level Yes: 0.12 (0.11, 0.12)
ATE with BaseDRLearner using XGBoost on treatment level Yes: 0.11 (0.11, 0.12)
ATE with BaseRLearner using XGBoost on treatment level Yes: 0.11 (0.11, 0.11)


Performing analysis for target A

In [77]:
# def generate_results(learner, X, treatment, y, treatment_labels, learner_name):
#     te, lb, ub = learner.estimate_ate(X, treatment, y)
#     # Print all the treatment average effects
#     for treatment_level in range(len(te)):
#         print('ATE with {} on treatment level {}: {:.2f} ({:.2f}, {:.2f})'\
#               .format(learner_name, treatment_labels[treatment_level + 1], te[treatment_level], lb[treatment_level], ub[treatment_level]))

# def perform_analysis(data, target, treatment, feature_cols, sample_weights_col):
#     # Extract the relevant data
#     X = data[feature_cols]
#     y = data[target]
#     treatment = data[treatment]
#     sample_weights = data[sample_weights_col]

#     # Perform the analysis -----------------------------------------
    
#     # Estimate the ATE using the LRS Regressor
#     learner_s = LRSRegressor()
#     generate_results(learner_s, X, treatment, y, 'LRS Regressor')

#     # Estimate the ATE using the Neural Network (MLP)
#     nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
#                     learning_rate_init=.1,
#                     early_stopping=True,
#                     random_state=42)
#     generate_results(nn, X, treatment, y, 'Neural Network (MLP)')

#     # Estimate the ATE using the BaseXRegressor
#     xl = BaseXRegressor(learner=XGBRegressor(random_state=42))
#     generate_results(xl, X, treatment, y, 'BaseXRegressor using XGBoost')

#     # Estimate the ATE using the BaseDRLearner
#     dr = BaseDRLearner(learner=XGBRegressor(random_state=42))
#     generate_results(dr, X, treatment, y, 'BaseDRLearner using XGBoost')

#     # Estimate the ATE using the BaseRLearner
#     rl = BaseRLearner(learner=XGBRegressor(random_state=42))
#     generate_results(rl, X, treatment, y, 'BaseRLearner using XGBoost')
    
#     # Print whitespace
#     print('\n')

In [78]:
# # Declare the feature columns
# feature_cols = ['_AGE_G', '_SEX', '_EDUCAG', '_INCOMG1']

# # Declare the target columns
# target_cols = ['ADDEPEV3', 'MENTHLTH']

# # Declare the treatment columns
# treatment_cols = ['ACEDEPRS', 'ACEDRINK', 'ACEDRUGS', 'ACEPRISN',
#        'ACEDIVRC', 'ACEPUNCH', 'ACEHURT1', 'ACESWEAR', 'ACETOUCH', 'ACETTHEM',
#        'ACEHVSEX']

# # Declare the sample weights column
# sample_weights_col = 'FINALWT'

# # Recode the target columns
# for target in target_cols:
#     ace_data = recode_target(ace_data, target)

# # Iterate over all combinations of target and treatment columns
# for target in target_cols:
#     for treatment in treatment_cols:
#         print(f"Performing analysis for target {target} and treatment {treatment}")
#         perform_analysis(ace_data, target, treatment, feature_cols, sample_weights_col)

Copilot on Interpretation:

For the binary target ADDEPEV3 with treatment ACEDEPRS:

The Average Treatment Effect (ATE) is a measure of the difference in mean (average) outcomes between units that received the treatment and those that did not. In this case, the ATE is the difference in the average outcome of ADDEPEV3 (whether a person has experienced a depressive episode) between those who have experienced ACEDEPRS (a form of adverse childhood experience) and those who have not.

The results from the different models (LRS Regressor, Neural Network, BaseXRegressor using XGBoost, and BaseRRegressor using XGBoost) are all around 0.25 to 0.30. This suggests that, on average, experiencing ACEDEPRS increases the likelihood of having a depressive episode (ADDEPEV3) by about 25% to 30%. The numbers in parentheses are the lower and upper bounds of a 95% confidence interval for the ATE, indicating the range within which we can be 95% confident that the true ATE lies.

In simpler terms: These results suggest that people who have experienced this particular adverse childhood experience are about 25% to 30% more likely to have had a depressive episode.

For the continuous target MENTHLTH with treatment ACEHVSEX:

The interpretation is similar, but now the ATE represents the difference in the average number of days of poor mental health (MENTHLTH) between those who have experienced ACEHVSEX (another form of adverse childhood experience) and those who have not.

The results from the different models suggest that, on average, experiencing ACEHVSEX increases the number of days of poor mental health by about 10 to 13 days. The BaseRRegressor using XGBoost model seems to be an outlier with an ATE of 0.26, which might suggest some issue with the model or the data.

In simpler terms: These results suggest that people who have experienced this particular adverse childhood experience have, on average, 10 to 13 more days of poor mental health.

Simple Example with Drugs as Treatment and Mental Health as Response

In [79]:
# # filter out rows that have `nan` values in the 'ACEDRUGS' or 'MENTHLTH' columns
# ace_data = ace_data.dropna(subset=['ACEDRUGS', 'MENTHLTH'])

# # Filter the dataset to only include rows where the 'ACEDRUGS' column is less than 2
# ace_data = ace_data[ace_data['ACEDRUGS'] < 3] # Only two levels of treatment

# # Declare the treatment
# treatment = ace_data['ACEDRUGS']

# # Declare the target
# # y = ace_data['MENTHLTH']
# y = ace_data['ACEDEPRS']

# # # Subtract 1 from the treatment column
# treatment = treatment - 1 
# # TODO I need to confirm what 0 and 1 should mean for CausalML i.e. YES and NO or NO and YES

# print(treatment.unique())

# # Declare X
# X = ace_data[['_AGE_G', '_SEX', '_EDUCAG', '_INCOMG1']]

# # Print the shapes of X, treatment, and y
# print(X.shape)
# print(y.shape)
# print(treatment.shape)

# Propensity Score
Propensity score, which is the probability of receiving the treatment given the observed features.

In the context of causal inference, the propensity score is a balancing score: conditional on the propensity score, the distribution of observed covariates will be the same between treated and untreated subjects.

To create e with non-synthetic data, you would typically use a binary classification model where the features are your covariates and the target is whether or not the subject received treatment. The predicted probability of receiving treatment is your propensity score.

This code fits a logistic regression model to predict the treatment given the features, and then uses this model to compute the propensity score. Note that this is a very basic example and in practice you might need to consider more sophisticated models or methods to estimate the propensity score, depending on the complexity of your data.

In [80]:
# # Calculate the propensity score (basic and prompt engineered could be wrong)
# model = LogisticRegression()
# model.fit(X, y)

# # The propensity score
# e = model.predict_proba(X)[:, 1]
# print(len(e))

In [81]:
# learner_s = LRSRegressor()
# ate_s = learner_s.estimate_ate(X=X, treatment=treatment, y=y)
# print(ate_s)
# print('ATE estimate: {:.03f}'.format(ate_s[0][0]))
# print('ATE lower bound: {:.03f}'.format(ate_s[1][0]))
# print('ATE upper bound: {:.03f}'.format(ate_s[2][0]))


In [82]:
# nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
#                  learning_rate_init=.1,
#                  early_stopping=True,
#                  random_state=42)
# te, lb, ub = nn.estimate_ate(X, treatment, y)
# print('Average Treatment Effect (Neural Network (MLP)): {:.2f} ({:.2f}, {:.2f})'.format(te[0], lb[0], ub[0]))

In [83]:
# xl = BaseXRegressor(learner=XGBRegressor(random_state=42))
# te, lb, ub = xl.estimate_ate(X, treatment, y, e)
# print('Average Treatment Effect (BaseXRegressor using XGBoost): {:.2f} ({:.2f}, {:.2f})'.format(te[0], lb[0], ub[0]))

In [84]:
# rl = BaseRRegressor(learner=XGBRegressor(random_state=42))
# te, lb, ub =  rl.estimate_ate(X=X, p=e, treatment=treatment, y=y)
# print('Average Treatment Effect (BaseRRegressor using XGBoost): {:.2f} ({:.2f}, {:.2f})'.format(te[0], lb[0], ub[0]))