From 87950fbe4be9b207fd37ac476694e05096998e02 Mon Sep 17 00:00:00 2001 From: Paul Lo Date: Sat, 30 Apr 2022 18:52:22 -0700 Subject: [PATCH] Fix Filter F doesn't work with latest statsmodels' F test fvalue format (#505) * Fix Filter F doesn't work with latest statsmodels' F test fvalue format * Fix Lint/ Black for coding style compliace --- causalml/feature_selection/filters.py | 2 +- causalml/optimize/__init__.py | 2 +- causalml/optimize/pns.py | 48 ++++++++++---------- tests/test_feature_selection.py | 64 +++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 25 deletions(-) create mode 100644 tests/test_feature_selection.py diff --git a/causalml/feature_selection/filters.py b/causalml/feature_selection/filters.py index 81ac94d3..da9897eb 100644 --- a/causalml/feature_selection/filters.py +++ b/causalml/feature_selection/filters.py @@ -47,7 +47,7 @@ def _filter_F_one_feature(data, treatment_indicator, feature_name, y_name): { "feature": feature_name, # for the interaction, not the main effect "method": "F-statistic", - "score": F_test.fvalue[0][0], + "score": float(F_test.fvalue), "p_value": F_test.pvalue, "misc": "df_num: {}, df_denom: {}".format( F_test.df_num, F_test.df_denom diff --git a/causalml/optimize/__init__.py b/causalml/optimize/__init__.py index 1d1ba9c0..6379fc0c 100644 --- a/causalml/optimize/__init__.py +++ b/causalml/optimize/__init__.py @@ -2,4 +2,4 @@ from .unit_selection import CounterfactualUnitSelector from .utils import get_treatment_costs, get_actual_value, get_uplift_best from .value_optimization import CounterfactualValueEstimator -from .pns import get_pns_bounds \ No newline at end of file +from .pns import get_pns_bounds diff --git a/causalml/optimize/pns.py b/causalml/optimize/pns.py index 6f835b70..ce02a547 100644 --- a/causalml/optimize/pns.py +++ b/causalml/optimize/pns.py @@ -2,8 +2,8 @@ import pandas as pd -def get_pns_bounds(data_exp, data_obs, T, Y, type='PNS'): - ''' +def get_pns_bounds(data_exp, data_obs, T, Y, type="PNS"): + """ Args ---- data_exp : DataFrame @@ -29,47 +29,49 @@ def get_pns_bounds(data_exp, data_obs, T, Y, type='PNS'): of an intervention. The experimental and observational data are either assumed to come to the same population, - or from random samples of the population. If the data are from a sample, the bounds may + or from random samples of the population. If the data are from a sample, the bounds may be incorrectly calculated because the relevant quantities in the Tian-Pearl equations are defined e.g. as P(YifT), not P(YifT \mid S) where S corresponds to sample selection. Bareinboim and Pearl (https://www.pnas.org/doi/10.1073/pnas.1510507113) discuss conditions under which P(YifT) can be recovered from P(YifT \mid S). - ''' + """ # Probabilities calculated from observational data Y1 = data_obs[Y].mean() - T1Y0 = data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 0)].shape[0] / data_obs.shape[0] - T1Y1 = data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 1)].shape[0] / data_obs.shape[0] - T0Y0 = data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 0)].shape[0] / data_obs.shape[0] - T0Y1 = data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 1)].shape[0] / data_obs.shape[0] + T1Y0 = ( + data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 0)].shape[0] + / data_obs.shape[0] + ) + T1Y1 = ( + data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 1)].shape[0] + / data_obs.shape[0] + ) + T0Y0 = ( + data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 0)].shape[0] + / data_obs.shape[0] + ) + T0Y1 = ( + data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 1)].shape[0] + / data_obs.shape[0] + ) # Probabilities calculated from experimental data Y1doT1 = data_exp.loc[data_exp[T] == 1, Y].mean() Y1doT0 = data_exp.loc[data_exp[T] == 0, Y].mean() Y0doT0 = 1 - Y1doT0 - if type == 'PNS': + if type == "PNS": - lb_args = [ - 0, - Y1doT1 - Y1doT0, - Y1 - Y1doT0, - Y1doT1 - Y1 - ] + lb_args = [0, Y1doT1 - Y1doT0, Y1 - Y1doT0, Y1doT1 - Y1] - ub_args = [ - Y1doT1, - Y0doT0, - T1Y1 + T0Y0, - Y1doT1 - Y1doT0 + T1Y0 + T0Y1 - ] + ub_args = [Y1doT1, Y0doT0, T1Y1 + T0Y0, Y1doT1 - Y1doT0 + T1Y0 + T0Y1] - if type == 'PN': + if type == "PN": lb_args = [0, (Y1 - Y1doT0) / T1Y1] ub_args = [1, (Y0doT0 - T0Y0) / T1Y1] - if type == 'PS': + if type == "PS": lb_args = [0, (Y1doT1 - Y1) / T0Y0] ub_args = [1, (Y1doT1 - T1Y1) / T0Y0] diff --git a/tests/test_feature_selection.py b/tests/test_feature_selection.py new file mode 100644 index 00000000..39ba91bf --- /dev/null +++ b/tests/test_feature_selection.py @@ -0,0 +1,64 @@ +import numpy as np +from causalml.feature_selection.filters import FilterSelect + +from .const import RANDOM_SEED, CONVERSION + + +def test_filter_f(generate_classification_data): + # generate uplift classification data + np.random.seed(RANDOM_SEED) + df, X_names = generate_classification_data() + y_name = CONVERSION + + # test F filter + method = "F" + filter_f = FilterSelect() + f_imp = filter_f.get_importance( + df, X_names, y_name, method, treatment_group="treatment1" + ) + + # each row represents the rank and importance score of each feature + # and spot check if it's sorted properly + assert f_imp.shape[0] == len(X_names) + assert f_imp["rank"].values[0] == 1 + assert f_imp["score"].values[0] >= f_imp["score"].values[1] + + +def test_filter_lr(generate_classification_data): + # generate uplift classification data + np.random.seed(RANDOM_SEED) + df, X_names = generate_classification_data() + y_name = CONVERSION + + # test LR filter + method = "LR" + filter_obj = FilterSelect() + imp = filter_obj.get_importance( + df, X_names, y_name, method, treatment_group="treatment1" + ) + + # each row represents the rank and importance score of each feature + # and spot check if it's sorted properly + assert imp.shape[0] == len(X_names) + assert imp["rank"].values[0] == 1 + assert imp["score"].values[0] >= imp["score"].values[1] + + +def test_filter_kl(generate_classification_data): + # generate uplift classification data + np.random.seed(RANDOM_SEED) + df, X_names = generate_classification_data() + y_name = CONVERSION + + # test KL filter + method = "KL" + filter_obj = FilterSelect() + imp = filter_obj.get_importance( + df, X_names, y_name, method, treatment_group="treatment1" + ) + + # each row represents the rank and importance score of each feature + # and spot check if it's sorted properly + assert imp.shape[0] == len(X_names) + assert imp["rank"].values[0] == 1 + assert imp["score"].values[0] >= imp["score"].values[1]