Skip to content

Commit

Permalink
Fix Filter F doesn't work with latest statsmodels' F test fvalue form…
Browse files Browse the repository at this point in the history
…at (#505)

* Fix Filter F doesn't work with latest statsmodels' F test fvalue format
* Fix Lint/ Black for coding style compliace
  • Loading branch information
paullo0106 committed May 1, 2022
1 parent 1cd2906 commit 87950fb
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 25 deletions.
2 changes: 1 addition & 1 deletion causalml/feature_selection/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _filter_F_one_feature(data, treatment_indicator, feature_name, y_name):
{
"feature": feature_name, # for the interaction, not the main effect
"method": "F-statistic",
"score": F_test.fvalue[0][0],
"score": float(F_test.fvalue),
"p_value": F_test.pvalue,
"misc": "df_num: {}, df_denom: {}".format(
F_test.df_num, F_test.df_denom
Expand Down
2 changes: 1 addition & 1 deletion causalml/optimize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from .unit_selection import CounterfactualUnitSelector
from .utils import get_treatment_costs, get_actual_value, get_uplift_best
from .value_optimization import CounterfactualValueEstimator
from .pns import get_pns_bounds
from .pns import get_pns_bounds
48 changes: 25 additions & 23 deletions causalml/optimize/pns.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import pandas as pd


def get_pns_bounds(data_exp, data_obs, T, Y, type='PNS'):
'''
def get_pns_bounds(data_exp, data_obs, T, Y, type="PNS"):
"""
Args
----
data_exp : DataFrame
Expand All @@ -29,47 +29,49 @@ def get_pns_bounds(data_exp, data_obs, T, Y, type='PNS'):
of an intervention.
The experimental and observational data are either assumed to come to the same population,
or from random samples of the population. If the data are from a sample, the bounds may
or from random samples of the population. If the data are from a sample, the bounds may
be incorrectly calculated because the relevant quantities in the Tian-Pearl equations are
defined e.g. as P(YifT), not P(YifT \mid S) where S corresponds to sample selection.
Bareinboim and Pearl (https://www.pnas.org/doi/10.1073/pnas.1510507113) discuss conditions
under which P(YifT) can be recovered from P(YifT \mid S).
'''
"""

# Probabilities calculated from observational data
Y1 = data_obs[Y].mean()
T1Y0 = data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 0)].shape[0] / data_obs.shape[0]
T1Y1 = data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 1)].shape[0] / data_obs.shape[0]
T0Y0 = data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 0)].shape[0] / data_obs.shape[0]
T0Y1 = data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 1)].shape[0] / data_obs.shape[0]
T1Y0 = (
data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 0)].shape[0]
/ data_obs.shape[0]
)
T1Y1 = (
data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 1)].shape[0]
/ data_obs.shape[0]
)
T0Y0 = (
data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 0)].shape[0]
/ data_obs.shape[0]
)
T0Y1 = (
data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 1)].shape[0]
/ data_obs.shape[0]
)

# Probabilities calculated from experimental data
Y1doT1 = data_exp.loc[data_exp[T] == 1, Y].mean()
Y1doT0 = data_exp.loc[data_exp[T] == 0, Y].mean()
Y0doT0 = 1 - Y1doT0

if type == 'PNS':
if type == "PNS":

lb_args = [
0,
Y1doT1 - Y1doT0,
Y1 - Y1doT0,
Y1doT1 - Y1
]
lb_args = [0, Y1doT1 - Y1doT0, Y1 - Y1doT0, Y1doT1 - Y1]

ub_args = [
Y1doT1,
Y0doT0,
T1Y1 + T0Y0,
Y1doT1 - Y1doT0 + T1Y0 + T0Y1
]
ub_args = [Y1doT1, Y0doT0, T1Y1 + T0Y0, Y1doT1 - Y1doT0 + T1Y0 + T0Y1]

if type == 'PN':
if type == "PN":

lb_args = [0, (Y1 - Y1doT0) / T1Y1]
ub_args = [1, (Y0doT0 - T0Y0) / T1Y1]

if type == 'PS':
if type == "PS":

lb_args = [0, (Y1doT1 - Y1) / T0Y0]
ub_args = [1, (Y1doT1 - T1Y1) / T0Y0]
Expand Down
64 changes: 64 additions & 0 deletions tests/test_feature_selection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import numpy as np
from causalml.feature_selection.filters import FilterSelect

from .const import RANDOM_SEED, CONVERSION


def test_filter_f(generate_classification_data):
# generate uplift classification data
np.random.seed(RANDOM_SEED)
df, X_names = generate_classification_data()
y_name = CONVERSION

# test F filter
method = "F"
filter_f = FilterSelect()
f_imp = filter_f.get_importance(
df, X_names, y_name, method, treatment_group="treatment1"
)

# each row represents the rank and importance score of each feature
# and spot check if it's sorted properly
assert f_imp.shape[0] == len(X_names)
assert f_imp["rank"].values[0] == 1
assert f_imp["score"].values[0] >= f_imp["score"].values[1]


def test_filter_lr(generate_classification_data):
# generate uplift classification data
np.random.seed(RANDOM_SEED)
df, X_names = generate_classification_data()
y_name = CONVERSION

# test LR filter
method = "LR"
filter_obj = FilterSelect()
imp = filter_obj.get_importance(
df, X_names, y_name, method, treatment_group="treatment1"
)

# each row represents the rank and importance score of each feature
# and spot check if it's sorted properly
assert imp.shape[0] == len(X_names)
assert imp["rank"].values[0] == 1
assert imp["score"].values[0] >= imp["score"].values[1]


def test_filter_kl(generate_classification_data):
# generate uplift classification data
np.random.seed(RANDOM_SEED)
df, X_names = generate_classification_data()
y_name = CONVERSION

# test KL filter
method = "KL"
filter_obj = FilterSelect()
imp = filter_obj.get_importance(
df, X_names, y_name, method, treatment_group="treatment1"
)

# each row represents the rank and importance score of each feature
# and spot check if it's sorted properly
assert imp.shape[0] == len(X_names)
assert imp["rank"].values[0] == 1
assert imp["score"].values[0] >= imp["score"].values[1]

0 comments on commit 87950fb

Please sign in to comment.