Fix Filter F doesn't work with latest statsmodels' F test fvalue form…

…at (#505) * Fix Filter F doesn't work with latest statsmodels' F test fvalue format * Fix Lint/ Black for coding style compliace
uber · May 1, 2022 · 87950fb · 87950fb
1 parent 1cd2906
commit 87950fb
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 25 deletions.
diff --git a/causalml/feature_selection/filters.py b/causalml/feature_selection/filters.py
@@ -47,7 +47,7 @@ def _filter_F_one_feature(data, treatment_indicator, feature_name, y_name):
             {
                 "feature": feature_name,  # for the interaction, not the main effect
                 "method": "F-statistic",
-                "score": F_test.fvalue[0][0],
+                "score": float(F_test.fvalue),
                 "p_value": F_test.pvalue,
                 "misc": "df_num: {}, df_denom: {}".format(
                     F_test.df_num, F_test.df_denom

diff --git a/causalml/optimize/__init__.py b/causalml/optimize/__init__.py
@@ -2,4 +2,4 @@
 from .unit_selection import CounterfactualUnitSelector
 from .utils import get_treatment_costs, get_actual_value, get_uplift_best
 from .value_optimization import CounterfactualValueEstimator
-from .pns import get_pns_bounds
+from .pns import get_pns_bounds
diff --git a/causalml/optimize/pns.py b/causalml/optimize/pns.py
@@ -2,8 +2,8 @@
 import pandas as pd
 
 
-def get_pns_bounds(data_exp, data_obs, T, Y, type='PNS'):
-    '''
+def get_pns_bounds(data_exp, data_obs, T, Y, type="PNS"):
+    """
     Args
     ----
     data_exp : DataFrame
@@ -29,47 +29,49 @@ def get_pns_bounds(data_exp, data_obs, T, Y, type='PNS'):
     of an intervention.
 
     The experimental and observational data are either assumed to come to the same population,
-    or from random samples of the population. If the data are from a sample, the bounds may 
+    or from random samples of the population. If the data are from a sample, the bounds may
     be incorrectly calculated because the relevant quantities in the Tian-Pearl equations are
     defined e.g. as P(YifT), not P(YifT \mid S) where S corresponds to sample selection.
     Bareinboim and Pearl (https://www.pnas.org/doi/10.1073/pnas.1510507113) discuss conditions
     under which P(YifT) can be recovered from P(YifT \mid S).
-    '''
+    """
 
     # Probabilities calculated from observational data
     Y1 = data_obs[Y].mean()
-    T1Y0 = data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 0)].shape[0] / data_obs.shape[0]
-    T1Y1 = data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 1)].shape[0] / data_obs.shape[0]
-    T0Y0 = data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 0)].shape[0] / data_obs.shape[0]
-    T0Y1 = data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 1)].shape[0] / data_obs.shape[0]
+    T1Y0 = (
+        data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 0)].shape[0]
+        / data_obs.shape[0]
+    )
+    T1Y1 = (
+        data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 1)].shape[0]
+        / data_obs.shape[0]
+    )
+    T0Y0 = (
+        data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 0)].shape[0]
+        / data_obs.shape[0]
+    )
+    T0Y1 = (
+        data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 1)].shape[0]
+        / data_obs.shape[0]
+    )
 
     # Probabilities calculated from experimental data
     Y1doT1 = data_exp.loc[data_exp[T] == 1, Y].mean()
     Y1doT0 = data_exp.loc[data_exp[T] == 0, Y].mean()
     Y0doT0 = 1 - Y1doT0
 
-    if type == 'PNS':
+    if type == "PNS":
 
-        lb_args = [
-            0,
-            Y1doT1 - Y1doT0,
-            Y1 - Y1doT0,
-            Y1doT1 - Y1
-        ]
+        lb_args = [0, Y1doT1 - Y1doT0, Y1 - Y1doT0, Y1doT1 - Y1]
 
-        ub_args = [
-            Y1doT1,
-            Y0doT0,
-            T1Y1 + T0Y0,
-            Y1doT1 - Y1doT0 + T1Y0 + T0Y1
-        ]
+        ub_args = [Y1doT1, Y0doT0, T1Y1 + T0Y0, Y1doT1 - Y1doT0 + T1Y0 + T0Y1]
 
-    if type == 'PN':
+    if type == "PN":
 
         lb_args = [0, (Y1 - Y1doT0) / T1Y1]
         ub_args = [1, (Y0doT0 - T0Y0) / T1Y1]
 
-    if type == 'PS':
+    if type == "PS":
 
         lb_args = [0, (Y1doT1 - Y1) / T0Y0]
         ub_args = [1, (Y1doT1 - T1Y1) / T0Y0]

diff --git a/tests/test_feature_selection.py b/tests/test_feature_selection.py
@@ -0,0 +1,64 @@
+import numpy as np
+from causalml.feature_selection.filters import FilterSelect
+
+from .const import RANDOM_SEED, CONVERSION
+
+
+def test_filter_f(generate_classification_data):
+    # generate uplift classification data
+    np.random.seed(RANDOM_SEED)
+    df, X_names = generate_classification_data()
+    y_name = CONVERSION
+
+    # test F filter
+    method = "F"
+    filter_f = FilterSelect()
+    f_imp = filter_f.get_importance(
+        df, X_names, y_name, method, treatment_group="treatment1"
+    )
+
+    # each row represents the rank and importance score of each feature
+    # and spot check if it's sorted properly
+    assert f_imp.shape[0] == len(X_names)
+    assert f_imp["rank"].values[0] == 1
+    assert f_imp["score"].values[0] >= f_imp["score"].values[1]
+
+
+def test_filter_lr(generate_classification_data):
+    # generate uplift classification data
+    np.random.seed(RANDOM_SEED)
+    df, X_names = generate_classification_data()
+    y_name = CONVERSION
+
+    # test LR filter
+    method = "LR"
+    filter_obj = FilterSelect()
+    imp = filter_obj.get_importance(
+        df, X_names, y_name, method, treatment_group="treatment1"
+    )
+
+    # each row represents the rank and importance score of each feature
+    # and spot check if it's sorted properly
+    assert imp.shape[0] == len(X_names)
+    assert imp["rank"].values[0] == 1
+    assert imp["score"].values[0] >= imp["score"].values[1]
+
+
+def test_filter_kl(generate_classification_data):
+    # generate uplift classification data
+    np.random.seed(RANDOM_SEED)
+    df, X_names = generate_classification_data()
+    y_name = CONVERSION
+
+    # test KL filter
+    method = "KL"
+    filter_obj = FilterSelect()
+    imp = filter_obj.get_importance(
+        df, X_names, y_name, method, treatment_group="treatment1"
+    )
+
+    # each row represents the rank and importance score of each feature
+    # and spot check if it's sorted properly
+    assert imp.shape[0] == len(X_names)
+    assert imp["rank"].values[0] == 1
+    assert imp["score"].values[0] >= imp["score"].values[1]