Merge pull request #482 from uber/add_pns_method

Add function for calculating PNS bounds
uber · Mar 29, 2022 · 2e7d08f · 2e7d08f
2 parents 548a2c5 + 8a6e8bd
commit 2e7d08f
Show file tree

Hide file tree

Showing 5 changed files with 377 additions and 0 deletions.
diff --git a/causalml/optimize/__init__.py b/causalml/optimize/__init__.py
@@ -2,3 +2,4 @@
 from .unit_selection import CounterfactualUnitSelector
 from .utils import get_treatment_costs, get_actual_value, get_uplift_best
 from .value_optimization import CounterfactualValueEstimator
+from .pns import get_pns_bounds
diff --git a/causalml/optimize/pns.py b/causalml/optimize/pns.py
@@ -0,0 +1,80 @@
+import numpy as np
+import pandas as pd
+
+
+def get_pns_bounds(data_exp, data_obs, T, Y, type='PNS'):
+    '''
+    Args
+    ----
+    data_exp : DataFrame
+        Data from an experiment.
+    data_obs : DataFrame
+        Data from an observational study
+    T : str
+        Name of the binary treatment indicator
+    y : str
+        Name of the binary outcome indicator
+    'type' : str
+        Type of probability of causation desired. Acceptable args are:
+        * 'PNS': Probability of necessary and sufficient causation
+        * 'PS': Probability of sufficient causation
+        * 'PN': Probability of necessary causation
+
+    Notes
+    -----
+    Based on Equation (24) in Tian and Pearl: https://ftp.cs.ucla.edu/pub/stat_ser/r271-A.pdf
+
+    To capture the counterfactual notation, we use `1' and `0' to indicate the actual and
+    counterfactual values of a variable, respectively, and we use `do' to indicate the effect
+    of an intervention.
+
+    The experimental and observational data are either assumed to come to the same population,
+    or from random samples of the population. If the data are from a sample, the bounds may 
+    be incorrectly calculated because the relevant quantities in the Tian-Pearl equations are
+    defined e.g. as P(YifT), not P(YifT \mid S) where S corresponds to sample selection.
+    Bareinboim and Pearl (https://www.pnas.org/doi/10.1073/pnas.1510507113) discuss conditions
+    under which P(YifT) can be recovered from P(YifT \mid S).
+    '''
+
+    # Probabilities calculated from observational data
+    Y1 = data_obs[Y].mean()
+    T1Y0 = data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 0)].shape[0] / data_obs.shape[0]
+    T1Y1 = data_obs.loc[(data_obs[T] == 1) & (data_obs[Y] == 1)].shape[0] / data_obs.shape[0]
+    T0Y0 = data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 0)].shape[0] / data_obs.shape[0]
+    T0Y1 = data_obs.loc[(data_obs[T] == 0) & (data_obs[Y] == 1)].shape[0] / data_obs.shape[0]
+
+    # Probabilities calculated from experimental data
+    Y1doT1 = data_exp.loc[data_exp[T] == 1, Y].mean()
+    Y1doT0 = data_exp.loc[data_exp[T] == 0, Y].mean()
+    Y0doT0 = 1 - Y1doT0
+
+    if type == 'PNS':
+
+        lb_args = [
+            0,
+            Y1doT1 - Y1doT0,
+            Y1 - Y1doT0,
+            Y1doT1 - Y1
+        ]
+
+        ub_args = [
+            Y1doT1,
+            Y0doT0,
+            T1Y1 + T0Y0,
+            Y1doT1 - Y1doT0 + T1Y0 + T0Y1
+        ]
+
+    if type == 'PN':
+
+        lb_args = [0, (Y1 - Y1doT0) / T1Y1]
+        ub_args = [1, (Y0doT0 - T0Y0) / T1Y1]
+
+    if type == 'PS':
+
+        lb_args = [0, (Y1doT1 - Y1) / T0Y0]
+        ub_args = [1, (Y1doT1 - T1Y1) / T0Y0]
+
+    lower_bound = max(lb_args)
+    upper_bound = min(ub_args)
+
+    return lower_bound, upper_bound
diff --git a/docs/methodology.rst b/docs/methodology.rst
@@ -261,6 +261,57 @@ The counterfactual value estimation method implemented in the package predicts t
 
 where :math:`Y_w` is the probability of a favourable event (such as conversion) under a given treatment :math:`w`, :math:`v` is the value of the favourable event, :math:`cc_w` is the cost of the treatment triggered in case of a favourable event, and :math:`ic_w` is the cost associated with the treatment whether or not the outcome is favourable. This method builds upon the ideas discussed in :cite:`zhao2019uplift`.
 
+Probabilities of causation
+--------------------------
+
+A cause is said to be *necessary* for an outcome if the outcome would not have occurred in the absence of the cause. A cause is said to be *sufficient* for an outcome if the outcome would have occurred in the presence of the cause. A cause is said to be *necessary and sufficient* if both of the above two conditions hold. :cite:`tian2000probabilities` show that we can calculate bounds for the probability that a cause is of each of the above three types.
+
+To understand how the bounds for the probabilities of causation are calculated, we need special notation to represent counterfactual quantities. Let :math:`y_t` represent the proposition “:math:`y` would occur if the treatment group was set to ‘treatment’”, :math:`y^{\prime}_c` represent the proposition “:math:`y` would not occur if the treatment group was set to ‘control’”, and similarly for the remaining two combinations of the (by assumption) binary outcome and treatment variables.
+
+Then the probability that the treatment is *sufficient* for :math:`y` to occur can be defined as
+
+.. math::
+
+    PS = P(y_t \mid c, y^{\prime})
+
+This is the probability that the :math:`y` would occur if the treatment was set to :math:`t` when in fact the treatment was set to control and the outcome did not occur.
+
+The probability that the treatment is *necessary* for :math:`y` to occur can be defined as
+
+.. math::
+    PN = P(y^'_c \mid t, y)
+
+This is the probability that :math:`y` would not occur if the treatment was set to control, while in actuality both :math:`y` occurs and the treatment takes place.
+
+Finally, the probability that the treatment is both necessary and sufficient is defined as 
+
+.. math::
+    PNS = P(y_t, y^'_c)
+
+and states that :math:`y` would occur if the treatment took place; and :math:`y` would not occur if the treatment did not take place. PNS is related with PN and PS as follows:
+
+.. math::
+    PNS = P(t, y)PN + P(c, y^')PS
+
+In bounding the above three quantities, we utilize observational data in addition to experimental data. The observational data is characterized in terms of the joint probabilities:
+
+.. math::
+    P_{TY} = {P(t, y),  P(c, y), P(t, y^'), P(c, y^')}
+
+Given this, :cite:`tian2000probabilities` use the program developed in :cite:`balke1995probabilistic` to obtain sharp bounds of the above three quantities. The main idea in this program is to turn the bounding task into a linear programming problem (for a modern implementation of their approach see `here <https://cran.r-project.org/web/packages/causaloptim/vignettes/vertexenum-speed.html>`_).
+
+Using the linear programming approach and given certain constraints together with observational data, :cite:`tian2000probabilities` find that the shar lower bound for PNS is given by
+
+.. math::
+    max\{0, P(y_t) - P(y_c), P(y) - P(y_c), P(y_t) - P(y)\}
+
+and the sharp upper bound is given by
+
+.. math::
+    min\{P(y_t), P(y^{\prime}_c), P(t, y) + P(c, y^{\prime}), P(y_t) - P(y_c) + P(t, y^{\prime}) + P(c, y)\}
+
+They use a similar routine to find the bounds for PS and PN. The `get_pns_bounds()` function calculates the bounds for each of the three probabilities of causation using the results in :cite:`tian2000probabilities`.
+
 Selected traditional methods
 ----------------------------
 

diff --git a/docs/refs.bib b/docs/refs.bib
@@ -415,6 +415,23 @@ @article{zhao2020feature
   year={2020}
 }
 
+@article{tian2000probabilities,
+  title={Probabilities of causation: Bounds and identification},
+  author={Tian, Jin and Pearl, Judea},
+  journal={Annals of Mathematics and Artificial Intelligence},
+  volume={28},
+  number={1},
+  pages={287--313},
+  year={2000},
+  publisher={Springer}
+}
+
+@book{balke1995probabilistic,
+  title={Probabilistic counterfactuals: semantics, computation, and applications},
+  author={Balke, Alexander Abraham},
+  year={1995},
+  publisher={University of California, Los Angeles}
+
 @misc{kennedy2020optimal,
       title={Optimal doubly robust estimation of heterogeneous causal effects},
       author={Edward H. Kennedy},

diff --git a/examples/necessary_and_sufficient.ipynb b/examples/necessary_and_sufficient.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Calculating the probabilities of necessary and sufficient causation\n",
+    "\n",
+    "Consider the causal effect of a voucher on customer conversion. We can distinguish between the following types of causation:\n",
+    "\n",
+    "* **Necessary**: If the customer doesn't get the voucher, they will not convert\n",
+    "* **Sufficient**: If the customer gets the voucher, they will convert\n",
+    "* **Necessary and sufficient**: The customer will convert if and only if they receive the voucher\n",
+    "\n",
+    "In general, we would like many intervetions to be of the last type. If the voucher is not necessary for a given customer, we might be wasting money by targeting them; if the voucher is not sufficient, we may not fulfil the goal of the campaign, which is to cause customers to convert.\n",
+    "\n",
+    "[Tian and Pearl (2000)](https://ftp.cs.ucla.edu/pub/stat_ser/r271-A.pdf) provided a way to combine experimental and observational data to derive bounds for the probability of each of the above types of causation. In this notebook, we replicate the example from their paper. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from causalml.optimize import get_pns_bounds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[Tian and Pearl (2000, p. 306)](https://ftp.cs.ucla.edu/pub/stat_ser/r271-A.pdf) imagine a setup where we have both experimental and observational data about the efficacy of a certain drug. The experimental data looks as follows:\n",
+    "\n",
+    "|           | Treatment | Control |\n",
+    "|-----------|-----------|---------|\n",
+    "| Deaths    | 16        | 14      |\n",
+    "| Survivals | 984       | 986     |\n",
+    "\n",
+    "Therefore, based on the experiment, it looks like there isn't much of a difference in the rate of deaths in the treatment and control groups. However, in addition to the experimental data, we also have the following data that is from an observational study, i.e. a study in which we simply observe the outcomes for those who choose to use the drug vs. those who don't:\n",
+    "\n",
+    "|           | Treatment | Control |\n",
+    "|-----------|-----------|---------|\n",
+    "| Deaths    | 2         | 28      |\n",
+    "| Survivals | 998       | 972     |\n",
+    "\n",
+    "Because people self-select to use the drug, the data shown in the table is very likely confounded. However, Tian and Pearl argue that the above two datasets can be combined to obtain information that is not visible by looking at either of the datasets independently, namely the probabilities of necessary and sufficient causation (PNS). More specifically, it is possible to derive bounds for PNS by combining the two data sources. To see how, let's generate the datasets:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_samples = 2000\n",
+    "half = int(num_samples / 2)\n",
+    "treatment = np.tile([0, 1], half)\n",
+    "recovery = np.zeros(num_samples)\n",
+    "\n",
+    "df_rct = pd.DataFrame({'treatment': treatment, 'death': recovery})\n",
+    "df_obs = pd.DataFrame({'treatment': treatment, 'death': recovery})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set the label to `1' for 16 treatment and 14 control observations\n",
+    "df_rct.loc[df_rct.loc[df_rct['treatment'] == 1].sample(n=16).index, 'death'] = 1\n",
+    "df_rct.loc[df_rct.loc[df_rct['treatment'] == 0].sample(n=14).index, 'death'] = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "treatment\n",
+       "0    14.0\n",
+       "1    16.0\n",
+       "Name: death, dtype: float64"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_rct.groupby('treatment')['death'].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set the label to `1' for 2 treatment and 28 control observations\n",
+    "df_obs.loc[df_obs.loc[df_obs['treatment'] == 1].sample(n=2).index, 'death'] = 1\n",
+    "df_obs.loc[df_obs.loc[df_obs['treatment'] == 0].sample(n=28).index, 'death'] = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "treatment\n",
+       "0    28.0\n",
+       "1     2.0\n",
+       "Name: death, dtype: float64"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_obs.groupby('treatment')['death'].sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "WIth these data, we can now use the `get_pns_bounds()' function to calculate the relevant bounds. Let's do it for each of the three types of bound:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pns_lb, pns_ub = get_pns_bounds(df_rct, df_obs, 'treatment', 'death', type='PNS')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pn_lb, pn_ub = get_pns_bounds(df_rct, df_obs, 'treatment', 'death', type='PN')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ps_lb, ps_ub = get_pns_bounds(df_rct, df_obs, 'treatment', 'death', type='PS')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Bounds for the probability of necessary causation: [1.0, 1]\n",
+      "Bounds for the probability of sufficient causation: [0.002, 0.031]\n",
+      "Bounds for the probability of necessary and sufficient causation: [0.002, 0.016]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f'''\n",
+    "Bounds for the probability of necessary causation: [{round(pn_lb, 3)}, {round(pn_ub, 3)}]\n",
+    "Bounds for the probability of sufficient causation: [{round(ps_lb, 3)}, {round(ps_ub, 3)}]\n",
+    "Bounds for the probability of necessary and sufficient causation: [{round(pns_lb, 3)}, {round(pns_ub, 3)}]\n",
+    "''')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "So, by combining experimental and observational data, we arrive at the conclusion that the participants who died and took the drug would have definitely survived without taking the drug. Those who survived and did not take the drug would have had between 0.2% and 3.1% risk of dying had they taken the drug. This illustrates how combining experimental and observational data can lead to additional insights compared to analysing either data source separately."
+   ]
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "1b5c1e8782fc5f664c4fe135feb4dd5f062247c917b91ce86cc8a320dfc2f525"
+  },
+  "kernelspec": {
+   "display_name": "Python 3.10.1 64-bit ('acme': conda)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}