python version of ABBA stats library

thumbtack · Sep 28, 2012 · d4bd243 · d4bd243
1 parent a3a0be0
commit d4bd243
Show file tree

Hide file tree

Showing 9 changed files with 481 additions and 0 deletions.
diff --git a/python/.gitignore b/python/.gitignore
@@ -0,0 +1,4 @@
+*.pyc
+MANIFEST
+build
+dist
diff --git a/python/CHANGES.txt b/python/CHANGES.txt
@@ -0,0 +1 @@
+v0.1.0, 2012/09/27 -- Initial release.
diff --git a/python/LICENSE.txt b/python/LICENSE.txt
@@ -0,0 +1,16 @@
+Copyright (c) 2012 Thumbtack, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge, publish, distribute,
+sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
+NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
+OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
@@ -0,0 +1,2 @@
+include *.txt
+recursive-include docs *.txt
diff --git a/python/abba/__init__.py b/python/abba/__init__.py
@@ -0,0 +1,33 @@
+'''Tools for statistical analysis of A/B test results.
+
+ABBA provides several statistical tools for analysis of binomial data, typically resulting from A/B
+tests:
+
+* Wald and Agresti-Coull confidence intervals on binomial proportions
+* Confidence intervals on the difference and ratio of two binomial proportions
+* Hypothesis tests for inequality of two binomial proportions
+* Multiple test correction for control of familywise error rate
+
+Some simple example usage::
+
+    >>> import abba.stats
+    >>> abba.stats.confidence_interval_on_proportion(
+    ...     num_successes=50, num_trials=200, confidence_level=0.99)
+    ValueWithInterval(value=0.25, lower_bound=0.17962262748069852, upper_bound=0.33643200973247306)
+
+    >>> experiment = abba.stats.Experiment(
+    ...     num_trials=5, baseline_num_successes=50, baseline_num_trials=200)
+    >>> results = experiment.get_results(num_successes=70, num_trials=190)
+    >>> results.relative_improvement
+    ValueWithInterval(value=0.4736842105263157, lower_bound=-0.014130868125315277, upper_bound=0.90421878236700903)
+    >>> results.two_tailed_p_value
+    0.047886616311815511
+
+
+ABBA requires SciPy for underlying statistical functions.
+
+For more info, see the docstrings, unit tests, and the ABBA website (including an interactive
+Javascript version) at http://www.thumbtack.com/labs/abba/.
+'''
+
+__version__ = '0.1.0'
diff --git a/python/abba/stats.py b/python/abba/stats.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2012 Thumbtack, Inc.
+
+import collections
+import math
+
+from scipy import stats
+
+def get_z_critical_value(alpha, two_tailed=True):
+    """
+    Returns the z critical value for a particular alpha = 1 - confidence level.  By default returns
+    a two-tailed z-value, meaning the actual tail probability is alpha / 2.
+    """
+    if two_tailed:
+        alpha /= 2
+    return stats.distributions.norm.ppf(1 - alpha)
+
+# a value with confidence interval bounds (not necessarily centered around the point estimate)
+ValueWithInterval = collections.namedtuple(
+    'ValueWithInterval',
+    ('value', 'lower_bound', 'upper_bound'),
+)
+
+class ValueWithError(object):
+    """
+    A value with standard error, from which a confidence interval can be derived.
+    """
+    def __init__(self, value, error):
+        self.value = value
+        self.error = error
+
+    def confidence_interval_width(self, z_critical_value):
+        """
+        z_critical_value should be the value at which the right-tail probability for a standard
+        normal distribution equals half the desired alpha = 1 - confidence level:
+
+        P(Z > z_value) = 1 - alpha / 2
+
+        where Z is an N(0, 1) random variable.  Use get_z_critical_value(), or see
+        http://en.wikipedia.org/wiki/Standard_normal_table.
+        """
+        return z_critical_value * self.error
+
+    def value_with_interval(self, z_critical_value, estimated_value=None):
+        width = self.confidence_interval_width(z_critical_value)
+        return ValueWithInterval(
+            value=estimated_value if estimated_value is not None else self.value,
+            lower_bound=self.value - width,
+            upper_bound=self.value + width,
+        )
+
+class BinomialDistribution(object):
+    def __init__(self, num_trials, probability):
+        self.num_trials = num_trials
+        self.probability = probability
+        self.expectation = num_trials * probability
+        self.standard_deviation = math.sqrt(self.expectation * (1 - probability))
+        self._binomial = stats.binom(num_trials, probability)
+
+    def mass(self, count):
+        return self._binomial.pmf(count)
+
+    def cdf(self, count):
+        return self._binomial.cdf(count)
+
+    def survival(self, count):
+        return 1 - self.cdf(count)
+
+    def inverse_cdf(self, probability):
+        return self._binomial.ppf(probability)
+
+    def inverse_survival(self, probability):
+        return self._binomial.isf(probability)
+
+class Proportion(object):
+    def __init__(self, num_successes, num_trials):
+        """
+        Represents a binomial proportion with num_successes successful samples out of num_trials
+        total.
+        """
+        self.num_successes = num_successes
+        self.num_trials = num_trials
+
+    def p_estimate(self, z_critical_value=0):
+        """
+        Generate an adjusted estimate and error using the "Agresti-Coull Interval", see
+        http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Agresti-Coull_Interval.
+
+        The estimated value is an adjusted best estimate for the actual probability. For example, if
+        0 successes were observed out of 10 samples, it's unlikely the actual probability is zero,
+        so the adjusted estimate will be slightly above zero.
+
+        A z_critical_value of zero yields the ordinary Wald interval.
+        """
+        adjusted_num_trials = float(self.num_trials + z_critical_value**2)
+        interval_center = (self.num_successes + z_critical_value**2 / 2) / adjusted_num_trials
+        standard_error = math.sqrt(interval_center * (1 - interval_center) / adjusted_num_trials)
+        return ValueWithError(interval_center, standard_error)
+
+    def mixed_estimate(self, z_critical_value):
+        """
+        Returns an ValueWithInterval with a MLE value and upper/lower bounds from the Agresti-Coull
+        interval.
+        """
+        return (
+            self.p_estimate(z_critical_value=z_critical_value)
+            .value_with_interval(z_critical_value, estimated_value=self.p_estimate().value)
+        )
+
+def confidence_interval_on_proportion(num_successes, num_trials, confidence_level=0.95):
+    '''Convenience function with more straightforward interface.'''
+    return Proportion(num_successes, num_trials).mixed_estimate(
+        get_z_critical_value(1 - confidence_level)
+    )
+
+class ProportionComparison(object):
+    def __init__(self, baseline, variation):
+        self.baseline = baseline
+        self.variation = variation
+
+    def difference_estimate(self, z_critical_value):
+        """
+        Generate an estimate of the difference in success rates between the variation and the
+        baseline.
+        """
+        baseline_p = self.baseline.p_estimate(z_critical_value=z_critical_value)
+        variation_p = self.variation.p_estimate(z_critical_value=z_critical_value)
+        difference = variation_p.value - baseline_p.value
+        standard_error = math.sqrt(baseline_p.error ** 2 + variation_p.error ** 2)
+        return ValueWithError(difference, standard_error)
+
+    def difference_ratio(self, z_critical_value):
+        """
+        Return the difference in sucess rates as a proportion of the baseline success rate.
+        """
+        baseline_value = self.baseline.p_estimate(z_critical_value=z_critical_value).value
+        difference = self.difference_estimate(z_critical_value=z_critical_value)
+        ratio = difference.value / baseline_value
+        error = difference.error / baseline_value
+        return ValueWithError(ratio, error)
+
+    def z_test(self, z_multiplier=1):
+        """
+        Perform a large-sample z-test of null hypothesis H0: p_baseline == p_variation against
+        alternative hypothesis H1: p_baseline < p_variation.  Return the (one-tailed) p-value.
+
+        z_multiplier: test z-value will be multiplied by this factor before computing a p-value.
+
+        See http://en.wikipedia.org/wiki/Statistical_hypothesis_testing#Common_test_statistics,
+        "Two-proportion z-test, pooled for d0 = 0".
+        """
+        pooled_stats = Proportion(
+            self.baseline.num_successes + self.variation.num_successes,
+            self.baseline.num_trials + self.variation.num_trials,
+        )
+        pooled_p_value = pooled_stats.p_estimate().value
+        pooled_variance_of_difference = (
+            pooled_p_value * (1 - pooled_p_value)
+            * (1.0 / self.baseline.num_trials + 1.0 / self.variation.num_trials)
+        )
+        pooled_standard_error_of_difference = math.sqrt(pooled_variance_of_difference)
+        test_z_value = self.difference_estimate(0).value / pooled_standard_error_of_difference
+        adjusted_p_value = stats.distributions.norm.sf(test_z_value * z_multiplier)
+        return adjusted_p_value
+
+    def _binomial_coverage_interval(self, distribution, coverage_alpha):
+        """
+        For the given binomial distribution, compute an interval that covers at least (1 -
+        coverage_alpha) of the total probability mass, centered at the expectation (unless we're at
+        the boundary). Uses the normal approximation.
+        """
+        if distribution.num_trials < 1000:
+            # don't even bother trying to optimize for small-ish sample sizes
+            return (0, distribution.num_trials)
+        else:
+            return (
+                int(math.floor(distribution.inverse_cdf(coverage_alpha / 2))),
+                int(math.ceil(distribution.inverse_survival(coverage_alpha / 2))),
+            )
+
+    def _probability_union(self, probability, num_tests):
+        """
+        Given the probability of an event, compute the probability that it happens at least once in
+        num_tests independent tests. This is used to adjust a p-value for multiple comparisons.
+        When used to adjust alpha instead, this is called a Sidak correction (the logic is the same,
+        the formula is inverted):
+        http://en.wikipedia.org/wiki/Bonferroni_correction#.C5.A0id.C3.A1k_correction
+        """
+        return 1 - (1 - probability)**num_tests
+
+    def iterated_test(self, num_tests, coverage_alpha, improvement_only=False):
+        """
+        Compute a p-value testing null hypothesis H0: p_baseline == p_variation against alternative
+        hypothesis H1: p_baseline != p_variation by summing p-values conditioned on individual
+        baseline success counts. This provides a more accurate correction for multiple testing but
+        scales like O(sqrt(self.baseline.num_trials)), so can eventually get slow for very large
+        values.
+
+        Lower coverage_alpha increases accuracy at the cost of longer runtime. Roughly, the result
+        will be accurate within no more than coverage_alpha (but this ignores error due to the
+        normal approximation so isn't guaranteed).
+
+        If improvement_only=True, computes p-value for alternative hypothesis
+        H1: p_baseline < p_variation instead.
+        """
+        observed_delta = self.variation.p_estimate().value - self.baseline.p_estimate().value
+        if observed_delta == 0 and not improvement_only:
+            # a trivial case that the code below does not handle well
+            return 1
+
+        pooled_proportion = (
+            (self.baseline.num_successes + self.variation.num_successes)
+            / float(self.baseline.num_trials + self.variation.num_trials)
+        )
+        variation_distribution = BinomialDistribution(self.variation.num_trials, pooled_proportion)
+        baseline_distribution = BinomialDistribution(self.baseline.num_trials, pooled_proportion)
+
+        baseline_limits = self._binomial_coverage_interval(baseline_distribution, coverage_alpha)
+        p_value = 0
+        for baseline_successes in xrange(baseline_limits[0], baseline_limits[1] + 1):
+            baseline_proportion = 1.0 * baseline_successes / self.baseline.num_trials
+            if improvement_only:
+                lower_trial_count = -1
+                upper_trial_count = math.ceil(
+                    (baseline_proportion + observed_delta) * self.variation.num_trials
+                )
+            else:
+                observed_absolute_delta = abs(observed_delta)
+                lower_trial_count = math.floor(
+                    (baseline_proportion - observed_absolute_delta) * self.variation.num_trials
+                )
+                upper_trial_count = math.ceil(
+                    (baseline_proportion + observed_absolute_delta) * self.variation.num_trials
+                )
+
+            # p-value of variation success counts "at least as extreme" for this particular
+            # baseline success count
+            p_value_at_baseline = (
+                variation_distribution.cdf(lower_trial_count)
+                + variation_distribution.survival(upper_trial_count - 1)
+            )
+
+            # this is exact because we're conditioning on the baseline count, so the multiple
+            # tests are independent.
+            adjusted_p_value = self._probability_union(p_value_at_baseline, num_tests)
+
+            baseline_probability = baseline_distribution.mass(baseline_successes)
+            p_value += baseline_probability * adjusted_p_value
+
+        # the remaining baseline values we didn't cover contribute less than coverage_alpha to the
+        # sum, so adding that amount gives us a conservative upper bound.
+        return p_value + coverage_alpha
+
+Results = collections.namedtuple(
+    'Results',
+    (
+        'num_successes',
+        'num_trials',
+        'proportion', # ValueWithInterval
+        'improvement', # ValueWithInterval
+        'relative_improvement', # ValueWithInterval
+        'two_tailed_p_value', # two-tailed p-value for trial != baseline
+        'improvement_one_tailed_p_value', # one-tailed p-value for trial > baseline
+    ),
+)
+
+class Experiment(object):
+    P_VALUE_PRECISION = 1e-5
+
+    def __init__(self, num_trials, baseline_num_successes, baseline_num_trials,
+                 confidence_level=0.95):
+        """
+        num_trials: number of trials to be compared to the baseline
+        confidence_level: used for all confidence intervals generated
+        """
+        self.num_comparisons = max(1, num_trials)
+        self._baseline = Proportion(baseline_num_successes, baseline_num_trials)
+        alpha = (1 - confidence_level) / num_trials # Bonferroni correction
+        self._z_critical_value = get_z_critical_value(alpha)
+
+    def get_baseline_proportion(self):
+        return self._baseline.mixed_estimate(self._z_critical_value)
+
+    def get_results(self, num_successes, num_trials):
+        trial = Proportion(num_successes, num_trials)
+        comparison = ProportionComparison(self._baseline, trial)
+        return Results(
+            num_successes=num_successes,
+            num_trials=num_trials,
+            proportion=trial.mixed_estimate(self._z_critical_value),
+            improvement=comparison.difference_estimate(self._z_critical_value)
+                .value_with_interval(
+                    self._z_critical_value,
+                    estimated_value=comparison.difference_estimate(0).value,
+                ),
+            relative_improvement=comparison.difference_ratio(self._z_critical_value)
+                .value_with_interval(
+                    self._z_critical_value,
+                    estimated_value=comparison.difference_ratio(0).value,
+                ),
+            two_tailed_p_value=comparison.iterated_test(
+                self.num_comparisons,
+                self.P_VALUE_PRECISION,
+            ),
+            improvement_one_tailed_p_value=comparison.iterated_test(
+                self.num_comparisons,
+                self.P_VALUE_PRECISION,
+                improvement_only=True,
+            ),
+        )
diff --git a/python/abba/test/__init__.py b/python/abba/test/__init__.py