Merge pull request #240 from zalando/chi-square-restructured

Changed chi-square test, removed frequencies computation
zalando · Sep 10, 2018 · 19b91f3 · 19b91f3
2 parents 92e7fc7 + 3c39814
commit 19b91f3
Show file tree

Hide file tree

Showing 3 changed files with 120 additions and 126 deletions.
diff --git a/expan/core/experiment.py b/expan/core/experiment.py
@@ -306,49 +306,36 @@ def _quantile_filtering(self, data, kpis, percentile, threshold_type):
             flags = flags | data[column].apply(method_table[threshold_type])
         return flags
 
-
-    def chi_square_test_result_and_statistics(self, variant_column, weights, min_counts=5, alpha=0.05):
-        """ Tests the consistency of variant split with the hypothesized distribution.
-        
-        :param variant_column: variant column from the input data frame
-        :param weights: dict with variant names as keys, weights as values
-                        ({<variant_name>:<weight>, ...}
-        :param min_counts: minimum number of observed and expected frequencies (should be at least 5), see 
-                            http://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.chisquare.html
-        :param alpha: significance level, 0.05 by default
-        :return: True(if split is consistent with the given split) or
-                 False(if split is not consistent with the given split)
-        :rtype:  Boolean, float, float, pd.Series, pd.Series
+    def run_goodness_of_fit_test(self, observed_freqs, expected_freqs, alpha=0.01, min_counts=5):
+        """ Checks the validity of observed and expected counts and runs chi-square test for goodness of fit.
+
+        :param observed_freqs: observed frequencies
+        :type  observed_freqs: pd.Series
+        :param expected_freqs: expected frequencies
+        :type  expected_freqs: pd.Series
+        :param alpha: significance level
+        :type  alpha: float
+        :param min_counts: minimum number of observations to run chi-square test
+        :type  min_counts: int
+        :return split_is_unbiased: False is split is biased and True if split is correct
+                p_value: corresponding chi-square p-value
+        :rtype: bool, float
         """
-        if not hasattr(variant_column, '__len__'):
-            raise ValueError("Variant split check was cancelled since input variant column is empty or doesn't exist.")
-        if not hasattr(weights, '__len__'):
-            raise ValueError("Variant split check was cancelled since input weights are empty or doesn't exist.")
-        if len(weights) <= 1 or len(variant_column) <= 1:
-            raise ValueError("Variant split check was cancelled since input weights or the number if categories "
-                             "is less than 2.")
-
-        # Count number of observations per each variant
-        variant_column = pd.Series(variant_column).dropna(axis=0)
-        observed_freqs = variant_column.value_counts()
+
+        if not isinstance(observed_freqs, pd.Series) or not isinstance(expected_freqs, pd.Series):
+            raise ValueError("Observed and expected frequencies should be of type Series.")
+        if observed_freqs.empty or expected_freqs.empty:
+            raise ValueError("Variant split check was cancelled since expected or observed frequencies are empty.")
 
         # Ensure at least a frequency of min_counts at every location in observed_counts.
         # It's recommended to not conduct test if frequencies in each category is less than min_counts
-        if len(observed_freqs[observed_freqs < min_counts]) >= 1:
-            raise ValueError("Chi-square test is not valid for small expected or observed frequencies.")
-
-        # If there are less than 2 categories left after dropping counts less than 5 we can't conduct the test.
-        if len(observed_freqs) < 2:
-            raise ValueError("If the number of categories is less than 2 Chi-square test is not applicable.")
-
-        # Calculate expected counts given corresponding weights,
-        # weights are filtered out of categories which were dropped before.
-        total_count = observed_freqs.sum()
-        weights = {k: v for (k, v) in weights.items() if k in observed_freqs.index.values}
-        expected_freqs = pd.Series(weights)
-        expected_freqs *= total_count
-
-        # Compute chi-square and p-value statistics
-        chi_square_val, p_val = statx.chi_square(observed_freqs.sort_index(), expected_freqs.sort_index())
-
-        return p_val >= alpha, p_val, chi_square_val, observed_freqs, expected_freqs
+        valid_observed_freqs = observed_freqs[observed_freqs > min_counts]
+        valid_expected_freqs = expected_freqs.filter(valid_observed_freqs.keys())
+
+        if len(valid_observed_freqs) == len(valid_expected_freqs) and len(valid_observed_freqs) >= 2:
+            _, p_value = statx.chi_square(valid_observed_freqs.sort_index(), valid_expected_freqs.sort_index())
+            split_is_unbiased = p_value >= alpha
+        else:
+            raise ValueError("Variant split check was cancelled since observed or expected frequencies "
+                             "are less than 2.")
+        return split_is_unbiased, p_value
diff --git a/expan/core/statistics.py b/expan/core/statistics.py
@@ -5,7 +5,7 @@
 
 import numpy as np
 import pandas as pd
-from scipy import stats
+import scipy
 from expan.core.results import BaseTestStatistics, SampleStatistics, SimpleTestStatistics
 
 logger = logging.getLogger(__name__)
@@ -210,7 +210,7 @@ def estimate_sample_size(x, mde, r, alpha=0.05, beta=0.2):
     if r <= 0:
         raise ValueError("Variant split ratio needs to be higher than 0.")
 
-    ppf = stats.norm.ppf
+    ppf = scipy.stats.norm.ppf
     c1 = (ppf(1.0 - alpha/2.0) - ppf(beta))**2
     c2 = (1.0 + r) * c1 * (1.0 + 1.0 / r)
     return c2 * x.var() / (mde * x.mean())**2
@@ -485,9 +485,9 @@ def normal_difference(mean1, std1, n1, mean2, std2, n2, percentiles=[2.5, 97.5],
 
     # Mapping percentiles via standard error
     if relative:
-        return dict([(round(p, 5), stats.t.ppf(p / 100.0, df=d_free) * st_error) for p in percentiles])
+        return dict([(round(p, 5), scipy.stats.t.ppf(p / 100.0, df=d_free) * st_error) for p in percentiles])
     else:
-        return dict([(round(p, 5), mean + stats.t.ppf(p / 100.0, df=d_free) * st_error) for p in percentiles])
+        return dict([(round(p, 5), mean + scipy.stats.t.ppf(p / 100.0, df=d_free) * st_error) for p in percentiles])
 
 
 def compute_statistical_power_from_samples(x, y, alpha=0.05):
@@ -503,7 +503,7 @@ def compute_statistical_power_from_samples(x, y, alpha=0.05):
     :return: statistical power---the probability of a test to detect an effect if the effect actually exists
     :rtype: float
     """
-    z_1_minus_alpha = stats.norm.ppf(1 - alpha/2.)
+    z_1_minus_alpha = scipy.stats.norm.ppf(1 - alpha/2.)
     _x = np.array(x, dtype=float)
     _x = _x[~np.isnan(_x)]
     _y = np.array(y, dtype=float)
@@ -553,7 +553,7 @@ def compute_statistical_power(mean1, std1, n1, mean2, std2, n2, z_1_minus_alpha)
 
     tmp = (n1 * n2 * effect_size**2) / ((n1 + n2) * std**2)
     z_beta = z_1_minus_alpha - np.sqrt(tmp)
-    beta = stats.norm.cdf(z_beta)
+    beta = scipy.stats.norm.cdf(z_beta)
     power = 1 - beta
     return power
 
@@ -618,12 +618,12 @@ def compute_p_value(mean1, std1, n1, mean2, std2, n2):
         t = np.sign(mean_diff) * 1000
     else:
         t = mean_diff / st_error
-    p = stats.t.cdf(-abs(t), df=d_free) * 2
+    p = scipy.stats.t.cdf(-abs(t), df=d_free) * 2
     return p
 
 
 def chi_square(observed_freqs, expected_freqs, ddof=0):
-    """ Compute chi-square statistics and p-values given observed and expected frequencies and degrees of freedom. 
+    """ Computes chi-square statistics and p-values given observed and expected frequencies and degrees of freedom.
 
     :param observed_freqs: observed frequencies 
     :type  observed_freqs: pd.Series or array-like
@@ -635,6 +635,6 @@ def chi_square(observed_freqs, expected_freqs, ddof=0):
     :return: chi-square statistics and p-value
     :rtype:  float, float
     """
-    chi_square_val, p_val = stats.chisquare(f_obs=observed_freqs, f_exp=expected_freqs, ddof=ddof, axis=None)
+    chi_square_val, p_val = scipy.stats.chisquare(f_obs=observed_freqs, f_exp=expected_freqs, ddof=ddof, axis=None)
 
     return chi_square_val, p_val
diff --git a/tests/tests_core/test_experiment.py b/tests/tests_core/test_experiment.py
@@ -360,108 +360,115 @@ def test_outlier_filtering_derived_kpi(self):
         )
         self.assertIn('derived_kpi', data.columns)
 
-    def test_chi_square_test_result_and_statistics_same_weights(self):
+    def test_run_goodness_of_fit_test_true(self):
         exp = self.getExperiment()
-        data = ['A'] * 23 + ['B'] * 18 + ['C'] * 17 + ['D'] * 19 + ['E'] * 23
-        weights = {'A':  0.2, 'B':  0.2, 'C':  0.2, 'D':  0.2, 'E':  0.2}
-        result = exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([230, 190, 190, 190, 230], ['A', 'B', 'C', 'D', 'E'])
+        expected_freqs = pd.Series([206, 206, 206, 206, 206], ['A', 'B', 'C', 'D', 'E'])
+        result = exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
         self.assertEqual(result[0], True)
-        self.assertAlmostEqual(result[1], 0.8087921354109989)
-        self.assertAlmostEqual(result[2], 1.6)
-        self.assertEqual(result[3]['A'], 23)
-        self.assertEqual(result[3]['B'], 18)
-        self.assertEqual(result[3]['C'], 17)
-        self.assertEqual(result[3]['D'], 19)
-        self.assertEqual(result[3]['E'], 23)
+        self.assertAlmostEqual(result[1], 0.05357161207695437)
 
-        self.assertTrue(all([val == 20 for val in result[4]]))
-
-    def test_chi_square_test_result_and_statistics_different_weights(self):
+    def test_run_goodness_of_fit_test_false(self):
         exp = self.getExperiment()
-        data = ['A'] * 23 + ['B'] * 18 + ['C'] * 17 + ['D'] * 19 + ['E'] * 23
-        weights = {'A':  0.25, 'B':  0.15, 'C':  0.10, 'D':  0.40, 'E':  0.10}
-        result = exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([230, 180, 170, 190, 230], ['A', 'B', 'C', 'D', 'E'])
+        expected_freqs = pd.Series([250, 150, 100, 400, 100], ['A', 'B', 'C', 'D', 'E'])
+        result = exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
         self.assertEqual(result[0], False)
-        self.assertAlmostEqual(result[1], 9.064563321754584e-07)
-        self.assertAlmostEqual(result[2], 33.585)
-        self.assertEqual(len(result[3]), 5)
-        self.assertEqual(len(result[4]), 5)
+        self.assertAlmostEqual(result[1], 1.5123889771594655e-147)
 
-    def test_chi_square_test_result_and_statistics_2_categories(self):
+    def test_run_goodness_of_fit_test_2_variants(self):
         exp = self.getExperiment()
-        data = ['A'] * 17 + ['B'] * 17
-        weights = {'A': 0.5, 'B': 0.5}
-        result = exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([170, 170], ['A', 'B'])
+        expected_freqs = pd.Series([170, 170], ['A', 'B'])
+        result = exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
         self.assertEqual(result[0], True)
         self.assertAlmostEqual(result[1], 1.0)
-        self.assertAlmostEqual(result[2], 0.0)
-        self.assertEqual(result[3]['A'], 17)
-        self.assertEqual(result[4]['A'], 17)
-        self.assertEqual(result[3]['B'], 17)
-        self.assertEqual(result[4]['B'], 17)
 
-    def test_chi_square_test_result_and_statistics_NaN_data(self):
+    def test_run_goodness_of_fit_test_NaN_data(self):
         exp = self.getExperiment()
-        data = ['A'] * 17 + [np.nan] * 17
-        weights = {'A': 0.5, 'B': 0.5}
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([170, 170], ['A', np.nan])
+        expected_freqs = pd.Series([170, 170], ['A', 'B'])
+        with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
+                                                 "or expected frequencies are less than 2."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
 
-    def test_chi_square_test_result_and_statistics_counts_less_5_2_categories(self):
+    def test_run_goodness_of_fit_test_1_variant_after_filter(self):
         exp = self.getExperiment()
-        data = ['A'] * 17 + ['B'] * 2 + ['C'] * 3
-        weights = {'A': 0.33, 'B': 0.33, 'C': 0.33}
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([180, 1, 2], ['A', 'B', 'C'])
+        expected_freqs = pd.Series([170, 61, 61], ['A', 'B', 'C'])
+        with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
+                                                 "or expected frequencies are less than 2."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
 
-    def test_chi_square_test_result_and_statistics_counts_less_5_1_category(self):
+    def test_test_run_goodness_of_fit_test_2_variants_after_filter(self):
         exp = self.getExperiment()
-        data = ['A'] * 8 + ['B'] * 2 + ['C'] * 14
-        weights = {'A': 0.33, 'B': 0.33, 'C': 0.33}
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([180, 184, 2], ['A', 'B', 'C'])
+        expected_freqs = pd.Series([122, 122, 122], ['A', 'B', 'C'])
+        result = exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
+        self.assertEqual(result[0], False)
+        self.assertAlmostEqual(result[1], 1.5123889771594655e-14)
 
-    def test_chi_square_test_result_and_statistics_one_category(self):
+    def test_test_run_goodness_of_fit_test_one_valid_variant(self):
         exp = self.getExperiment()
-        data = ['A'] * 16
-        weights = {'A': 0.5}
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([170, 170], ['A', 'C'])
+        expected_freqs = pd.Series([170, 170], ['A', 'B'])
+        with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
+                                                 "or expected frequencies are less than 2."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
 
-    def test_chi_square_test_result_and_statistics_empty_weights(self):
+    def test_test_run_goodness_of_fit_test_one_variant(self):
         exp = self.getExperiment()
-        data = ['A'] * 16
-        weights = {}
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([180], ['A'])
+        expected_freqs = pd.Series([170], ['A'])
+        with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
+                                                 "or expected frequencies are less than 2."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
 
-    def test_chi_square_test_result_and_statistics_no_categories(self):
+    def test_test_run_goodness_of_fit_test_no_expected_freqs(self):
         exp = self.getExperiment()
-        data = []
-        weights = {'A': 0.5}
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([180], ['A'])
+        expected_freqs = pd.Series([], [])
+        with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since expected "
+                                                 "or observed frequencies are empty."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
 
-    def test_chi_square_test_result_and_statistics_insufficient_weights(self):
+    def test_test_run_goodness_of_fit_test_unequal_expected_observed_freqs(self):
         exp = self.getExperiment()
-        data = ['A'] * 16 + ['B'] * 15
-        weights = {'A': 0.5}
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series([180, 170], ['A', 'B'])
+        expected_freqs = pd.Series([180], ['A'])
+        with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
+                                                 "or expected frequencies are less than 2."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
 
-    def test_chi_square_test_result_and_statistics_None_inputs(self):
+    def test_test_run_goodness_of_fit_test_None_freqs(self):
         exp = self.getExperiment()
-        data = None
-        weights = None
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = None
+        expected_freqs = None
+        with self.assertRaisesRegexp(ValueError, "Observed and expected frequencies should be of type Series."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
 
-    def test_chi_square_test_result_and_statistics_empty_inputs(self):
+    def test_test_run_goodness_of_fit_test_empty_input(self):
         exp = self.getExperiment()
-        data = []
-        weights = {}
-        with self.assertRaises(ValueError):
-            exp.chi_square_test_result_and_statistics(data, weights)
+        observed_freqs = pd.Series()
+        expected_freqs = pd.Series()
+        with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since expected "
+                                                 "or observed frequencies are empty."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
+
+    def test_test_run_goodness_of_fit_test_incorrect_input_1(self):
+        exp = self.getExperiment()
+        observed_freqs = {'A': 45, 'B': 35}
+        expected_freqs = {'A': 45, 'B': 35}
+        with self.assertRaisesRegexp(ValueError, "Observed and expected frequencies should be of type Series."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
+
+    def test_test_run_goodness_of_fit_test_incorrect_input_2(self):
+        exp = self.getExperiment()
+        observed_freqs = {'A': 45, 'B': 35}
+        expected_freqs = ['A', 'B', 'C']
+        with self.assertRaisesRegexp(ValueError, "Observed and expected frequencies should be of type Series."):
+            exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
+
 
 class HelperMethodsTestCases(ExperimentTestCase):
     """ Test other helper methods. """