Skip to content

Commit

Permalink
Merge pull request #240 from zalando/chi-square-restructured
Browse files Browse the repository at this point in the history
Changed chi-square test, removed frequencies computation
  • Loading branch information
daryadedik committed Sep 10, 2018
2 parents 92e7fc7 + 3c39814 commit 19b91f3
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 126 deletions.
71 changes: 29 additions & 42 deletions expan/core/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,49 +306,36 @@ def _quantile_filtering(self, data, kpis, percentile, threshold_type):
flags = flags | data[column].apply(method_table[threshold_type])
return flags


def chi_square_test_result_and_statistics(self, variant_column, weights, min_counts=5, alpha=0.05):
""" Tests the consistency of variant split with the hypothesized distribution.
:param variant_column: variant column from the input data frame
:param weights: dict with variant names as keys, weights as values
({<variant_name>:<weight>, ...}
:param min_counts: minimum number of observed and expected frequencies (should be at least 5), see
http://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.stats.chisquare.html
:param alpha: significance level, 0.05 by default
:return: True(if split is consistent with the given split) or
False(if split is not consistent with the given split)
:rtype: Boolean, float, float, pd.Series, pd.Series
def run_goodness_of_fit_test(self, observed_freqs, expected_freqs, alpha=0.01, min_counts=5):
""" Checks the validity of observed and expected counts and runs chi-square test for goodness of fit.
:param observed_freqs: observed frequencies
:type observed_freqs: pd.Series
:param expected_freqs: expected frequencies
:type expected_freqs: pd.Series
:param alpha: significance level
:type alpha: float
:param min_counts: minimum number of observations to run chi-square test
:type min_counts: int
:return split_is_unbiased: False is split is biased and True if split is correct
p_value: corresponding chi-square p-value
:rtype: bool, float
"""
if not hasattr(variant_column, '__len__'):
raise ValueError("Variant split check was cancelled since input variant column is empty or doesn't exist.")
if not hasattr(weights, '__len__'):
raise ValueError("Variant split check was cancelled since input weights are empty or doesn't exist.")
if len(weights) <= 1 or len(variant_column) <= 1:
raise ValueError("Variant split check was cancelled since input weights or the number if categories "
"is less than 2.")

# Count number of observations per each variant
variant_column = pd.Series(variant_column).dropna(axis=0)
observed_freqs = variant_column.value_counts()

if not isinstance(observed_freqs, pd.Series) or not isinstance(expected_freqs, pd.Series):
raise ValueError("Observed and expected frequencies should be of type Series.")
if observed_freqs.empty or expected_freqs.empty:
raise ValueError("Variant split check was cancelled since expected or observed frequencies are empty.")

# Ensure at least a frequency of min_counts at every location in observed_counts.
# It's recommended to not conduct test if frequencies in each category is less than min_counts
if len(observed_freqs[observed_freqs < min_counts]) >= 1:
raise ValueError("Chi-square test is not valid for small expected or observed frequencies.")

# If there are less than 2 categories left after dropping counts less than 5 we can't conduct the test.
if len(observed_freqs) < 2:
raise ValueError("If the number of categories is less than 2 Chi-square test is not applicable.")

# Calculate expected counts given corresponding weights,
# weights are filtered out of categories which were dropped before.
total_count = observed_freqs.sum()
weights = {k: v for (k, v) in weights.items() if k in observed_freqs.index.values}
expected_freqs = pd.Series(weights)
expected_freqs *= total_count

# Compute chi-square and p-value statistics
chi_square_val, p_val = statx.chi_square(observed_freqs.sort_index(), expected_freqs.sort_index())

return p_val >= alpha, p_val, chi_square_val, observed_freqs, expected_freqs
valid_observed_freqs = observed_freqs[observed_freqs > min_counts]
valid_expected_freqs = expected_freqs.filter(valid_observed_freqs.keys())

if len(valid_observed_freqs) == len(valid_expected_freqs) and len(valid_observed_freqs) >= 2:
_, p_value = statx.chi_square(valid_observed_freqs.sort_index(), valid_expected_freqs.sort_index())
split_is_unbiased = p_value >= alpha
else:
raise ValueError("Variant split check was cancelled since observed or expected frequencies "
"are less than 2.")
return split_is_unbiased, p_value
18 changes: 9 additions & 9 deletions expan/core/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import numpy as np
import pandas as pd
from scipy import stats
import scipy
from expan.core.results import BaseTestStatistics, SampleStatistics, SimpleTestStatistics

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -210,7 +210,7 @@ def estimate_sample_size(x, mde, r, alpha=0.05, beta=0.2):
if r <= 0:
raise ValueError("Variant split ratio needs to be higher than 0.")

ppf = stats.norm.ppf
ppf = scipy.stats.norm.ppf
c1 = (ppf(1.0 - alpha/2.0) - ppf(beta))**2
c2 = (1.0 + r) * c1 * (1.0 + 1.0 / r)
return c2 * x.var() / (mde * x.mean())**2
Expand Down Expand Up @@ -485,9 +485,9 @@ def normal_difference(mean1, std1, n1, mean2, std2, n2, percentiles=[2.5, 97.5],

# Mapping percentiles via standard error
if relative:
return dict([(round(p, 5), stats.t.ppf(p / 100.0, df=d_free) * st_error) for p in percentiles])
return dict([(round(p, 5), scipy.stats.t.ppf(p / 100.0, df=d_free) * st_error) for p in percentiles])
else:
return dict([(round(p, 5), mean + stats.t.ppf(p / 100.0, df=d_free) * st_error) for p in percentiles])
return dict([(round(p, 5), mean + scipy.stats.t.ppf(p / 100.0, df=d_free) * st_error) for p in percentiles])


def compute_statistical_power_from_samples(x, y, alpha=0.05):
Expand All @@ -503,7 +503,7 @@ def compute_statistical_power_from_samples(x, y, alpha=0.05):
:return: statistical power---the probability of a test to detect an effect if the effect actually exists
:rtype: float
"""
z_1_minus_alpha = stats.norm.ppf(1 - alpha/2.)
z_1_minus_alpha = scipy.stats.norm.ppf(1 - alpha/2.)
_x = np.array(x, dtype=float)
_x = _x[~np.isnan(_x)]
_y = np.array(y, dtype=float)
Expand Down Expand Up @@ -553,7 +553,7 @@ def compute_statistical_power(mean1, std1, n1, mean2, std2, n2, z_1_minus_alpha)

tmp = (n1 * n2 * effect_size**2) / ((n1 + n2) * std**2)
z_beta = z_1_minus_alpha - np.sqrt(tmp)
beta = stats.norm.cdf(z_beta)
beta = scipy.stats.norm.cdf(z_beta)
power = 1 - beta
return power

Expand Down Expand Up @@ -618,12 +618,12 @@ def compute_p_value(mean1, std1, n1, mean2, std2, n2):
t = np.sign(mean_diff) * 1000
else:
t = mean_diff / st_error
p = stats.t.cdf(-abs(t), df=d_free) * 2
p = scipy.stats.t.cdf(-abs(t), df=d_free) * 2
return p


def chi_square(observed_freqs, expected_freqs, ddof=0):
""" Compute chi-square statistics and p-values given observed and expected frequencies and degrees of freedom.
""" Computes chi-square statistics and p-values given observed and expected frequencies and degrees of freedom.
:param observed_freqs: observed frequencies
:type observed_freqs: pd.Series or array-like
Expand All @@ -635,6 +635,6 @@ def chi_square(observed_freqs, expected_freqs, ddof=0):
:return: chi-square statistics and p-value
:rtype: float, float
"""
chi_square_val, p_val = stats.chisquare(f_obs=observed_freqs, f_exp=expected_freqs, ddof=ddof, axis=None)
chi_square_val, p_val = scipy.stats.chisquare(f_obs=observed_freqs, f_exp=expected_freqs, ddof=ddof, axis=None)

return chi_square_val, p_val
157 changes: 82 additions & 75 deletions tests/tests_core/test_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,108 +360,115 @@ def test_outlier_filtering_derived_kpi(self):
)
self.assertIn('derived_kpi', data.columns)

def test_chi_square_test_result_and_statistics_same_weights(self):
def test_run_goodness_of_fit_test_true(self):
exp = self.getExperiment()
data = ['A'] * 23 + ['B'] * 18 + ['C'] * 17 + ['D'] * 19 + ['E'] * 23
weights = {'A': 0.2, 'B': 0.2, 'C': 0.2, 'D': 0.2, 'E': 0.2}
result = exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([230, 190, 190, 190, 230], ['A', 'B', 'C', 'D', 'E'])
expected_freqs = pd.Series([206, 206, 206, 206, 206], ['A', 'B', 'C', 'D', 'E'])
result = exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
self.assertEqual(result[0], True)
self.assertAlmostEqual(result[1], 0.8087921354109989)
self.assertAlmostEqual(result[2], 1.6)
self.assertEqual(result[3]['A'], 23)
self.assertEqual(result[3]['B'], 18)
self.assertEqual(result[3]['C'], 17)
self.assertEqual(result[3]['D'], 19)
self.assertEqual(result[3]['E'], 23)
self.assertAlmostEqual(result[1], 0.05357161207695437)

self.assertTrue(all([val == 20 for val in result[4]]))

def test_chi_square_test_result_and_statistics_different_weights(self):
def test_run_goodness_of_fit_test_false(self):
exp = self.getExperiment()
data = ['A'] * 23 + ['B'] * 18 + ['C'] * 17 + ['D'] * 19 + ['E'] * 23
weights = {'A': 0.25, 'B': 0.15, 'C': 0.10, 'D': 0.40, 'E': 0.10}
result = exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([230, 180, 170, 190, 230], ['A', 'B', 'C', 'D', 'E'])
expected_freqs = pd.Series([250, 150, 100, 400, 100], ['A', 'B', 'C', 'D', 'E'])
result = exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
self.assertEqual(result[0], False)
self.assertAlmostEqual(result[1], 9.064563321754584e-07)
self.assertAlmostEqual(result[2], 33.585)
self.assertEqual(len(result[3]), 5)
self.assertEqual(len(result[4]), 5)
self.assertAlmostEqual(result[1], 1.5123889771594655e-147)

def test_chi_square_test_result_and_statistics_2_categories(self):
def test_run_goodness_of_fit_test_2_variants(self):
exp = self.getExperiment()
data = ['A'] * 17 + ['B'] * 17
weights = {'A': 0.5, 'B': 0.5}
result = exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([170, 170], ['A', 'B'])
expected_freqs = pd.Series([170, 170], ['A', 'B'])
result = exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
self.assertEqual(result[0], True)
self.assertAlmostEqual(result[1], 1.0)
self.assertAlmostEqual(result[2], 0.0)
self.assertEqual(result[3]['A'], 17)
self.assertEqual(result[4]['A'], 17)
self.assertEqual(result[3]['B'], 17)
self.assertEqual(result[4]['B'], 17)

def test_chi_square_test_result_and_statistics_NaN_data(self):
def test_run_goodness_of_fit_test_NaN_data(self):
exp = self.getExperiment()
data = ['A'] * 17 + [np.nan] * 17
weights = {'A': 0.5, 'B': 0.5}
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([170, 170], ['A', np.nan])
expected_freqs = pd.Series([170, 170], ['A', 'B'])
with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
"or expected frequencies are less than 2."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_chi_square_test_result_and_statistics_counts_less_5_2_categories(self):
def test_run_goodness_of_fit_test_1_variant_after_filter(self):
exp = self.getExperiment()
data = ['A'] * 17 + ['B'] * 2 + ['C'] * 3
weights = {'A': 0.33, 'B': 0.33, 'C': 0.33}
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([180, 1, 2], ['A', 'B', 'C'])
expected_freqs = pd.Series([170, 61, 61], ['A', 'B', 'C'])
with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
"or expected frequencies are less than 2."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_chi_square_test_result_and_statistics_counts_less_5_1_category(self):
def test_test_run_goodness_of_fit_test_2_variants_after_filter(self):
exp = self.getExperiment()
data = ['A'] * 8 + ['B'] * 2 + ['C'] * 14
weights = {'A': 0.33, 'B': 0.33, 'C': 0.33}
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([180, 184, 2], ['A', 'B', 'C'])
expected_freqs = pd.Series([122, 122, 122], ['A', 'B', 'C'])
result = exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)
self.assertEqual(result[0], False)
self.assertAlmostEqual(result[1], 1.5123889771594655e-14)

def test_chi_square_test_result_and_statistics_one_category(self):
def test_test_run_goodness_of_fit_test_one_valid_variant(self):
exp = self.getExperiment()
data = ['A'] * 16
weights = {'A': 0.5}
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([170, 170], ['A', 'C'])
expected_freqs = pd.Series([170, 170], ['A', 'B'])
with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
"or expected frequencies are less than 2."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_chi_square_test_result_and_statistics_empty_weights(self):
def test_test_run_goodness_of_fit_test_one_variant(self):
exp = self.getExperiment()
data = ['A'] * 16
weights = {}
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([180], ['A'])
expected_freqs = pd.Series([170], ['A'])
with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
"or expected frequencies are less than 2."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_chi_square_test_result_and_statistics_no_categories(self):
def test_test_run_goodness_of_fit_test_no_expected_freqs(self):
exp = self.getExperiment()
data = []
weights = {'A': 0.5}
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([180], ['A'])
expected_freqs = pd.Series([], [])
with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since expected "
"or observed frequencies are empty."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_chi_square_test_result_and_statistics_insufficient_weights(self):
def test_test_run_goodness_of_fit_test_unequal_expected_observed_freqs(self):
exp = self.getExperiment()
data = ['A'] * 16 + ['B'] * 15
weights = {'A': 0.5}
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series([180, 170], ['A', 'B'])
expected_freqs = pd.Series([180], ['A'])
with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since observed "
"or expected frequencies are less than 2."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_chi_square_test_result_and_statistics_None_inputs(self):
def test_test_run_goodness_of_fit_test_None_freqs(self):
exp = self.getExperiment()
data = None
weights = None
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = None
expected_freqs = None
with self.assertRaisesRegexp(ValueError, "Observed and expected frequencies should be of type Series."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_chi_square_test_result_and_statistics_empty_inputs(self):
def test_test_run_goodness_of_fit_test_empty_input(self):
exp = self.getExperiment()
data = []
weights = {}
with self.assertRaises(ValueError):
exp.chi_square_test_result_and_statistics(data, weights)
observed_freqs = pd.Series()
expected_freqs = pd.Series()
with self.assertRaisesRegexp(ValueError, "Variant split check was cancelled since expected "
"or observed frequencies are empty."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_test_run_goodness_of_fit_test_incorrect_input_1(self):
exp = self.getExperiment()
observed_freqs = {'A': 45, 'B': 35}
expected_freqs = {'A': 45, 'B': 35}
with self.assertRaisesRegexp(ValueError, "Observed and expected frequencies should be of type Series."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)

def test_test_run_goodness_of_fit_test_incorrect_input_2(self):
exp = self.getExperiment()
observed_freqs = {'A': 45, 'B': 35}
expected_freqs = ['A', 'B', 'C']
with self.assertRaisesRegexp(ValueError, "Observed and expected frequencies should be of type Series."):
exp.run_goodness_of_fit_test(observed_freqs, expected_freqs)


class HelperMethodsTestCases(ExperimentTestCase):
""" Test other helper methods. """
Expand Down

0 comments on commit 19b91f3

Please sign in to comment.