Skip to content

Commit

Permalink
Fix KS test p-value computation, add condition in entropy
Browse files Browse the repository at this point in the history
  • Loading branch information
MilenaTrajanoska committed Feb 16, 2022
1 parent efb6105 commit ab2a002
Showing 1 changed file with 36 additions and 6 deletions.
42 changes: 36 additions & 6 deletions src/whylogs/core/summaryconverters.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,8 @@ def entropy_from_column_summary(summary: ColumnSummary, histogram: datasketches.
total_count = summary.counters.count

if inferred_type == InferredType.Type.FRACTIONAL:
if histogram.get_min_value() == histogram.get_max_value() or histogram.get_n() <= 1:
return 0
bins = np.linspace(histogram.get_min_value(), histogram.get_max_value(), 100)
pmf = histogram.get_pmf(bins)
pmf = list(filter(lambda x: x > 0, pmf))
Expand Down Expand Up @@ -251,15 +253,43 @@ def ks_test_compute_p_value(target_distribution: kll_floats_sketch, reference_di
"""

D_max = 0
quantile_values = reference_distribution.get_quantiles(QUANTILES)
for quant in quantile_values:
cdf_target = target_distribution.get_cdf([quant])[0]
cdf_ref = reference_distribution.get_cdf([quant])[0]
target_quantile_values = target_distribution.get_quantiles(QUANTILES)
ref_quantile_values = reference_distribution.get_quantiles(QUANTILES)

num_quantiles = len(QUANTILES)
i, j = 0, 0
while i < num_quantiles and j < num_quantiles:

if target_quantile_values[i] < ref_quantile_values[j]:
current_quantile = target_quantile_values[i]
i += 1
else:
current_quantile = ref_quantile_values[j]
j += 1

cdf_target = target_distribution.get_cdf([current_quantile])[0]
cdf_ref = reference_distribution.get_cdf([current_quantile])[0]
D = abs(cdf_target - cdf_ref)
if D > D_max:
D_max = D

while i < num_quantiles:
cdf_target = target_distribution.get_cdf([target_quantile_values[i]])[0]
cdf_ref = reference_distribution.get_cdf([target_quantile_values[i]])[0]
D = abs(cdf_target - cdf_ref)
if D > D_max:
D_max = D
n_samples = min(target_distribution.get_n(), reference_distribution.get_n())
p_value = special.kolmogorov(np.sqrt(n_samples) * D_max)
i += 1

while j < num_quantiles:
cdf_target = target_distribution.get_cdf([ref_quantile_values[j]])[0]
cdf_ref = reference_distribution.get_cdf([ref_quantile_values[j]])[0]
D = abs(cdf_target - cdf_ref)
if D > D_max:
D_max = D
j += 1

p_value = special.kolmogorov(np.sqrt(num_quantiles) * D_max)
return type("Object", (), {"ks_test": p_value})


Expand Down

0 comments on commit ab2a002

Please sign in to comment.