From cc12a3d88219f34cc8957f75c4ef1b123d9c9f9b Mon Sep 17 00:00:00 2001 From: Jamie Broomall Date: Wed, 6 Apr 2022 18:53:05 -0700 Subject: [PATCH] handle large magnitude negative floats consistently --- src/whylogs/core/summaryconverters.py | 7 +++++-- tests/unit/app/test_session.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/whylogs/core/summaryconverters.py b/src/whylogs/core/summaryconverters.py index 1931ec6c47..ee5ab395e2 100644 --- a/src/whylogs/core/summaryconverters.py +++ b/src/whylogs/core/summaryconverters.py @@ -124,6 +124,9 @@ def single_quantile_from_sketch(sketch: kll_floats_sketch, quantile: float): def _calculate_bins(end: float, start: float, n: int, avg_per_bucket: float, max_buckets: int): # Include the max value in the right-most bin end += abs(end) * 1e-7 + abs_end = abs(end) + abs_start = abs(start) + max_magnitude = max(abs_end, abs_start) # the kll_floats_sketch use 32bit floats, so we check precision against np.float32 float_mantissa_bits = np.finfo(np.float32).nmant @@ -133,7 +136,7 @@ def _calculate_bins(end: float, start: float, n: int, avg_per_bucket: float, max width = (end - start) / n_buckets # check for precision of width with respect to float_mantissa_bits and bin width: - bits_in_max = math.floor(math.log2(end)) + bits_in_max = math.floor(math.log2(max_magnitude)) width_bits = math.floor(math.log2((end - start) / n_buckets)) logger.debug(f"bits_in_max is: {bits_in_max}") logger.debug(f"width_bits is: {width_bits}") @@ -143,7 +146,7 @@ def _calculate_bins(end: float, start: float, n: int, avg_per_bucket: float, max logger.info(f"Width must be larger than {bits_in_width} bits.") new_buckets = math.floor((end - start) / math.pow(2, bits_in_width)) logger.warn(f"Avoiding bin edge collisions by resizing to {new_buckets} buckets") - n_buckets = new_buckets + n_buckets = max(new_buckets, 1) width = (end - start) / n_buckets # Calculate histograms from the Probability Mass Function diff --git a/tests/unit/app/test_session.py b/tests/unit/app/test_session.py index 0483e37da8..cc8c2add04 100644 --- a/tests/unit/app/test_session.py +++ b/tests/unit/app/test_session.py @@ -1,5 +1,6 @@ from logging import getLogger +import pandas as pd import pytest from whylogs.app.config import SessionConfig @@ -78,6 +79,19 @@ def test_session_profile_small(df_single): assert len(flat_summary) == 1 +def test_session_profile_negative_ints(): + df = pd.DataFrame(range(-22335544310, -22335542310), columns=["negative"]) + session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False)) + profile = session.log_dataframe(df) + TEST_LOGGER.debug(f"logged df: {df.shape} ") + TEST_LOGGER.debug(f"logged profile: {profile} ") + summary = profile.flat_summary() + TEST_LOGGER.debug(f"logged summary: {summary} ") + flat_summary = summary["summary"] + TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ") + assert len(flat_summary) == 1 + + def test_session_profile_two_column(df_two_int_col): TEST_LOGGER.debug(f"About to log {df_two_int_col.describe()} with columns {df_two_int_col.columns}") session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))