Merge 39e68af into 72584f4

whylabs · Apr 7, 2022 · 4fc08a2 · 4fc08a2
2 parents 72584f4 + 39e68af
commit 4fc08a2
Show file tree

Hide file tree

Showing 3 changed files with 135 additions and 8 deletions.
diff --git a/src/whylogs/core/summaryconverters.py b/src/whylogs/core/summaryconverters.py
@@ -2,6 +2,7 @@
 Library module defining function for generating summaries
 """
 import math
+from logging import getLogger
 from typing import Union
 
 import datasketches
@@ -28,6 +29,8 @@
 HIST_AVG_NUMBER_PER_BUCKET = 4.0
 QUANTILES = [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0]
 
+logger = getLogger(__name__)
+
 
 def from_sketch(sketch: update_theta_sketch, num_std_devs: float = 1):
     """
@@ -118,6 +121,45 @@ def single_quantile_from_sketch(sketch: kll_floats_sketch, quantile: float):
     return type("Object", (), {"quantile": qval[0]})
 
 
+def _calculate_bins(end: float, start: float, n: int, avg_per_bucket: float, max_buckets: int):
+    # Include the max value in the right-most bin
+    end += abs(end) * 1e-7
+    abs_end = abs(end)
+    abs_start = abs(start)
+    max_magnitude = max(abs_end, abs_start)
+
+    # the kll_floats_sketch use 32bit floats, so we check precision against np.float32
+    float_mantissa_bits = np.finfo(np.float32).nmant
+
+    # Include the right edge in the bin edges
+    n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
+    width = (end - start) / n_buckets
+
+    # Figure out the floating point precision at the scale of the bin boundaries
+    # min_interval is the smallest difference between floats at this scale
+    log_min_interval = math.floor(math.log2(max_magnitude)) - float_mantissa_bits
+    min_interval = math.pow(2, log_min_interval)
+
+    # If the bin width is smaller than min_interval, we need bigger bins
+    if width < min_interval:
+        new_buckets = math.floor((end - start) / min_interval)
+        logger.warning(
+            f"A bin width of {width} won't work with values in range of [{start}, {end}] "
+            f"because numbers closer to each other than {int(min_interval)} might not be distinct "
+            "when passed as float32: avoiding bin edge collisions by resizing from: "
+            f"{n_buckets} to: {new_buckets} histogram buckets in summary."
+        )
+        n_buckets = max(new_buckets, 1)
+        width = (end - start) / n_buckets
+        logger.info(f"New bin widh is: {width} across {n_buckets} buckets")
+
+    # Calculate histograms from the Probability Mass Function
+    bins = [start + i * width for i in range(n_buckets + 1)]
+    logger.debug(f"about to get pmf using start: {start} end:{end} width:{width} and n_buckets:{n_buckets}")
+    logger.debug(f"bin: {bins}")
+    return bins, end, start
+
+
 def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, avg_per_bucket: int = None):
     """
     Generate a summary of a kll_floats_sketch, including a histogram
@@ -151,13 +193,7 @@ def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, av
         bins = [start, end]
         counts = [n]
     else:
-        # Include the max value in the right-most bin
-        end += abs(end) * 1e-7
-        # Include the right edge in the bin edges
-        n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
-        width = (end - start) / n_buckets
-        # Calculate histograms from the Probability Mass Function
-        bins = [start + i * width for i in range(n_buckets + 1)]
+        bins, end, start = _calculate_bins(end, start, n, avg_per_bucket, max_buckets)
         pmf = sketch.get_pmf(bins)
         counts = [round(p * n) for p in pmf]
         counts = counts[1:-1]

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -80,6 +80,16 @@ def df():
     return pd.DataFrame((np.random.rand(30, 4) - 0.5) * 3, columns=["A", "B", "C", "D"])
 
 
+@pytest.fixture(scope="session")
+def df_single():
+    return pd.DataFrame((np.random.randint(low=6092996828, high=6093000284, size=(100, 1))), columns=["id"], dtype=np.int64)
+
+
+@pytest.fixture(scope="session")
+def df_two_int_col():
+    return pd.DataFrame((np.random.randint(low=6092996828, high=6093000284, size=(100, 2))), columns=["id", "A"])
+
+
 @pytest.fixture(scope="session")
 def test_data_path():
 

diff --git a/tests/unit/app/test_session.py b/tests/unit/app/test_session.py
@@ -1,3 +1,7 @@
+from logging import getLogger
+
+import numpy as np
+import pandas as pd
 import pytest
 
 from whylogs.app.config import SessionConfig
@@ -9,6 +13,8 @@
     session_from_config,
 )
 
+TEST_LOGGER = getLogger(__name__)
+
 
 def test_get_global_session():
     session = get_or_create_session()
@@ -27,7 +33,6 @@ def test_reset():
 
 
 def test_session_log_dataframe(df):
-    pass
 
     session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
     session.log_dataframe(df)
@@ -49,6 +54,82 @@ def test_session_profile(df):
     assert len(flat_summary) == 4
 
 
+def test_session_profile_single_column(df_single):
+    TEST_LOGGER.debug(f"About to log {df_single.describe()} with columns {df_single.columns}")
+    # session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    session = get_or_create_session()
+    df = pd.DataFrame(range(22335544310, 22335545310), columns=["A"])
+    profile = session.log_dataframe(df)  # df_single)
+    assert profile is not None
+
+    summary = profile.flat_summary()
+
+    flat_summary = summary["summary"]
+    assert len(flat_summary) == 1
+
+
+def test_session_profile_small(df_single):
+    TEST_LOGGER.debug(f"About to log {df_single.describe()} with columns {df_single.columns}")
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    for i in range(1, 5):
+        profile = session.log_dataframe(df_single.head(i))
+        assert profile is not None
+
+        summary = profile.flat_summary()
+
+        flat_summary = summary["summary"]
+        TEST_LOGGER.info(f"logged {i} rows and summary is {flat_summary}")
+        assert len(flat_summary) == 1
+
+
+def test_session_profile_negative_ints():
+    df = pd.DataFrame(range(-22335544310, -22335542310), columns=["negative"])
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    profile = session.log_dataframe(df)
+    TEST_LOGGER.debug(f"logged df: {df.shape} ")
+    summary = profile.flat_summary()
+    TEST_LOGGER.debug(f"logged summary: {summary} ")
+    flat_summary = summary["summary"]
+    TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
+    assert len(flat_summary) == 1
+
+
+def test_session_profile_negative_to_zero():
+    df = pd.DataFrame(range(-1000, 1), columns=["negative_to_zero"])
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    profile = session.log_dataframe(df)
+    TEST_LOGGER.debug(f"logged df: {df.shape} ")
+    summary = profile.flat_summary()
+    TEST_LOGGER.debug(f"logged summary: {summary} ")
+    flat_summary = summary["summary"]
+    TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
+    assert len(flat_summary) == 1
+
+
+def test_session_profile_all_close_to_zero():
+    df = pd.DataFrame(np.arange(-1.00000001e-20, -1e-20, 1e-30), columns=["close_to_zero"])
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    profile = session.log_dataframe(df)
+    TEST_LOGGER.debug(f"logged df: {df.shape} ")
+    summary = profile.flat_summary()
+    TEST_LOGGER.debug(f"logged summary: {summary} ")
+    flat_summary = summary["summary"]
+    TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
+    assert len(flat_summary) == 1
+
+
+def test_session_profile_two_column(df_two_int_col):
+    TEST_LOGGER.debug(f"About to log {df_two_int_col.describe()} with columns {df_two_int_col.columns}")
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    profile = session.log_dataframe(df_two_int_col)
+    assert profile is not None
+
+    summary = profile.flat_summary()
+
+    flat_summary = summary["summary"]
+    assert len(flat_summary) == 2
+
+
 def test_profile_df(df):
     import datetime