Merge branch 'whylabs:mainline' into segbug

whylabs · Apr 13, 2022 · 0f9035f · 0f9035f
2 parents f4e1e12 + 04f3c31
commit 0f9035f
Show file tree

Hide file tree

Showing 11 changed files with 163 additions and 33 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.7.2
+current_version = 0.7.3
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
 serialize = 
@@ -27,4 +27,4 @@ replace = version := {new_version}
 
 [bumpversion:file:pyproject.toml]
 search = version = "{current_version}"
-replace = version = "{new_version}"
+replace = version = "{new_version}"
diff --git a/.github/workflows/python-continuous-integration.yml b/.github/workflows/python-continuous-integration.yml
@@ -32,13 +32,12 @@ jobs:
       with:
         version: '3.19.4'
         repo-token: ${{ secrets.GITHUB_TOKEN }}
-    - name: Cache Python dependencies
+    - name: Set up Poetry cache for Python dependencies
       uses: actions/cache@v2
       with:
-        path: ~/.cache/pip
-        key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements-dev.txt') }}
-        restore-keys: |
-          ${{ runner.os }}-pip-
+        path: ~/.cache/pypoetry
+        key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
+        restore-keys: ${{ runner.os }}-poetry-
     - name: Install dependencies
       run: make install
     - name: Run build, style, and lint checks

diff --git a/Makefile b/Makefile
@@ -5,7 +5,7 @@ src.python.pyc := $(shell find ./src -type f -name "*.pyc")
 src.proto.dir := ./proto/src
 src.proto := $(shell find $(src.proto.dir) -type f -name "*.proto")
 
-version := 0.7.2
+version := 0.7.3
 
 dist.dir := dist
 egg.dir := .eggs
@@ -140,6 +140,7 @@ test-notebooks: ## Run tests for the notebooks
 
 install: ## Install all dependencies with poetry.
 	@$(call i, Installing dependencies)
+	$(source HOME/.poetry/env)
 	poetry install
 
 coverage: ## Generate test coverage reports.

diff --git a/docs/conf.py b/docs/conf.py
@@ -101,7 +101,7 @@
 # built documents.
 #
 # The short X.Y version.
-version = "0.7.2"
+version = "0.7.3"
 # The full version, including alpha/beta/rc tags.
 release = ""  # Is set by calling `setup.py docs`
 

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -18,13 +18,6 @@ Install our library in a Python 3.6+ environment.
 
    pip install whylogs
 
-Demo CLI
-#######################
-Our demo CLI will walk you through setting up a whylogs project.
-
-.. code-block:: bash
-
-   whylogs-demo init
 
 Configuration
 ===================
@@ -79,7 +72,7 @@ Loggers log statistical information about your data. They have the following par
 
 For more information, see the `documentation <https://whylogs.readthedocs.io/en/latest/autoapi/whylogs/app/logger/index.html>`_ for the logger class.
 
-`This example code <https://whylogs.readthedocs.io/en/latest/auto_examples/configure_logger.html>`_ uses logger options to control the output location. 
+`This example code <https://github.com/whylabs/whylogs/blob/mainline/examples/configure_logger.py>`_ uses logger options to control the output location.
 
 Configure a Writer
 ########################

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "whylogs"
-version = "0.7.2"
+version = "0.7.3"
 description = "Profile and monitor your ML data pipeline end-to-end"
 authors = ["WhyLabs.ai <support@whylabs.ai>"]
 license = "Apache-2.0"

diff --git a/src/whylogs/_version.py b/src/whylogs/_version.py
@@ -1,3 +1,3 @@
 """WhyLabs version number."""
 
-__version__ = "0.7.2"
+__version__ = "0.7.3"
diff --git a/src/whylogs/core/summaryconverters.py b/src/whylogs/core/summaryconverters.py
@@ -2,6 +2,7 @@
 Library module defining function for generating summaries
 """
 import math
+from logging import getLogger
 from typing import Union
 
 import datasketches
@@ -28,6 +29,8 @@
 HIST_AVG_NUMBER_PER_BUCKET = 4.0
 QUANTILES = [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0]
 
+logger = getLogger(__name__)
+
 
 def from_sketch(sketch: update_theta_sketch, num_std_devs: float = 1):
     """
@@ -118,6 +121,45 @@ def single_quantile_from_sketch(sketch: kll_floats_sketch, quantile: float):
     return type("Object", (), {"quantile": qval[0]})
 
 
+def _calculate_bins(end: float, start: float, n: int, avg_per_bucket: float, max_buckets: int):
+    # Include the max value in the right-most bin
+    end += abs(end) * 1e-7
+    abs_end = abs(end)
+    abs_start = abs(start)
+    max_magnitude = max(abs_end, abs_start)
+
+    # the kll_floats_sketch use 32bit floats, so we check precision against np.float32
+    float_mantissa_bits = np.finfo(np.float32).nmant
+
+    # Include the right edge in the bin edges
+    n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
+    width = (end - start) / n_buckets
+
+    # Figure out the floating point precision at the scale of the bin boundaries
+    # min_interval is the smallest difference between floats at this scale
+    log_min_interval = math.floor(math.log2(max_magnitude)) - float_mantissa_bits
+    min_interval = math.pow(2, log_min_interval)
+
+    # If the bin width is smaller than min_interval, we need bigger bins
+    if width < min_interval:
+        new_buckets = math.floor((end - start) / min_interval)
+        logger.warning(
+            f"A bin width of {width} won't work with values in range of [{start}, {end}] "
+            f"because numbers closer to each other than {int(min_interval)} might not be distinct "
+            "when passed as float32: avoiding bin edge collisions by resizing from: "
+            f"{n_buckets} to: {new_buckets} histogram buckets in summary."
+        )
+        n_buckets = max(new_buckets, 1)
+        width = (end - start) / n_buckets
+        logger.info(f"New bin widh is: {width} across {n_buckets} buckets")
+
+    # Calculate histograms from the Probability Mass Function
+    bins = [start + i * width for i in range(n_buckets + 1)]
+    logger.debug(f"about to get pmf using start: {start} end:{end} width:{width} and n_buckets:{n_buckets}")
+    logger.debug(f"bin: {bins}")
+    return bins, end, start
+
+
 def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, avg_per_bucket: int = None):
     """
     Generate a summary of a kll_floats_sketch, including a histogram
@@ -151,13 +193,7 @@ def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, av
         bins = [start, end]
         counts = [n]
     else:
-        # Include the max value in the right-most bin
-        end += abs(end) * 1e-7
-        # Include the right edge in the bin edges
-        n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
-        width = (end - start) / n_buckets
-        # Calculate histograms from the Probability Mass Function
-        bins = [start + i * width for i in range(n_buckets + 1)]
+        bins, end, start = _calculate_bins(end, start, n, avg_per_bucket, max_buckets)
         pmf = sketch.get_pmf(bins)
         counts = [round(p * n) for p in pmf]
         counts = counts[1:-1]

diff --git a/src/whylogs/viewer/templates/index-hbs-cdn-all-in.html b/src/whylogs/viewer/templates/index-hbs-cdn-all-in.html
@@ -1776,6 +1776,10 @@ <h1 class="no-responsive__title">Hold on! :)</h1>
         return svgEl._groups[0][0].outerHTML;
       }
 
+      function range_arr(size, startAt = 0) {
+        return [...Array(size).keys()].map(i => i + startAt);
+}
+
       function generateBarChart(currentWidth, key, datas, referenceData) {
         let histogramData = [],
             overlappedHistogramData = [];
@@ -1786,10 +1790,13 @@ <h1 class="no-responsive__title">Hold on! :)</h1>
         }
         let yFormat,
             xFormat;
-        const data = histogramData.map((profile, index) => {
+
+
+        minArray = range_arr(Math.min(histogramData.length,overlappedHistogramData.length))
+        const data = minArray.map((profile, index) => {
           return {
             group: index,
-            profile: profile.axisY,
+            profile: histogramData[index].axisY,
             reference_profile: overlappedHistogramData[index].axisY
           }
         }).slice(0, 20)
@@ -1885,10 +1892,13 @@ <h1 class="no-responsive__title">Hold on! :)</h1>
           histogramData = chartData(datas)
           overlappedHistogramData = chartData(referenceData.columns[key.data.key])
         }
-        const data = histogramData.map((value, index) => {
-          const difference = value.axisY - overlappedHistogramData[index].axisY
+
+        minArray = range_arr(Math.min(histogramData.length,overlappedHistogramData.length))
+
+        const data = minArray.map((value, index) => {
+          const difference = histogramData[index].axisY - overlappedHistogramData[index].axisY
           const negativeValues = difference < 0 ? difference : 0
-          return [+value.axisY, negativeValues]
+          return [+histogramData[index].axisY, negativeValues]
         }).slice(0, 20).flat()
 
         let yFormat,
@@ -2666,7 +2676,7 @@ <h1 class="no-responsive__title">Hold on! :)</h1>
           ${frequentItemBoxElement('',chipElementTableData(items[item].value))}
         `
         referenceFrequentItemString += `
-          ${frequentItemBoxElement('',chipElementTableData(referenceItems[item].value))}
+          ${frequentItemBoxElement('',chipElementTableData(referenceItems[item]?.value ?? ''))}
         `
       }
     );

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -80,6 +80,16 @@ def df():
     return pd.DataFrame((np.random.rand(30, 4) - 0.5) * 3, columns=["A", "B", "C", "D"])
 
 
+@pytest.fixture(scope="session")
+def df_single():
+    return pd.DataFrame((np.random.randint(low=6092996828, high=6093000284, size=(100, 1))), columns=["id"], dtype=np.int64)
+
+
+@pytest.fixture(scope="session")
+def df_two_int_col():
+    return pd.DataFrame((np.random.randint(low=6092996828, high=6093000284, size=(100, 2))), columns=["id", "A"])
+
+
 @pytest.fixture(scope="session")
 def test_data_path():
 

diff --git a/tests/unit/app/test_session.py b/tests/unit/app/test_session.py
@@ -1,3 +1,7 @@
+from logging import getLogger
+
+import numpy as np
+import pandas as pd
 import pytest
 
 from whylogs.app.config import SessionConfig
@@ -9,6 +13,8 @@
     session_from_config,
 )
 
+TEST_LOGGER = getLogger(__name__)
+
 
 def test_get_global_session():
     session = get_or_create_session()
@@ -27,7 +33,6 @@ def test_reset():
 
 
 def test_session_log_dataframe(df):
-    pass
 
     session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
     session.log_dataframe(df)
@@ -49,6 +54,82 @@ def test_session_profile(df):
     assert len(flat_summary) == 4
 
 
+def test_session_profile_single_column(df_single):
+    TEST_LOGGER.debug(f"About to log {df_single.describe()} with columns {df_single.columns}")
+    # session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    session = get_or_create_session()
+    df = pd.DataFrame(range(22335544310, 22335545310), columns=["A"])
+    profile = session.log_dataframe(df)  # df_single)
+    assert profile is not None
+
+    summary = profile.flat_summary()
+
+    flat_summary = summary["summary"]
+    assert len(flat_summary) == 1
+
+
+def test_session_profile_small(df_single):
+    TEST_LOGGER.debug(f"About to log {df_single.describe()} with columns {df_single.columns}")
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    for i in range(1, 5):
+        profile = session.log_dataframe(df_single.head(i))
+        assert profile is not None
+
+        summary = profile.flat_summary()
+
+        flat_summary = summary["summary"]
+        TEST_LOGGER.info(f"logged {i} rows and summary is {flat_summary}")
+        assert len(flat_summary) == 1
+
+
+def test_session_profile_negative_ints():
+    df = pd.DataFrame(range(-22335544310, -22335542310), columns=["negative"])
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    profile = session.log_dataframe(df)
+    TEST_LOGGER.debug(f"logged df: {df.shape} ")
+    summary = profile.flat_summary()
+    TEST_LOGGER.debug(f"logged summary: {summary} ")
+    flat_summary = summary["summary"]
+    TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
+    assert len(flat_summary) == 1
+
+
+def test_session_profile_negative_to_zero():
+    df = pd.DataFrame(range(-1000, 1), columns=["negative_to_zero"])
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    profile = session.log_dataframe(df)
+    TEST_LOGGER.debug(f"logged df: {df.shape} ")
+    summary = profile.flat_summary()
+    TEST_LOGGER.debug(f"logged summary: {summary} ")
+    flat_summary = summary["summary"]
+    TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
+    assert len(flat_summary) == 1
+
+
+def test_session_profile_all_close_to_zero():
+    df = pd.DataFrame(np.arange(-1.00000001e-20, -1e-20, 1e-30), columns=["close_to_zero"])
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    profile = session.log_dataframe(df)
+    TEST_LOGGER.debug(f"logged df: {df.shape} ")
+    summary = profile.flat_summary()
+    TEST_LOGGER.debug(f"logged summary: {summary} ")
+    flat_summary = summary["summary"]
+    TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
+    assert len(flat_summary) == 1
+
+
+def test_session_profile_two_column(df_two_int_col):
+    TEST_LOGGER.debug(f"About to log {df_two_int_col.describe()} with columns {df_two_int_col.columns}")
+    session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
+    profile = session.log_dataframe(df_two_int_col)
+    assert profile is not None
+
+    summary = profile.flat_summary()
+
+    flat_summary = summary["summary"]
+    assert len(flat_summary) == 2
+
+
 def test_profile_df(df):
     import datetime