Skip to content

Commit

Permalink
Merge 39e68af into 72584f4
Browse files Browse the repository at this point in the history
  • Loading branch information
jamie256 authored Apr 7, 2022
2 parents 72584f4 + 39e68af commit 4fc08a2
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 8 deletions.
50 changes: 43 additions & 7 deletions src/whylogs/core/summaryconverters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Library module defining function for generating summaries
"""
import math
from logging import getLogger
from typing import Union

import datasketches
Expand All @@ -28,6 +29,8 @@
HIST_AVG_NUMBER_PER_BUCKET = 4.0
QUANTILES = [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0]

logger = getLogger(__name__)


def from_sketch(sketch: update_theta_sketch, num_std_devs: float = 1):
"""
Expand Down Expand Up @@ -118,6 +121,45 @@ def single_quantile_from_sketch(sketch: kll_floats_sketch, quantile: float):
return type("Object", (), {"quantile": qval[0]})


def _calculate_bins(end: float, start: float, n: int, avg_per_bucket: float, max_buckets: int):
# Include the max value in the right-most bin
end += abs(end) * 1e-7
abs_end = abs(end)
abs_start = abs(start)
max_magnitude = max(abs_end, abs_start)

# the kll_floats_sketch use 32bit floats, so we check precision against np.float32
float_mantissa_bits = np.finfo(np.float32).nmant

# Include the right edge in the bin edges
n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
width = (end - start) / n_buckets

# Figure out the floating point precision at the scale of the bin boundaries
# min_interval is the smallest difference between floats at this scale
log_min_interval = math.floor(math.log2(max_magnitude)) - float_mantissa_bits
min_interval = math.pow(2, log_min_interval)

# If the bin width is smaller than min_interval, we need bigger bins
if width < min_interval:
new_buckets = math.floor((end - start) / min_interval)
logger.warning(
f"A bin width of {width} won't work with values in range of [{start}, {end}] "
f"because numbers closer to each other than {int(min_interval)} might not be distinct "
"when passed as float32: avoiding bin edge collisions by resizing from: "
f"{n_buckets} to: {new_buckets} histogram buckets in summary."
)
n_buckets = max(new_buckets, 1)
width = (end - start) / n_buckets
logger.info(f"New bin widh is: {width} across {n_buckets} buckets")

# Calculate histograms from the Probability Mass Function
bins = [start + i * width for i in range(n_buckets + 1)]
logger.debug(f"about to get pmf using start: {start} end:{end} width:{width} and n_buckets:{n_buckets}")
logger.debug(f"bin: {bins}")
return bins, end, start


def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, avg_per_bucket: int = None):
"""
Generate a summary of a kll_floats_sketch, including a histogram
Expand Down Expand Up @@ -151,13 +193,7 @@ def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, av
bins = [start, end]
counts = [n]
else:
# Include the max value in the right-most bin
end += abs(end) * 1e-7
# Include the right edge in the bin edges
n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
width = (end - start) / n_buckets
# Calculate histograms from the Probability Mass Function
bins = [start + i * width for i in range(n_buckets + 1)]
bins, end, start = _calculate_bins(end, start, n, avg_per_bucket, max_buckets)
pmf = sketch.get_pmf(bins)
counts = [round(p * n) for p in pmf]
counts = counts[1:-1]
Expand Down
10 changes: 10 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ def df():
return pd.DataFrame((np.random.rand(30, 4) - 0.5) * 3, columns=["A", "B", "C", "D"])


@pytest.fixture(scope="session")
def df_single():
return pd.DataFrame((np.random.randint(low=6092996828, high=6093000284, size=(100, 1))), columns=["id"], dtype=np.int64)


@pytest.fixture(scope="session")
def df_two_int_col():
return pd.DataFrame((np.random.randint(low=6092996828, high=6093000284, size=(100, 2))), columns=["id", "A"])


@pytest.fixture(scope="session")
def test_data_path():

Expand Down
83 changes: 82 additions & 1 deletion tests/unit/app/test_session.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from logging import getLogger

import numpy as np
import pandas as pd
import pytest

from whylogs.app.config import SessionConfig
Expand All @@ -9,6 +13,8 @@
session_from_config,
)

TEST_LOGGER = getLogger(__name__)


def test_get_global_session():
session = get_or_create_session()
Expand All @@ -27,7 +33,6 @@ def test_reset():


def test_session_log_dataframe(df):
pass

session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
session.log_dataframe(df)
Expand All @@ -49,6 +54,82 @@ def test_session_profile(df):
assert len(flat_summary) == 4


def test_session_profile_single_column(df_single):
TEST_LOGGER.debug(f"About to log {df_single.describe()} with columns {df_single.columns}")
# session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
session = get_or_create_session()
df = pd.DataFrame(range(22335544310, 22335545310), columns=["A"])
profile = session.log_dataframe(df) # df_single)
assert profile is not None

summary = profile.flat_summary()

flat_summary = summary["summary"]
assert len(flat_summary) == 1


def test_session_profile_small(df_single):
TEST_LOGGER.debug(f"About to log {df_single.describe()} with columns {df_single.columns}")
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
for i in range(1, 5):
profile = session.log_dataframe(df_single.head(i))
assert profile is not None

summary = profile.flat_summary()

flat_summary = summary["summary"]
TEST_LOGGER.info(f"logged {i} rows and summary is {flat_summary}")
assert len(flat_summary) == 1


def test_session_profile_negative_ints():
df = pd.DataFrame(range(-22335544310, -22335542310), columns=["negative"])
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
profile = session.log_dataframe(df)
TEST_LOGGER.debug(f"logged df: {df.shape} ")
summary = profile.flat_summary()
TEST_LOGGER.debug(f"logged summary: {summary} ")
flat_summary = summary["summary"]
TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
assert len(flat_summary) == 1


def test_session_profile_negative_to_zero():
df = pd.DataFrame(range(-1000, 1), columns=["negative_to_zero"])
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
profile = session.log_dataframe(df)
TEST_LOGGER.debug(f"logged df: {df.shape} ")
summary = profile.flat_summary()
TEST_LOGGER.debug(f"logged summary: {summary} ")
flat_summary = summary["summary"]
TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
assert len(flat_summary) == 1


def test_session_profile_all_close_to_zero():
df = pd.DataFrame(np.arange(-1.00000001e-20, -1e-20, 1e-30), columns=["close_to_zero"])
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
profile = session.log_dataframe(df)
TEST_LOGGER.debug(f"logged df: {df.shape} ")
summary = profile.flat_summary()
TEST_LOGGER.debug(f"logged summary: {summary} ")
flat_summary = summary["summary"]
TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
assert len(flat_summary) == 1


def test_session_profile_two_column(df_two_int_col):
TEST_LOGGER.debug(f"About to log {df_two_int_col.describe()} with columns {df_two_int_col.columns}")
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
profile = session.log_dataframe(df_two_int_col)
assert profile is not None

summary = profile.flat_summary()

flat_summary = summary["summary"]
assert len(flat_summary) == 2


def test_profile_df(df):
import datetime

Expand Down

0 comments on commit 4fc08a2

Please sign in to comment.