Skip to content

Commit

Permalink
Merge branch 'whylabs:mainline' into segbug
Browse files Browse the repository at this point in the history
  • Loading branch information
FelipeAdachi committed Apr 13, 2022
2 parents f4e1e12 + 04f3c31 commit 0f9035f
Show file tree
Hide file tree
Showing 11 changed files with 163 additions and 33 deletions.
4 changes: 2 additions & 2 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.7.2
current_version = 0.7.3
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
serialize =
Expand Down Expand Up @@ -27,4 +27,4 @@ replace = version := {new_version}

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"
replace = version = "{new_version}"
9 changes: 4 additions & 5 deletions .github/workflows/python-continuous-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,12 @@ jobs:
with:
version: '3.19.4'
repo-token: ${{ secrets.GITHUB_TOKEN }}
- name: Cache Python dependencies
- name: Set up Poetry cache for Python dependencies
uses: actions/cache@v2
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements-dev.txt') }}
restore-keys: |
${{ runner.os }}-pip-
path: ~/.cache/pypoetry
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: ${{ runner.os }}-poetry-
- name: Install dependencies
run: make install
- name: Run build, style, and lint checks
Expand Down
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ src.python.pyc := $(shell find ./src -type f -name "*.pyc")
src.proto.dir := ./proto/src
src.proto := $(shell find $(src.proto.dir) -type f -name "*.proto")

version := 0.7.2
version := 0.7.3

dist.dir := dist
egg.dir := .eggs
Expand Down Expand Up @@ -140,6 +140,7 @@ test-notebooks: ## Run tests for the notebooks

install: ## Install all dependencies with poetry.
@$(call i, Installing dependencies)
$(source HOME/.poetry/env)
poetry install

coverage: ## Generate test coverage reports.
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
# built documents.
#
# The short X.Y version.
version = "0.7.2"
version = "0.7.3"
# The full version, including alpha/beta/rc tags.
release = "" # Is set by calling `setup.py docs`

Expand Down
9 changes: 1 addition & 8 deletions docs/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@ Install our library in a Python 3.6+ environment.
pip install whylogs
Demo CLI
#######################
Our demo CLI will walk you through setting up a whylogs project.

.. code-block:: bash
whylogs-demo init
Configuration
===================
Expand Down Expand Up @@ -79,7 +72,7 @@ Loggers log statistical information about your data. They have the following par

For more information, see the `documentation <https://whylogs.readthedocs.io/en/latest/autoapi/whylogs/app/logger/index.html>`_ for the logger class.

`This example code <https://whylogs.readthedocs.io/en/latest/auto_examples/configure_logger.html>`_ uses logger options to control the output location.
`This example code <https://github.com/whylabs/whylogs/blob/mainline/examples/configure_logger.py>`_ uses logger options to control the output location.

Configure a Writer
########################
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "whylogs"
version = "0.7.2"
version = "0.7.3"
description = "Profile and monitor your ML data pipeline end-to-end"
authors = ["WhyLabs.ai <support@whylabs.ai>"]
license = "Apache-2.0"
Expand Down
2 changes: 1 addition & 1 deletion src/whylogs/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""WhyLabs version number."""

__version__ = "0.7.2"
__version__ = "0.7.3"
50 changes: 43 additions & 7 deletions src/whylogs/core/summaryconverters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Library module defining function for generating summaries
"""
import math
from logging import getLogger
from typing import Union

import datasketches
Expand All @@ -28,6 +29,8 @@
HIST_AVG_NUMBER_PER_BUCKET = 4.0
QUANTILES = [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0]

logger = getLogger(__name__)


def from_sketch(sketch: update_theta_sketch, num_std_devs: float = 1):
"""
Expand Down Expand Up @@ -118,6 +121,45 @@ def single_quantile_from_sketch(sketch: kll_floats_sketch, quantile: float):
return type("Object", (), {"quantile": qval[0]})


def _calculate_bins(end: float, start: float, n: int, avg_per_bucket: float, max_buckets: int):
# Include the max value in the right-most bin
end += abs(end) * 1e-7
abs_end = abs(end)
abs_start = abs(start)
max_magnitude = max(abs_end, abs_start)

# the kll_floats_sketch use 32bit floats, so we check precision against np.float32
float_mantissa_bits = np.finfo(np.float32).nmant

# Include the right edge in the bin edges
n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
width = (end - start) / n_buckets

# Figure out the floating point precision at the scale of the bin boundaries
# min_interval is the smallest difference between floats at this scale
log_min_interval = math.floor(math.log2(max_magnitude)) - float_mantissa_bits
min_interval = math.pow(2, log_min_interval)

# If the bin width is smaller than min_interval, we need bigger bins
if width < min_interval:
new_buckets = math.floor((end - start) / min_interval)
logger.warning(
f"A bin width of {width} won't work with values in range of [{start}, {end}] "
f"because numbers closer to each other than {int(min_interval)} might not be distinct "
"when passed as float32: avoiding bin edge collisions by resizing from: "
f"{n_buckets} to: {new_buckets} histogram buckets in summary."
)
n_buckets = max(new_buckets, 1)
width = (end - start) / n_buckets
logger.info(f"New bin widh is: {width} across {n_buckets} buckets")

# Calculate histograms from the Probability Mass Function
bins = [start + i * width for i in range(n_buckets + 1)]
logger.debug(f"about to get pmf using start: {start} end:{end} width:{width} and n_buckets:{n_buckets}")
logger.debug(f"bin: {bins}")
return bins, end, start


def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, avg_per_bucket: int = None):
"""
Generate a summary of a kll_floats_sketch, including a histogram
Expand Down Expand Up @@ -151,13 +193,7 @@ def histogram_from_sketch(sketch: kll_floats_sketch, max_buckets: int = None, av
bins = [start, end]
counts = [n]
else:
# Include the max value in the right-most bin
end += abs(end) * 1e-7
# Include the right edge in the bin edges
n_buckets = min(math.ceil(n / avg_per_bucket), max_buckets)
width = (end - start) / n_buckets
# Calculate histograms from the Probability Mass Function
bins = [start + i * width for i in range(n_buckets + 1)]
bins, end, start = _calculate_bins(end, start, n, avg_per_bucket, max_buckets)
pmf = sketch.get_pmf(bins)
counts = [round(p * n) for p in pmf]
counts = counts[1:-1]
Expand Down
22 changes: 16 additions & 6 deletions src/whylogs/viewer/templates/index-hbs-cdn-all-in.html
Original file line number Diff line number Diff line change
Expand Up @@ -1776,6 +1776,10 @@ <h1 class="no-responsive__title">Hold on! :)</h1>
return svgEl._groups[0][0].outerHTML;
}

function range_arr(size, startAt = 0) {
return [...Array(size).keys()].map(i => i + startAt);
}

function generateBarChart(currentWidth, key, datas, referenceData) {
let histogramData = [],
overlappedHistogramData = [];
Expand All @@ -1786,10 +1790,13 @@ <h1 class="no-responsive__title">Hold on! :)</h1>
}
let yFormat,
xFormat;
const data = histogramData.map((profile, index) => {


minArray = range_arr(Math.min(histogramData.length,overlappedHistogramData.length))
const data = minArray.map((profile, index) => {
return {
group: index,
profile: profile.axisY,
profile: histogramData[index].axisY,
reference_profile: overlappedHistogramData[index].axisY
}
}).slice(0, 20)
Expand Down Expand Up @@ -1885,10 +1892,13 @@ <h1 class="no-responsive__title">Hold on! :)</h1>
histogramData = chartData(datas)
overlappedHistogramData = chartData(referenceData.columns[key.data.key])
}
const data = histogramData.map((value, index) => {
const difference = value.axisY - overlappedHistogramData[index].axisY

minArray = range_arr(Math.min(histogramData.length,overlappedHistogramData.length))

const data = minArray.map((value, index) => {
const difference = histogramData[index].axisY - overlappedHistogramData[index].axisY
const negativeValues = difference < 0 ? difference : 0
return [+value.axisY, negativeValues]
return [+histogramData[index].axisY, negativeValues]
}).slice(0, 20).flat()

let yFormat,
Expand Down Expand Up @@ -2666,7 +2676,7 @@ <h1 class="no-responsive__title">Hold on! :)</h1>
${frequentItemBoxElement('',chipElementTableData(items[item].value))}
`
referenceFrequentItemString += `
${frequentItemBoxElement('',chipElementTableData(referenceItems[item].value))}
${frequentItemBoxElement('',chipElementTableData(referenceItems[item]?.value ?? ''))}
`
}
);
Expand Down
10 changes: 10 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,16 @@ def df():
return pd.DataFrame((np.random.rand(30, 4) - 0.5) * 3, columns=["A", "B", "C", "D"])


@pytest.fixture(scope="session")
def df_single():
return pd.DataFrame((np.random.randint(low=6092996828, high=6093000284, size=(100, 1))), columns=["id"], dtype=np.int64)


@pytest.fixture(scope="session")
def df_two_int_col():
return pd.DataFrame((np.random.randint(low=6092996828, high=6093000284, size=(100, 2))), columns=["id", "A"])


@pytest.fixture(scope="session")
def test_data_path():

Expand Down
83 changes: 82 additions & 1 deletion tests/unit/app/test_session.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
from logging import getLogger

import numpy as np
import pandas as pd
import pytest

from whylogs.app.config import SessionConfig
Expand All @@ -9,6 +13,8 @@
session_from_config,
)

TEST_LOGGER = getLogger(__name__)


def test_get_global_session():
session = get_or_create_session()
Expand All @@ -27,7 +33,6 @@ def test_reset():


def test_session_log_dataframe(df):
pass

session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
session.log_dataframe(df)
Expand All @@ -49,6 +54,82 @@ def test_session_profile(df):
assert len(flat_summary) == 4


def test_session_profile_single_column(df_single):
TEST_LOGGER.debug(f"About to log {df_single.describe()} with columns {df_single.columns}")
# session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
session = get_or_create_session()
df = pd.DataFrame(range(22335544310, 22335545310), columns=["A"])
profile = session.log_dataframe(df) # df_single)
assert profile is not None

summary = profile.flat_summary()

flat_summary = summary["summary"]
assert len(flat_summary) == 1


def test_session_profile_small(df_single):
TEST_LOGGER.debug(f"About to log {df_single.describe()} with columns {df_single.columns}")
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
for i in range(1, 5):
profile = session.log_dataframe(df_single.head(i))
assert profile is not None

summary = profile.flat_summary()

flat_summary = summary["summary"]
TEST_LOGGER.info(f"logged {i} rows and summary is {flat_summary}")
assert len(flat_summary) == 1


def test_session_profile_negative_ints():
df = pd.DataFrame(range(-22335544310, -22335542310), columns=["negative"])
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
profile = session.log_dataframe(df)
TEST_LOGGER.debug(f"logged df: {df.shape} ")
summary = profile.flat_summary()
TEST_LOGGER.debug(f"logged summary: {summary} ")
flat_summary = summary["summary"]
TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
assert len(flat_summary) == 1


def test_session_profile_negative_to_zero():
df = pd.DataFrame(range(-1000, 1), columns=["negative_to_zero"])
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
profile = session.log_dataframe(df)
TEST_LOGGER.debug(f"logged df: {df.shape} ")
summary = profile.flat_summary()
TEST_LOGGER.debug(f"logged summary: {summary} ")
flat_summary = summary["summary"]
TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
assert len(flat_summary) == 1


def test_session_profile_all_close_to_zero():
df = pd.DataFrame(np.arange(-1.00000001e-20, -1e-20, 1e-30), columns=["close_to_zero"])
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
profile = session.log_dataframe(df)
TEST_LOGGER.debug(f"logged df: {df.shape} ")
summary = profile.flat_summary()
TEST_LOGGER.debug(f"logged summary: {summary} ")
flat_summary = summary["summary"]
TEST_LOGGER.debug(f"logged flat_summary: {flat_summary} ")
assert len(flat_summary) == 1


def test_session_profile_two_column(df_two_int_col):
TEST_LOGGER.debug(f"About to log {df_two_int_col.describe()} with columns {df_two_int_col.columns}")
session = session_from_config(SessionConfig("default-project", "default-pipeline", [], False))
profile = session.log_dataframe(df_two_int_col)
assert profile is not None

summary = profile.flat_summary()

flat_summary = summary["summary"]
assert len(flat_summary) == 2


def test_profile_df(df):
import datetime

Expand Down

0 comments on commit 0f9035f

Please sign in to comment.