Skip to content

Commit

Permalink
Merge 37c250f into 73abc56
Browse files Browse the repository at this point in the history
  • Loading branch information
andyndang committed Feb 24, 2021
2 parents 73abc56 + 37c250f commit 9e2f035
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.3-dev1
current_version = 0.3.3-dev2
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def setup(app):
# built documents.
#
# The short X.Y version.
version = "0.3.3-dev1"
version = "0.3.3-dev2"
# The full version, including alpha/beta/rc tags.
release = "" # Is set by calling `setup.py docs`

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

[metadata]
name = whylogs
version = 0.3.3-dev1
version = 0.3.3-dev2
description = Profile and monitor your ML data pipeline end-to-end
author = WhyLabs.ai
author-email = support@whylabs.ai
Expand Down
2 changes: 1 addition & 1 deletion src/whylogs/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""WhyLabs version number."""

__version__ = "0.3.3-dev1"
__version__ = "0.3.3-dev2"
37 changes: 24 additions & 13 deletions src/whylogs/core/columnprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,9 @@
from whylogs.core.statistics.hllsketch import HllSketch
from whylogs.core.statistics.constraints import ValueConstraints, SummaryConstraints, SummaryConstraint
from whylogs.core.types import TypedDataConverter
from whylogs.proto import ColumnMessage, ColumnSummary, InferredType, Op
from whylogs.proto import ColumnMessage, ColumnSummary, InferredType, Op, UniqueCountSummary
from whylogs.util.dsketch import FrequentItemsSketch


import pandas as pd

_TYPES = InferredType.Type
Expand Down Expand Up @@ -48,15 +47,15 @@ class ColumnProfile:
"""

def __init__(
self,
name: str,
number_tracker: NumberTracker = None,
string_tracker: StringTracker = None,
schema_tracker: SchemaTracker = None,
counters: CountersTracker = None,
frequent_items: FrequentItemsSketch = None,
cardinality_tracker: HllSketch = None,
constraints: ValueConstraints = None,
self,
name: str,
number_tracker: NumberTracker = None,
string_tracker: StringTracker = None,
schema_tracker: SchemaTracker = None,
counters: CountersTracker = None,
frequent_items: FrequentItemsSketch = None,
cardinality_tracker: HllSketch = None,
constraints: ValueConstraints = None,
):
# Handle default values
if counters is None:
Expand Down Expand Up @@ -113,6 +112,19 @@ def track(self, value):

self.constraints.update(typed_data)

def _unique_count_summary(self) -> UniqueCountSummary:
cardinality_summary = self.cardinality_tracker.to_summary(_UNIQUE_COUNT_BOUNDS_STD)
if cardinality_summary:
return cardinality_summary

inferred_type = self.schema_tracker.infer_type()
if inferred_type.type == _TYPES.STRING:
cardinality_summary = self.string_tracker.theta_sketch.to_summary()
else: # default is number summary
print("Inferred type: ", inferred_type)
cardinality_summary = self.number_tracker.theta_sketch.to_summary()
return cardinality_summary

def to_summary(self):
"""
Generate a summary of the statistics
Expand All @@ -129,8 +141,7 @@ def to_summary(self):
opts = dict(
counters=self.counters.to_protobuf(),
frequent_items=self.frequent_items.to_summary(),
unique_count=self.cardinality_tracker.to_summary(
_UNIQUE_COUNT_BOUNDS_STD),
unique_count=self._unique_count_summary(),
)
if self.string_tracker is not None and self.string_tracker.count > 0:
opts["string_summary"] = self.string_tracker.to_summary()
Expand Down
41 changes: 38 additions & 3 deletions tests/unit/core/test_columnprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
from testutil import compare_frequent_items

from whylogs.core import ColumnProfile
from whylogs.core.statistics.hllsketch import HllSketch
from whylogs.util.protobuf import message_to_dict
import numpy as np
import pandas as pd


def test_all_numeric_types_get_tracked_by_number_tracker():

all_values = [
[1.0, 2.0, 3.0],
[1, 2, 3],
Expand Down Expand Up @@ -118,8 +118,10 @@ def test_summary():
"mean": 2.0,
"stddev": 1.0,
"isDiscrete": False,
"histogram": {"start": 1.0, "end": 3.0000003, "counts": ["3"], "max": 3.0, "min": 1.0, "bins": [1.0, 3.0000003], "n": "3", "width": 0.0, },
"quantiles": {"quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0], },
"histogram": {"start": 1.0, "end": 3.0000003, "counts": ["3"], "max": 3.0, "min": 1.0,
"bins": [1.0, 3.0000003], "n": "3", "width": 0.0, },
"quantiles": {"quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
"quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0], },
"uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
},
}
Expand Down Expand Up @@ -179,3 +181,36 @@ def test_merge():
assert merged.number_tracker.ints.count == 0
assert merged.number_tracker.floats.count == 4
assert merged.string_tracker.count == 2


def test_fallback_number_counter():
col = ColumnProfile("test")
vals = [1, 1.0, 2, 3, 4, 5, 6, 6.0, "text"]
for v in vals:
col.track(v)
col.cardinality_tracker = HllSketch()

summary = col.to_summary()
assert summary.unique_count.estimate == summary.number_summary.unique_count.estimate


def test_fallback_string_counter():
col = ColumnProfile("test")
vals = ["a", "b", "c", "d", "e", "f", 1.0, 2.0]
for v in vals:
col.track(v)
col.cardinality_tracker = HllSketch()

summary = col.to_summary()
assert summary.unique_count.estimate == summary.string_summary.unique_count.estimate


def test_fallback_fallbacks_to_number_counter():
col = ColumnProfile("test")
vals = ["a", "b", 1.0, 2.0]
for v in vals:
col.track(v)
col.cardinality_tracker = HllSketch()

summary = col.to_summary()
assert summary.unique_count.estimate == summary.number_summary.unique_count.estimate

0 comments on commit 9e2f035

Please sign in to comment.