Skip to content

Commit

Permalink
Merge pull request #117 from whylabs/WHY-2051
Browse files Browse the repository at this point in the history
track None datatypes as null
  • Loading branch information
lalmei committed Jan 5, 2021
2 parents 457130f + 2f10658 commit 242db7f
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 10 deletions.
32 changes: 22 additions & 10 deletions src/whylogs/core/datasetprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@
),
),
schema=OrderedDict(
inferred_type=OrderedDict(type="inferred_dtype", ratio="dtype_fraction"),
inferred_type=OrderedDict(
type="inferred_dtype", ratio="dtype_fraction"),
type_counts=TYPENUM_COLUMN_NAMES,
),
string_summary=OrderedDict(
Expand Down Expand Up @@ -130,7 +131,7 @@ def __init__(
self._tags = dict(tags)
self._metadata = metadata.copy()
self.columns = columns

# Store Name attribute
self._tags["name"] = name

Expand Down Expand Up @@ -179,19 +180,26 @@ def track(self, columns, data=None):
Value to track. Specify if `columns` is a string.
"""
if data is not None:
if type(columns)!= str:
if type(columns) != str:
raise TypeError("Unambigious column to data mapping")
self.track_datum(columns, data)
else:
for column_name, data in columns.items():
self.track_datum(column_name, data)
if isinstance(columns, dict):
for column_name, data in columns.items():
self.track_datum(column_name, data)
elif isinstance(columns, str):
self.track_datum(columns, None)
else:
raise TypeError(" Data type of: {} not supported for tracking ".format(
columns.__class__.__name__))

def track_datum(self, column_name, data):
try:
prof = self.columns[column_name]
except KeyError:
prof = ColumnProfile(column_name)
self.columns[column_name] = prof

prof.track(data)

def track_array(self, x: np.ndarray, columns=None):
Expand Down Expand Up @@ -229,6 +237,7 @@ def track_dataframe(self, df: pd.DataFrame):
col_str = str(col)
x = df[col].values
for xi in x:

self.track(col_str, xi)

def to_properties(self):
Expand Down Expand Up @@ -303,7 +312,8 @@ def chunk_iterator(self):
properties = self.to_properties()

yield MessageSegment(
marker=marker, metadata=DatasetMetadataSegment(properties=properties,)
marker=marker, metadata=DatasetMetadataSegment(
properties=properties,)
)

chunked_columns = self._column_message_iterator()
Expand Down Expand Up @@ -348,7 +358,8 @@ def merge(self, other):
return self._do_merge(other)

def _do_merge(self, other):
columns_set = set(list(self.columns.keys()) + list(other.columns.keys()))
columns_set = set(list(self.columns.keys()) +
list(other.columns.keys()))
columns = {}
for col_name in columns_set:
empty_column = ColumnProfile(col_name)
Expand Down Expand Up @@ -456,7 +467,7 @@ def read_protobuf(protobuf_path: str, delimited_file: bool = True):
else:
msg_len = len(data)
new_pos = 0
msg_buf = data[new_pos : new_pos + msg_len]
msg_buf = data[new_pos: new_pos + msg_len]
return DatasetProfile.from_protobuf_string(msg_buf)

@staticmethod
Expand Down Expand Up @@ -533,7 +544,7 @@ def parse_delimited_single(data: bytes, pos=0):
"""
msg_len, new_pos = _DecodeVarint32(data, pos)
pos = new_pos
msg_buf = data[pos : pos + msg_len]
msg_buf = data[pos: pos + msg_len]
pos += msg_len
profile = DatasetProfile.from_protobuf_string(msg_buf)
return pos, profile
Expand Down Expand Up @@ -708,7 +719,8 @@ def flatten_dataset_frequent_strings(dataset_summary: DatasetSummary):

for col_name, col in dataset_summary.columns.items():
try:
item_summary = getter(getter(col, "string_summary"), "frequent").items
item_summary = getter(
getter(col, "string_summary"), "frequent").items
items = {}
for item in item_summary:
items[item.value] = int(item.estimate)
Expand Down
15 changes: 15 additions & 0 deletions tests/unit/core/test_datasetprofile_datatypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@


from whylogs.core.datasetprofile import DatasetProfile


def test_track_null_item():
prof = DatasetProfile("name")
prof.track("column_name", 1)
prof = DatasetProfile("name")
prof.track("column_name", None)
assert prof.flat_summary()["summary"]["column"][0] == "column_name"
assert prof.flat_summary()["summary"]["null_count"][0] == 1
prof.track("column_name", None)
assert prof.flat_summary()["summary"]["null_count"][0] == 2
assert prof.flat_summary()["summary"]["column"][0] == "column_name"

0 comments on commit 242db7f

Please sign in to comment.