Skip to content

Commit

Permalink
Merge branch 'mainline' into update_release_version
Browse files Browse the repository at this point in the history
  • Loading branch information
jamie256 committed Jan 26, 2022
2 parents bcd5315 + 4271ed9 commit d76b8ed
Show file tree
Hide file tree
Showing 7 changed files with 871 additions and 19 deletions.
21 changes: 21 additions & 0 deletions proto/src/constraints.proto
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ enum Op {
APPLY_FUNC = 13;
IN = 14;
CONTAIN = 15;
NOT_IN = 16;
SUM = 17;
}

/* Summary constraints specify a relationship between a summary field and a literal value,
Expand Down Expand Up @@ -95,8 +97,26 @@ message ValueConstraintMsg {
bool verbose = 4;
}

message MultiColumnValueConstraintMsg {
string name = 1;
oneof dependent {
google.protobuf.ListValue dependent_columns = 2;
string dependent_column = 3;
}

oneof reference {
double value = 6;
google.protobuf.ListValue value_set = 7;
google.protobuf.ListValue reference_columns = 8;
}
Op op = 4;
bool verbose = 5;
Op internal_dependent_columns_op = 9;
}

message ValueConstraintMsgs {
repeated ValueConstraintMsg constraints = 1;
repeated MultiColumnValueConstraintMsg multi_column_constraints = 2;
}

message SummaryConstraintMsgs {
Expand All @@ -108,5 +128,6 @@ message DatasetConstraintMsg {
map<string, ValueConstraintMsgs> value_constraints = 2;
map<string, SummaryConstraintMsgs> summary_constraints = 3;
SummaryConstraintMsgs table_shape_constraints = 4;
map<string, ValueConstraintMsgs> multi_column_value_constraints = 5;
}

3 changes: 2 additions & 1 deletion src/whylogs/core/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from .annotation_profiling import BB_ATTRIBUTES, TrackBB
from .columnprofile import ColumnProfile
from .columnprofile import ColumnProfile, MultiColumnProfile
from .datasetprofile import DatasetProfile
from .image_profiling import _METADATA_DEFAULT_ATTRIBUTES as METADATA_DEFAULT_ATTRIBUTES
from .image_profiling import TrackImage

__ALL__ = [
ColumnProfile,
MultiColumnProfile,
DatasetProfile,
TrackImage,
METADATA_DEFAULT_ATTRIBUTES,
Expand Down
86 changes: 86 additions & 0 deletions src/whylogs/core/columnprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
StringTracker,
)
from whylogs.core.statistics.constraints import (
MultiColumnValueConstraints,
SummaryConstraint,
SummaryConstraints,
ValueConstraints,
Expand Down Expand Up @@ -250,3 +251,88 @@ def from_protobuf(message):
frequent_items=FrequentItemsSketch.from_protobuf(message.frequent_items),
cardinality_tracker=HllSketch.from_protobuf(message.cardinality_tracker),
)


class MultiColumnProfile:
"""
Statistics tracking for a multiple columns (i.e. a features)
The primary method for
Parameters
----------
constraints : MultiColumnValueConstraints
Static assertions to be applied to data tracked between all columns
"""

def __init__(
self,
constraints: MultiColumnValueConstraints = None,
):

self.constraints = constraints or MultiColumnValueConstraints()

def track(self, column_dict, character_list=None, token_method=None):
"""
TODO: Add `column_dict` to tracking statistics.
"""

# update the MultiColumnTrackers code

self.constraints.update(column_dict)
self.constraints.update_typed(column_dict)

def to_summary(self):
"""
Generate a summary of the statistics
Returns
-------
summary : (Multi)ColumnSummary
Protobuf summary message.
"""

# TODO: summaries for the multi column trackers and statistics

raise NotImplementedError()

def merge(self, other) -> "MultiColumnProfile":
"""
Merge this columnprofile with another.
Parameters
----------
other : MultiColumnProfile
Returns
-------
merged : MultiColumnProfile
A new, merged multi column profile.
"""
return MultiColumnProfile(self.constraints.merge(other.constraints))

def to_protobuf(self):
"""
Return the object serialized as a protobuf message
Returns
-------
message : ColumnMessage
"""

# TODO: implement new type of multicolumn message
raise NotImplementedError()

@staticmethod
def from_protobuf(message):
"""
Load from a protobuf message
Returns
-------
column_profile : MultiColumnProfile
"""
# TODO: implement new type of multicolumn message

raise NotImplementedError()
38 changes: 28 additions & 10 deletions src/whylogs/core/datasetprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
from google.protobuf.internal.encoder import _VarintBytes
from smart_open import open

from whylogs.core import ColumnProfile
from whylogs.core import ColumnProfile, MultiColumnProfile
from whylogs.core.flatten_datasetprofile import flatten_summary
from whylogs.core.model_profile import ModelProfile
from whylogs.core.statistics.constraints import DatasetConstraints, SummaryConstraints
from whylogs.core.statistics.constraints import (
DatasetConstraints,
MultiColumnValueConstraints,
SummaryConstraints,
)
from whylogs.core.summaryconverters import entropy_from_column_summary
from whylogs.core.types import TypedDataConverter
from whylogs.proto import (
Expand Down Expand Up @@ -86,6 +90,7 @@ def __init__(
dataset_timestamp: datetime.datetime = None,
session_timestamp: datetime.datetime = None,
columns: dict = None,
multi_columns: MultiColumnProfile = None,
tags: Dict[str, str] = None,
metadata: Dict[str, str] = None,
session_id: str = None,
Expand All @@ -95,6 +100,9 @@ def __init__(
# Default values
if columns is None:
columns = {}
if multi_columns is None:
multi_column_constraints = MultiColumnValueConstraints(constraints.multi_column_value_constraints) if constraints else None
multi_columns = MultiColumnProfile(multi_column_constraints)
if tags is None:
tags = {}
if metadata is None:
Expand All @@ -110,6 +118,7 @@ def __init__(
self._tags = dict(tags)
self._metadata = metadata.copy()
self.columns = columns
self.multi_columns = multi_columns
self.constraints = constraints

self.model_profile = model_profile
Expand Down Expand Up @@ -262,6 +271,10 @@ def track_datum(self, column_name, data, character_list=None, token_method=None)

prof.track(data, character_list=None, token_method=None)

def track_multi_column(self, columns):
multi_column_profile = self.multi_columns
multi_column_profile.track(columns)

def track_array(self, x: np.ndarray, columns=None):
"""
Track statistics for a numpy array
Expand Down Expand Up @@ -299,15 +312,20 @@ def track_dataframe(self, df: pd.DataFrame, character_list=None, token_method=No
if large_df:
logger.warning(f"About to log a dataframe with {element_count} elements, logging might take some time to complete.")
count = 0
for col in df.columns:
col_str = str(col)

x = df[col].values
for xi in x:
count = count + 1
columns_len = len(df.columns)
num_records = len(df)
for idx in range(num_records):
row_values = df.iloc[idx].values
count += 1
for col_idx in range(columns_len):
col = df.columns[col_idx]
col_str = str(col)
self.track(col_str, row_values[col_idx], character_list=None, token_method=None)
if large_df and (count % 200000 == 0):
logger.warning(f"Logged {count} elements out of {element_count}")
self.track(col_str, xi, character_list=None, token_method=None)

self.track_multi_column({str(col): val for col, val in zip(df.columns, row_values)})

def to_properties(self):
"""
Expand Down Expand Up @@ -729,7 +747,7 @@ def apply_summary_constraints(self, summary_constraints: Optional[Mapping[str, S
frequent_items_summ = colprof.frequent_items.to_summary(max_items=1, min_count=1)
most_common_val = frequent_items_summ.items[0].json_value if frequent_items_summ else None

update_dict = _create_column_profile_summary_object(
update_obj = _create_column_profile_summary_object(
number_summary=summ.number_summary,
distinct_column_values=distinct_column_values_dict,
quantile=colprof.number_tracker.histogram,
Expand All @@ -744,7 +762,7 @@ def apply_summary_constraints(self, summary_constraints: Optional[Mapping[str, S
chi_squared_test=chi_squared_summary,
)

constraints.update(update_dict)
constraints.update(update_obj)
else:
logger.debug(f"unkown feature '{feature_name}' in summary constraints")

Expand Down
Loading

0 comments on commit d76b8ed

Please sign in to comment.