Skip to content

Commit

Permalink
SummaryConstraint: Table shape constraints (#398)
Browse files Browse the repository at this point in the history
* Table shape constraints as SummaryConstraint, changes in datasetprofile needed for implementation, modify DatasetConstraints accordingly, tests for table shape constraints and DatasetConstraints serialization

* Add the table shape constraints report to the DatasetConstraints report. Modify the DatasetConstrains serialization test.

* Change OP CONTAINS_SET to CONTAIN_SET. Add function to simplify SummaryConstraint.from_protobuf.

* Change the approach for the table shape number of rows constraint. Tests for sequential logging. Parameterize set length.

Co-authored-by: pecop2 <petar@loka.com>
Co-authored-by: Jamie Broomall <88007022+jamie256@users.noreply.github.com>
  • Loading branch information
3 people committed Jan 25, 2022
1 parent 6bc0957 commit 7434e09
Show file tree
Hide file tree
Showing 4 changed files with 520 additions and 43 deletions.
4 changes: 3 additions & 1 deletion proto/src/constraints.proto
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ enum Op {
EQ_SET = 12;
APPLY_FUNC = 13;
IN = 14;
CONTAIN = 15;
}

/* Summary constraints specify a relationship between a summary field and a literal value,
Expand All @@ -41,6 +42,7 @@ message SummaryConstraintMsg {
double value = 4;
SummaryBetweenConstraintMsg between = 7;
google.protobuf.ListValue reference_set = 8;
string value_str = 12;
}
Op op = 5;
bool verbose = 6;
Expand All @@ -51,7 +53,6 @@ message SummaryConstraintMsg {
}
}


message ReferenceDistributionContinuousMessage {
KllFloatsSketchMessage sketch = 1;
}
Expand Down Expand Up @@ -106,5 +107,6 @@ message DatasetConstraintMsg {
DatasetProperties properties = 1;
map<string, ValueConstraintMsgs> value_constraints = 2;
map<string, SummaryConstraintMsgs> summary_constraints = 3;
SummaryConstraintMsgs table_shape_constraints = 4;
}

23 changes: 22 additions & 1 deletion src/whylogs/core/datasetprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ def __init__(

self.model_profile = model_profile

self.column_row_dict = dict()

# Store Name attribute
self._tags["name"] = name

Expand Down Expand Up @@ -154,6 +156,10 @@ def session_timestamp_ms(self):
"""
return time.to_utc_ms(self.session_timestamp)

@property
def total_row_number(self):
return max(self.column_row_dict.values())

def add_output_field(self, field: Union[str, List[str]]):
if self.model_profile is None:
self.model_profile = ModelProfile()
Expand Down Expand Up @@ -249,6 +255,11 @@ def track_datum(self, column_name, data, character_list=None, token_method=None)
prof = ColumnProfile(column_name, constraints=constraints)
self.columns[column_name] = prof

self.column_row_dict[column_name] = 0

# updating the map for every column name with increasing the number of tracked values
self.column_row_dict[column_name] += 1

prof.track(data, character_list=None, token_method=None)

def track_array(self, x: np.ndarray, columns=None):
Expand Down Expand Up @@ -282,6 +293,7 @@ def track_dataframe(self, df: pd.DataFrame, character_list=None, token_method=No
# workaround for CUDF due to https://github.com/rapidsai/cudf/issues/6743
if cudfDataFrame is not None and isinstance(df, cudfDataFrame):
df = df.to_pandas()

element_count = df.size
large_df = element_count > 200000
if large_df:
Expand Down Expand Up @@ -738,6 +750,16 @@ def apply_summary_constraints(self, summary_constraints: Optional[Mapping[str, S

return [(k, s.report()) for k, s in summary_constraints.items()]

def apply_table_shape_constraints(self, table_shape_constraints: Optional[SummaryConstraints] = None):
if table_shape_constraints is None:
table_shape_constraints = self.constraints.table_shape_constraints

update_obj = _create_column_profile_summary_object(NumberSummary(), columns=self.columns.keys(), total_row_number=self.total_row_number)

table_shape_constraints.update(update_obj)

return table_shape_constraints.report()


def columns_chunk_iterator(iterator, marker: str):
"""
Expand Down Expand Up @@ -848,7 +870,6 @@ def _create_column_profile_summary_object(number_summary: NumberSummary, **kwarg
Used to unpack the metrics as separate items in the dictionary
kwargs : Summary objects or datasketches objects
Used to update specific constraints that need additional calculations
Returns
-------
Anonymous object containing all of the metrics as fields with their corresponding values
Expand Down
Loading

0 comments on commit 7434e09

Please sign in to comment.