Skip to content

Commit

Permalink
Merge branch 'mainline' into dev/loka/multi_column_value_constraints
Browse files Browse the repository at this point in the history
# Conflicts:
#	proto/src/constraints.proto
#	src/whylogs/core/statistics/constraints.py
#	tests/unit/core/statistics/test_constraints.py
  • Loading branch information
MilenaTrajanoska committed Jan 26, 2022
2 parents c1c2967 + 7434e09 commit 1cc3074
Show file tree
Hide file tree
Showing 4 changed files with 523 additions and 44 deletions.
8 changes: 5 additions & 3 deletions proto/src/constraints.proto
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ enum Op {
EQ_SET = 12;
APPLY_FUNC = 13;
IN = 14;
NOT_IN = 15;
SUM = 16;
CONTAIN = 15;
NOT_IN = 16;
SUM = 17;
}

/* Summary constraints specify a relationship between a summary field and a literal value,
Expand All @@ -43,6 +44,7 @@ message SummaryConstraintMsg {
double value = 4;
SummaryBetweenConstraintMsg between = 7;
google.protobuf.ListValue reference_set = 8;
string value_str = 12;
}
Op op = 5;
bool verbose = 6;
Expand All @@ -53,7 +55,6 @@ message SummaryConstraintMsg {
}
}


message ReferenceDistributionContinuousMessage {
KllFloatsSketchMessage sketch = 1;
}
Expand Down Expand Up @@ -126,6 +127,7 @@ message DatasetConstraintMsg {
DatasetProperties properties = 1;
map<string, ValueConstraintMsgs> value_constraints = 2;
map<string, SummaryConstraintMsgs> summary_constraints = 3;
SummaryConstraintMsgs table_shape_constraints = 4;
map<string, ValueConstraintMsgs> multi_column_value_constraints = 5;
}

23 changes: 22 additions & 1 deletion src/whylogs/core/datasetprofile.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ def __init__(

self.model_profile = model_profile

self.column_row_dict = dict()

# Store Name attribute
self._tags["name"] = name

Expand Down Expand Up @@ -163,6 +165,10 @@ def session_timestamp_ms(self):
"""
return time.to_utc_ms(self.session_timestamp)

@property
def total_row_number(self):
return max(self.column_row_dict.values())

def add_output_field(self, field: Union[str, List[str]]):
if self.model_profile is None:
self.model_profile = ModelProfile()
Expand Down Expand Up @@ -258,6 +264,11 @@ def track_datum(self, column_name, data, character_list=None, token_method=None)
prof = ColumnProfile(column_name, constraints=constraints)
self.columns[column_name] = prof

self.column_row_dict[column_name] = 0

# updating the map for every column name with increasing the number of tracked values
self.column_row_dict[column_name] += 1

prof.track(data, character_list=None, token_method=None)

def track_multi_column(self, columns):
Expand Down Expand Up @@ -295,6 +306,7 @@ def track_dataframe(self, df: pd.DataFrame, character_list=None, token_method=No
# workaround for CUDF due to https://github.com/rapidsai/cudf/issues/6743
if cudfDataFrame is not None and isinstance(df, cudfDataFrame):
df = df.to_pandas()

element_count = df.size
large_df = element_count > 200000
if large_df:
Expand Down Expand Up @@ -756,6 +768,16 @@ def apply_summary_constraints(self, summary_constraints: Optional[Mapping[str, S

return [(k, s.report()) for k, s in summary_constraints.items()]

def apply_table_shape_constraints(self, table_shape_constraints: Optional[SummaryConstraints] = None):
if table_shape_constraints is None:
table_shape_constraints = self.constraints.table_shape_constraints

update_obj = _create_column_profile_summary_object(NumberSummary(), columns=self.columns.keys(), total_row_number=self.total_row_number)

table_shape_constraints.update(update_obj)

return table_shape_constraints.report()


def columns_chunk_iterator(iterator, marker: str):
"""
Expand Down Expand Up @@ -866,7 +888,6 @@ def _create_column_profile_summary_object(number_summary: NumberSummary, **kwarg
Used to unpack the metrics as separate items in the dictionary
kwargs : Summary objects or datasketches objects
Used to update specific constraints that need additional calculations
Returns
-------
Anonymous object containing all of the metrics as fields with their corresponding values
Expand Down
Loading

0 comments on commit 1cc3074

Please sign in to comment.