diff --git a/.bumpversion.cfg b/.bumpversion.cfg index d6b05ceebf..6698cc5c47 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.4-dev0 +current_version = 0.4.4-dev3 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 99eef22c11..0000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "proto"] - path = proto - url = https://github.com/whylabs/whylogs-proto.git diff --git a/docs/conf.py b/docs/conf.py index 2ce07541f3..9da96e1eec 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -101,7 +101,7 @@ # built documents. # # The short X.Y version. -version = "0.4.4-dev0" +version = "0.4.4-dev3" # The full version, including alpha/beta/rc tags. release = "" # Is set by calling `setup.py docs` diff --git a/proto b/proto deleted file mode 160000 index 083464b1e5..0000000000 --- a/proto +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 083464b1e5fdc200b3118e8621b0e99346d045f9 diff --git a/proto/.gitignore b/proto/.gitignore new file mode 100644 index 0000000000..1eaf70ed43 --- /dev/null +++ b/proto/.gitignore @@ -0,0 +1,2 @@ +generated +generated/ \ No newline at end of file diff --git a/proto/src/constraints.proto b/proto/src/constraints.proto new file mode 100644 index 0000000000..412e080843 --- /dev/null +++ b/proto/src/constraints.proto @@ -0,0 +1,59 @@ +syntax = "proto3"; + +import "messages.proto"; + +option java_package = "com.whylogs.core.constraint"; +option java_outer_classname = "Constraints"; +option java_multiple_files = true; + +/* constraints specify one of the following binary boolean relationships. */ +enum Op { + unused = 0; + LT = 1; + LE = 2; + EQ = 3; + NE = 4; + GE = 5; + GT = 6; + } + +/* Summary constraints specify a relationship between a summary field and a literal value, + or between two summary fields. + e.g. 'min' < 6 + 'std_dev' < 2.17 + 'min' > 'avg' + */ +message SummaryConstraintMsg { + string name = 1; + string first_field = 2; + oneof second { + string second_field = 3; + double value = 4; + } + Op op = 5; + bool verbose = 6; +} + +/* ValueConstraints express a binary boolean relationship between an implied numeric value and a literal. + These are applied to every incoming value that is processed by whylogs. */ +message ValueConstraintMsg { + string name = 1; + double value = 2; + Op op = 3; + bool verbose = 4; +} + +message ValueConstraintMsgs { + repeated ValueConstraintMsg constraints = 1; +} + +message SummaryConstraintMsgs { + repeated SummaryConstraintMsg constraints = 1; +} + +message DatasetConstraintMsg { + DatasetProperties properties = 1; + map value_constraints = 2; + map summary_constraints = 3; +} + diff --git a/proto/src/messages.proto b/proto/src/messages.proto new file mode 100644 index 0000000000..9fbe93f827 --- /dev/null +++ b/proto/src/messages.proto @@ -0,0 +1,187 @@ +syntax = "proto3"; + +import "google/protobuf/wrappers.proto"; + +option java_package = "com.whylogs.core.message"; +option java_outer_classname = "Messages"; +option java_multiple_files = true; + +message Counters { + int64 count = 1; + + google.protobuf.Int64Value true_count = 2; + google.protobuf.Int64Value null_count = 3; +} + +message InferredType { + enum Type { + UNKNOWN = 0; + NULL = 1; + FRACTIONAL = 2; + INTEGRAL = 3; + BOOLEAN = 4; + STRING = 5; + } + + Type type = 1; + double ratio = 2; +} + +message DoublesMessage { + int64 count = 1; + double min = 2; + double max = 3; + double sum = 4; +} + +message LongsMessage { + int64 count = 1; + int64 min = 2; + int64 max = 3; + int64 sum = 4; +} + +message VarianceMessage { + int64 count = 1; + double sum = 2; // sample variance * (n-1) + double mean = 3; +} + +message FrequentNumbersSketchMessage { + bytes sketch = 1; + int32 lg_max_k = 2; +} + +message FrequentItemsSketchMessage { + bytes sketch = 1; + int32 lg_max_k = 2; +} + +message NumbersMessage { + VarianceMessage variance = 1; + oneof numbers { + DoublesMessage doubles = 2; + LongsMessage longs = 3; + } + + // sketches + bytes histogram = 4; + bytes theta = 5; + bytes compact_theta = 6; + FrequentNumbersSketchMessage frequent_numbers = 7; +} + +message StringsMessage { + int64 count = 1; + + // sketches + bytes theta = 2; + bytes items = 3; + bytes compact_theta = 4; +} + + +message SchemaMessage { + map typeCounts = 1; + InferredType inferred_type = 2; +} + +message ColumnMessage { + string name = 1; + Counters counters = 2; + SchemaMessage schema = 3; + NumbersMessage numbers = 4; + StringsMessage strings = 5; + InferredType inferred_type = 6; + FrequentItemsSketchMessage frequent_items = 7; + HllSketchMessage cardinality_tracker = 8; +} + +message DatasetProperties { + uint32 schema_major_version = 1; + uint32 schema_minor_version = 2; + + string session_id = 3; + int64 session_timestamp = 4; + int64 data_timestamp = 5; + map tags = 6; + map metadata = 7; + // TODO: store other configuration here +} + +message ScoreMatrixMessage { + repeated string labels = 1; + string prediction_field = 2; + string target_field = 3; + string score_field = 4; + + // a flattened NxN matrix (N = len(labels)) + repeated NumbersMessage scores = 10; +} + +message RegressionMetricsMessage{ + string prediction_field = 1; + string target_field = 2; + uint64 count = 3; + double sum_abs_diff = 4; + double sum_diff = 5; + double sum2_diff = 6; +} + +enum ModelType { + UNKNOWN = 0; + CLASSIFICATION = 1; + REGRESSION = 2; + EMBEDDINGS = 3; +} + +message ModelMetricsMessage { + ScoreMatrixMessage scoreMatrix = 1; + ModelType modelType = 2; + RegressionMetricsMessage regressionMetrics = 3; +} + +message ModelProfileMessage { + repeated string output_fields = 1; + // Reserving fields for ModelMessage + + ModelMetricsMessage metrics = 10; +} + +message DatasetProfileMessage { + DatasetProperties properties = 1; + map columns = 2; + // reserve other fields for dataset level data + ModelProfileMessage modeProfile = 10; +} + +/** + * The follow section is for transmission and reconstruction of the dataset + * in WhyLogs backend + */ +message ColumnsChunkSegment { + // UUID is required to aggregate to the original message + // This should map back to the original dataset + string marker = 1; + repeated ColumnMessage columns = 2; +} + +message DatasetMetadataSegment { + string marker = 1; + DatasetProperties properties = 2; +} + +// A segment of a dataset profile. This can be used to composed the +// original object back +message MessageSegment { + string marker = 1; + oneof item { + DatasetMetadataSegment metadata = 2; + ColumnsChunkSegment columns = 3; + } +} + +message HllSketchMessage { + bytes sketch = 1; + int32 lg_k = 2; +} diff --git a/proto/src/summaries.proto b/proto/src/summaries.proto new file mode 100644 index 0000000000..383ad606e8 --- /dev/null +++ b/proto/src/summaries.proto @@ -0,0 +1,105 @@ +syntax = "proto3"; + +import "messages.proto"; + +option java_package = "com.whylogs.core.message"; +option java_outer_classname = "Summaries"; +option java_multiple_files = true; + +message UniqueCountSummary { + double estimate = 1; + double upper = 2; + double lower = 3; +} + +message FrequentStringsSummary { + message FrequentItem { + string value = 1; + double estimate = 2; + } + repeated FrequentItem items = 1; +} + +message FrequentNumbersSummary { + message FrequentDoubleItem { + int64 estimate = 1; + double value = 2; + int32 rank = 3; + } + message FrequentLongItem { + int64 estimate = 1; + int64 value = 2; + int32 rank = 3; + } + repeated FrequentDoubleItem doubles = 1; + repeated FrequentLongItem longs = 2; +} + +message FrequentItemsSummary { + message FrequentItem { + int64 estimate = 1; + string json_value = 2; + } + repeated FrequentItem items = 1; +} + +message StringsSummary { + UniqueCountSummary unique_count = 1; + FrequentStringsSummary frequent = 2; +} + +message SchemaSummary { + InferredType inferred_type = 1; + map type_counts = 2; +} + +message HistogramSummary { + double start = 1; + double end = 2; + double width = 3; + repeated int64 counts = 4; + + double max = 5; + double min = 6; + repeated double bins = 7; + int64 n = 8; + +} + +message QuantileSummary { + repeated double quantiles = 1; + repeated double quantile_values = 2; +} + +message NumberSummary { + uint64 count = 1; + double min = 2; + double max = 3; + double mean = 4; + double stddev = 5; + + HistogramSummary histogram = 6; + UniqueCountSummary unique_count = 7; + QuantileSummary quantiles = 8; + FrequentNumbersSummary frequent_numbers = 9; + + bool is_discrete = 10; +} + +message ColumnSummary { + Counters counters = 1; + SchemaSummary schema = 2; + NumberSummary number_summary = 3; + StringsSummary string_summary = 4; + FrequentItemsSummary frequent_items = 5; + UniqueCountSummary unique_count = 6; +} + +message DatasetSummary { + DatasetProperties properties = 1; + map columns = 2; +} + +message DatasetSummaries { + map profiles = 1; +} \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 0808212ed8..2afb3cf018 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,7 @@ [metadata] name = whylogs -version = 0.4.4-dev0 +version = 0.4.4-dev3 description = Profile and monitor your ML data pipeline end-to-end author = WhyLabs.ai author-email = support@whylabs.ai diff --git a/src/whylogs/_version.py b/src/whylogs/_version.py index 489d85aff6..d3fce8fed1 100644 --- a/src/whylogs/_version.py +++ b/src/whylogs/_version.py @@ -1,3 +1,3 @@ """WhyLabs version number.""" -__version__ = "0.4.4-dev0" +__version__ = "0.4.4-dev3"