Skip to content

Commit

Permalink
Merge 1229669 into dcdf98e
Browse files Browse the repository at this point in the history
  • Loading branch information
andyndang committed Apr 9, 2021
2 parents dcdf98e + 1229669 commit eaf9dc4
Show file tree
Hide file tree
Showing 10 changed files with 357 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.4.4-dev0
current_version = 0.4.4-dev3
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
3 changes: 0 additions & 3 deletions .gitmodules

This file was deleted.

2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@
# built documents.
#
# The short X.Y version.
version = "0.4.4-dev0"
version = "0.4.4-dev3"
# The full version, including alpha/beta/rc tags.
release = "" # Is set by calling `setup.py docs`

Expand Down
1 change: 0 additions & 1 deletion proto
Submodule proto deleted from 083464
2 changes: 2 additions & 0 deletions proto/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
generated
generated/
59 changes: 59 additions & 0 deletions proto/src/constraints.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
syntax = "proto3";

import "messages.proto";

option java_package = "com.whylogs.core.constraint";
option java_outer_classname = "Constraints";
option java_multiple_files = true;

/* constraints specify one of the following binary boolean relationships. */
enum Op {
unused = 0;
LT = 1;
LE = 2;
EQ = 3;
NE = 4;
GE = 5;
GT = 6;
}

/* Summary constraints specify a relationship between a summary field and a literal value,
or between two summary fields.
e.g. 'min' < 6
'std_dev' < 2.17
'min' > 'avg'
*/
message SummaryConstraintMsg {
string name = 1;
string first_field = 2;
oneof second {
string second_field = 3;
double value = 4;
}
Op op = 5;
bool verbose = 6;
}

/* ValueConstraints express a binary boolean relationship between an implied numeric value and a literal.
These are applied to every incoming value that is processed by whylogs. */
message ValueConstraintMsg {
string name = 1;
double value = 2;
Op op = 3;
bool verbose = 4;
}

message ValueConstraintMsgs {
repeated ValueConstraintMsg constraints = 1;
}

message SummaryConstraintMsgs {
repeated SummaryConstraintMsg constraints = 1;
}

message DatasetConstraintMsg {
DatasetProperties properties = 1;
map<string, ValueConstraintMsgs> value_constraints = 2;
map<string, SummaryConstraintMsgs> summary_constraints = 3;
}

187 changes: 187 additions & 0 deletions proto/src/messages.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
syntax = "proto3";

import "google/protobuf/wrappers.proto";

option java_package = "com.whylogs.core.message";
option java_outer_classname = "Messages";
option java_multiple_files = true;

message Counters {
int64 count = 1;

google.protobuf.Int64Value true_count = 2;
google.protobuf.Int64Value null_count = 3;
}

message InferredType {
enum Type {
UNKNOWN = 0;
NULL = 1;
FRACTIONAL = 2;
INTEGRAL = 3;
BOOLEAN = 4;
STRING = 5;
}

Type type = 1;
double ratio = 2;
}

message DoublesMessage {
int64 count = 1;
double min = 2;
double max = 3;
double sum = 4;
}

message LongsMessage {
int64 count = 1;
int64 min = 2;
int64 max = 3;
int64 sum = 4;
}

message VarianceMessage {
int64 count = 1;
double sum = 2; // sample variance * (n-1)
double mean = 3;
}

message FrequentNumbersSketchMessage {
bytes sketch = 1;
int32 lg_max_k = 2;
}

message FrequentItemsSketchMessage {
bytes sketch = 1;
int32 lg_max_k = 2;
}

message NumbersMessage {
VarianceMessage variance = 1;
oneof numbers {
DoublesMessage doubles = 2;
LongsMessage longs = 3;
}

// sketches
bytes histogram = 4;
bytes theta = 5;
bytes compact_theta = 6;
FrequentNumbersSketchMessage frequent_numbers = 7;
}

message StringsMessage {
int64 count = 1;

// sketches
bytes theta = 2;
bytes items = 3;
bytes compact_theta = 4;
}


message SchemaMessage {
map<int32, int64> typeCounts = 1;
InferredType inferred_type = 2;
}

message ColumnMessage {
string name = 1;
Counters counters = 2;
SchemaMessage schema = 3;
NumbersMessage numbers = 4;
StringsMessage strings = 5;
InferredType inferred_type = 6;
FrequentItemsSketchMessage frequent_items = 7;
HllSketchMessage cardinality_tracker = 8;
}

message DatasetProperties {
uint32 schema_major_version = 1;
uint32 schema_minor_version = 2;

string session_id = 3;
int64 session_timestamp = 4;
int64 data_timestamp = 5;
map<string, string> tags = 6;
map<string, string> metadata = 7;
// TODO: store other configuration here
}

message ScoreMatrixMessage {
repeated string labels = 1;
string prediction_field = 2;
string target_field = 3;
string score_field = 4;

// a flattened NxN matrix (N = len(labels))
repeated NumbersMessage scores = 10;
}

message RegressionMetricsMessage{
string prediction_field = 1;
string target_field = 2;
uint64 count = 3;
double sum_abs_diff = 4;
double sum_diff = 5;
double sum2_diff = 6;
}

enum ModelType {
UNKNOWN = 0;
CLASSIFICATION = 1;
REGRESSION = 2;
EMBEDDINGS = 3;
}

message ModelMetricsMessage {
ScoreMatrixMessage scoreMatrix = 1;
ModelType modelType = 2;
RegressionMetricsMessage regressionMetrics = 3;
}

message ModelProfileMessage {
repeated string output_fields = 1;
// Reserving fields for ModelMessage

ModelMetricsMessage metrics = 10;
}

message DatasetProfileMessage {
DatasetProperties properties = 1;
map<string, ColumnMessage> columns = 2;
// reserve other fields for dataset level data
ModelProfileMessage modeProfile = 10;
}

/**
* The follow section is for transmission and reconstruction of the dataset
* in WhyLogs backend
*/
message ColumnsChunkSegment {
// UUID is required to aggregate to the original message
// This should map back to the original dataset
string marker = 1;
repeated ColumnMessage columns = 2;
}

message DatasetMetadataSegment {
string marker = 1;
DatasetProperties properties = 2;
}

// A segment of a dataset profile. This can be used to composed the
// original object back
message MessageSegment {
string marker = 1;
oneof item {
DatasetMetadataSegment metadata = 2;
ColumnsChunkSegment columns = 3;
}
}

message HllSketchMessage {
bytes sketch = 1;
int32 lg_k = 2;
}
105 changes: 105 additions & 0 deletions proto/src/summaries.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
syntax = "proto3";

import "messages.proto";

option java_package = "com.whylogs.core.message";
option java_outer_classname = "Summaries";
option java_multiple_files = true;

message UniqueCountSummary {
double estimate = 1;
double upper = 2;
double lower = 3;
}

message FrequentStringsSummary {
message FrequentItem {
string value = 1;
double estimate = 2;
}
repeated FrequentItem items = 1;
}

message FrequentNumbersSummary {
message FrequentDoubleItem {
int64 estimate = 1;
double value = 2;
int32 rank = 3;
}
message FrequentLongItem {
int64 estimate = 1;
int64 value = 2;
int32 rank = 3;
}
repeated FrequentDoubleItem doubles = 1;
repeated FrequentLongItem longs = 2;
}

message FrequentItemsSummary {
message FrequentItem {
int64 estimate = 1;
string json_value = 2;
}
repeated FrequentItem items = 1;
}

message StringsSummary {
UniqueCountSummary unique_count = 1;
FrequentStringsSummary frequent = 2;
}

message SchemaSummary {
InferredType inferred_type = 1;
map<string, int64> type_counts = 2;
}

message HistogramSummary {
double start = 1;
double end = 2;
double width = 3;
repeated int64 counts = 4;

double max = 5;
double min = 6;
repeated double bins = 7;
int64 n = 8;

}

message QuantileSummary {
repeated double quantiles = 1;
repeated double quantile_values = 2;
}

message NumberSummary {
uint64 count = 1;
double min = 2;
double max = 3;
double mean = 4;
double stddev = 5;

HistogramSummary histogram = 6;
UniqueCountSummary unique_count = 7;
QuantileSummary quantiles = 8;
FrequentNumbersSummary frequent_numbers = 9;

bool is_discrete = 10;
}

message ColumnSummary {
Counters counters = 1;
SchemaSummary schema = 2;
NumberSummary number_summary = 3;
StringsSummary string_summary = 4;
FrequentItemsSummary frequent_items = 5;
UniqueCountSummary unique_count = 6;
}

message DatasetSummary {
DatasetProperties properties = 1;
map<string, ColumnSummary> columns = 2;
}

message DatasetSummaries {
map<string, DatasetSummary> profiles = 1;
}
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

[metadata]
name = whylogs
version = 0.4.4-dev0
version = 0.4.4-dev3
description = Profile and monitor your ML data pipeline end-to-end
author = WhyLabs.ai
author-email = support@whylabs.ai
Expand Down
2 changes: 1 addition & 1 deletion src/whylogs/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""WhyLabs version number."""

__version__ = "0.4.4-dev0"
__version__ = "0.4.4-dev3"

0 comments on commit eaf9dc4

Please sign in to comment.