Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
891 lines (771 sloc)
32.7 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2017 The TensorFlow Authors. All Rights Reserved. | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
// ============================================================================= | |
syntax = "proto2"; | |
package tensorflow.metadata.v0; | |
import "google/protobuf/any.proto"; | |
import "tensorflow_metadata/proto/v0/derived_feature.proto"; | |
import "tensorflow_metadata/proto/v0/path.proto"; | |
// GOOGLE-LEGACY option jspb_use_correct_proto2_semantics = false; | |
option cc_enable_arenas = true; | |
option java_package = "org.tensorflow.metadata.v0"; | |
option java_multiple_files = true; | |
// LifecycleStage. Only UNKNOWN_STAGE, BETA, PRODUCTION, and VALIDATION_DERIVED | |
// features are actually validated. | |
// PLANNED, ALPHA, DISABLED, and DEBUG are treated as DEPRECATED. | |
enum LifecycleStage { | |
// Unknown stage. | |
UNKNOWN_STAGE = 0; | |
// Planned feature, may not be created yet. | |
PLANNED = 1; | |
// Prototype feature, not used in experiments yet. | |
ALPHA = 2; | |
// Used in user-facing experiments. | |
BETA = 3; | |
// Used in a significant fraction of user traffic. | |
PRODUCTION = 4; | |
// No longer supported: do not use in new models. | |
DEPRECATED = 5; | |
// Only exists for debugging purposes. | |
DEBUG_ONLY = 6; | |
// Generic indication that feature is disabled / excluded | |
// from models, regardless of specific reason. | |
DISABLED = 7; | |
// Indicates that this feature was derived from ordinary | |
// features for the purposes of statistics generation or | |
// validation. Consumers should expect that this feature | |
// may be present in DatasetFeatureStatistics, but not in | |
// input data. | |
// Experimental and subject to change. | |
VALIDATION_DERIVED = 9; | |
reserved 8; | |
} | |
// | |
// Message to represent schema information. | |
// NextID: 14 | |
message Schema { | |
// Features described in this schema. | |
repeated Feature feature = 1; | |
// Sparse features described in this schema. | |
repeated SparseFeature sparse_feature = 6; | |
// Weighted features described in this schema. | |
repeated WeightedFeature weighted_feature = 12; | |
// Use StructDomain instead. | |
// Sequences described in this schema. A sequence may be described in terms of | |
// several features. Any features appearing within a sequence must *not* be | |
// declared as top-level features in <feature>. | |
// GOOGLE-LEGACY repeated Sequence sequence = 2; | |
// declared as top-level features in <feature>. | |
// String domains referenced in the features. | |
repeated StringDomain string_domain = 4; | |
// top level float domains that can be reused by features | |
repeated FloatDomain float_domain = 9; | |
// top level int domains that can be reused by features | |
repeated IntDomain int_domain = 10; | |
// Default environments for each feature. | |
// An environment represents both a type of location (e.g. a server or phone) | |
// and a time (e.g. right before model X is run). In the standard scenario, | |
// 99% of the features should be in the default environments TRAINING, | |
// SERVING, and the LABEL (or labels) AND WEIGHT is only available at TRAINING | |
// (not at serving). | |
// Other possible variations: | |
// 1. There may be TRAINING_MOBILE, SERVING_MOBILE, TRAINING_SERVICE, | |
// and SERVING_SERVICE. | |
// 2. If one is ensembling three models, where the predictions of the first | |
// three models are available for the ensemble model, there may be | |
// TRAINING, SERVING_INITIAL, SERVING_ENSEMBLE. | |
// See FeatureProto::not_in_environment and FeatureProto::in_environment. | |
repeated string default_environment = 5; | |
/* BEGIN GOOGLE-LEGACY | |
// TODO(b/73109633): Change default to false, before removing this field. | |
optional bool generate_legacy_feature_spec = 7 [default = true]; | |
END GOOGLE-LEGACY */ | |
// Additional information about the schema as a whole. Features may also | |
// be annotated individually. | |
optional Annotation annotation = 8; | |
// Dataset-level constraints. This is currently used for specifying | |
// information about changes in num_examples. | |
optional DatasetConstraints dataset_constraints = 11; | |
// TensorRepresentation groups. The keys are the names of the groups. | |
// Key "" (empty string) denotes the "default" group, which is what should | |
// be used when a group name is not provided. | |
// See the documentation at TensorRepresentationGroup for more info. | |
// Under development. | |
map<string, TensorRepresentationGroup> tensor_representation_group = 13; | |
} | |
message ValueCountList { | |
repeated ValueCount value_count = 1; | |
} | |
// Describes schema-level information about a specific feature. | |
// NextID: 35 | |
message Feature { | |
// The name of the feature. | |
optional string name = 1; // required | |
// This field is no longer supported. Instead, use: | |
// lifecycle_stage: DEPRECATED | |
// TODO(b/111450258): remove this. | |
optional bool deprecated = 2 [deprecated = true]; | |
// Comment field for a human readable description of the field. | |
// TODO(b/123518108): remove this. | |
// GOOGLE-LEGACY optional string comment = 3 [deprecated = true]; | |
oneof presence_constraints { | |
// Constraints on the presence of this feature in the examples. | |
FeaturePresence presence = 14; | |
// Only used in the context of a "group" context, e.g., inside a sequence. | |
FeaturePresenceWithinGroup group_presence = 17; | |
} | |
// The shape of the feature which governs the number of values that appear in | |
// each example. | |
oneof shape_type { | |
// The feature has a fixed shape corresponding to a multi-dimensional | |
// tensor. | |
FixedShape shape = 23; | |
// The feature doesn't have a well defined shape. All we know are limits on | |
// the minimum and maximum number of values. | |
ValueCount value_count = 5; | |
// Captures the same information as value_count but for features with | |
// nested values. A ValueCount is provided for each nest level. | |
ValueCountList value_counts = 32; | |
} | |
// Physical type of the feature's values. | |
// Note that you can have: | |
// type: BYTES | |
// int_domain: { | |
// min: 0 | |
// max: 3 | |
// } | |
// This would be a field that is syntactically BYTES (i.e. strings), but | |
// semantically an int, i.e. it would be "0", "1", "2", or "3". | |
optional FeatureType type = 6; | |
// Domain for the values of the feature. | |
oneof domain_info { | |
// Reference to a domain defined at the schema level. | |
string domain = 7; | |
// Inline definitions of domains. | |
IntDomain int_domain = 9; | |
FloatDomain float_domain = 10; | |
StringDomain string_domain = 11; | |
BoolDomain bool_domain = 13; | |
StructDomain struct_domain = 29; | |
// Supported semantic domains. | |
NaturalLanguageDomain natural_language_domain = 24; | |
ImageDomain image_domain = 25; | |
MIDDomain mid_domain = 26; | |
URLDomain url_domain = 27; | |
TimeDomain time_domain = 28; | |
TimeOfDayDomain time_of_day_domain = 30; | |
} | |
// Constraints on the distribution of the feature values. | |
// Only supported for StringDomains. | |
optional DistributionConstraints distribution_constraints = 15; | |
// Additional information about the feature for documentation purpose. | |
optional Annotation annotation = 16; | |
// Tests comparing the distribution to the associated serving data. | |
optional FeatureComparator skew_comparator = 18; | |
// Tests comparing the distribution between two consecutive spans (e.g. days). | |
optional FeatureComparator drift_comparator = 21; | |
// List of environments this feature is present in. | |
// Should be disjoint from not_in_environment. | |
// This feature is in environment "foo" if: | |
// ("foo" is in in_environment or default_environment) AND | |
// "foo" is not in not_in_environment. | |
// See Schema::default_environment. | |
repeated string in_environment = 20; | |
// List of environments this feature is not present in. | |
// Should be disjoint from of in_environment. | |
// See Schema::default_environment and in_environment. | |
repeated string not_in_environment = 19; | |
// The lifecycle stage of a feature. It can also apply to its descendants. | |
// i.e., if a struct is DEPRECATED, its children are implicitly deprecated. | |
optional LifecycleStage lifecycle_stage = 22; | |
// Constraints on the number of unique values for a given feature. | |
// This is supported for string and categorical features only. | |
optional UniqueConstraints unique_constraints = 31; | |
// If set, indicates that that this feature is derived, and stores metadata | |
// about its source. If this field is set, this feature should have | |
// lifecycle_stage VALIDATION_DERIVED or DISABLED. | |
// Experimental and subject to change. | |
optional DerivedFeatureSource validation_derived_source = 34; | |
reserved 33; | |
} | |
// Additional information about the schema or about a feature. | |
message Annotation { | |
// Tags can be used to mark features. For example, tag on user_age feature can | |
// be `user_feature`, tag on user_country feature can be `location_feature`, | |
// `user_feature`. | |
repeated string tag = 1; | |
// Free-text comments. This can be used as a description of the feature, | |
// developer notes etc. | |
repeated string comment = 2; | |
// Application-specific metadata may be attached here. | |
repeated .google.protobuf.Any extra_metadata = 3; | |
} | |
// Checks that the ratio of the current value to the previous value is not below | |
// the min_fraction_threshold or above the max_fraction_threshold. That is, | |
// previous value * min_fraction_threshold <= current value <= | |
// previous value * max_fraction_threshold. | |
// To specify that the value cannot change, set both min_fraction_threshold and | |
// max_fraction_threshold to 1.0. | |
message NumericValueComparator { | |
optional double min_fraction_threshold = 1; | |
optional double max_fraction_threshold = 2; | |
} | |
// Constraints on the entire dataset. | |
message DatasetConstraints { | |
// Tests differences in number of examples between the current data and the | |
// previous span. | |
optional NumericValueComparator num_examples_drift_comparator = 1; | |
// Tests comparisions in number of examples between the current data and the | |
// previous version of that data. | |
optional NumericValueComparator num_examples_version_comparator = 2; | |
// Minimum number of examples in the dataset. | |
optional int64 min_examples_count = 3; | |
// Maximum number of examples in the dataset. | |
optional int64 max_examples_count = 4; | |
} | |
// Specifies a fixed shape for the feature's values. The immediate implication | |
// is that each feature has a fixed number of values. Moreover, these values | |
// can be parsed in a multi-dimensional tensor using the specified axis sizes. | |
// The FixedShape defines a lexicographical ordering of the data. For instance, | |
// if there is a FixedShape { | |
// dim {size:3} dim {size:2} | |
// } | |
// then tensor[0][0]=field[0] | |
// then tensor[0][1]=field[1] | |
// then tensor[1][0]=field[2] | |
// then tensor[1][1]=field[3] | |
// then tensor[2][0]=field[4] | |
// then tensor[2][1]=field[5] | |
// | |
// The FixedShape message is identical with the TensorFlow TensorShape proto | |
// message. | |
message FixedShape { | |
// The dimensions that define the shape. The total number of values in each | |
// example is the product of sizes of each dimension. | |
repeated Dim dim = 2; | |
// An axis in a multi-dimensional feature representation. | |
message Dim { | |
optional int64 size = 1; | |
// Optional name of the tensor dimension. | |
optional string name = 2; | |
} | |
} | |
// Limits on maximum and minimum number of values in a | |
// single example (when the feature is present). Use this when the minimum | |
// value count can be different than the maximum value count. Otherwise prefer | |
// FixedShape. | |
message ValueCount { | |
optional int64 min = 1; | |
optional int64 max = 2; | |
} | |
/* BEGIN GOOGLE-LEGACY | |
// Constraint on the number of elements in a sequence. | |
message LengthConstraint { | |
optional int64 min = 1; | |
optional int64 max = 2; | |
} | |
// A sequence is a logical feature that comprises several "raw" features that | |
// encode values at different "steps" within the sequence. | |
// TODO(b/110490010): Delete this. This is a special case of StructDomain. | |
message Sequence { | |
// An optional name for this sequence. Used mostly for debugging and | |
// presentation. | |
optional string name = 1; | |
// Features that comprise the sequence. These features are "zipped" together | |
// to form the values for the sequence at different steps. | |
// - Use group_presence within each feature to encode presence constraints | |
// within the sequence. | |
// - If all features have the same value-count constraints then | |
// declare this once using the shape_constraint below. | |
repeated Feature feature = 2; | |
// Constraints on the presence of the sequence across all examples in the | |
// dataset. The sequence is assumed to be present if at least one of its | |
// features is present. | |
optional FeaturePresence presence = 3; | |
// Shape constraints that apply on all the features that comprise the | |
// sequence. If this is set then the value_count in 'feature' is | |
// ignored. | |
// TODO(martinz): delete: there is no reason to believe the shape of the | |
// fields in a sequence will be the same. Use the fields in Feature instead. | |
oneof shape_constraint { | |
ValueCount value_count = 4; | |
FixedShape fixed_shape = 5; | |
} | |
// Constraint on the number of elements in a sequence. | |
optional LengthConstraint length_constraint = 6; | |
} | |
END GOOGLE-LEGACY */ | |
// Represents a weighted feature that is encoded as a combination of raw base | |
// features. The `weight_feature` should be a float feature with identical | |
// shape as the `feature`. This is useful for representing weights associated | |
// with categorical tokens (e.g. a TFIDF weight associated with each token). | |
// TODO(b/142122960): Handle WeightedCategorical end to end in TFX (validation, | |
// TFX Unit Testing, etc) | |
message WeightedFeature { | |
// Name for the weighted feature. This should not clash with other features in | |
// the same schema. | |
optional string name = 1; // required | |
// Path of a base feature to be weighted. Required. | |
optional Path feature = 2; | |
// Path of weight feature to associate with the base feature. Must be same | |
// shape as feature. Required. | |
optional Path weight_feature = 3; | |
// The lifecycle_stage determines where a feature is expected to be used, | |
// and therefore how important issues with it are. | |
optional LifecycleStage lifecycle_stage = 4; | |
} | |
// A sparse feature represents a sparse tensor that is encoded with a | |
// combination of raw features, namely index features and a value feature. Each | |
// index feature defines a list of indices in a different dimension. | |
message SparseFeature { | |
reserved 11; | |
// Name for the sparse feature. This should not clash with other features in | |
// the same schema. | |
optional string name = 1; // required | |
// This field is no longer supported. Instead, use: | |
// lifecycle_stage: DEPRECATED | |
// TODO(b/111450258): remove this. | |
optional bool deprecated = 2 [deprecated = true]; | |
// The lifecycle_stage determines where a feature is expected to be used, | |
// and therefore how important issues with it are. | |
optional LifecycleStage lifecycle_stage = 7; | |
// Comment field for a human readable description of the field. | |
// TODO(martinz): delete, convert to annotation. | |
// GOOGLE-LEGACY optional string comment = 3 [deprecated = true]; | |
// Constraints on the presence of this feature in examples. | |
// Deprecated, this is inferred by the referred features. | |
optional FeaturePresence presence = 4 [deprecated = true]; | |
// Shape of the sparse tensor that this SparseFeature represents. | |
// Currently not supported. | |
// TODO(b/109669962): Consider deriving this from the referred features. | |
optional FixedShape dense_shape = 5; | |
// Features that represent indexes. Should be integers >= 0. | |
repeated IndexFeature index_feature = 6; // at least one | |
message IndexFeature { | |
// Name of the index-feature. This should be a reference to an existing | |
// feature in the schema. | |
optional string name = 1; | |
} | |
// If true then the index values are already sorted lexicographically. | |
optional bool is_sorted = 8; | |
optional ValueFeature value_feature = 9; // required | |
message ValueFeature { | |
// Name of the value-feature. This should be a reference to an existing | |
// feature in the schema. | |
optional string name = 1; | |
} | |
// Type of value feature. | |
// Deprecated, this is inferred by the referred features. | |
optional FeatureType type = 10 [deprecated = true]; | |
} | |
// Models constraints on the distribution of a feature's values. | |
// TODO(martinz): replace min_domain_mass with max_off_domain (but slowly). | |
message DistributionConstraints { | |
// The minimum fraction (in [0,1]) of values across all examples that | |
// should come from the feature's domain, e.g.: | |
// 1.0 => All values must come from the domain. | |
// .9 => At least 90% of the values must come from the domain. | |
optional double min_domain_mass = 1 [default = 1.0]; | |
} | |
// Encodes vocabulary coverage constraints. | |
message FeatureCoverageConstraints { | |
// Fraction of feature values that map to a vocab entry (i.e. are not oov). | |
optional float min_coverage = 1; | |
// Average length of tokens. Used for cases such as wordpiece that fallback | |
// to character-level tokenization. | |
optional float min_avg_token_length = 2; | |
// String tokens to exclude when calculating min_coverage and | |
// min_avg_token_length. Useful for tokens such as [PAD]. | |
repeated string excluded_string_tokens = 3; | |
// Integer tokens to exclude when calculating min_coverage and | |
// min_avg_token_length. | |
repeated int64 excluded_int_tokens = 4 [packed = true]; | |
// String tokens to treat as oov tokens (e.g. [UNK]). These tokens are also | |
// excluded when calculating avg token length. | |
repeated string oov_string_tokens = 5; | |
} | |
// Encodes constraints on specific values in sequences. | |
message SequenceValueConstraints { | |
// The value which to express constraints for. Can be either an integer or | |
// a string. | |
oneof value { | |
int64 int_value = 1; | |
string string_value = 2; | |
} | |
// Min / max number of times the value can occur in a sequence. | |
optional int64 min_per_sequence = 3; | |
optional int64 max_per_sequence = 4; | |
// Min / max fraction of sequences that must contain the value. | |
optional float min_fraction_of_sequences = 5; | |
optional float max_fraction_of_sequences = 6; | |
} | |
// Encodes constraints on sequence lengths. | |
message SequenceLengthConstraints { | |
// Token values (int and string) that are excluded when calculating sequence | |
// length. | |
repeated int64 excluded_int_value = 1; | |
repeated string excluded_string_value = 2; | |
// Min / max sequence length. | |
optional int64 min_sequence_length = 3; | |
optional int64 max_sequence_length = 4; | |
} | |
// Encodes information for domains of integer values. | |
// Note that FeatureType could be either INT or BYTES. | |
message IntDomain { | |
// Id of the domain. Required if the domain is defined at the schema level. If | |
// so, then the name must be unique within the schema. | |
optional string name = 1; | |
// Min and max values for the domain. | |
optional int64 min = 3; | |
optional int64 max = 4; | |
// If true then the domain encodes categorical values (i.e., ids) rather than | |
// ordinal values. | |
optional bool is_categorical = 5; | |
} | |
// Encodes information for domains of float values. | |
// Note that FeatureType could be either INT or BYTES. | |
message FloatDomain { | |
// Id of the domain. Required if the domain is defined at the schema level. If | |
// so, then the name must be unique within the schema. | |
optional string name = 1; | |
// Min and max values of the domain. | |
optional float min = 3; | |
optional float max = 4; | |
// If true, feature should not contain NaNs. | |
optional bool disallow_nan = 5; | |
// If true, feature should not contain Inf or -Inf. | |
optional bool disallow_inf = 6; | |
// If True, this indicates that the feature is semantically an embedding. This | |
// can be useful for distinguishing fixed dimensional numeric features that | |
// should be fed to a model unmodified. | |
optional bool is_embedding = 7; | |
// If true then the domain encodes categorical values (i.e., ids) rather than | |
// continuous values. | |
optional bool is_categorical = 8; | |
} | |
// Domain for a recursive struct. | |
// NOTE: If a feature with a StructDomain is deprecated, then all the | |
// child features (features and sparse_features of the StructDomain) are also | |
// considered to be deprecated. Similarly child features can only be in | |
// environments of the parent feature. | |
message StructDomain { | |
repeated Feature feature = 1; | |
repeated SparseFeature sparse_feature = 2; | |
} | |
// Encodes information for domains of string values. | |
message StringDomain { | |
// Id of the domain. Required if the domain is defined at the schema level. If | |
// so, then the name must be unique within the schema. | |
optional string name = 1; | |
// The values appearing in the domain. | |
repeated string value = 2; | |
} | |
// Encodes information about the domain of a boolean attribute that encodes its | |
// TRUE/FALSE values as strings, or 0=false, 1=true. | |
// Note that FeatureType could be either INT or BYTES. | |
message BoolDomain { | |
// Id of the domain. Required if the domain is defined at the schema level. If | |
// so, then the name must be unique within the schema. | |
optional string name = 1; | |
// Strings values for TRUE/FALSE. | |
optional string true_value = 2; | |
optional string false_value = 3; | |
} | |
// BEGIN SEMANTIC-TYPES-PROTOS | |
// Semantic domains are specialized feature domains. For example a string | |
// Feature might represent a Time of a specific format. | |
// Semantic domains are defined as protocol buffers to allow further sub-types / | |
// specialization, e.g: NaturalLanguageDomain can provide information on the | |
// language of the text. | |
// Natural language text. | |
message NaturalLanguageDomain { | |
// Name of the vocabulary associated with the NaturalLanguageDomain. | |
// When computing and validating stats using TFDV, | |
// tfdv.StatsOptions.vocab_paths should map this name to a vocabulary file. | |
optional string vocabulary = 1; | |
optional FeatureCoverageConstraints coverage = 2; | |
repeated SequenceValueConstraints token_constraints = 3; | |
optional SequenceLengthConstraints sequence_length_constraints = 5; | |
// Specifies the location constraints as a function of the tokens specified | |
// in token_constraints. | |
// | |
// String tokens will be specified by S_TOKEN_, (e.g. S_(PAD)_) and integer | |
// tokens will be specified as I_#_ (e.g. I_123_). | |
// A_T_ will match any token that has not been specified in token_constraints. | |
// Parenthesis, +, and * are supported. | |
// _ will be escapable with a \ for tokens containing it (e.g. FOO\_BAR). | |
// | |
// For example, a two-sequence BERT model may look as follows: | |
// S_(CLS)_ A_T_+ S_(SEP)_ A_T_+ S_(SEP)_ S_(PAD)_* | |
// | |
// Note: Support for this field is not yet implemented. Please do not use. | |
// TODO(b/188095987): Remove warning once field is implemented. | |
optional string location_constraint_regex = 4; | |
} | |
// Image data. | |
message ImageDomain { | |
// If set, at least this fraction of values should be TensorFlow supported | |
// images. | |
optional float minimum_supported_image_fraction = 1; | |
// If set, image should have less than this value of undecoded byte size. | |
optional int64 max_image_byte_size = 2; | |
} | |
// Knowledge graph ID, see: https://www.wikidata.org/wiki/Property:P646 | |
message MIDDomain {} | |
// A URL, see: https://en.wikipedia.org/wiki/URL | |
message URLDomain {} | |
// Time or date representation. | |
message TimeDomain { | |
enum IntegerTimeFormat { | |
FORMAT_UNKNOWN = 0; | |
UNIX_DAYS = 5; // Number of days since 1970-01-01. | |
UNIX_SECONDS = 1; | |
UNIX_MILLISECONDS = 2; | |
UNIX_MICROSECONDS = 3; | |
UNIX_NANOSECONDS = 4; | |
} | |
oneof format { | |
// Expected format that contains a combination of regular characters and | |
// special format specifiers. Format specifiers are a subset of the | |
// strptime standard. | |
string string_format = 1; | |
// Expected format of integer times. | |
IntegerTimeFormat integer_format = 2; | |
} | |
} | |
// Time of day, without a particular date. | |
message TimeOfDayDomain { | |
enum IntegerTimeOfDayFormat { | |
FORMAT_UNKNOWN = 0; | |
// Time values, containing hour/minute/second/nanos, encoded into 8-byte | |
// bit fields following the ZetaSQL convention: | |
// 6 5 4 3 2 1 | |
// MSB 3210987654321098765432109876543210987654321098765432109876543210 LSB | |
// | H || M || S ||---------- nanos -----------| | |
PACKED_64_NANOS = 1; | |
} | |
oneof format { | |
// Expected format that contains a combination of regular characters and | |
// special format specifiers. Format specifiers are a subset of the | |
// strptime standard. | |
string string_format = 1; | |
// Expected format of integer times. | |
IntegerTimeOfDayFormat integer_format = 2; | |
} | |
} | |
// END SEMANTIC-TYPES-PROTOS | |
// Describes the physical representation of a feature. | |
// It may be different than the logical representation, which | |
// is represented as a Domain. | |
enum FeatureType { | |
TYPE_UNKNOWN = 0; | |
BYTES = 1; | |
INT = 2; | |
FLOAT = 3; | |
STRUCT = 4; | |
} | |
// Describes constraints on the presence of the feature in the data. | |
message FeaturePresence { | |
// Minimum fraction of examples that have this feature. | |
optional double min_fraction = 1; | |
// Minimum number of examples that have this feature. | |
optional int64 min_count = 2; | |
} | |
// Records constraints on the presence of a feature inside a "group" context | |
// (e.g., .presence inside a group of features that define a sequence). | |
message FeaturePresenceWithinGroup { | |
optional bool required = 1; | |
} | |
// Checks that the L-infinity norm is below a certain threshold between the | |
// two discrete distributions. Since this is applied to a FeatureNameStatistics, | |
// it only considers the top k. | |
// L_infty(p,q) = max_i |p_i-q_i| | |
message InfinityNorm { | |
// The InfinityNorm is in the interval [0.0, 1.0] so sensible bounds should | |
// be in the interval [0.0, 1.0). | |
optional double threshold = 1; | |
} | |
// Checks that the approximate Jensen-Shannon Divergence is below a certain | |
// threshold between the two distributions. | |
message JensenShannonDivergence { | |
// The JensenShannonDivergence will be in the interval [0.0, 1.0] so sensible | |
// bounds should be in the interval [0.0, 1.0). | |
optional double threshold = 1; | |
} | |
message FeatureComparator { | |
optional InfinityNorm infinity_norm = 1; | |
optional JensenShannonDivergence jensen_shannon_divergence = 2; | |
} | |
// Checks that the number of unique values is greater than or equal to the min, | |
// and less than or equal to the max. | |
message UniqueConstraints { | |
optional int64 min = 1; | |
optional int64 max = 2; | |
} | |
// A TensorRepresentation captures the intent for converting columns in a | |
// dataset to TensorFlow Tensors (or more generally, tf.CompositeTensors). | |
// Note that one tf.CompositeTensor may consist of data from multiple columns, | |
// for example, a N-dimensional tf.SparseTensor may need N + 1 columns to | |
// provide the sparse indices and values. | |
// Note that the "column name" that a TensorRepresentation needs is a | |
// string, not a Path -- it means that the column name identifies a top-level | |
// Feature in the schema (i.e. you cannot specify a Feature nested in a STRUCT | |
// Feature). | |
message TensorRepresentation { | |
message DefaultValue { | |
oneof kind { | |
double float_value = 1; | |
// Note that the data column might be of a shorter integral type. It's the | |
// user's responsitiblity to make sure the default value fits that type. | |
int64 int_value = 2; | |
bytes bytes_value = 3; | |
// uint_value should only be used if the default value can't fit in a | |
// int64 (`int_value`). | |
uint64 uint_value = 4; | |
} | |
} | |
// A tf.Tensor | |
message DenseTensor { | |
// Identifies the column in the dataset that provides the values of this | |
// Tensor. | |
optional string column_name = 1; | |
// The shape of each row of the data (i.e. does not include the batch | |
// dimension) | |
optional FixedShape shape = 2; | |
// If this column is missing values in a row, the default_value will be | |
// used to fill that row. | |
optional DefaultValue default_value = 3; | |
} | |
// A ragged tf.SparseTensor that models nested lists. | |
message VarLenSparseTensor { | |
// Identifies the column in the dataset that should be converted to the | |
// VarLenSparseTensor. | |
optional string column_name = 1; | |
} | |
// A tf.SparseTensor whose indices and values come from separate data columns. | |
// This will replace Schema.sparse_feature eventually. | |
// The index columns must be of INT type, and all the columns must co-occur | |
// and have the same valency at the same row. | |
message SparseTensor { | |
// The dense shape of the resulting SparseTensor (does not include the batch | |
// dimension). | |
optional FixedShape dense_shape = 1; | |
// The columns constitute the coordinates of the values. | |
// indices_column[i][j] contains the coordinate of the i-th dimension of the | |
// j-th value. | |
repeated string index_column_names = 2; | |
// The column that contains the values. | |
optional string value_column_name = 3; | |
} | |
// A tf.RaggedTensor that models nested lists. | |
// Currently there is no way for the user to specify the shape of the leaf | |
// value (the innermost value tensor of the RaggedTensor). The leaf value will | |
// always be a 1-D tensor. | |
message RaggedTensor { | |
// Identifies the leaf feature that provides values of the RaggedTensor. | |
// struct type sub fields. | |
// The first step of the path refers to a top-level feature in the data. The | |
// remaining steps refer to STRUCT features under the top-level feature, | |
// recursively. | |
// If the feature has N outer ragged lists, they will become the first | |
// N dimensions of the resulting RaggedTensor and the contents will become | |
// the flat_values. | |
optional Path feature_path = 1; // required. | |
// Further partition of the feature values at the leaf level. | |
message Partition { | |
oneof kind { | |
// If the final element(s) of partition are uniform_row_lengths [U0, U1, | |
// ...] , then the result RaggedTensor will have their flat values (a | |
// dense tensor) being of shape [U0, U1, ...]. Otherwise, a | |
// uniform_row_length simply means a ragged dimension with row_lengths | |
// [uniform_row_length]*nrows. | |
int64 uniform_row_length = 1; | |
// Identifies a leaf feature who share the same parent of | |
// value_feature_path that contains the partition row lengths. | |
string row_length = 2; | |
} | |
} | |
// The result RaggedTensor would be of shape: | |
// [B, D_0, D_1, ..., D_N, P_0, P_1, ..., P_M, U_0, U_1, ..., U_P] | |
// | |
// Where the dimensions belong to different categories: | |
// * B: Batch size dimension | |
// * D_n: Dimensions specified by the nested structure specified by the | |
// value path until the leaf node. n>=1. | |
// * P_m: Dimensions specified by the partitions that do not define any | |
// fixed diomension size. m>=0. | |
// * U_0: Dimensions specified by the latest partitions of type | |
// uniform_row_length that can define the fixed inner shape of the tensor. | |
// If iterationg the partitions from the end to the beginning, these | |
// dimensions are defined by all the continuous uniform_row_length | |
// partitions present. p>=0. | |
repeated Partition partition = 3; | |
// The data type of the ragged tensor's row partitions. This will | |
// default to INT64 if it is not specified. | |
optional RowPartitionDType row_partition_dtype = 2; | |
} | |
// RaggedTensor consists of RowPartitions. This enum allows the user to | |
// specify the dtype of those RowPartitions. If it is UNSPECIFIED, then we | |
// default to INT64. | |
enum RowPartitionDType { | |
UNSPECIFIED = 0; | |
INT64 = 1; | |
INT32 = 2; | |
} | |
oneof kind { | |
DenseTensor dense_tensor = 1; | |
VarLenSparseTensor varlen_sparse_tensor = 2; | |
SparseTensor sparse_tensor = 3; | |
RaggedTensor ragged_tensor = 4; | |
} | |
} | |
// A TensorRepresentationGroup is a collection of TensorRepresentations with | |
// names. These names may serve as identifiers when converting the dataset | |
// to a collection of Tensors or tf.CompositeTensors. | |
// For example, given the following group: | |
// { | |
// key: "dense_tensor" | |
// tensor_representation { | |
// dense_tensor { | |
// column_name: "univalent_feature" | |
// shape { | |
// dim { | |
// size: 1 | |
// } | |
// } | |
// default_value { | |
// float_value: 0 | |
// } | |
// } | |
// } | |
// } | |
// { | |
// key: "varlen_sparse_tensor" | |
// tensor_representation { | |
// varlen_sparse_tensor { | |
// column_name: "multivalent_feature" | |
// } | |
// } | |
// } | |
// | |
// Then the schema is expected to have feature "univalent_feature" and | |
// "multivalent_feature", and when a batch of data is converted to Tensors using | |
// this TensorRepresentationGroup, the result may be the following dict: | |
// { | |
// "dense_tensor": tf.Tensor(...), | |
// "varlen_sparse_tensor": tf.SparseTensor(...), | |
// } | |
message TensorRepresentationGroup { | |
map<string, TensorRepresentation> tensor_representation = 1; | |
} |