Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
562 lines (478 sloc) 20.5 KB
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
syntax = "proto2";
package tensorflow.metadata.v0;
import "google/protobuf/any.proto";
import "tensorflow_metadata/proto/v0/path.proto";
option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;
// LifecycleStage. Only UNKNOWN_STAGE, BETA, and PRODUCTION features are
// actually validated.
// PLANNED, ALPHA, and DEBUG are treated as DEPRECATED.
enum LifecycleStage {
UNKNOWN_STAGE = 0; // Unknown stage.
PLANNED = 1; // Planned feature, may not be created yet.
ALPHA = 2; // Prototype feature, not used in experiments yet.
BETA = 3; // Used in user-facing experiments.
PRODUCTION = 4; // Used in a significant fraction of user traffic.
DEPRECATED = 5; // No longer supported: do not use in new models.
DEBUG_ONLY = 6; // Only exists for debugging purposes.
}
//
// Message to represent schema information.
// NextID: 13
message Schema {
// Features described in this schema.
repeated Feature feature = 1;
// Sparse features described in this schema.
repeated SparseFeature sparse_feature = 6;
// Weighted features described in this schema.
repeated WeightedFeature weighted_feature = 12;
// Use StructDomain instead.
// Sequences described in this schema. A sequence may be described in terms of
// several features. Any features appearing within a sequence must *not* be
// declared as top-level features in <feature>.
// GOOGLE-LEGACY repeated Sequence sequence = 2;
// declared as top-level features in <feature>.
// String domains referenced in the features.
repeated StringDomain string_domain = 4;
// top level float domains that can be reused by features
repeated FloatDomain float_domain = 9;
// top level int domains that can be reused by features
repeated IntDomain int_domain = 10;
// Default environments for each feature.
// An environment represents both a type of location (e.g. a server or phone)
// and a time (e.g. right before model X is run). In the standard scenario,
// 99% of the features should be in the default environments TRAINING,
// SERVING, and the LABEL (or labels) AND WEIGHT is only available at TRAINING
// (not at serving).
// Other possible variations:
// 1. There may be TRAINING_MOBILE, SERVING_MOBILE, TRAINING_SERVICE,
// and SERVING_SERVICE.
// 2. If one is ensembling three models, where the predictions of the first
// three models are available for the ensemble model, there may be
// TRAINING, SERVING_INITIAL, SERVING_ENSEMBLE.
// See FeatureProto::not_in_environment and FeatureProto::in_environment.
repeated string default_environment = 5;
/* BEGIN GOOGLE-LEGACY
// TODO(b/73109633): Change default to false, before removing this field.
optional bool generate_legacy_feature_spec = 7 [default = true];
END GOOGLE-LEGACY */
// Additional information about the schema as a whole. Features may also
// be annotated individually.
optional Annotation annotation = 8;
// Dataset-level constraints. This is currently used for specifying
// information about changes in num_examples.
optional DatasetConstraints dataset_constraints = 11;
}
// Describes schema-level information about a specific feature.
// NextID: 31
message Feature {
// The name of the feature.
optional string name = 1; // required
// This field is no longer supported. Instead, use:
// lifecycle_stage: DEPRECATED
// TODO(b/111450258): remove this.
optional bool deprecated = 2 [deprecated = true];
// Comment field for a human readable description of the field.
// TODO(b/123518108): remove this.
// GOOGLE-LEGACY optional string comment = 3 [deprecated = true];
oneof presence_constraints {
// Constraints on the presence of this feature in the examples.
FeaturePresence presence = 14;
// Only used in the context of a "group" context, e.g., inside a sequence.
FeaturePresenceWithinGroup group_presence = 17;
}
// The shape of the feature which governs the number of values that appear in
// each example.
oneof shape_type {
// The feature has a fixed shape corresponding to a multi-dimensional
// tensor.
FixedShape shape = 23;
// The feature doesn't have a well defined shape. All we know are limits on
// the minimum and maximum number of values.
ValueCount value_count = 5;
}
// Physical type of the feature's values.
// Note that you can have:
// type: BYTES
// int_domain: {
// min: 0
// max: 3
// }
// This would be a field that is syntactically BYTES (i.e. strings), but
// semantically an int, i.e. it would be "0", "1", "2", or "3".
optional FeatureType type = 6;
// Domain for the values of the feature.
oneof domain_info {
// Reference to a domain defined at the schema level.
string domain = 7;
// Inline definitions of domains.
IntDomain int_domain = 9;
FloatDomain float_domain = 10;
StringDomain string_domain = 11;
BoolDomain bool_domain = 13;
StructDomain struct_domain = 29;
// Supported semantic domains (Not supported in TFDV).
NaturalLanguageDomain natural_language_domain = 24;
ImageDomain image_domain = 25;
MIDDomain mid_domain = 26;
URLDomain url_domain = 27;
TimeDomain time_domain = 28;
TimeOfDayDomain time_of_day_domain = 30;
}
// Constraints on the distribution of the feature values.
// Currently only supported for StringDomains.
// TODO(b/69473628): Extend functionality to other domain types.
optional DistributionConstraints distribution_constraints = 15;
// Additional information about the feature for documentation purpose.
optional Annotation annotation = 16;
// Tests comparing the distribution to the associated serving data.
optional FeatureComparator skew_comparator = 18;
// Tests comparing the distribution between two consecutive spans (e.g. days).
optional FeatureComparator drift_comparator = 21;
// List of environments this feature is present in.
// Should be disjoint from not_in_environment.
// This feature is in environment "foo" if:
// ("foo" is in in_environment or default_environments) AND
// "foo" is not in not_in_environment.
// See Schema::default_environments.
repeated string in_environment = 20;
// List of environments this feature is not present in.
// Should be disjoint from of in_environment.
// See Schema::default_environments and in_environment.
repeated string not_in_environment = 19;
// The lifecycle stage of a feature. It can also apply to its descendants.
// i.e., if a struct is DEPRECATED, its children are implicitly deprecated.
optional LifecycleStage lifecycle_stage = 22;
}
// Additional information about the schema or about a feature.
message Annotation {
// Tags can be used to mark features. For example, tag on user_age feature can
// be `user_feature`, tag on user_country feature can be `location_feature`,
// `user_feature`.
repeated string tag = 1;
// Free-text comments. This can be used as a description of the feature,
// developer notes etc.
repeated string comment = 2;
// Application-specific metadata may be attached here.
repeated .google.protobuf.Any extra_metadata = 3;
}
// Checks that the ratio of the current value to the previous value is not below
// the min_fraction_threshold or above the max_fraction_threshold. That is,
// previous value * min_fraction_threshold <= current value <=
// previous value * max_fraction_threshold.
// To specify that the value cannot change, set both min_fraction_threshold and
// max_fraction_threshold to 1.0.
message NumericValueComparator {
optional double min_fraction_threshold = 1;
optional double max_fraction_threshold = 2;
}
// Constraints on the entire dataset.
message DatasetConstraints {
// Tests differences in number of examples between the current data and the
// previous span.
optional NumericValueComparator num_examples_drift_comparator = 1;
// Tests comparisions in number of examples between the current data and the
// previous version of that data.
optional NumericValueComparator num_examples_version_comparator = 2;
// Minimum number of examples in the dataset.
optional int64 min_examples_count = 3;
}
// Specifies a fixed shape for the feature's values. The immediate implication
// is that each feature has a fixed number of values. Moreover, these values
// can be parsed in a multi-dimensional tensor using the specified axis sizes.
// The FixedShape defines a lexicographical ordering of the data. For instance,
// if there is a FixedShape {
// dim {size:3} dim {size:2}
// }
// then tensor[0][0]=field[0]
// then tensor[0][1]=field[1]
// then tensor[1][0]=field[2]
// then tensor[1][1]=field[3]
// then tensor[2][0]=field[4]
// then tensor[2][1]=field[5]
//
// The FixedShape message is identical with the TensorFlow TensorShape proto
// message.
message FixedShape {
// The dimensions that define the shape. The total number of values in each
// example is the product of sizes of each dimension.
repeated Dim dim = 2;
// An axis in a multi-dimensional feature representation.
message Dim {
optional int64 size = 1;
// Optional name of the tensor dimension.
optional string name = 2;
}
}
// Limits on maximum and minimum number of values in a
// single example (when the feature is present). Use this when the minimum
// value count can be different than the maximum value count. Otherwise prefer
// FixedShape.
message ValueCount {
optional int64 min = 1;
optional int64 max = 2;
}
/* BEGIN GOOGLE-LEGACY
// Constraint on the number of elements in a sequence.
message LengthConstraint {
optional int64 min = 1;
optional int64 max = 2;
}
// A sequence is a logical feature that comprises several "raw" features that
// encode values at different "steps" within the sequence.
// TODO(b/110490010): Delete this. This is a special case of StructDomain.
message Sequence {
// An optional name for this sequence. Used mostly for debugging and
// presentation.
optional string name = 1;
// Features that comprise the sequence. These features are "zipped" together
// to form the values for the sequence at different steps.
// - Use group_presence within each feature to encode presence constraints
// within the sequence.
// - If all features have the same value-count constraints then
// declare this once using the shape_constraint below.
repeated Feature feature = 2;
// Constraints on the presence of the sequence across all examples in the
// dataset. The sequence is assumed to be present if at least one of its
// features is present.
optional FeaturePresence presence = 3;
// Shape constraints that apply on all the features that comprise the
// sequence. If this is set then the value_count in 'feature' is
// ignored.
// TODO(martinz): delete: there is no reason to believe the shape of the
// fields in a sequence will be the same. Use the fields in Feature instead.
oneof shape_constraint {
ValueCount value_count = 4;
FixedShape fixed_shape = 5;
}
// Constraint on the number of elements in a sequence.
optional LengthConstraint length_constraint = 6;
}
END GOOGLE-LEGACY */
// Represents a weighted feature that is encoded as a combination of raw base
// features. The `weight_feature` should be a float feature with identical
// shape as the `feature`. This is useful for representing weights associated
// with categorical tokens (e.g. a TFIDF weight associated with each token).
// TODO(b/142122960): Handle WeightedCategorical end to end in TFX (validation,
// TFX Unit Testing, etc)
message WeightedFeature {
// Name for the weighted feature. This should not clash with other features in
// the same schema.
optional string name = 1; // required
// Path of a base feature to be weighted. Required.
optional Path feature = 2;
// Path of weight feature to associate with the base feature. Must be same
// shape as feature. Required.
optional Path weight_feature = 3;
// The lifecycle_stage determines where a feature is expected to be used,
// and therefore how important issues with it are.
optional LifecycleStage lifecycle_stage = 4;
}
// A sparse feature represents a sparse tensor that is encoded with a
// combination of raw features, namely index features and a value feature. Each
// index feature defines a list of indices in a different dimension.
message SparseFeature {
reserved 11;
// Name for the sparse feature. This should not clash with other features in
// the same schema.
optional string name = 1; // required
// This field is no longer supported. Instead, use:
// lifecycle_stage: DEPRECATED
// TODO(b/111450258): remove this.
optional bool deprecated = 2 [deprecated = true];
// The lifecycle_stage determines where a feature is expected to be used,
// and therefore how important issues with it are.
optional LifecycleStage lifecycle_stage = 7;
// Comment field for a human readable description of the field.
// TODO(martinz): delete, convert to annotation.
// GOOGLE-LEGACY optional string comment = 3 [deprecated = true];
// Constraints on the presence of this feature in examples.
// Deprecated, this is inferred by the referred features.
optional FeaturePresence presence = 4 [deprecated = true];
// Shape of the sparse tensor that this SparseFeature represents.
// Currently not supported.
// TODO(b/109669962): Consider deriving this from the referred features.
optional FixedShape dense_shape = 5;
// Features that represent indexes. Should be integers >= 0.
repeated IndexFeature index_feature = 6; // at least one
message IndexFeature {
// Name of the index-feature. This should be a reference to an existing
// feature in the schema.
optional string name = 1;
}
// If true then the index values are already sorted lexicographically.
optional bool is_sorted = 8;
optional ValueFeature value_feature = 9; // required
message ValueFeature {
// Name of the value-feature. This should be a reference to an existing
// feature in the schema.
optional string name = 1;
}
// Type of value feature.
// Deprecated, this is inferred by the referred features.
optional FeatureType type = 10 [deprecated = true];
}
// Models constraints on the distribution of a feature's values.
// TODO(martinz): replace min_domain_mass with max_off_domain (but slowly).
message DistributionConstraints {
// The minimum fraction (in [0,1]) of values across all examples that
// should come from the feature's domain, e.g.:
// 1.0 => All values must come from the domain.
// .9 => At least 90% of the values must come from the domain.
optional double min_domain_mass = 1 [default = 1.0];
}
// Encodes information for domains of integer values.
// Note that FeatureType could be either INT or BYTES.
message IntDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Min and max values for the domain.
optional int64 min = 3;
optional int64 max = 4;
// If true then the domain encodes categorical values (i.e., ids) rather than
// ordinal values.
optional bool is_categorical = 5;
}
// Encodes information for domains of float values.
// Note that FeatureType could be either INT or BYTES.
message FloatDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Min and max values of the domain.
optional float min = 3;
optional float max = 4;
}
// Domain for a recursive struct.
// NOTE: If a feature with a StructDomain is deprecated, then all the
// child features (features and sparse_features of the StructDomain) are also
// considered to be deprecated. Similarly child features can only be in
// environments of the parent feature.
message StructDomain {
repeated Feature feature = 1;
repeated SparseFeature sparse_feature = 2;
}
// Encodes information for domains of string values.
message StringDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// The values appearing in the domain.
repeated string value = 2;
}
// Encodes information about the domain of a boolean attribute that encodes its
// TRUE/FALSE values as strings, or 0=false, 1=true.
// Note that FeatureType could be either INT or BYTES.
message BoolDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Strings values for TRUE/FALSE.
optional string true_value = 2;
optional string false_value = 3;
}
// BEGIN SEMANTIC-TYPES-PROTOS
// Semantic domains are specialized feature domains. For example a string
// Feature might represent a Time of a specific format.
// Semantic domains are defined as protocol buffers to allow further sub-types /
// specialization, e.g: NaturalLanguageDomain can provide information on the
// language of the text.
// Natural language text.
message NaturalLanguageDomain {}
// Image data.
message ImageDomain {}
// Knowledge graph ID, see: https://www.wikidata.org/wiki/Property:P646
message MIDDomain {}
// A URL, see: https://en.wikipedia.org/wiki/URL
message URLDomain {}
// Time or date representation.
message TimeDomain {
enum IntegerTimeFormat {
FORMAT_UNKNOWN = 0;
UNIX_DAYS = 5; // Number of days since 1970-01-01.
UNIX_SECONDS = 1;
UNIX_MILLISECONDS = 2;
UNIX_MICROSECONDS = 3;
UNIX_NANOSECONDS = 4;
}
oneof format {
// Expected format that contains a combination of regular characters and
// special format specifiers. Format specifiers are a subset of the
// strptime standard.
string string_format = 1;
// Expected format of integer times.
IntegerTimeFormat integer_format = 2;
}
}
// Time of day, without a particular date.
message TimeOfDayDomain {
enum IntegerTimeOfDayFormat {
FORMAT_UNKNOWN = 0;
// Time values, containing hour/minute/second/nanos, encoded into 8-byte
// bit fields following the ZetaSQL convention:
// 6 5 4 3 2 1
// MSB 3210987654321098765432109876543210987654321098765432109876543210 LSB
// | H || M || S ||---------- nanos -----------|
PACKED_64_NANOS = 1;
}
oneof format {
// Expected format that contains a combination of regular characters and
// special format specifiers. Format specifiers are a subset of the
// strptime standard.
string string_format = 1;
// Expected format of integer times.
IntegerTimeOfDayFormat integer_format = 2;
}
}
// END SEMANTIC-TYPES-PROTOS
// Describes the physical representation of a feature.
// It may be different than the logical representation, which
// is represented as a Domain.
enum FeatureType {
TYPE_UNKNOWN = 0;
BYTES = 1;
INT = 2;
FLOAT = 3;
STRUCT = 4;
}
// Describes constraints on the presence of the feature in the data.
message FeaturePresence {
// Minimum fraction of examples that have this feature.
optional double min_fraction = 1;
// Minimum number of examples that have this feature.
optional int64 min_count = 2;
}
// Records constraints on the presence of a feature inside a "group" context
// (e.g., .presence inside a group of features that define a sequence).
// TFDV does not support this feature yet.
message FeaturePresenceWithinGroup {
optional bool required = 1;
}
// Checks that the L-infinity norm is below a certain threshold between the
// two discrete distributions. Since this is applied to a FeatureNameStatistics,
// it only considers the top k.
// L_infty(p,q) = max_i |p_i-q_i|
message InfinityNorm {
// The InfinityNorm is in the interval [0.0, 1.0] so sensible bounds should
// be in the interval [0.0, 1.0).
optional double threshold = 1;
}
message FeatureComparator {
optional InfinityNorm infinity_norm = 1;
}
You can’t perform that action at this time.