Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
syntax = "proto2";
package tensorflow.metadata.v0;
import "google/protobuf/any.proto";
import "tensorflow_metadata/proto/v0/derived_feature.proto";
import "tensorflow_metadata/proto/v0/path.proto";
// GOOGLE-LEGACY option jspb_use_correct_proto2_semantics = false;
option cc_enable_arenas = true;
option java_package = "org.tensorflow.metadata.v0";
option java_multiple_files = true;
// LifecycleStage. Only UNKNOWN_STAGE, BETA, PRODUCTION, and VALIDATION_DERIVED
// features are actually validated.
// PLANNED, ALPHA, DISABLED, and DEBUG are treated as DEPRECATED.
enum LifecycleStage {
// Unknown stage.
UNKNOWN_STAGE = 0;
// Planned feature, may not be created yet.
PLANNED = 1;
// Prototype feature, not used in experiments yet.
ALPHA = 2;
// Used in user-facing experiments.
BETA = 3;
// Used in a significant fraction of user traffic.
PRODUCTION = 4;
// No longer supported: do not use in new models.
DEPRECATED = 5;
// Only exists for debugging purposes.
DEBUG_ONLY = 6;
// Generic indication that feature is disabled / excluded
// from models, regardless of specific reason.
DISABLED = 7;
// Indicates that this feature was derived from ordinary
// features for the purposes of statistics generation or
// validation. Consumers should expect that this feature
// may be present in DatasetFeatureStatistics, but not in
// input data.
// Experimental and subject to change.
VALIDATION_DERIVED = 9;
reserved 8;
}
//
// Message to represent schema information.
// NextID: 14
message Schema {
// Features described in this schema.
repeated Feature feature = 1;
// Sparse features described in this schema.
repeated SparseFeature sparse_feature = 6;
// Weighted features described in this schema.
repeated WeightedFeature weighted_feature = 12;
// Use StructDomain instead.
// Sequences described in this schema. A sequence may be described in terms of
// several features. Any features appearing within a sequence must *not* be
// declared as top-level features in <feature>.
// GOOGLE-LEGACY repeated Sequence sequence = 2;
// declared as top-level features in <feature>.
// String domains referenced in the features.
repeated StringDomain string_domain = 4;
// top level float domains that can be reused by features
repeated FloatDomain float_domain = 9;
// top level int domains that can be reused by features
repeated IntDomain int_domain = 10;
// Default environments for each feature.
// An environment represents both a type of location (e.g. a server or phone)
// and a time (e.g. right before model X is run). In the standard scenario,
// 99% of the features should be in the default environments TRAINING,
// SERVING, and the LABEL (or labels) AND WEIGHT is only available at TRAINING
// (not at serving).
// Other possible variations:
// 1. There may be TRAINING_MOBILE, SERVING_MOBILE, TRAINING_SERVICE,
// and SERVING_SERVICE.
// 2. If one is ensembling three models, where the predictions of the first
// three models are available for the ensemble model, there may be
// TRAINING, SERVING_INITIAL, SERVING_ENSEMBLE.
// See FeatureProto::not_in_environment and FeatureProto::in_environment.
repeated string default_environment = 5;
/* BEGIN GOOGLE-LEGACY
// TODO(b/73109633): Change default to false, before removing this field.
optional bool generate_legacy_feature_spec = 7 [default = true];
END GOOGLE-LEGACY */
// Additional information about the schema as a whole. Features may also
// be annotated individually.
optional Annotation annotation = 8;
// Dataset-level constraints. This is currently used for specifying
// information about changes in num_examples.
optional DatasetConstraints dataset_constraints = 11;
// TensorRepresentation groups. The keys are the names of the groups.
// Key "" (empty string) denotes the "default" group, which is what should
// be used when a group name is not provided.
// See the documentation at TensorRepresentationGroup for more info.
// Under development.
map<string, TensorRepresentationGroup> tensor_representation_group = 13;
}
message ValueCountList {
repeated ValueCount value_count = 1;
}
// Describes schema-level information about a specific feature.
// NextID: 35
message Feature {
// The name of the feature.
optional string name = 1; // required
// This field is no longer supported. Instead, use:
// lifecycle_stage: DEPRECATED
// TODO(b/111450258): remove this.
optional bool deprecated = 2 [deprecated = true];
// Comment field for a human readable description of the field.
// TODO(b/123518108): remove this.
// GOOGLE-LEGACY optional string comment = 3 [deprecated = true];
oneof presence_constraints {
// Constraints on the presence of this feature in the examples.
FeaturePresence presence = 14;
// Only used in the context of a "group" context, e.g., inside a sequence.
FeaturePresenceWithinGroup group_presence = 17;
}
// The shape of the feature which governs the number of values that appear in
// each example.
oneof shape_type {
// The feature has a fixed shape corresponding to a multi-dimensional
// tensor.
FixedShape shape = 23;
// The feature doesn't have a well defined shape. All we know are limits on
// the minimum and maximum number of values.
ValueCount value_count = 5;
// Captures the same information as value_count but for features with
// nested values. A ValueCount is provided for each nest level.
ValueCountList value_counts = 32;
}
// Physical type of the feature's values.
// Note that you can have:
// type: BYTES
// int_domain: {
// min: 0
// max: 3
// }
// This would be a field that is syntactically BYTES (i.e. strings), but
// semantically an int, i.e. it would be "0", "1", "2", or "3".
optional FeatureType type = 6;
// Domain for the values of the feature.
oneof domain_info {
// Reference to a domain defined at the schema level.
string domain = 7;
// Inline definitions of domains.
IntDomain int_domain = 9;
FloatDomain float_domain = 10;
StringDomain string_domain = 11;
BoolDomain bool_domain = 13;
StructDomain struct_domain = 29;
// Supported semantic domains.
NaturalLanguageDomain natural_language_domain = 24;
ImageDomain image_domain = 25;
MIDDomain mid_domain = 26;
URLDomain url_domain = 27;
TimeDomain time_domain = 28;
TimeOfDayDomain time_of_day_domain = 30;
}
// Constraints on the distribution of the feature values.
// Only supported for StringDomains.
optional DistributionConstraints distribution_constraints = 15;
// Additional information about the feature for documentation purpose.
optional Annotation annotation = 16;
// Tests comparing the distribution to the associated serving data.
optional FeatureComparator skew_comparator = 18;
// Tests comparing the distribution between two consecutive spans (e.g. days).
optional FeatureComparator drift_comparator = 21;
// List of environments this feature is present in.
// Should be disjoint from not_in_environment.
// This feature is in environment "foo" if:
// ("foo" is in in_environment or default_environment) AND
// "foo" is not in not_in_environment.
// See Schema::default_environment.
repeated string in_environment = 20;
// List of environments this feature is not present in.
// Should be disjoint from of in_environment.
// See Schema::default_environment and in_environment.
repeated string not_in_environment = 19;
// The lifecycle stage of a feature. It can also apply to its descendants.
// i.e., if a struct is DEPRECATED, its children are implicitly deprecated.
optional LifecycleStage lifecycle_stage = 22;
// Constraints on the number of unique values for a given feature.
// This is supported for string and categorical features only.
optional UniqueConstraints unique_constraints = 31;
// If set, indicates that that this feature is derived, and stores metadata
// about its source. If this field is set, this feature should have
// lifecycle_stage VALIDATION_DERIVED or DISABLED.
// Experimental and subject to change.
optional DerivedFeatureSource validation_derived_source = 34;
reserved 33;
}
// Additional information about the schema or about a feature.
message Annotation {
// Tags can be used to mark features. For example, tag on user_age feature can
// be `user_feature`, tag on user_country feature can be `location_feature`,
// `user_feature`.
repeated string tag = 1;
// Free-text comments. This can be used as a description of the feature,
// developer notes etc.
repeated string comment = 2;
// Application-specific metadata may be attached here.
repeated .google.protobuf.Any extra_metadata = 3;
}
// Checks that the ratio of the current value to the previous value is not below
// the min_fraction_threshold or above the max_fraction_threshold. That is,
// previous value * min_fraction_threshold <= current value <=
// previous value * max_fraction_threshold.
// To specify that the value cannot change, set both min_fraction_threshold and
// max_fraction_threshold to 1.0.
message NumericValueComparator {
optional double min_fraction_threshold = 1;
optional double max_fraction_threshold = 2;
}
// Constraints on the entire dataset.
message DatasetConstraints {
// Tests differences in number of examples between the current data and the
// previous span.
optional NumericValueComparator num_examples_drift_comparator = 1;
// Tests comparisions in number of examples between the current data and the
// previous version of that data.
optional NumericValueComparator num_examples_version_comparator = 2;
// Minimum number of examples in the dataset.
optional int64 min_examples_count = 3;
// Maximum number of examples in the dataset.
optional int64 max_examples_count = 4;
}
// Specifies a fixed shape for the feature's values. The immediate implication
// is that each feature has a fixed number of values. Moreover, these values
// can be parsed in a multi-dimensional tensor using the specified axis sizes.
// The FixedShape defines a lexicographical ordering of the data. For instance,
// if there is a FixedShape {
// dim {size:3} dim {size:2}
// }
// then tensor[0][0]=field[0]
// then tensor[0][1]=field[1]
// then tensor[1][0]=field[2]
// then tensor[1][1]=field[3]
// then tensor[2][0]=field[4]
// then tensor[2][1]=field[5]
//
// The FixedShape message is identical with the TensorFlow TensorShape proto
// message.
message FixedShape {
// The dimensions that define the shape. The total number of values in each
// example is the product of sizes of each dimension.
repeated Dim dim = 2;
// An axis in a multi-dimensional feature representation.
message Dim {
optional int64 size = 1;
// Optional name of the tensor dimension.
optional string name = 2;
}
}
// Limits on maximum and minimum number of values in a
// single example (when the feature is present). Use this when the minimum
// value count can be different than the maximum value count. Otherwise prefer
// FixedShape.
message ValueCount {
optional int64 min = 1;
optional int64 max = 2;
}
/* BEGIN GOOGLE-LEGACY
// Constraint on the number of elements in a sequence.
message LengthConstraint {
optional int64 min = 1;
optional int64 max = 2;
}
// A sequence is a logical feature that comprises several "raw" features that
// encode values at different "steps" within the sequence.
// TODO(b/110490010): Delete this. This is a special case of StructDomain.
message Sequence {
// An optional name for this sequence. Used mostly for debugging and
// presentation.
optional string name = 1;
// Features that comprise the sequence. These features are "zipped" together
// to form the values for the sequence at different steps.
// - Use group_presence within each feature to encode presence constraints
// within the sequence.
// - If all features have the same value-count constraints then
// declare this once using the shape_constraint below.
repeated Feature feature = 2;
// Constraints on the presence of the sequence across all examples in the
// dataset. The sequence is assumed to be present if at least one of its
// features is present.
optional FeaturePresence presence = 3;
// Shape constraints that apply on all the features that comprise the
// sequence. If this is set then the value_count in 'feature' is
// ignored.
// TODO(martinz): delete: there is no reason to believe the shape of the
// fields in a sequence will be the same. Use the fields in Feature instead.
oneof shape_constraint {
ValueCount value_count = 4;
FixedShape fixed_shape = 5;
}
// Constraint on the number of elements in a sequence.
optional LengthConstraint length_constraint = 6;
}
END GOOGLE-LEGACY */
// Represents a weighted feature that is encoded as a combination of raw base
// features. The `weight_feature` should be a float feature with identical
// shape as the `feature`. This is useful for representing weights associated
// with categorical tokens (e.g. a TFIDF weight associated with each token).
// TODO(b/142122960): Handle WeightedCategorical end to end in TFX (validation,
// TFX Unit Testing, etc)
message WeightedFeature {
// Name for the weighted feature. This should not clash with other features in
// the same schema.
optional string name = 1; // required
// Path of a base feature to be weighted. Required.
optional Path feature = 2;
// Path of weight feature to associate with the base feature. Must be same
// shape as feature. Required.
optional Path weight_feature = 3;
// The lifecycle_stage determines where a feature is expected to be used,
// and therefore how important issues with it are.
optional LifecycleStage lifecycle_stage = 4;
}
// A sparse feature represents a sparse tensor that is encoded with a
// combination of raw features, namely index features and a value feature. Each
// index feature defines a list of indices in a different dimension.
message SparseFeature {
reserved 11;
// Name for the sparse feature. This should not clash with other features in
// the same schema.
optional string name = 1; // required
// This field is no longer supported. Instead, use:
// lifecycle_stage: DEPRECATED
// TODO(b/111450258): remove this.
optional bool deprecated = 2 [deprecated = true];
// The lifecycle_stage determines where a feature is expected to be used,
// and therefore how important issues with it are.
optional LifecycleStage lifecycle_stage = 7;
// Comment field for a human readable description of the field.
// TODO(martinz): delete, convert to annotation.
// GOOGLE-LEGACY optional string comment = 3 [deprecated = true];
// Constraints on the presence of this feature in examples.
// Deprecated, this is inferred by the referred features.
optional FeaturePresence presence = 4 [deprecated = true];
// Shape of the sparse tensor that this SparseFeature represents.
// Currently not supported.
// TODO(b/109669962): Consider deriving this from the referred features.
optional FixedShape dense_shape = 5;
// Features that represent indexes. Should be integers >= 0.
repeated IndexFeature index_feature = 6; // at least one
message IndexFeature {
// Name of the index-feature. This should be a reference to an existing
// feature in the schema.
optional string name = 1;
}
// If true then the index values are already sorted lexicographically.
optional bool is_sorted = 8;
optional ValueFeature value_feature = 9; // required
message ValueFeature {
// Name of the value-feature. This should be a reference to an existing
// feature in the schema.
optional string name = 1;
}
// Type of value feature.
// Deprecated, this is inferred by the referred features.
optional FeatureType type = 10 [deprecated = true];
}
// Models constraints on the distribution of a feature's values.
// TODO(martinz): replace min_domain_mass with max_off_domain (but slowly).
message DistributionConstraints {
// The minimum fraction (in [0,1]) of values across all examples that
// should come from the feature's domain, e.g.:
// 1.0 => All values must come from the domain.
// .9 => At least 90% of the values must come from the domain.
optional double min_domain_mass = 1 [default = 1.0];
}
// Encodes vocabulary coverage constraints.
message FeatureCoverageConstraints {
// Fraction of feature values that map to a vocab entry (i.e. are not oov).
optional float min_coverage = 1;
// Average length of tokens. Used for cases such as wordpiece that fallback
// to character-level tokenization.
optional float min_avg_token_length = 2;
// String tokens to exclude when calculating min_coverage and
// min_avg_token_length. Useful for tokens such as [PAD].
repeated string excluded_string_tokens = 3;
// Integer tokens to exclude when calculating min_coverage and
// min_avg_token_length.
repeated int64 excluded_int_tokens = 4 [packed = true];
// String tokens to treat as oov tokens (e.g. [UNK]). These tokens are also
// excluded when calculating avg token length.
repeated string oov_string_tokens = 5;
}
// Encodes constraints on specific values in sequences.
message SequenceValueConstraints {
// The value which to express constraints for. Can be either an integer or
// a string.
oneof value {
int64 int_value = 1;
string string_value = 2;
}
// Min / max number of times the value can occur in a sequence.
optional int64 min_per_sequence = 3;
optional int64 max_per_sequence = 4;
// Min / max fraction of sequences that must contain the value.
optional float min_fraction_of_sequences = 5;
optional float max_fraction_of_sequences = 6;
}
// Encodes constraints on sequence lengths.
message SequenceLengthConstraints {
// Token values (int and string) that are excluded when calculating sequence
// length.
repeated int64 excluded_int_value = 1;
repeated string excluded_string_value = 2;
// Min / max sequence length.
optional int64 min_sequence_length = 3;
optional int64 max_sequence_length = 4;
}
// Encodes information for domains of integer values.
// Note that FeatureType could be either INT or BYTES.
message IntDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Min and max values for the domain.
optional int64 min = 3;
optional int64 max = 4;
// If true then the domain encodes categorical values (i.e., ids) rather than
// ordinal values.
optional bool is_categorical = 5;
}
// Encodes information for domains of float values.
// Note that FeatureType could be either INT or BYTES.
message FloatDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Min and max values of the domain.
optional float min = 3;
optional float max = 4;
// If true, feature should not contain NaNs.
optional bool disallow_nan = 5;
// If true, feature should not contain Inf or -Inf.
optional bool disallow_inf = 6;
// If True, this indicates that the feature is semantically an embedding. This
// can be useful for distinguishing fixed dimensional numeric features that
// should be fed to a model unmodified.
optional bool is_embedding = 7;
// If true then the domain encodes categorical values (i.e., ids) rather than
// continuous values.
optional bool is_categorical = 8;
}
// Domain for a recursive struct.
// NOTE: If a feature with a StructDomain is deprecated, then all the
// child features (features and sparse_features of the StructDomain) are also
// considered to be deprecated. Similarly child features can only be in
// environments of the parent feature.
message StructDomain {
repeated Feature feature = 1;
repeated SparseFeature sparse_feature = 2;
}
// Encodes information for domains of string values.
message StringDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// The values appearing in the domain.
repeated string value = 2;
}
// Encodes information about the domain of a boolean attribute that encodes its
// TRUE/FALSE values as strings, or 0=false, 1=true.
// Note that FeatureType could be either INT or BYTES.
message BoolDomain {
// Id of the domain. Required if the domain is defined at the schema level. If
// so, then the name must be unique within the schema.
optional string name = 1;
// Strings values for TRUE/FALSE.
optional string true_value = 2;
optional string false_value = 3;
}
// BEGIN SEMANTIC-TYPES-PROTOS
// Semantic domains are specialized feature domains. For example a string
// Feature might represent a Time of a specific format.
// Semantic domains are defined as protocol buffers to allow further sub-types /
// specialization, e.g: NaturalLanguageDomain can provide information on the
// language of the text.
// Natural language text.
message NaturalLanguageDomain {
// Name of the vocabulary associated with the NaturalLanguageDomain.
// When computing and validating stats using TFDV,
// tfdv.StatsOptions.vocab_paths should map this name to a vocabulary file.
optional string vocabulary = 1;
optional FeatureCoverageConstraints coverage = 2;
repeated SequenceValueConstraints token_constraints = 3;
optional SequenceLengthConstraints sequence_length_constraints = 5;
// Specifies the location constraints as a function of the tokens specified
// in token_constraints.
//
// String tokens will be specified by S_TOKEN_, (e.g. S_(PAD)_) and integer
// tokens will be specified as I_#_ (e.g. I_123_).
// A_T_ will match any token that has not been specified in token_constraints.
// Parenthesis, +, and * are supported.
// _ will be escapable with a \ for tokens containing it (e.g. FOO\_BAR).
//
// For example, a two-sequence BERT model may look as follows:
// S_(CLS)_ A_T_+ S_(SEP)_ A_T_+ S_(SEP)_ S_(PAD)_*
//
// Note: Support for this field is not yet implemented. Please do not use.
// TODO(b/188095987): Remove warning once field is implemented.
optional string location_constraint_regex = 4;
}
// Image data.
message ImageDomain {
// If set, at least this fraction of values should be TensorFlow supported
// images.
optional float minimum_supported_image_fraction = 1;
// If set, image should have less than this value of undecoded byte size.
optional int64 max_image_byte_size = 2;
}
// Knowledge graph ID, see: https://www.wikidata.org/wiki/Property:P646
message MIDDomain {}
// A URL, see: https://en.wikipedia.org/wiki/URL
message URLDomain {}
// Time or date representation.
message TimeDomain {
enum IntegerTimeFormat {
FORMAT_UNKNOWN = 0;
UNIX_DAYS = 5; // Number of days since 1970-01-01.
UNIX_SECONDS = 1;
UNIX_MILLISECONDS = 2;
UNIX_MICROSECONDS = 3;
UNIX_NANOSECONDS = 4;
}
oneof format {
// Expected format that contains a combination of regular characters and
// special format specifiers. Format specifiers are a subset of the
// strptime standard.
string string_format = 1;
// Expected format of integer times.
IntegerTimeFormat integer_format = 2;
}
}
// Time of day, without a particular date.
message TimeOfDayDomain {
enum IntegerTimeOfDayFormat {
FORMAT_UNKNOWN = 0;
// Time values, containing hour/minute/second/nanos, encoded into 8-byte
// bit fields following the ZetaSQL convention:
// 6 5 4 3 2 1
// MSB 3210987654321098765432109876543210987654321098765432109876543210 LSB
// | H || M || S ||---------- nanos -----------|
PACKED_64_NANOS = 1;
}
oneof format {
// Expected format that contains a combination of regular characters and
// special format specifiers. Format specifiers are a subset of the
// strptime standard.
string string_format = 1;
// Expected format of integer times.
IntegerTimeOfDayFormat integer_format = 2;
}
}
// END SEMANTIC-TYPES-PROTOS
// Describes the physical representation of a feature.
// It may be different than the logical representation, which
// is represented as a Domain.
enum FeatureType {
TYPE_UNKNOWN = 0;
BYTES = 1;
INT = 2;
FLOAT = 3;
STRUCT = 4;
}
// Describes constraints on the presence of the feature in the data.
message FeaturePresence {
// Minimum fraction of examples that have this feature.
optional double min_fraction = 1;
// Minimum number of examples that have this feature.
optional int64 min_count = 2;
}
// Records constraints on the presence of a feature inside a "group" context
// (e.g., .presence inside a group of features that define a sequence).
message FeaturePresenceWithinGroup {
optional bool required = 1;
}
// Checks that the L-infinity norm is below a certain threshold between the
// two discrete distributions. Since this is applied to a FeatureNameStatistics,
// it only considers the top k.
// L_infty(p,q) = max_i |p_i-q_i|
message InfinityNorm {
// The InfinityNorm is in the interval [0.0, 1.0] so sensible bounds should
// be in the interval [0.0, 1.0).
optional double threshold = 1;
}
// Checks that the approximate Jensen-Shannon Divergence is below a certain
// threshold between the two distributions.
message JensenShannonDivergence {
// The JensenShannonDivergence will be in the interval [0.0, 1.0] so sensible
// bounds should be in the interval [0.0, 1.0).
optional double threshold = 1;
}
message FeatureComparator {
optional InfinityNorm infinity_norm = 1;
optional JensenShannonDivergence jensen_shannon_divergence = 2;
}
// Checks that the number of unique values is greater than or equal to the min,
// and less than or equal to the max.
message UniqueConstraints {
optional int64 min = 1;
optional int64 max = 2;
}
// A TensorRepresentation captures the intent for converting columns in a
// dataset to TensorFlow Tensors (or more generally, tf.CompositeTensors).
// Note that one tf.CompositeTensor may consist of data from multiple columns,
// for example, a N-dimensional tf.SparseTensor may need N + 1 columns to
// provide the sparse indices and values.
// Note that the "column name" that a TensorRepresentation needs is a
// string, not a Path -- it means that the column name identifies a top-level
// Feature in the schema (i.e. you cannot specify a Feature nested in a STRUCT
// Feature).
message TensorRepresentation {
message DefaultValue {
oneof kind {
double float_value = 1;
// Note that the data column might be of a shorter integral type. It's the
// user's responsitiblity to make sure the default value fits that type.
int64 int_value = 2;
bytes bytes_value = 3;
// uint_value should only be used if the default value can't fit in a
// int64 (`int_value`).
uint64 uint_value = 4;
}
}
// A tf.Tensor
message DenseTensor {
// Identifies the column in the dataset that provides the values of this
// Tensor.
optional string column_name = 1;
// The shape of each row of the data (i.e. does not include the batch
// dimension)
optional FixedShape shape = 2;
// If this column is missing values in a row, the default_value will be
// used to fill that row.
optional DefaultValue default_value = 3;
}
// A ragged tf.SparseTensor that models nested lists.
message VarLenSparseTensor {
// Identifies the column in the dataset that should be converted to the
// VarLenSparseTensor.
optional string column_name = 1;
}
// A tf.SparseTensor whose indices and values come from separate data columns.
// This will replace Schema.sparse_feature eventually.
// The index columns must be of INT type, and all the columns must co-occur
// and have the same valency at the same row.
message SparseTensor {
// The dense shape of the resulting SparseTensor (does not include the batch
// dimension).
optional FixedShape dense_shape = 1;
// The columns constitute the coordinates of the values.
// indices_column[i][j] contains the coordinate of the i-th dimension of the
// j-th value.
repeated string index_column_names = 2;
// The column that contains the values.
optional string value_column_name = 3;
}
// A tf.RaggedTensor that models nested lists.
// Currently there is no way for the user to specify the shape of the leaf
// value (the innermost value tensor of the RaggedTensor). The leaf value will
// always be a 1-D tensor.
message RaggedTensor {
// Identifies the leaf feature that provides values of the RaggedTensor.
// struct type sub fields.
// The first step of the path refers to a top-level feature in the data. The
// remaining steps refer to STRUCT features under the top-level feature,
// recursively.
// If the feature has N outer ragged lists, they will become the first
// N dimensions of the resulting RaggedTensor and the contents will become
// the flat_values.
optional Path feature_path = 1; // required.
// Further partition of the feature values at the leaf level.
message Partition {
oneof kind {
// If the final element(s) of partition are uniform_row_lengths [U0, U1,
// ...] , then the result RaggedTensor will have their flat values (a
// dense tensor) being of shape [U0, U1, ...]. Otherwise, a
// uniform_row_length simply means a ragged dimension with row_lengths
// [uniform_row_length]*nrows.
int64 uniform_row_length = 1;
// Identifies a leaf feature who share the same parent of
// value_feature_path that contains the partition row lengths.
string row_length = 2;
}
}
// The result RaggedTensor would be of shape:
// [B, D_0, D_1, ..., D_N, P_0, P_1, ..., P_M, U_0, U_1, ..., U_P]
//
// Where the dimensions belong to different categories:
// * B: Batch size dimension
// * D_n: Dimensions specified by the nested structure specified by the
// value path until the leaf node. n>=1.
// * P_m: Dimensions specified by the partitions that do not define any
// fixed diomension size. m>=0.
// * U_0: Dimensions specified by the latest partitions of type
// uniform_row_length that can define the fixed inner shape of the tensor.
// If iterationg the partitions from the end to the beginning, these
// dimensions are defined by all the continuous uniform_row_length
// partitions present. p>=0.
repeated Partition partition = 3;
// The data type of the ragged tensor's row partitions. This will
// default to INT64 if it is not specified.
optional RowPartitionDType row_partition_dtype = 2;
}
// RaggedTensor consists of RowPartitions. This enum allows the user to
// specify the dtype of those RowPartitions. If it is UNSPECIFIED, then we
// default to INT64.
enum RowPartitionDType {
UNSPECIFIED = 0;
INT64 = 1;
INT32 = 2;
}
oneof kind {
DenseTensor dense_tensor = 1;
VarLenSparseTensor varlen_sparse_tensor = 2;
SparseTensor sparse_tensor = 3;
RaggedTensor ragged_tensor = 4;
}
}
// A TensorRepresentationGroup is a collection of TensorRepresentations with
// names. These names may serve as identifiers when converting the dataset
// to a collection of Tensors or tf.CompositeTensors.
// For example, given the following group:
// {
// key: "dense_tensor"
// tensor_representation {
// dense_tensor {
// column_name: "univalent_feature"
// shape {
// dim {
// size: 1
// }
// }
// default_value {
// float_value: 0
// }
// }
// }
// }
// {
// key: "varlen_sparse_tensor"
// tensor_representation {
// varlen_sparse_tensor {
// column_name: "multivalent_feature"
// }
// }
// }
//
// Then the schema is expected to have feature "univalent_feature" and
// "multivalent_feature", and when a batch of data is converted to Tensors using
// this TensorRepresentationGroup, the result may be the following dict:
// {
// "dense_tensor": tf.Tensor(...),
// "varlen_sparse_tensor": tf.SparseTensor(...),
// }
message TensorRepresentationGroup {
map<string, TensorRepresentation> tensor_representation = 1;
}