/
top_level_feature.py
121 lines (103 loc) · 4.2 KB
/
top_level_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# coding=utf-8
# Copyright 2021 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wrapper around FeatureDict to allow better control over decoding.
"""
from tensorflow_datasets.core.features import feature as feature_lib
class TopLevelFeature(feature_lib.FeatureConnector):
"""Top-level `FeatureConnector` to manage decoding.
Note that `FeatureConnector` which are declared as `TopLevelFeature` can be
nested. However, only the top-level feature should be decoded.
`TopLevelFeature` allows better control over the decoding, and
eventually better support for augmentations.
"""
def decode_example(self, serialized_example, decoders=None):
# pylint: disable=line-too-long
"""Decode the serialize examples.
Args:
serialized_example: Nested `dict` of `tf.Tensor`
decoders: Nested dict of `Decoder` objects which allow to customize the
decoding. The structure should match the feature structure, but only
customized feature keys need to be present. See
[the guide](https://github.com/tensorflow/datasets/tree/master/docs/decode.md)
for more info.
Returns:
example: Nested `dict` containing the decoded nested examples.
"""
# Step 1: Flatten the nested dict => []
flat_example = self._flatten(serialized_example)
flat_features = self._flatten(self)
flat_serialized_info = self._flatten(self.get_serialized_info())
flat_decoders = self._flatten(decoders)
# Step 2: Apply the decoding
flatten_decoded = [
_decode_feature( # pylint: disable=g-complex-comprehension
feature=feature,
example=example,
serialized_info=serialized_info,
decoder=decoder,
) for (
feature,
example,
serialized_info,
decoder,
) in zip(
flat_features,
flat_example,
flat_serialized_info,
flat_decoders
)
]
# Step 3: Restore nesting [] => {}
nested_decoded = self._nest(flatten_decoded)
return nested_decoded
def _decode_feature(feature, example, serialized_info, decoder):
"""Decode a single feature."""
# TODO(tfds): Support decoders for tfds.features.Dataset
# Eventually overwrite the default decoding
if decoder is not None:
decoder.setup(feature=feature)
else:
decoder = feature
sequence_rank = _get_sequence_rank(serialized_info)
if sequence_rank == 0:
return decoder.decode_example(example)
elif sequence_rank == 1:
# Return a batch of examples from a sequence
return decoder.decode_batch_example(example)
elif sequence_rank > 1:
# Use ragged tensor if the sequance rank is greater than one
return decoder.decode_ragged_example(example)
def _get_sequence_rank(serialized_info):
"""Return the number of sequence dimensions of the feature."""
if isinstance(serialized_info, dict):
# If the element is a dictionary, it might correspond to a nested dataset
# whose serialized_info is not flattened (so it might be a nested dict).
all_sequence_rank = [
_get_sequence_rank(s) for s in serialized_info.values()
]
else:
# If this is a nested dataset, we ignore the sequence_rank. We will decode
# the full dataset example with the Dataset decoder.
if serialized_info.dataset_lvl > 0:
return 0
all_sequence_rank = [serialized_info.sequence_rank]
sequence_ranks = set(all_sequence_rank)
if len(sequence_ranks) != 1:
raise NotImplementedError(
'Decoding do not support mixing sequence and context features within a '
'single FeatureConnector. Received inputs of different sequence_rank: '
'{}'.format(sequence_ranks)
)
return next(iter(sequence_ranks))