/
component.py
93 lines (78 loc) · 3.8 KB
/
component.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# Copyright 2019 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TFX CsvExampleGen component definition."""
from typing import Optional, Union
from tfx.components.example_gen import component
from tfx.components.example_gen.csv_example_gen import executor
from tfx.dsl.components.base import executor_spec
from tfx.dsl.placeholder import placeholder
from tfx.orchestration import data_types
from tfx.proto import example_gen_pb2
from tfx.proto import range_config_pb2
class CsvExampleGen(component.FileBasedExampleGen): # pylint: disable=protected-access
"""Official TFX CsvExampleGen component.
The csv examplegen component takes csv data, and generates train
and eval examples for downstream components.
The csv examplegen encodes column values to tf.Example int/float/byte feature.
For the case when there's missing cells, the csv examplegen uses:
-- tf.train.Feature(`type`_list=tf.train.`type`List(value=[])), when the
`type` can be inferred.
-- tf.train.Feature() when it cannot infer the `type` from the column.
Note that the type inferring will be per input split. If input isn't a single
split, users need to ensure the column types align in each pre-splits.
For example, given the following csv rows of a split:
header:A,B,C,D
row1: 1,,x,0.1
row2: 2,,y,0.2
row3: 3,,,0.3
row4:
The output example will be
example1: 1(int), empty feature(no type), x(string), 0.1(float)
example2: 2(int), empty feature(no type), x(string), 0.2(float)
example3: 3(int), empty feature(no type), empty list(string), 0.3(float)
Note that the empty feature is `tf.train.Feature()` while empty list string
feature is `tf.train.Feature(bytes_list=tf.train.BytesList(value=[]))`.
Component `outputs` contains:
- `examples`: Channel of type `standard_artifacts.Examples` for output train
and eval examples.
"""
EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)
def __init__(
self,
input_base: Optional[str] = None,
input_config: Optional[Union[example_gen_pb2.Input,
data_types.RuntimeParameter]] = None,
output_config: Optional[Union[example_gen_pb2.Output,
data_types.RuntimeParameter]] = None,
range_config: Optional[Union[placeholder.Placeholder,
range_config_pb2.RangeConfig,
data_types.RuntimeParameter]] = None):
"""Construct a CsvExampleGen component.
Args:
input_base: an external directory containing the CSV files.
input_config: An example_gen_pb2.Input instance, providing input
configuration. If unset, the files under input_base will be treated as a
single split.
output_config: An example_gen_pb2.Output instance, providing output
configuration. If unset, default splits will be 'train' and 'eval' with
size 2:1.
range_config: An optional range_config_pb2.RangeConfig instance,
specifying the range of span values to consider. If unset, driver will
default to searching for latest span with no restrictions.
"""
super().__init__(
input_base=input_base,
input_config=input_config,
output_config=output_config,
range_config=range_config)