/
hadoop_dataset_ops.py
76 lines (63 loc) · 2.45 KB
/
hadoop_dataset_ops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SequenceFile Dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow import dtypes
from tensorflow.compat.v1 import data
from tensorflow_io.core.python.ops import _load_library
hadoop_ops = _load_library('_hadoop_ops.so')
class SequenceFileDataset(data.Dataset):
"""A Sequence File Dataset that reads the sequence file."""
def __init__(self, filenames):
"""Create a `SequenceFileDataset`.
`SequenceFileDataset` allows a user to read data from a hadoop sequence
file. A sequence file consists of (key value) pairs sequentially. At
the moment, `org.apache.hadoop.io.Text` is the only serialization type
being supported, and there is no compression support.
For example:
```python
dataset = SequenceFileDataset("/foo/bar.seq")
iterator = dataset.make_one_shot_iterator()
next_element = iterator.get_next()
# Prints the (key, value) pairs inside a hadoop sequence file.
while True:
try:
print(sess.run(next_element))
except tf.errors.OutOfRangeError:
break
```
Args:
filenames: A `tf.string` tensor containing one or more filenames.
"""
self._filenames = tf.convert_to_tensor(
filenames, dtype=dtypes.string, name="filenames")
super(SequenceFileDataset, self).__init__()
def _inputs(self):
return []
def _as_variant_tensor(self):
return hadoop_ops.io_sequence_file_dataset(
self._filenames, (dtypes.string, dtypes.string))
@property
def output_classes(self):
return tf.Tensor, tf.Tensor
@property
def output_shapes(self):
return (tf.TensorShape([]), tf.TensorShape([]))
@property
def output_types(self):
return dtypes.string, dtypes.string