-
Notifications
You must be signed in to change notification settings - Fork 154
/
tariterators.py
277 lines (236 loc) · 8.27 KB
/
tariterators.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#
# Copyright (c) 2017-2021 NVIDIA CORPORATION. All rights reserved.
# This file is part of the WebDataset library.
# See the LICENSE file for licensing terms (BSD-style).
#
"""Low level iteration functions for tar archives."""
from typing import Any, Callable, Dict, Iterable, Iterator, Optional, Set, Tuple
import random, re, tarfile
import braceexpand
from . import filters, gopen
from .handlers import reraise_exception
trace = False
meta_prefix = "__"
meta_suffix = "__"
def base_plus_ext(path):
"""Split off all file extensions.
Returns base, allext.
Args:
path: path with extensions
Returns:
path with all extensions removed
"""
match = re.match(r"^((?:.*/|)[^.]+)[.]([^/]*)$", path)
if not match:
return None, None
return match.group(1), match.group(2)
def valid_sample(sample: Dict[str, Any]) -> bool:
"""Check whether a sample is valid.
Args:
sample: a
Returns:
boolean indicating whether the sample is valid.
"""
return (
sample is not None
and isinstance(sample, dict)
and len(list(sample.keys())) > 0
and not sample.get("__bad__", False)
)
# FIXME: UNUSED
def shardlist(urls, *, shuffle=False):
"""Given a list of URLs, yields that list, possibly shuffled."""
if isinstance(urls, str):
urls = braceexpand.braceexpand(urls)
else:
urls = list(urls)
if shuffle:
random.shuffle(urls)
for url in urls:
yield dict(url=url)
def url_opener(
data: Iterable[Dict[str, Any]],
handler: Callable[[Exception], bool] = reraise_exception,
**kw: Dict[str, Any],
):
"""Open URLs and yield a stream of url+stream pairs.
Args:
data: iterator over dict(url=...)
handler: exception handler.
kw: keyword arguments for gopen.gopen.
Yields:
a stream of url+stream pairs.
"""
for sample in data:
assert isinstance(sample, dict), sample
assert "url" in sample
url = sample["url"]
try:
stream = gopen.gopen(url, **kw)
sample.update(stream=stream)
yield sample
except Exception as exn:
exn.args = exn.args + (url,)
if handler(exn):
continue
else:
break
def tar_file_iterator(
fileobj: tarfile.TarFile,
skip_meta: Optional[str] = r"__[^/]*__($|/)",
handler: Callable[[Exception], bool] = reraise_exception,
select_files: Optional[Callable[[str], bool]] = None,
rename_files: Optional[Callable[[str], str]] = None,
) -> Iterator[Dict[str, Any]]:
"""Iterate over tar file, yielding filename, content pairs for the given tar stream.
Args:
fileobj: the tar file stream.
skip_meta: regexp for keys that are skipped entirely. Defaults to r"__[^/]*__($|/)".
handler: exception handler. Defaults to reraise_exception.
select: predicate for selecting files. Defaults to None.
Yields:
a stream of samples.
"""
stream = tarfile.open(fileobj=fileobj, mode="r|*")
for tarinfo in stream:
fname = tarinfo.name
try:
if not tarinfo.isreg():
continue
if fname is None:
continue
if (
"/" not in fname
and fname.startswith(meta_prefix)
and fname.endswith(meta_suffix)
):
# skipping metadata for now
continue
if skip_meta is not None and re.match(skip_meta, fname):
continue
if rename_files:
fname = rename_files(fname)
if select_files is not None and not select_files(fname):
continue
data = stream.extractfile(tarinfo).read()
result = dict(fname=fname, data=data)
yield result
stream.members = []
except Exception as exn:
if hasattr(exn, "args") and len(exn.args) > 0:
exn.args = (str(exn.args[0]) + " @ " + str(fileobj),) + exn.args[1:]
if handler(exn):
continue
else:
break
del stream
def tar_file_expander(
data: Iterable[Dict[str, Any]],
handler: Callable[[Exception], bool] = reraise_exception,
select_files: Optional[Callable[[str], bool]] = None,
rename_files: Optional[Callable[[str], str]] = None,
) -> Iterator[Dict[str, Any]]:
"""Expand tar files.
Args:
data: iterator over opened tar file streams.
handler: exception handler.
select_files: select files from tarfiles by name (permits skipping files).
Yields:
a stream of samples.
"""
for source in data:
url = source["url"]
try:
assert isinstance(source, dict)
assert "stream" in source
for sample in tar_file_iterator(
source["stream"],
handler=handler,
select_files=select_files,
rename_files=rename_files,
):
assert (
isinstance(sample, dict) and "data" in sample and "fname" in sample
)
sample["__url__"] = url
yield sample
except Exception as exn:
exn.args = exn.args + (source.get("stream"), source.get("url"))
if handler(exn):
continue
else:
break
def group_by_keys(
data: Iterable[Dict[str, Any]],
keys: Callable[[str], Tuple[str, str]] = base_plus_ext,
lcase: bool = True,
suffixes: Optional[Set[str]] = None,
handler: Callable[[Exception], bool] = reraise_exception,
) -> Iterator[Dict[str, Any]]:
"""Group tarfile contents by keys and yield samples.
Args:
data: iterator over tarfile contents
keys: function that takes a file name and returns a key and a suffix.
lcase: whether to lowercase the suffix.
suffixes: list of suffixes to keep.
handler: exception handler.
Raises:
ValueError: raised if there are duplicate file names in the tar file.
Yields:
iterator over samples.
"""
current_sample = None
for filesample in data:
try:
assert isinstance(filesample, dict)
fname, value = filesample["fname"], filesample["data"]
prefix, suffix = keys(fname)
if trace:
print(
prefix,
suffix,
current_sample.keys() if isinstance(current_sample, dict) else None,
)
if prefix is None:
continue
if lcase:
suffix = suffix.lower()
if current_sample is None or prefix != current_sample["__key__"]:
if valid_sample(current_sample):
yield current_sample
current_sample = dict(__key__=prefix, __url__=filesample["__url__"])
if suffix in current_sample:
raise ValueError(
f"{fname}: duplicate file name in tar file {suffix} {current_sample.keys()}"
)
if suffixes is None or suffix in suffixes:
current_sample[suffix] = value
except Exception as exn:
exn.args = exn.args + (filesample.get("stream"), filesample.get("url"))
if handler(exn):
continue
else:
break
if valid_sample(current_sample):
yield current_sample
def tarfile_samples(
src: Iterable[Dict[str, Any]],
handler: Callable[[Exception], bool] = reraise_exception,
select_files: Optional[Callable[[str], bool]] = None,
rename_files: Optional[Callable[[str], str]] = None,
) -> Iterable[Dict[str, Any]]:
"""Given a stream of tar files, yield samples.
Args:
src: stream of tar files
handler: exception handler
select_files: function that selects files to be included
Returns:
stream of samples
"""
streams = url_opener(src, handler=handler)
files = tar_file_expander(
streams, handler=handler, select_files=select_files, rename_files=rename_files
)
samples = group_by_keys(files, handler=handler)
return samples
tarfile_to_samples = filters.pipelinefilter(tarfile_samples)