From 78bb6ee6db806325143afd18c4e5ee0ea197e9f4 Mon Sep 17 00:00:00 2001 From: Ryan Sepassi Date: Fri, 1 Feb 2019 14:37:25 -0800 Subject: [PATCH] Rename dataset_as_numpy as_numpy and update docs PiperOrigin-RevId: 232039691 --- README.md | 10 +- docs/api_docs/python/_toc.yaml | 4 + docs/api_docs/python/index.md | 2 + docs/api_docs/python/tfds.md | 4 +- docs/api_docs/python/tfds/Split.md | 2 + docs/api_docs/python/tfds/_api_cache.json | 57 +++- docs/api_docs/python/tfds/as_numpy.md | 34 ++ docs/api_docs/python/tfds/core.md | 2 + .../python/tfds/core/DatasetBuilder.md | 6 +- docs/api_docs/python/tfds/core/DatasetInfo.md | 17 +- .../python/tfds/core/GeneratorBasedBuilder.md | 4 +- docs/api_docs/python/tfds/core/NamedSplit.md | 2 +- docs/api_docs/python/tfds/core/SplitBase.md | 151 +++++++++ docs/api_docs/python/tfds/core/SplitDict.md | 9 + docs/api_docs/python/tfds/dataset_as_numpy.md | 21 +- .../python/tfds/download/DownloadManager.md | 7 +- docs/api_docs/python/tfds/features/Text.md | 4 +- .../tfds/features/text/SubwordTextEncoder.md | 4 +- .../tfds/features/text/TokenTextEncoder.md | 2 +- .../tfds/file_adapter/FileFormatAdapter.md | 2 +- docs/api_docs/python/tfds/load.md | 2 +- docs/datasets.md | 291 ++++++++++++++++-- .../core/dataset_builder_test.py | 18 +- tensorflow_datasets/core/dataset_info.py | 2 +- tensorflow_datasets/core/dataset_utils.py | 13 +- .../core/dataset_utils_test.py | 20 +- tensorflow_datasets/core/registered.py | 2 +- tensorflow_datasets/core/splits_test.py | 2 +- tensorflow_datasets/core/test_utils.py | 2 +- tensorflow_datasets/public_api.py | 2 + .../scripts/document_datasets.py | 2 +- .../testing/dataset_builder_testing.py | 2 +- tensorflow_datasets/testing/e2e_binary.py | 2 +- 33 files changed, 594 insertions(+), 110 deletions(-) create mode 100644 docs/api_docs/python/tfds/as_numpy.md create mode 100644 docs/api_docs/python/tfds/core/SplitBase.md diff --git a/README.md b/README.md index a433b2ab48e..67b4d8415fb 100644 --- a/README.md +++ b/README.md @@ -99,10 +99,10 @@ print(info) ) ``` -### NumPy Usage with `tfds.dataset_as_numpy` +### NumPy Usage with `tfds.as_numpy` As a convenience for users that want simple NumPy arrays in their programs, you -can use `tfds.dataset_as_numpy` to return a generator that yields NumPy array +can use `tfds.as_numpy` to return a generator that yields NumPy array records out of a `tf.data.Dataset`. This allows you to build high-performance input pipelines with `tf.data` but use whatever you'd like for your model components. @@ -110,16 +110,16 @@ components. ```python train_ds = tfds.load("mnist", split=tfds.Split.TRAIN) train_ds = train_ds.shuffle(1024).batch(128).repeat(5).prefetch(10) -for example in tfds.dataset_as_numpy(train_ds): +for example in tfds.as_numpy(train_ds): numpy_images, numpy_labels = example["image"], example["label"] ``` -You can also use `tfds.dataset_as_numpy` in conjunction with `batch_size=-1` to +You can also use `tfds.as_numpy` in conjunction with `batch_size=-1` to get the full dataset in NumPy arrays from the returned `tf.Tensor` object: ```python train_data = tfds.load("mnist", split=tfds.Split.TRAIN, batch_size=-1) -numpy_data = tfds.dataset_as_numpy(train_data) +numpy_data = tfds.as_numpy(train_data) numpy_images, numpy_labels = numpy_dataset["image"], numpy_dataset["label"] ``` diff --git a/docs/api_docs/python/_toc.yaml b/docs/api_docs/python/_toc.yaml index 75ecaf0367b..c8f976dfc7d 100644 --- a/docs/api_docs/python/_toc.yaml +++ b/docs/api_docs/python/_toc.yaml @@ -4,6 +4,8 @@ toc: section: - title: Overview path: /datasets/api_docs/python/tfds + - title: as_numpy + path: /datasets/api_docs/python/tfds/as_numpy - title: builder path: /datasets/api_docs/python/tfds/builder - title: dataset_as_numpy @@ -34,6 +36,8 @@ toc: path: /datasets/api_docs/python/tfds/core/lazy_imports - title: NamedSplit path: /datasets/api_docs/python/tfds/core/NamedSplit + - title: SplitBase + path: /datasets/api_docs/python/tfds/core/SplitBase - title: SplitDict path: /datasets/api_docs/python/tfds/core/SplitDict - title: SplitGenerator diff --git a/docs/api_docs/python/index.md b/docs/api_docs/python/index.md index 504a8d516e4..0a029fc6de3 100644 --- a/docs/api_docs/python/index.md +++ b/docs/api_docs/python/index.md @@ -3,6 +3,7 @@ * tfds * tfds.GenerateMode * tfds.Split +* tfds.as_numpy * tfds.builder * tfds.core * tfds.core.BuilderConfig @@ -10,6 +11,7 @@ * tfds.core.DatasetInfo * tfds.core.GeneratorBasedBuilder * tfds.core.NamedSplit +* tfds.core.SplitBase * tfds.core.SplitDict * tfds.core.SplitGenerator * tfds.core.SplitInfo diff --git a/docs/api_docs/python/tfds.md b/docs/api_docs/python/tfds.md index 664eca14d67..9f63cb933a1 100644 --- a/docs/api_docs/python/tfds.md +++ b/docs/api_docs/python/tfds.md @@ -55,7 +55,9 @@ Documentation: ## Functions -[`dataset_as_numpy(...)`](./tfds/dataset_as_numpy.md): Converts a `tf.data.Dataset` to an iterable of NumPy arrays. +[`as_numpy(...)`](./tfds/as_numpy.md): Converts a `tf.data.Dataset` to an iterable of NumPy arrays. + +[`dataset_as_numpy(...)`](./tfds/dataset_as_numpy.md): DEPRECATED. Renamed tfds.as_numpy. [`builder(...)`](./tfds/builder.md): Fetches a tfds.core.DatasetBuilder by string name. diff --git a/docs/api_docs/python/tfds/Split.md b/docs/api_docs/python/tfds/Split.md index 305e6b3eb38..0c89f3eea1c 100644 --- a/docs/api_docs/python/tfds/Split.md +++ b/docs/api_docs/python/tfds/Split.md @@ -32,6 +32,8 @@ stages of training and evaluation. * `ALL`: Special value corresponding to all existing splits of a dataset merged together +Note: All splits, including compositions inherit from tfds.core.SplitBase + See the [guide on splits](https://github.com/tensorflow/datasets/tree/master/docs/splits.md) for more information. diff --git a/docs/api_docs/python/tfds/_api_cache.json b/docs/api_docs/python/tfds/_api_cache.json index 1ffb6e6502d..93c3e8a1d10 100644 --- a/docs/api_docs/python/tfds/_api_cache.json +++ b/docs/api_docs/python/tfds/_api_cache.json @@ -1,5 +1,5 @@ { - "current_doc_full_name": "tfds.core.DatasetBuilder.IN_DEVELOPMENT", + "current_doc_full_name": "tfds.features.text.TextEncoder.__hash__", "duplicate_of": { "tfds.GenerateMode": "tfds.download.GenerateMode", "tfds.GenerateMode.FORCE_REDOWNLOAD": "tfds.download.GenerateMode.FORCE_REDOWNLOAD", @@ -9,7 +9,7 @@ "tfds.Split.__format__": "tfds.core.BuilderConfig.__format__", "tfds.Split.__getattribute__": "tfds.core.BuilderConfig.__getattribute__", "tfds.Split.__hash__": "tfds.core.BuilderConfig.__hash__", - "tfds.Split.__init__": "tfds.core.Version.__init__", + "tfds.Split.__init__": "tfds.core.SplitBase.__init__", "tfds.Split.__reduce__": "tfds.core.BuilderConfig.__reduce__", "tfds.Split.__reduce_ex__": "tfds.core.BuilderConfig.__reduce_ex__", "tfds.Split.__repr__": "tfds.core.DatasetBuilder.__repr__", @@ -58,6 +58,18 @@ "tfds.core.NamedSplit.__reduce_ex__": "tfds.core.BuilderConfig.__reduce_ex__", "tfds.core.NamedSplit.__setattr__": "tfds.core.BuilderConfig.__setattr__", "tfds.core.NamedSplit.__sizeof__": "tfds.core.BuilderConfig.__sizeof__", + "tfds.core.NamedSplit.__weakref__": "tfds.core.SplitBase.__weakref__", + "tfds.core.SplitBase.__delattr__": "tfds.core.BuilderConfig.__delattr__", + "tfds.core.SplitBase.__format__": "tfds.core.BuilderConfig.__format__", + "tfds.core.SplitBase.__getattribute__": "tfds.core.BuilderConfig.__getattribute__", + "tfds.core.SplitBase.__hash__": "tfds.core.BuilderConfig.__hash__", + "tfds.core.SplitBase.__new__": "tfds.core.BuilderConfig.__new__", + "tfds.core.SplitBase.__reduce__": "tfds.core.BuilderConfig.__reduce__", + "tfds.core.SplitBase.__reduce_ex__": "tfds.core.BuilderConfig.__reduce_ex__", + "tfds.core.SplitBase.__repr__": "tfds.core.DatasetBuilder.__repr__", + "tfds.core.SplitBase.__setattr__": "tfds.core.BuilderConfig.__setattr__", + "tfds.core.SplitBase.__sizeof__": "tfds.core.BuilderConfig.__sizeof__", + "tfds.core.SplitBase.__str__": "tfds.core.BuilderConfig.__str__", "tfds.core.SplitDict.__delattr__": "tfds.core.BuilderConfig.__delattr__", "tfds.core.SplitDict.__format__": "tfds.core.BuilderConfig.__format__", "tfds.core.SplitDict.__reduce__": "tfds.core.BuilderConfig.__reduce__", @@ -87,6 +99,7 @@ "tfds.core.SplitInfo.__str__": "tfds.core.BuilderConfig.__str__", "tfds.core.Version.__delattr__": "tfds.core.BuilderConfig.__delattr__", "tfds.core.Version.__format__": "tfds.core.BuilderConfig.__format__", + "tfds.core.Version.__init__": "tfds.core.SplitBase.__init__", "tfds.core.Version.__reduce__": "tfds.core.BuilderConfig.__reduce__", "tfds.core.Version.__reduce_ex__": "tfds.core.BuilderConfig.__reduce_ex__", "tfds.core.Version.__setattr__": "tfds.core.BuilderConfig.__setattr__", @@ -150,7 +163,7 @@ "tfds.features.BBox.__getslice__": "tfds.core.Version.__getslice__", "tfds.features.BBox.__gt__": "tfds.core.Version.__gt__", "tfds.features.BBox.__hash__": "tfds.core.Version.__hash__", - "tfds.features.BBox.__init__": "tfds.core.Version.__init__", + "tfds.features.BBox.__init__": "tfds.core.SplitBase.__init__", "tfds.features.BBox.__iter__": "tfds.core.Version.__iter__", "tfds.features.BBox.__le__": "tfds.core.Version.__le__", "tfds.features.BBox.__len__": "tfds.core.Version.__len__", @@ -199,7 +212,7 @@ "tfds.features.FeatureConnector.__format__": "tfds.core.BuilderConfig.__format__", "tfds.features.FeatureConnector.__getattribute__": "tfds.core.BuilderConfig.__getattribute__", "tfds.features.FeatureConnector.__hash__": "tfds.core.BuilderConfig.__hash__", - "tfds.features.FeatureConnector.__init__": "tfds.core.Version.__init__", + "tfds.features.FeatureConnector.__init__": "tfds.core.SplitBase.__init__", "tfds.features.FeatureConnector.__new__": "tfds.core.BuilderConfig.__new__", "tfds.features.FeatureConnector.__reduce__": "tfds.core.BuilderConfig.__reduce__", "tfds.features.FeatureConnector.__reduce_ex__": "tfds.core.BuilderConfig.__reduce_ex__", @@ -292,7 +305,7 @@ "tfds.features.TensorInfo.__getslice__": "tfds.core.Version.__getslice__", "tfds.features.TensorInfo.__gt__": "tfds.core.Version.__gt__", "tfds.features.TensorInfo.__hash__": "tfds.core.Version.__hash__", - "tfds.features.TensorInfo.__init__": "tfds.core.Version.__init__", + "tfds.features.TensorInfo.__init__": "tfds.core.SplitBase.__init__", "tfds.features.TensorInfo.__iter__": "tfds.core.Version.__iter__", "tfds.features.TensorInfo.__le__": "tfds.core.Version.__le__", "tfds.features.TensorInfo.__len__": "tfds.core.Version.__len__", @@ -365,7 +378,7 @@ "tfds.features.text.TextEncoder.__format__": "tfds.core.BuilderConfig.__format__", "tfds.features.text.TextEncoder.__getattribute__": "tfds.core.BuilderConfig.__getattribute__", "tfds.features.text.TextEncoder.__hash__": "tfds.core.BuilderConfig.__hash__", - "tfds.features.text.TextEncoder.__init__": "tfds.core.Version.__init__", + "tfds.features.text.TextEncoder.__init__": "tfds.core.SplitBase.__init__", "tfds.features.text.TextEncoder.__new__": "tfds.core.BuilderConfig.__new__", "tfds.features.text.TextEncoder.__reduce__": "tfds.core.BuilderConfig.__reduce__", "tfds.features.text.TextEncoder.__reduce_ex__": "tfds.core.BuilderConfig.__reduce_ex__", @@ -423,7 +436,7 @@ "tfds.file_adapter.FileFormatAdapter.__format__": "tfds.core.BuilderConfig.__format__", "tfds.file_adapter.FileFormatAdapter.__getattribute__": "tfds.core.BuilderConfig.__getattribute__", "tfds.file_adapter.FileFormatAdapter.__hash__": "tfds.core.BuilderConfig.__hash__", - "tfds.file_adapter.FileFormatAdapter.__init__": "tfds.core.Version.__init__", + "tfds.file_adapter.FileFormatAdapter.__init__": "tfds.core.SplitBase.__init__", "tfds.file_adapter.FileFormatAdapter.__new__": "tfds.core.BuilderConfig.__new__", "tfds.file_adapter.FileFormatAdapter.__reduce__": "tfds.core.BuilderConfig.__reduce__", "tfds.file_adapter.FileFormatAdapter.__reduce_ex__": "tfds.core.BuilderConfig.__reduce_ex__", @@ -448,7 +461,7 @@ "tfds.percent.__format__": "tfds.core.BuilderConfig.__format__", "tfds.percent.__getattribute__": "tfds.core.BuilderConfig.__getattribute__", "tfds.percent.__hash__": "tfds.core.BuilderConfig.__hash__", - "tfds.percent.__init__": "tfds.core.Version.__init__", + "tfds.percent.__init__": "tfds.core.SplitBase.__init__", "tfds.percent.__new__": "tfds.core.BuilderConfig.__new__", "tfds.percent.__reduce__": "tfds.core.BuilderConfig.__reduce__", "tfds.percent.__reduce_ex__": "tfds.core.BuilderConfig.__reduce_ex__", @@ -488,6 +501,7 @@ "tfds.Split.__str__": true, "tfds.Split.__subclasshook__": true, "tfds.Split.__weakref__": true, + "tfds.as_numpy": false, "tfds.builder": false, "tfds.core": false, "tfds.core.BuilderConfig": false, @@ -564,10 +578,9 @@ "tfds.core.DatasetInfo.description": true, "tfds.core.DatasetInfo.download_checksums": true, "tfds.core.DatasetInfo.features": true, - "tfds.core.DatasetInfo.initialize_from_package_data": true, + "tfds.core.DatasetInfo.initialize_from_bucket": true, "tfds.core.DatasetInfo.initialized": true, "tfds.core.DatasetInfo.name": true, - "tfds.core.DatasetInfo.num_examples": true, "tfds.core.DatasetInfo.read_from_directory": true, "tfds.core.DatasetInfo.size_in_bytes": true, "tfds.core.DatasetInfo.splits": true, @@ -626,6 +639,29 @@ "tfds.core.NamedSplit.__weakref__": true, "tfds.core.NamedSplit.get_read_instruction": true, "tfds.core.NamedSplit.subsplit": true, + "tfds.core.SplitBase": false, + "tfds.core.SplitBase.__abstractmethods__": true, + "tfds.core.SplitBase.__add__": true, + "tfds.core.SplitBase.__delattr__": true, + "tfds.core.SplitBase.__dict__": true, + "tfds.core.SplitBase.__doc__": true, + "tfds.core.SplitBase.__eq__": true, + "tfds.core.SplitBase.__format__": true, + "tfds.core.SplitBase.__getattribute__": true, + "tfds.core.SplitBase.__hash__": true, + "tfds.core.SplitBase.__init__": true, + "tfds.core.SplitBase.__module__": true, + "tfds.core.SplitBase.__new__": true, + "tfds.core.SplitBase.__reduce__": true, + "tfds.core.SplitBase.__reduce_ex__": true, + "tfds.core.SplitBase.__repr__": true, + "tfds.core.SplitBase.__setattr__": true, + "tfds.core.SplitBase.__sizeof__": true, + "tfds.core.SplitBase.__str__": true, + "tfds.core.SplitBase.__subclasshook__": true, + "tfds.core.SplitBase.__weakref__": true, + "tfds.core.SplitBase.get_read_instruction": true, + "tfds.core.SplitBase.subsplit": true, "tfds.core.SplitDict": false, "tfds.core.SplitDict.__cmp__": true, "tfds.core.SplitDict.__contains__": true, @@ -673,6 +709,7 @@ "tfds.core.SplitDict.popitem": true, "tfds.core.SplitDict.setdefault": true, "tfds.core.SplitDict.to_proto": true, + "tfds.core.SplitDict.total_num_examples": true, "tfds.core.SplitDict.update": true, "tfds.core.SplitDict.values": true, "tfds.core.SplitDict.viewitems": true, diff --git a/docs/api_docs/python/tfds/as_numpy.md b/docs/api_docs/python/tfds/as_numpy.md new file mode 100644 index 00000000000..4370f221f5d --- /dev/null +++ b/docs/api_docs/python/tfds/as_numpy.md @@ -0,0 +1,34 @@ +
+ + +
+ +# tfds.as_numpy + +``` python +tfds.as_numpy( + dataset, + graph=None +) +``` + + + +Defined in [`core/dataset_utils.py`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/core/dataset_utils.py). + +Converts a `tf.data.Dataset` to an iterable of NumPy arrays. + +`as_numpy` converts a possibly nested structure of `tf.data.Dataset`s +and `tf.Tensor`s to iterables of NumPy arrays and NumPy arrays, respectively. + +#### Args: + +* `dataset`: a possibly nested structure of `tf.data.Dataset`s and/or + `tf.Tensor`s. +* `graph`: `tf.Graph`, optional, explicitly set the graph to use. + + +#### Returns: + +A structure matching `dataset` where `tf.data.Dataset`s are converted to +generators of NumPy arrays and `tf.Tensor`s are converted to NumPy arrays. \ No newline at end of file diff --git a/docs/api_docs/python/tfds/core.md b/docs/api_docs/python/tfds/core.md index ed0a3904155..95e183a4581 100644 --- a/docs/api_docs/python/tfds/core.md +++ b/docs/api_docs/python/tfds/core.md @@ -25,6 +25,8 @@ API to define datasets. [`class NamedSplit`](../tfds/core/NamedSplit.md): Descriptor corresponding to a named split (train, test, ...). +[`class SplitBase`](../tfds/core/SplitBase.md): Abstract base class for Split compositionality. + [`class SplitDict`](../tfds/core/SplitDict.md): Split info object. [`class SplitGenerator`](../tfds/core/SplitGenerator.md): Defines the split information for the generator. diff --git a/docs/api_docs/python/tfds/core/DatasetBuilder.md b/docs/api_docs/python/tfds/core/DatasetBuilder.md index 01fd31fdb9f..3b62a206d74 100644 --- a/docs/api_docs/python/tfds/core/DatasetBuilder.md +++ b/docs/api_docs/python/tfds/core/DatasetBuilder.md @@ -52,7 +52,7 @@ assert isinstance(train_dataset, tf.data.Dataset) # And then the rest of your input pipeline train_dataset = train_dataset.repeat().shuffle(1024).batch(128) train_dataset = train_dataset.prefetch(2) -features = train_dataset.make_one_shot_iterator().get_next() +features = tf.compat.v1.data.make_one_shot_iterator(train_dataset).get_next() image, label = features['image'], features['label'] ``` @@ -110,8 +110,8 @@ Callers must pass arguments as keyword arguments. #### Args: -* `split`: tfds.Split, which subset of the data to read. If None (default), - returns all splits in a dict +* `split`: tfds.core.SplitBase, which subset(s) of the data to read. If None + (default), returns all splits in a dict ``. * `batch_size`: `int`, batch size. Note that variable-length features will be 0-padded if `batch_size > 1`. Users that want more custom behavior diff --git a/docs/api_docs/python/tfds/core/DatasetInfo.md b/docs/api_docs/python/tfds/core/DatasetInfo.md index bf83a73547e..03677f8d768 100644 --- a/docs/api_docs/python/tfds/core/DatasetInfo.md +++ b/docs/api_docs/python/tfds/core/DatasetInfo.md @@ -9,7 +9,6 @@ - @@ -17,7 +16,7 @@ - + @@ -113,10 +112,6 @@ Whether DatasetInfo has been fully initialized. -

num_examples

- - -

size_in_bytes

@@ -149,20 +144,20 @@ compute_dynamic_properties() -

initialize_from_package_data

+

initialize_from_bucket

``` python -initialize_from_package_data() +initialize_from_bucket() ``` -Initialize DatasetInfo from package data, returns True on success. +Initialize DatasetInfo from GCS bucket info files.

read_from_directory

``` python read_from_directory( dataset_info_dir, - from_packaged_data=False + from_bucket=False ) ``` @@ -177,7 +172,7 @@ This will overwrite all previous metadata. * `dataset_info_dir`: `str` The directory containing the metadata file. This should be the root directory of a specific dataset version. -* `from_packaged_data`: `bool`, If data is restored from packaged data, +* `from_bucket`: `bool`, If data is restored from info files on GCS, then only the informations not defined in the code are updated diff --git a/docs/api_docs/python/tfds/core/GeneratorBasedBuilder.md b/docs/api_docs/python/tfds/core/GeneratorBasedBuilder.md index 20f150c05b4..a6c83a98d36 100644 --- a/docs/api_docs/python/tfds/core/GeneratorBasedBuilder.md +++ b/docs/api_docs/python/tfds/core/GeneratorBasedBuilder.md @@ -83,8 +83,8 @@ Callers must pass arguments as keyword arguments. #### Args: -* `split`: tfds.Split, which subset of the data to read. If None (default), - returns all splits in a dict +* `split`: tfds.core.SplitBase, which subset(s) of the data to read. If None + (default), returns all splits in a dict ``. * `batch_size`: `int`, batch size. Note that variable-length features will be 0-padded if `batch_size > 1`. Users that want more custom behavior diff --git a/docs/api_docs/python/tfds/core/NamedSplit.md b/docs/api_docs/python/tfds/core/NamedSplit.md index 9f317a2ef1c..dd228710c5e 100644 --- a/docs/api_docs/python/tfds/core/NamedSplit.md +++ b/docs/api_docs/python/tfds/core/NamedSplit.md @@ -12,7 +12,7 @@ ## Class `NamedSplit` - +Inherits From: [`SplitBase`](../../tfds/core/SplitBase.md) diff --git a/docs/api_docs/python/tfds/core/SplitBase.md b/docs/api_docs/python/tfds/core/SplitBase.md new file mode 100644 index 00000000000..7dd75110890 --- /dev/null +++ b/docs/api_docs/python/tfds/core/SplitBase.md @@ -0,0 +1,151 @@ +
+ + + + + + +
+ +# tfds.core.SplitBase + +## Class `SplitBase` + + + + + +Defined in [`core/splits.py`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/core/splits.py). + +Abstract base class for Split compositionality. + +See the +[guide on splits](https://github.com/tensorflow/datasets/tree/master/docs/splits.md) +for more information. + +There are three parts to the composition: + 1) The splits are composed (defined, merged, splitted,...) together before + calling the `.as_dataset()` function. This is done with the `__add__`, + `__getitem__`, which return a tree of `SplitBase` (whose leaf + are the `NamedSplit` objects) + + ``` + split = tfds.TRAIN + tfds.TEST.subsplit(tfds.percent[:50]) + ``` + + 2) The `SplitBase` is forwarded to the `.as_dataset()` function + to be resolved into actual read instruction. This is done by the + `.get_read_instruction()` method which takes the real dataset splits + (name, number of shards,...) and parse the tree to return a + `SplitReadInstruction()` object + + ``` + read_instruction = split.get_read_instruction(self.info.splits) + ``` + + 3) The `SplitReadInstruction` is then used in the `tf.data.Dataset` pipeline + to define which files to read and how to skip examples within file. + + ``` + files_to_read = read_instruction.split_info_list + slice_per_file = read_instruction.slice_list + ``` + +## Methods + +

__add__

+ +``` python +__add__(other) +``` + +Merging: tfds.Split.TRAIN + tfds.Split.TEST. + +

__eq__

+ +``` python +__eq__(other) +``` + +Equality: tfds.Split.TRAIN == 'train'. + +

get_read_instruction

+ +``` python +get_read_instruction(split_dict) +``` + +Parse the descriptor tree and compile all read instructions together. + +#### Args: + +* `split_dict`: `dict`, The `dict[split_name, SplitInfo]` of the dataset + + +#### Returns: + +* `split_read_instruction`: `SplitReadInstruction` + +

subsplit

+ +``` python +subsplit( + arg=None, + k=None, + percent=None, + weighted=None +) +``` + +Divides this split into subsplits. + +There are 3 ways to define subsplits, which correspond to the 3 +arguments `k` (get `k` even subsplits), `percent` (get a slice of the +dataset with tfds.percent), and `weighted` (get subsplits with proportions +specified by `weighted`). + +Examples: + +``` +# 50% train, 50% test +train, test = split.subsplit(k=2) +# 50% train, 25% test, 25% validation +train, test, validation = split.subsplit(weighted=[2, 1, 1]) +# Extract last 20% +subsplit = split.subsplit(tfds.percent[-20:]) +``` + +Warning: k and weighted will be converted into percent which mean that +values bellow the percent will be rounded up or down. The final split may be +bigger to deal with remainders. For instance: + +``` +train, test, valid = split.subsplit(k=3) # 33%, 33%, 34% +s1, s2, s3, s4 = split.subsplit(weighted=[2, 2, 1, 1]) # 33%, 33%, 16%, 18% +``` + +#### Args: + +* `arg`: If no kwargs are given, `arg` will be interpreted as one of + `k`, `percent`, or `weighted` depending on the type. + For example: + ``` + split.subsplit(10) # Equivalent to split.subsplit(k=10) + split.subsplit(tfds.percent[:-20]) # percent=tfds.percent[:-20] + split.subsplit([1, 1, 2]) # weighted=[1, 1, 2] + ``` +* `k`: `int` If set, subdivide the split into `k` equal parts. +* `percent`: `tfds.percent slice`, return a single subsplit corresponding to + a slice of the original split. For example: + `split.subsplit(tfds.percent[-20:]) # Last 20% of the dataset`. +* `weighted`: `list[int]`, return a list of subsplits whose proportions match + the normalized sum of the list. For example: + `split.subsplit(weighted=[1, 1, 2]) # 25%, 25%, 50%`. + + +#### Returns: + +A subsplit or list of subsplits extracted from this split object. + + + diff --git a/docs/api_docs/python/tfds/core/SplitDict.md b/docs/api_docs/python/tfds/core/SplitDict.md index 6fe76d3023d..a417a5f3d9e 100644 --- a/docs/api_docs/python/tfds/core/SplitDict.md +++ b/docs/api_docs/python/tfds/core/SplitDict.md @@ -1,6 +1,7 @@
+ @@ -33,6 +34,14 @@ __init__() +## Properties + +

total_num_examples

+ +Return the total number of examples. + + + ## Methods

__getitem__

diff --git a/docs/api_docs/python/tfds/dataset_as_numpy.md b/docs/api_docs/python/tfds/dataset_as_numpy.md index aa513518323..30fd1019527 100644 --- a/docs/api_docs/python/tfds/dataset_as_numpy.md +++ b/docs/api_docs/python/tfds/dataset_as_numpy.md @@ -7,8 +7,8 @@ ``` python tfds.dataset_as_numpy( - dataset, - graph=None + *args, + **kwargs ) ``` @@ -16,19 +16,4 @@ tfds.dataset_as_numpy( Defined in [`core/dataset_utils.py`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/core/dataset_utils.py). -Converts a `tf.data.Dataset` to an iterable of NumPy arrays. - -`dataset_as_numpy` converts a possibly nested structure of `tf.data.Dataset`s -and `tf.Tensor`s to iterables of NumPy arrays and NumPy arrays, respectively. - -#### Args: - -* `dataset`: a possibly nested structure of `tf.data.Dataset`s and/or - `tf.Tensor`s. -* `graph`: `tf.Graph`, optional, explicitly set the graph to use. - - -#### Returns: - -A structure matching `dataset` where `tf.data.Dataset`s are converted to -generators of NumPy arrays and `tf.Tensor`s are converted to NumPy arrays. \ No newline at end of file +DEPRECATED. Renamed tfds.as_numpy. \ No newline at end of file diff --git a/docs/api_docs/python/tfds/download/DownloadManager.md b/docs/api_docs/python/tfds/download/DownloadManager.md index d1f8a97561f..bc1c10a078c 100644 --- a/docs/api_docs/python/tfds/download/DownloadManager.md +++ b/docs/api_docs/python/tfds/download/DownloadManager.md @@ -71,10 +71,10 @@ For more customization on the download/extraction (ex: passwords, output_name, ``` python __init__( - dataset_name, - download_dir=None, + download_dir, extract_dir=None, manual_dir=None, + dataset_name=None, checksums=None, force_download=False, force_extraction=False @@ -85,10 +85,11 @@ Download manager constructor. #### Args: -* `dataset_name`: `str`, name of dataset this instance will be used for. * `download_dir`: `str`, path to directory where downloads are stored. * `extract_dir`: `str`, path to directory where artifacts are extracted. * `manual_dir`: `str`, path to manually downloaded/extracted data directory. +* `dataset_name`: `str`, name of dataset this instance will be used for. If + provided, downloads will contain which datasets they were used for. * `checksums`: `dict`, url to sha256 of resource. Only URLs present are checked. If empty, checksum of (already) downloaded files is computed and can diff --git a/docs/api_docs/python/tfds/features/Text.md b/docs/api_docs/python/tfds/features/Text.md index 4690b9720cf..7ec116ae3e9 100644 --- a/docs/api_docs/python/tfds/features/Text.md +++ b/docs/api_docs/python/tfds/features/Text.md @@ -142,7 +142,7 @@ get_tensor_info() ints2str(int_values) ``` -Conversion string => encoded list[int]. +Conversion list[int] => decoded string.

load_metadata

@@ -191,7 +191,7 @@ save_metadata( str2ints(str_value) ``` -Conversion list[int] => decoded string. +Conversion string => encoded list[int]. diff --git a/docs/api_docs/python/tfds/features/text/SubwordTextEncoder.md b/docs/api_docs/python/tfds/features/text/SubwordTextEncoder.md index 5b6a45ab169..f691e494989 100644 --- a/docs/api_docs/python/tfds/features/text/SubwordTextEncoder.md +++ b/docs/api_docs/python/tfds/features/text/SubwordTextEncoder.md @@ -60,8 +60,8 @@ Note: To generate a vocabulary from a corpus, use * `vocab_list`: `list`, list of subwords for the vocabulary. Note that an underscore at the end of a subword indicates the end of the word (i.e. a space will be inserted afterwards when decoding). Underscores in the - interior are disallowed and should use the underscore escape sequence - "\u". + interior of subwords are disallowed and should use the underscore + escape sequence. diff --git a/docs/api_docs/python/tfds/features/text/TokenTextEncoder.md b/docs/api_docs/python/tfds/features/text/TokenTextEncoder.md index 73d9083f983..e47db9be34d 100644 --- a/docs/api_docs/python/tfds/features/text/TokenTextEncoder.md +++ b/docs/api_docs/python/tfds/features/text/TokenTextEncoder.md @@ -34,7 +34,7 @@ regex "\W+". __init__( vocab_list, oov_buckets=1, - oov_token=u'UNK', + oov_token='UNK', lowercase=False, tokenizer=None ) diff --git a/docs/api_docs/python/tfds/file_adapter/FileFormatAdapter.md b/docs/api_docs/python/tfds/file_adapter/FileFormatAdapter.md index 1583f3812c6..9a378757597 100644 --- a/docs/api_docs/python/tfds/file_adapter/FileFormatAdapter.md +++ b/docs/api_docs/python/tfds/file_adapter/FileFormatAdapter.md @@ -51,7 +51,7 @@ Write to files from generators_and_filenames. * `generator_fn`: returns generator yielding dictionaries of feature name to value. - output_files (list): output files to write records to. +* `output_files`: `list`, output files to write records to. diff --git a/docs/api_docs/python/tfds/load.md b/docs/api_docs/python/tfds/load.md index 10344725c99..4c62d7d7c02 100644 --- a/docs/api_docs/python/tfds/load.md +++ b/docs/api_docs/python/tfds/load.md @@ -46,7 +46,7 @@ return ds ``` If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s, -you can pass the return value to tfds.dataset_as_numpy. +you can pass the return value to tfds.as_numpy. Callers must pass arguments as keyword arguments. diff --git a/docs/datasets.md b/docs/datasets.md index 0e4b241ba76..8cbfc71a24f 100644 --- a/docs/datasets.md +++ b/docs/datasets.md @@ -9,7 +9,7 @@ tfds.list_builders() # Load a given dataset by name, along with the DatasetInfo data, info = tfds.load("mnist", with_info=True) -train_data, test_data = data['train'], data['test'] +train_data, test_data = data['test'], data['train'] assert isinstance(train_data, tf.data.Dataset) assert info.features['label'].num_classes == 10 assert info.splits['train'].num_examples == 60000 @@ -21,13 +21,15 @@ builder.download_and_prepare() datasets = builder.as_dataset() # If you need NumPy arrays -np_datasets = tfds.dataset_as_numpy(datasets) +np_datasets = tfds.as_numpy(datasets) ``` --- # Datasets +* [`audio`](#audio) + * [`"nsynth"`](#nsynth) * [`image`](#image) * [`"celeb_a"`](#celeb_a) * [`"cifar10"`](#cifar10) @@ -40,6 +42,8 @@ np_datasets = tfds.dataset_as_numpy(datasets) * [`"lsun"`](#lsun) * [`"mnist"`](#mnist) * [`"omniglot"`](#omniglot) + * [`"open_images_v4"`](#open_images_v4) + * [`"quickdraw_bitmap"`](#quickdraw_bitmap) * [`"svhn_cropped"`](#svhn_cropped) * [`text`](#text) * [`"imdb_reviews"`](#imdb_reviews) @@ -54,6 +58,92 @@ np_datasets = tfds.dataset_as_numpy(datasets) --- +# [`audio`](#audio) + +## `"nsynth"` + +The NSynth Dataset is an audio dataset containing ~300k musical notes, each +with a unique pitch, timbre, and envelope. Each note is annotated with three +additional pieces of information based on a combination of human evaluation +and heuristic algorithms: + -Source: The method of sound production for the note's instrument. + -Family: The high-level family of which the note's instrument is a member. + -Qualities: Sonic qualities of the note. + +The dataset is split into train, valid, and test sets, with no instruments +overlapping between the train set and the valid/test sets. + + +* URL: [https://g.co/magenta/nsynth-dataset](https://g.co/magenta/nsynth-dataset) +* `DatasetBuilder`: [`tfds.audio.nsynth.Nsynth`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/audio/nsynth.py) +* Version: `v1.0.0` + +### Features +``` +FeaturesDict({ + 'audio': Tensor(shape=(64000,), dtype=tf.float32), + 'id': Tensor(shape=(), dtype=tf.string), + 'instrument': FeaturesDict({ + 'family': ClassLabel(shape=(), dtype=tf.int64, num_classes=11), + 'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=1006), + 'source': ClassLabel(shape=(), dtype=tf.int64, num_classes=3), + }), + 'pitch': ClassLabel(shape=(), dtype=tf.int64, num_classes=128), + 'qualities': FeaturesDict({ + 'bright': Tensor(shape=(), dtype=tf.bool), + 'dark': Tensor(shape=(), dtype=tf.bool), + 'distortion': Tensor(shape=(), dtype=tf.bool), + 'fast_decay': Tensor(shape=(), dtype=tf.bool), + 'long_release': Tensor(shape=(), dtype=tf.bool), + 'multiphonic': Tensor(shape=(), dtype=tf.bool), + 'nonlinear_env': Tensor(shape=(), dtype=tf.bool), + 'percussive': Tensor(shape=(), dtype=tf.bool), + 'reverb': Tensor(shape=(), dtype=tf.bool), + 'tempo-synced': Tensor(shape=(), dtype=tf.bool), + }), + 'velocity': ClassLabel(shape=(), dtype=tf.int64, num_classes=128), +}) +``` + + +### Statistics +Split | Examples +:----- | ---: +ALL | 305,979 +TRAIN | 289,205 +VALID | 12,678 +TEST | 4,096 + + +### Urls + * [https://g.co/magenta/nsynth-dataset](https://g.co/magenta/nsynth-dataset) + +### Supervised keys (for `as_supervised=True`) +None + +### Citation +``` +@InProceedings{pmlr-v70-engel17a, + title = {Neural Audio Synthesis of Musical Notes with {W}ave{N}et Autoencoders}, + author = {Jesse Engel and Cinjon Resnick and Adam Roberts and Sander Dieleman and Mohammad Norouzi and Douglas Eck and Karen Simonyan}, + booktitle = {Proceedings of the 34th International Conference on Machine Learning}, + pages = {1068--1077}, + year = {2017}, + editor = {Doina Precup and Yee Whye Teh}, + volume = {70}, + series = {Proceedings of Machine Learning Research}, + address = {International Convention Centre, Sydney, Australia}, + month = {06--11 Aug}, + publisher = {PMLR}, + pdf = {http://proceedings.mlr.press/v70/engel17a/engel17a.pdf}, + url = {http://proceedings.mlr.press/v70/engel17a.html}, +} + +``` + +--- + + # [`image`](#image) ## `"celeb_a"` @@ -62,7 +152,7 @@ Large-scale CelebFaces Attributes, CelebA.Set of ~30k celebrities pictures. Thes * URL: [http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html) * `DatasetBuilder`: [`tfds.image.celeba.CelebA`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/image/celeba.py) -* Version: `v0.2.0` +* Version: `v0.3.0` ### Features ``` @@ -468,7 +558,7 @@ images for most of the concepts in the WordNet hierarchy. * URL: [http://image-net.org/](http://image-net.org/) * `DatasetBuilder`: [`tfds.image.imagenet.Imagenet2012`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/image/imagenet.py) -* Version: `v1.0.0` +* Version: `v2.0.0` ### Features ``` @@ -682,6 +772,143 @@ SMALL1 | 2,720 --- +## `"open_images_v4"` + +Open Images is a dataset of ~9M images that have been annotated with image-level + labels and object bounding boxes. + +The training set of V4 contains 14.6M bounding boxes for 600 object classes on +1.74M images, making it the largest existing dataset with object location +annotations. The boxes have been largely manually drawn by professional +annotators to ensure accuracy and consistency. The images are very diverse and +often contain complex scenes with several objects (8.4 per image on average). +Moreover, the dataset is annotated with image-level labels spanning thousands of +classes. + + +* URL: [https://storage.googleapis.com/openimages/web/index.html](https://storage.googleapis.com/openimages/web/index.html) +* `DatasetBuilder`: [`tfds.image.open_images.OpenImagesV4`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/image/open_images.py) +* Version: `v0.0.1` + +### Features +``` +FeaturesDict({ + 'image': Image(shape=(None, None, 3), dtype=tf.uint8), + 'image/filename': Text(shape=(), dtype=tf.string, encoder=None), + 'objects': SequenceDict({ + 'confidence': Tensor(shape=(), dtype=tf.int32), + 'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=19995), + 'source': ClassLabel(shape=(), dtype=tf.int64, num_classes=3), + }), +}) +``` + + +### Statistics +Split | Examples +:----- | ---: +ALL | 1,910,098 +TRAIN | 1,743,042 +TEST | 125,436 +VALIDATION | 41,620 + + +### Urls + * [https://storage.googleapis.com/openimages/web/index.html](https://storage.googleapis.com/openimages/web/index.html) + +### Supervised keys (for `as_supervised=True`) +None + +### Citation +``` +@article{OpenImages, + author = {Alina Kuznetsova and + Hassan Rom and + Neil Alldrin and + Jasper Uijlings and + Ivan Krasin and + Jordi Pont-Tuset and + Shahab Kamali and + Stefan Popov and + Matteo Malloci and + Tom Duerig and + Vittorio Ferrari}, + title = {The Open Images Dataset V4: Unified image classification, + object detection, and visual relationship detection at scale}, + year = {2018}, + journal = {arXiv:1811.00982} +} +@article{OpenImages2, + author = {Krasin, Ivan and + Duerig, Tom and + Alldrin, Neil and + Ferrari, Vittorio + and Abu-El-Haija, Sami and + Kuznetsova, Alina and + Rom, Hassan and + Uijlings, Jasper and + Popov, Stefan and + Kamali, Shahab and + Malloci, Matteo and + Pont-Tuset, Jordi and + Veit, Andreas and + Belongie, Serge and + Gomes, Victor and + Gupta, Abhinav and + Sun, Chen and + Chechik, Gal and + Cai, David and + Feng, Zheyun and + Narayanan, Dhyanesh and + Murphy, Kevin}, + title = {OpenImages: A public dataset for large-scale multi-label and + multi-class image classification.}, + journal = {Dataset available from + https://storage.googleapis.com/openimages/web/index.html}, + year={2017} +} + +``` + +--- + +## `"quickdraw_bitmap"` + +The Quick Draw Dataset is a collection of 50 million drawings across 345 categories, contributed by players of the game Quick, Draw!. The bitmap dataset contains these drawings converted from vector format into 28x28 grayscale images + +* URL: [https://github.com/googlecreativelab/quickdraw-dataset](https://github.com/googlecreativelab/quickdraw-dataset) +* `DatasetBuilder`: [`tfds.image.quickdraw.QuickdrawBitmap`](https://github.com/tensorflow/datasets/tree/master/tensorflow_datasets/image/quickdraw.py) +* Version: `v1.0.0` + +### Features +``` +FeaturesDict({ + 'image': Image(shape=(28, 28, 1), dtype=tf.uint8), + 'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=345), +}) +``` + + +### Statistics +Split | Examples +:----- | ---: +TRAIN | 50,426,266 +ALL | 50,426,266 + + +### Urls + * [https://github.com/googlecreativelab/quickdraw-dataset](https://github.com/googlecreativelab/quickdraw-dataset) + +### Supervised keys (for `as_supervised=True`) +(u'image', u'label') + +### Citation +``` +A Neural Representation of Sketch Drawings, D. Ha and D. Eck, arXiv:1704.03477v4, 2017. +``` + +--- + ## `"svhn_cropped"` The Street View House Numbers (SVHN) Dataset is an image digit recognition dataset of over 600,000 digit images coming from real world data. Images are cropped to 32x32. @@ -871,7 +1098,7 @@ FeaturesDict({ ``` FeaturesDict({ - 'text': Text(shape=(None,), dtype=tf.int64, encoder=), + 'text': Text(shape=(None,), dtype=tf.int64, encoder=), }) ``` @@ -881,7 +1108,7 @@ FeaturesDict({ ``` FeaturesDict({ - 'text': Text(shape=(None,), dtype=tf.int64, encoder=), + 'text': Text(shape=(None,), dtype=tf.int64, encoder=), }) ``` @@ -1141,8 +1368,8 @@ FeaturesDict({ ``` FeaturesDict({ - 'en': Text(shape=(), dtype=tf.string, encoder=None), - 'fr': Text(shape=(), dtype=tf.string, encoder=None), + 'en': Text(shape=(None,), dtype=tf.int64, encoder=), + 'fr': Text(shape=(None,), dtype=tf.int64, encoder=), }) ``` @@ -1163,8 +1390,8 @@ FeaturesDict({ ``` FeaturesDict({ - 'en': Text(shape=(), dtype=tf.string, encoder=None), - 'fr': Text(shape=(), dtype=tf.string, encoder=None), + 'en': Text(shape=(None,), dtype=tf.int64, encoder=), + 'fr': Text(shape=(None,), dtype=tf.int64, encoder=), }) ``` @@ -1172,7 +1399,12 @@ FeaturesDict({ ### Statistics -None computed +Split | Examples +:----- | ---: +ALL | 18,319,500 +TRAIN | 18,316,500 +VALIDATION | 3,000 + ### Urls * [http://www.statmt.org/wmt18/](http://www.statmt.org/wmt18/) @@ -1271,21 +1503,21 @@ This data set contains videos generated from Starcraft. `starcraft_video` is configured with `tfds.video.starcraft.StarcraftVideoConfig` and has the following configurations predefined (defaults to the first one): -* `"brawl_64"` (`v0.1.1`): Brawl map with 64x64 resolution. +* `"brawl_64"` (`v0.1.2`): Brawl map with 64x64 resolution. -* `"brawl_128"` (`v0.1.1`): Brawl map with 128x128 resolution. +* `"brawl_128"` (`v0.1.2`): Brawl map with 128x128 resolution. -* `"collect_mineral_shards_64"` (`v0.1.1`): CollectMineralShards map with 64x64 resolution. +* `"collect_mineral_shards_64"` (`v0.1.2`): CollectMineralShards map with 64x64 resolution. -* `"collect_mineral_shards_128"` (`v0.1.1`): CollectMineralShards map with 128x128 resolution. +* `"collect_mineral_shards_128"` (`v0.1.2`): CollectMineralShards map with 128x128 resolution. -* `"move_unit_to_border_64"` (`v0.1.1`): MoveUnitToBorder map with 64x64 resolution. +* `"move_unit_to_border_64"` (`v0.1.2`): MoveUnitToBorder map with 64x64 resolution. -* `"move_unit_to_border_128"` (`v0.1.1`): MoveUnitToBorder map with 128x128 resolution. +* `"move_unit_to_border_128"` (`v0.1.2`): MoveUnitToBorder map with 128x128 resolution. -* `"road_trip_with_medivac_64"` (`v0.1.1`): RoadTripWithMedivac map with 64x64 resolution. +* `"road_trip_with_medivac_64"` (`v0.1.2`): RoadTripWithMedivac map with 64x64 resolution. -* `"road_trip_with_medivac_128"` (`v0.1.1`): RoadTripWithMedivac map with 128x128 resolution. +* `"road_trip_with_medivac_128"` (`v0.1.2`): RoadTripWithMedivac map with 128x128 resolution. ### `"starcraft_video/brawl_64"` @@ -1386,7 +1618,26 @@ None ### Citation ``` -Towards Accurate Generative Models of Video: New Metrics & Challenges +@article{DBLP:journals/corr/abs-1812-01717, + author = {Thomas Unterthiner and + Sjoerd van Steenkiste and + Karol Kurach and + Rapha{"{e}}l Marinier and + Marcin Michalski and + Sylvain Gelly}, + title = {Towards Accurate Generative Models of Video: {A} New Metric and + Challenges}, + journal = {CoRR}, + volume = {abs/1812.01717}, + year = {2018}, + url = {http://arxiv.org/abs/1812.01717}, + archivePrefix = {arXiv}, + eprint = {1812.01717}, + timestamp = {Tue, 01 Jan 2019 15:01:25 +0100}, + biburl = {https://dblp.org/rec/bib/journals/corr/abs-1812-01717}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + ``` --- diff --git a/tensorflow_datasets/core/dataset_builder_test.py b/tensorflow_datasets/core/dataset_builder_test.py index b8fdd6843b3..be7200bd883 100644 --- a/tensorflow_datasets/core/dataset_builder_test.py +++ b/tensorflow_datasets/core/dataset_builder_test.py @@ -120,7 +120,7 @@ def test_shared_generator(self): ] train_data, test_data = [ [el["x"] for el in - dataset_utils.dataset_as_numpy(builder.as_dataset(split=split))] + dataset_utils.as_numpy(builder.as_dataset(split=split))] for split in splits_list ] @@ -144,7 +144,7 @@ def test_load(self): data_dir=tmp_dir, download=True, split=splits_lib.Split.TRAIN) - data = list(dataset_utils.dataset_as_numpy(dataset)) + data = list(dataset_utils.as_numpy(dataset)) self.assertEqual(20, len(data)) self.assertLess(data[0]["x"], 30) @@ -156,7 +156,7 @@ def test_determinism(self): data_dir=tmp_dir, split=splits_lib.Split.TRAIN, as_dataset_kwargs=dict(shuffle_files=False)) - ds_values = list(dataset_utils.dataset_as_numpy(ds)) + ds_values = list(dataset_utils.as_numpy(ds)) # Ensure determinism. If this test fail, this mean that numpy random # module isn't always determinist (maybe between version, architecture, @@ -184,10 +184,10 @@ def test_multi_split(self): split=[splits_lib.Split.TRAIN, splits_lib.Split.TEST], as_dataset_kwargs=dict(shuffle_files=False)) - data = list(dataset_utils.dataset_as_numpy(ds_train)) + data = list(dataset_utils.as_numpy(ds_train)) self.assertEqual(20, len(data)) - data = list(dataset_utils.dataset_as_numpy(ds_test)) + data = list(dataset_utils.as_numpy(ds_test)) self.assertEqual(10, len(data)) def test_build_data_dir(self): @@ -257,7 +257,7 @@ def test_with_configs(self): for builder, incr in [(builder1, 1), (builder2, 2)]: train_data, test_data = [ [el["x"] for el in - dataset_utils.dataset_as_numpy(builder.as_dataset(split=split))] + dataset_utils.as_numpy(builder.as_dataset(split=split))] for split in splits_list ] @@ -293,7 +293,7 @@ def setUp(self): @test_utils.run_in_graph_and_eager_modes() def test_all_splits(self): - splits = dataset_utils.dataset_as_numpy( + splits = dataset_utils.as_numpy( self.builder.as_dataset(batch_size=-1)) self.assertSetEqual(set(splits.keys()), set([splits_lib.Split.TRAIN, splits_lib.Split.TEST])) @@ -310,7 +310,7 @@ def test_all_splits(self): @test_utils.run_in_graph_and_eager_modes() def test_with_batch_size(self): - items = list(dataset_utils.dataset_as_numpy(self.builder.as_dataset( + items = list(dataset_utils.as_numpy(self.builder.as_dataset( split=splits_lib.Split.TRAIN + splits_lib.Split.TEST, batch_size=10))) # 3 batches of 10 self.assertEqual(3, len(items)) @@ -322,7 +322,7 @@ def test_with_batch_size(self): @test_utils.run_in_graph_and_eager_modes() def test_supervised_keys(self): - x, _ = dataset_utils.dataset_as_numpy(self.builder.as_dataset( + x, _ = dataset_utils.as_numpy(self.builder.as_dataset( split=splits_lib.Split.TRAIN, as_supervised=True, batch_size=-1)) self.assertEqual(x.shape[0], 20) diff --git a/tensorflow_datasets/core/dataset_info.py b/tensorflow_datasets/core/dataset_info.py index c63b742429e..bbbfd25c46f 100644 --- a/tensorflow_datasets/core/dataset_info.py +++ b/tensorflow_datasets/core/dataset_info.py @@ -410,7 +410,7 @@ def get_dataset_feature_statistics(builder, split): feature_to_min = {} feature_to_max = {} - np_dataset = dataset_utils.dataset_as_numpy(dataset) + np_dataset = dataset_utils.as_numpy(dataset) for example in tqdm.tqdm(np_dataset, unit=" examples"): statistics.num_examples += 1 diff --git a/tensorflow_datasets/core/dataset_utils.py b/tensorflow_datasets/core/dataset_utils.py index 8759ee6a890..978306f0469 100644 --- a/tensorflow_datasets/core/dataset_utils.py +++ b/tensorflow_datasets/core/dataset_utils.py @@ -98,11 +98,18 @@ def _graph_dataset_iterator(ds_item, graph=None): break +def dataset_as_numpy(*args, **kwargs): + """DEPRECATED. Renamed `tfds.as_numpy`.""" + del args, kwargs + raise AttributeError( + "tfds.dataset_as_numpy has been renamed to tfds.as_numpy.") + + @api_utils.disallow_positional_args(allowed=["dataset"]) -def dataset_as_numpy(dataset, graph=None): +def as_numpy(dataset, graph=None): """Converts a `tf.data.Dataset` to an iterable of NumPy arrays. - `dataset_as_numpy` converts a possibly nested structure of `tf.data.Dataset`s + `as_numpy` converts a possibly nested structure of `tf.data.Dataset`s and `tf.Tensor`s to iterables of NumPy arrays and NumPy arrays, respectively. Args: @@ -126,7 +133,7 @@ def dataset_as_numpy(dataset, graph=None): types = [type(el) for el in flat_ds] types = tf.nest.pack_sequence_as(nested_ds, types) if not isinstance(ds_el, (tf.Tensor, tf.data.Dataset)): - raise ValueError("Arguments to dataset_as_numpy must be tf.Tensors or " + raise ValueError("Arguments to as_numpy must be tf.Tensors or " "tf.data.Datasets. Got: %s" % types) if tf.executing_eagerly(): diff --git a/tensorflow_datasets/core/dataset_utils_test.py b/tensorflow_datasets/core/dataset_utils_test.py index 883e69b9d06..d9793c4d99e 100644 --- a/tensorflow_datasets/core/dataset_utils_test.py +++ b/tensorflow_datasets/core/dataset_utils_test.py @@ -37,7 +37,7 @@ class DatasetAsNumPyTest(tf.test.TestCase): @test_utils.run_in_graph_and_eager_modes() def test_singleton_tensor(self): t = tf.random.normal((10, 10)) - np_t = dataset_utils.dataset_as_numpy(t) + np_t = dataset_utils.as_numpy(t) self.assertEqual((10, 10), np_t.shape) self.assertEqual(np.float32, np_t.dtype) @@ -46,14 +46,14 @@ def test_nested_tensors(self): t1 = tf.random.normal((10, 10)) t2 = tf.random.normal((10, 20)) nest_tup = (t1, t2) - np_t1, np_t2 = dataset_utils.dataset_as_numpy(nest_tup) + np_t1, np_t2 = dataset_utils.as_numpy(nest_tup) self.assertEqual((10, 10), np_t1.shape) self.assertEqual(np.float32, np_t1.dtype) self.assertEqual((10, 20), np_t2.shape) self.assertEqual(np.float32, np_t2.dtype) nest_dict = {"foo": t1, "bar": {"zoo": t2}} - np_nest_dict = dataset_utils.dataset_as_numpy(nest_dict) + np_nest_dict = dataset_utils.as_numpy(nest_dict) np_t1 = np_nest_dict["foo"] np_t2 = np_nest_dict["bar"]["zoo"] self.assertEqual((10, 10), np_t1.shape) @@ -64,21 +64,21 @@ def test_nested_tensors(self): @test_utils.run_in_graph_and_eager_modes() def test_singleton_dataset(self): ds = _create_dataset(range(10)) - np_ds = dataset_utils.dataset_as_numpy(ds) + np_ds = dataset_utils.as_numpy(ds) self.assertEqual(list(range(10)), [int(el) for el in list(np_ds)]) def test_with_graph(self): with tf.Graph().as_default(): with tf.Graph().as_default() as g: ds = _create_dataset(range(10)) - np_ds = dataset_utils.dataset_as_numpy(ds, graph=g) + np_ds = dataset_utils.as_numpy(ds, graph=g) self.assertEqual(list(range(10)), [int(el) for el in list(np_ds)]) @test_utils.run_in_graph_and_eager_modes() def test_singleton_dataset_with_nested_elements(self): ds = _create_dataset(range(10)) ds = ds.map(lambda el: {"a": el, "b": el + 1, "c": (el + 2, el + 3)}) - np_ds = dataset_utils.dataset_as_numpy(ds) + np_ds = dataset_utils.as_numpy(ds) for i, el in enumerate(np_ds): self.assertEqual(i, el["a"]) self.assertEqual(i + 1, el["b"]) @@ -89,7 +89,7 @@ def test_singleton_dataset_with_nested_elements(self): def test_nested_dataset_sequential_access(self): ds1 = _create_dataset(range(10)) ds2 = _create_dataset(range(10, 20)) - np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2})) + np_ds = dataset_utils.as_numpy((ds1, {"a": ds2})) np_ds1 = np_ds[0] np_ds2 = np_ds[1]["a"] @@ -100,7 +100,7 @@ def test_nested_dataset_sequential_access(self): def test_nested_dataset_simultaneous_access(self): ds1 = _create_dataset(range(10)) ds2 = _create_dataset(range(10, 20)) - np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2})) + np_ds = dataset_utils.as_numpy((ds1, {"a": ds2})) np_ds1 = np_ds[0] np_ds2 = np_ds[1]["a"] @@ -112,7 +112,7 @@ def test_nested_dataset_nested_elements(self): ds1 = _create_dataset(range(10)) ds1 = ds1.map(lambda el: {"a": el, "b": el + 1, "c": (el + 2, el + 3)}) ds2 = _create_dataset(range(10, 20)) - np_ds = dataset_utils.dataset_as_numpy((ds1, {"a": ds2})) + np_ds = dataset_utils.as_numpy((ds1, {"a": ds2})) np_ds1 = np_ds[0] np_ds2 = np_ds[1]["a"] @@ -131,7 +131,7 @@ def test_tensors_match(self): dtype=tf.int32, ) - ds = dataset_utils.dataset_as_numpy({"a": t, "b": t}) + ds = dataset_utils.as_numpy({"a": t, "b": t}) # sess.run() should be called a single time for all input. Otherwise input # and target may not match self.assertAllEqual(ds["a"], ds["b"]) diff --git a/tensorflow_datasets/core/registered.py b/tensorflow_datasets/core/registered.py index 0b22aa90313..4bdde59d4e5 100644 --- a/tensorflow_datasets/core/registered.py +++ b/tensorflow_datasets/core/registered.py @@ -185,7 +185,7 @@ def load(name, ``` If you'd like NumPy arrays instead of `tf.data.Dataset`s or `tf.Tensor`s, - you can pass the return value to `tfds.dataset_as_numpy`. + you can pass the return value to `tfds.as_numpy`. Callers must pass arguments as keyword arguments. diff --git a/tensorflow_datasets/core/splits_test.py b/tensorflow_datasets/core/splits_test.py index 2d3bc665284..d01a10cb4e3 100644 --- a/tensorflow_datasets/core/splits_test.py +++ b/tensorflow_datasets/core/splits_test.py @@ -70,7 +70,7 @@ def _generate_examples(self, data): def values(self, split): return [int(v["value"]) for v in - tfds.dataset_as_numpy(self.as_dataset(split=split))] + tfds.as_numpy(self.as_dataset(split=split))] class SplitsUnitTest(tf.test.TestCase): diff --git a/tensorflow_datasets/core/test_utils.py b/tensorflow_datasets/core/test_utils.py index 8ad356bae47..87323aca8cf 100644 --- a/tensorflow_datasets/core/test_utils.py +++ b/tensorflow_datasets/core/test_utils.py @@ -299,7 +299,7 @@ def features_encode_decode(features_dict, example, as_tensor=False): dataset = dataset.map(features_dict.decode_example) if not as_tensor: # Evaluate to numpy array - for el in dataset_utils.dataset_as_numpy(dataset): + for el in dataset_utils.as_numpy(dataset): return el else: if tf.executing_eagerly(): diff --git a/tensorflow_datasets/public_api.py b/tensorflow_datasets/public_api.py index 0a9254abe49..b228ffe9007 100644 --- a/tensorflow_datasets/public_api.py +++ b/tensorflow_datasets/public_api.py @@ -22,6 +22,7 @@ from tensorflow_datasets.core import features from tensorflow_datasets.core import file_format_adapter as file_adapter from tensorflow_datasets.core import units +from tensorflow_datasets.core.dataset_utils import as_numpy from tensorflow_datasets.core.dataset_utils import dataset_as_numpy from tensorflow_datasets.core.download import GenerateMode from tensorflow_datasets.core.registered import builder @@ -32,6 +33,7 @@ __all__ = [ "core", + "as_numpy", "dataset_as_numpy", "download", "features", diff --git a/tensorflow_datasets/scripts/document_datasets.py b/tensorflow_datasets/scripts/document_datasets.py index 04e16f3b2db..9476e706539 100644 --- a/tensorflow_datasets/scripts/document_datasets.py +++ b/tensorflow_datasets/scripts/document_datasets.py @@ -62,7 +62,7 @@ datasets = builder.as_dataset() # If you need NumPy arrays -np_datasets = tfds.dataset_as_numpy(datasets) +np_datasets = tfds.as_numpy(datasets) ``` --- diff --git a/tensorflow_datasets/testing/dataset_builder_testing.py b/tensorflow_datasets/testing/dataset_builder_testing.py index a3536a047af..b2ab0b566ba 100644 --- a/tensorflow_datasets/testing/dataset_builder_testing.py +++ b/tensorflow_datasets/testing/dataset_builder_testing.py @@ -225,7 +225,7 @@ def _assertAsDataset(self, builder): dataset = builder.as_dataset(split=split_name) compare_shapes_and_types(builder.info.features.get_tensor_info(), dataset.output_types, dataset.output_shapes) - examples = list(dataset_utils.dataset_as_numpy( + examples = list(dataset_utils.as_numpy( builder.as_dataset(split=split_name))) split_to_checksums[split_name] = set(checksum(rec) for rec in examples) self.assertLen(examples, expected_examples_number) diff --git a/tensorflow_datasets/testing/e2e_binary.py b/tensorflow_datasets/testing/e2e_binary.py index e60f2f52696..8b93b322c81 100644 --- a/tensorflow_datasets/testing/e2e_binary.py +++ b/tensorflow_datasets/testing/e2e_binary.py @@ -43,7 +43,7 @@ def main(argv): print(i) cifar10, info = tfds.load('cifar10', with_info=True) print(cifar10, info) - cifar10_np = tfds.dataset_as_numpy(cifar10) + cifar10_np = tfds.as_numpy(cifar10) print(cifar10_np)