From e19a1c7a006d428b8e46234be5ac565ed6ae4fe6 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 17 Oct 2019 01:37:18 -0500 Subject: [PATCH 01/19] s3: list_objects -> list_objects_api --- dvc/remote/s3.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 51f07f94df..1c33884db8 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -36,7 +36,10 @@ def __init__(self, repo, config): self.endpoint_url = config.get(Config.SECTION_AWS_ENDPOINT_URL) - self.list_objects = config.get(Config.SECTION_AWS_LIST_OBJECTS) + if config.get(Config.SECTION_AWS_LIST_OBJECTS): + self.list_objects_api = "list_objects" + else: + self.list_objects_api = "list_objects_v2" self.use_ssl = config.get(Config.SECTION_AWS_USE_SSL, True) @@ -183,11 +186,7 @@ def remove(self, path_info): def _list_paths(self, bucket, prefix): """ Read config for list object api, paginate through list objects.""" kwargs = {"Bucket": bucket, "Prefix": prefix} - if self.list_objects: - list_objects_api = "list_objects" - else: - list_objects_api = "list_objects_v2" - paginator = self.s3.get_paginator(list_objects_api) + paginator = self.s3.get_paginator(self.list_objects_api) for page in paginator.paginate(**kwargs): contents = page.get("Contents", None) if not contents: From f7093edf5115e38bd61861db12b10fcf9d8bd85d Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 17 Oct 2019 01:40:31 -0500 Subject: [PATCH 02/19] s3: use path_info for _list_paths --- dvc/remote/s3.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 1c33884db8..3b24174d96 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -183,9 +183,9 @@ def remove(self, path_info): logger.debug("Removing {}".format(path_info)) self.s3.delete_object(Bucket=path_info.bucket, Key=path_info.path) - def _list_paths(self, bucket, prefix): + def _list_paths(self, path_info): """ Read config for list object api, paginate through list objects.""" - kwargs = {"Bucket": bucket, "Prefix": prefix} + kwargs = {"Bucket": path_info.bucket, "Prefix": path_info.path} paginator = self.s3.get_paginator(self.list_objects_api) for page in paginator.paginate(**kwargs): contents = page.get("Contents", None) @@ -195,10 +195,10 @@ def _list_paths(self, bucket, prefix): yield item["Key"] def list_cache_paths(self): - return self._list_paths(self.path_info.bucket, self.path_info.path) + return self._list_paths(self.path_info) def exists(self, path_info): - paths = self._list_paths(path_info.bucket, path_info.path) + paths = self._list_paths(path_info) return any(path_info.path == path for path in paths) def _upload(self, from_file, to_info, name=None, no_progress_bar=False): From 19d3ea25d64d8e2f311a143fb10bfa01db22d845 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 17 Oct 2019 01:42:42 -0500 Subject: [PATCH 03/19] s3: separate _list_paths from _list_objects --- dvc/remote/s3.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 3b24174d96..b655f4f7ca 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -183,7 +183,7 @@ def remove(self, path_info): logger.debug("Removing {}".format(path_info)) self.s3.delete_object(Bucket=path_info.bucket, Key=path_info.path) - def _list_paths(self, path_info): + def _list_objects(self, path_info): """ Read config for list object api, paginate through list objects.""" kwargs = {"Bucket": path_info.bucket, "Prefix": path_info.path} paginator = self.s3.get_paginator(self.list_objects_api) @@ -192,7 +192,10 @@ def _list_paths(self, path_info): if not contents: continue for item in contents: - yield item["Key"] + yield item + + def _list_paths(self, path_info): + return (item["Key"] for item in self._list_objects(path_info)) def list_cache_paths(self): return self._list_paths(self.path_info) From 8fa2a06ec6b66f63a7f8a9ba431d89be531d7b0c Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 17 Oct 2019 03:05:15 -0500 Subject: [PATCH 04/19] s3: support max items for listing objects/paths --- dvc/remote/s3.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index b655f4f7ca..cb5dcba1d5 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -183,9 +183,13 @@ def remove(self, path_info): logger.debug("Removing {}".format(path_info)) self.s3.delete_object(Bucket=path_info.bucket, Key=path_info.path) - def _list_objects(self, path_info): + def _list_objects(self, path_info, max_items=None): """ Read config for list object api, paginate through list objects.""" - kwargs = {"Bucket": path_info.bucket, "Prefix": path_info.path} + kwargs = { + "Bucket": path_info.bucket, + "Prefix": path_info.path, + "PaginationConfig": {"MaxItems": max_items}, + } paginator = self.s3.get_paginator(self.list_objects_api) for page in paginator.paginate(**kwargs): contents = page.get("Contents", None) @@ -194,8 +198,10 @@ def _list_objects(self, path_info): for item in contents: yield item - def _list_paths(self, path_info): - return (item["Key"] for item in self._list_objects(path_info)) + def _list_paths(self, path_info, max_items=None): + return ( + item["Key"] for item in self._list_objects(path_info, max_items) + ) def list_cache_paths(self): return self._list_paths(self.path_info) From 44b9566093b91a405dc0122ef6e8ec5a323d33bc Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 17 Oct 2019 03:06:27 -0500 Subject: [PATCH 05/19] s3: take into account directories on `exists` --- dvc/remote/s3.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index cb5dcba1d5..2dee0b6180 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -2,6 +2,7 @@ import os import logging +import posixpath from funcy import cached_property from dvc.progress import Tqdm @@ -207,8 +208,10 @@ def list_cache_paths(self): return self._list_paths(self.path_info) def exists(self, path_info): - paths = self._list_paths(path_info) - return any(path_info.path == path for path in paths) + dir_path = posixpath.join(path_info.path, "") + fname = next(self._list_paths(path_info, max_items=1), "") + return path_info.path == fname or dir_path in fname + def _upload(self, from_file, to_info, name=None, no_progress_bar=False): total = os.path.getsize(from_file) From 2075bc360ff5f115276ed24ecda1a697bba1e7c2 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 17 Oct 2019 04:10:30 -0500 Subject: [PATCH 06/19] s3: overwrite `isdir` method --- dvc/remote/s3.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 2dee0b6180..836f7378e5 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -212,6 +212,10 @@ def exists(self, path_info): fname = next(self._list_paths(path_info, max_items=1), "") return path_info.path == fname or dir_path in fname + def isdir(self, path_info): + dir_path = posixpath.join(path_info.path, "") + fname = next(self._list_paths(path_info, max_items=1), "") + return dir_path in fname def _upload(self, from_file, to_info, name=None, no_progress_bar=False): total = os.path.getsize(from_file) From 0f1da32202bff4a5b80d912c60e16780954a59e8 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 17 Oct 2019 04:12:45 -0500 Subject: [PATCH 07/19] s3: custom collect directory entry for s3 --- dvc/remote/s3.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 836f7378e5..f80cb602a3 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -249,3 +249,14 @@ def _generate_download_url(self, path_info, expires=3600): return self.s3.generate_presigned_url( ClientMethod="get_object", Params=params, ExpiresIn=int(expires) ) + + def _collect_dir(self, path_info): + root = path_info.path + + return [ + { + self.PARAM_CHECKSUM: entry["ETag"], + self.PARAM_RELPATH: os.path.relpath(entry["Key"], start=root), + } + for entry in self._list_objects(path_info) + ] From e46560a728faffc306c0b1cc1044a280c09a30c1 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Mon, 21 Oct 2019 20:17:13 -0500 Subject: [PATCH 08/19] s3: strip checksum when collecting directories --- dvc/remote/s3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index f80cb602a3..967cadb25a 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -255,7 +255,7 @@ def _collect_dir(self, path_info): return [ { - self.PARAM_CHECKSUM: entry["ETag"], + self.PARAM_CHECKSUM: entry["ETag"].strip('"'), self.PARAM_RELPATH: os.path.relpath(entry["Key"], start=root), } for entry in self._list_objects(path_info) From 0c11c14daec274b8b973e77ac745b2ece436108e Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Tue, 22 Oct 2019 23:40:40 -0500 Subject: [PATCH 09/19] remote: change walk implementation to walk_files --- dvc/remote/base.py | 19 ++++++++----------- dvc/remote/local.py | 6 +++--- dvc/remote/s3.py | 15 ++++----------- dvc/remote/ssh/__init__.py | 6 +++--- 4 files changed, 18 insertions(+), 28 deletions(-) diff --git a/dvc/remote/base.py b/dvc/remote/base.py index 5dd2ca1b6a..f576205d8b 100644 --- a/dvc/remote/base.py +++ b/dvc/remote/base.py @@ -2,6 +2,7 @@ from operator import itemgetter from multiprocessing import cpu_count + import json import logging import tempfile @@ -190,14 +191,13 @@ def _calculate_checksums(self, file_infos): return checksums def _collect_dir(self, path_info): - file_infos = set() - for root, _dirs, files in self.walk(path_info): - if DvcIgnore.DVCIGNORE_FILE in files: - raise DvcIgnoreInCollectedDirError(root) + for fname in self.walk_files(path_info): + if DvcIgnore.DVCIGNORE_FILE == fname.name: + raise DvcIgnoreInCollectedDirError(fname.parent) - file_infos.update(path_info / root / fname for fname in files) + file_infos.add(fname) checksums = {fi: self.state.get(fi) for fi in file_infos} not_in_state = { @@ -466,7 +466,8 @@ def isdir(self, path_info): """ return False - def walk(self, path_info): + def walk_files(self, path_info): + """Return a generator with `PathInfo`s to all the files""" raise NotImplementedError @staticmethod @@ -831,11 +832,7 @@ def _checkout_dir( self.state.save(path_info, checksum) def _remove_redundant_files(self, path_info, dir_info, force): - existing_files = set( - path_info / root / fname - for root, _, files in self.walk(path_info) - for fname in files - ) + existing_files = set(self.walk_files(path_info)) needed_files = { path_info / entry[self.PARAM_RELPATH] for entry in dir_info diff --git a/dvc/remote/local.py b/dvc/remote/local.py index e187f33b32..8a6d0cf5c5 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -25,7 +25,6 @@ file_md5, walk_files, relpath, - dvc_walk, makedirs, ) from dvc.config import Config @@ -144,8 +143,9 @@ def isdir(path_info): def getsize(path_info): return os.path.getsize(fspath_py35(path_info)) - def walk(self, path_info): - return dvc_walk(path_info, self.repo.dvcignore) + def walk_files(self, path_info): + for fname in walk_files(str(path_info), self.repo.dvcignore): + yield path_info / os.path.relpath(fname, path_info) def get_file_checksum(self, path_info): return file_md5(fspath_py35(path_info))[0] diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 967cadb25a..b841814323 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -205,7 +205,7 @@ def _list_paths(self, path_info, max_items=None): ) def list_cache_paths(self): - return self._list_paths(self.path_info) + return self.walk_files(self.path_info) def exists(self, path_info): dir_path = posixpath.join(path_info.path, "") @@ -250,13 +250,6 @@ def _generate_download_url(self, path_info, expires=3600): ClientMethod="get_object", Params=params, ExpiresIn=int(expires) ) - def _collect_dir(self, path_info): - root = path_info.path - - return [ - { - self.PARAM_CHECKSUM: entry["ETag"].strip('"'), - self.PARAM_RELPATH: os.path.relpath(entry["Key"], start=root), - } - for entry in self._list_objects(path_info) - ] + def walk_files(self, path_info, max_items=None): + for fname in self._list_paths(path_info, max_items): + yield path_info / os.path.relpath(fname, path_info.path) diff --git a/dvc/remote/ssh/__init__.py b/dvc/remote/ssh/__init__.py index 75a2a6f980..14242a6a11 100644 --- a/dvc/remote/ssh/__init__.py +++ b/dvc/remote/ssh/__init__.py @@ -260,10 +260,10 @@ def list_cache_paths(self): with self.ssh(self.path_info) as ssh: return list(ssh.walk_files(self.path_info.path)) - def walk(self, path_info): + def walk_files(self, path_info): with self.ssh(path_info) as ssh: - for entry in ssh.walk(path_info.path): - yield entry + for fname in ssh.walk_files(path_info.path): + yield path_info / os.path.relpath(fname, path_info.path) def makedirs(self, path_info): with self.ssh(path_info) as ssh: From c50cba4e3eed57e4cc25f88a78c07cb3795e2d90 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 23 Oct 2019 00:17:07 -0500 Subject: [PATCH 10/19] s3: add comment about isdir logic --- dvc/remote/s3.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index b841814323..43155e3709 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -210,12 +210,31 @@ def list_cache_paths(self): def exists(self, path_info): dir_path = posixpath.join(path_info.path, "") fname = next(self._list_paths(path_info, max_items=1), "") - return path_info.path == fname or dir_path in fname + return path_info.path == fname or fname.startswith(dir_path) def isdir(self, path_info): + # S3 doesn't have a concept for directories. + # + # Using `head_object` with a path pointing to a directory + # will throw a 404 error. + # + # A reliable way to know if a given path is a directory is by + # checking if there are more files sharing the same prefix + # with a `list_objects` call. + # + # We need to make sure that the path ends with a forward slash, + # since we can end with false-positives like the following example: + # + # bucket + # └── data + # ├── alice + # └── alpha + # + # Using `data/al` as prefix will return `[data/alice, data/alpha]`, + # While `data/al/` will return nothing. + # dir_path = posixpath.join(path_info.path, "") - fname = next(self._list_paths(path_info, max_items=1), "") - return dir_path in fname + return bool(self._list_paths(path_info, max_items=1)) def _upload(self, from_file, to_info, name=None, no_progress_bar=False): total = os.path.getsize(from_file) From 793de2460a9b19d7923fa01b650b12fd38ae2962 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 23 Oct 2019 02:40:29 -0500 Subject: [PATCH 11/19] s3: use `dir_path` correctly on `isdir` --- dvc/remote/s3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 43155e3709..ee234eb80b 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -233,8 +233,8 @@ def isdir(self, path_info): # Using `data/al` as prefix will return `[data/alice, data/alpha]`, # While `data/al/` will return nothing. # - dir_path = posixpath.join(path_info.path, "") - return bool(self._list_paths(path_info, max_items=1)) + dir_path = path_info / "" + return bool(list(self._list_paths(dir_path, max_items=1))) def _upload(self, from_file, to_info, name=None, no_progress_bar=False): total = os.path.getsize(from_file) From 335ddcb8d5a720b4361a724dd7952545932535a6 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 23 Oct 2019 02:40:51 -0500 Subject: [PATCH 12/19] tests: add a few unit tests for S3 --- tests/unit/remote/test_s3.py | 88 ++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 tests/unit/remote/test_s3.py diff --git a/tests/unit/remote/test_s3.py b/tests/unit/remote/test_s3.py new file mode 100644 index 0000000000..1992393a2f --- /dev/null +++ b/tests/unit/remote/test_s3.py @@ -0,0 +1,88 @@ +import boto3 +import os +import pytest +from moto import mock_s3 + +from dvc.remote.s3 import RemoteS3 + + +@pytest.fixture +def aws_credentials(): + """Mocked AWS Credentials for moto.""" + os.environ["AWS_ACCESS_KEY_ID"] = "testing" + os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" + os.environ["AWS_SECURITY_TOKEN"] = "testing" + os.environ["AWS_SESSION_TOKEN"] = "testing" + + +@pytest.fixture +def s3(aws_credentials): + """Returns a connection to a bucket with the following file structure: + + bucket + ├── data + │ ├── alice + │ ├── alpha + │ └── subdir + │ ├── 1 + │ ├── 2 + │ └── 3 + ├── empty_dir + ├── empty_file + └── foo + """ + with mock_s3(): + s3 = boto3.client("s3", region_name="us-east-1") + s3.create_bucket(Bucket="bucket") + + s3.put_object(Bucket="bucket", Key="empty_dir/") + s3.put_object(Bucket="bucket", Key="empty_file", Body=b"") + s3.put_object(Bucket="bucket", Key="foo", Body=b"foo") + s3.put_object(Bucket="bucket", Key="data/alice", Body=b"alice") + s3.put_object(Bucket="bucket", Key="data/alpha", Body=b"alpha") + s3.put_object(Bucket="bucket", Key="data/subdir/1", Body=b"1") + s3.put_object(Bucket="bucket", Key="data/subdir/2", Body=b"2") + s3.put_object(Bucket="bucket", Key="data/subdir/3", Body=b"3") + + yield s3 + + +@pytest.fixture +def remote(): + """Returns RemoteS3 instance to work with the `s3` fixture.""" + yield RemoteS3(None, {"url": "s3://bucket", "region": "us-east-1"}) + + +def test_isdir(s3, remote): + assert remote.isdir(remote.path_info / "data") + assert remote.isdir(remote.path_info / "data/") + assert remote.isdir(remote.path_info / "data/subdir") + assert remote.isdir(remote.path_info / "empty_dir") + assert not remote.isdir(remote.path_info / "foo") + assert not remote.isdir(remote.path_info / "data/alice") + assert not remote.isdir(remote.path_info / "data/al") + assert not remote.isdir(remote.path_info / "data/subdir/1") + + +def test_exists(s3, remote): + assert remote.exists(remote.path_info / "data") + assert remote.exists(remote.path_info / "data/") + assert remote.exists(remote.path_info / "data/subdir") + assert remote.exists(remote.path_info / "empty_dir") + assert remote.exists(remote.path_info / "empty_file") + assert remote.exists(remote.path_info / "foo") + assert remote.exists(remote.path_info / "data/alice") + assert remote.exists(remote.path_info / "data/subdir/1") + assert not remote.exists(remote.path_info / "data/al") + + +def test_walk_files(s3, remote): + files = [ + remote.path_info / "data/alice", + remote.path_info / "data/alpha", + remote.path_info / "data/subdir/1", + remote.path_info / "data/subdir/2", + remote.path_info / "data/subdir/3", + ] + + assert list(remote.walk_files(remote.path_info / "data")) == files From f188a8108c4223a22c2809816c6ab45b0a18ed28 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 23 Oct 2019 02:49:51 -0500 Subject: [PATCH 13/19] s3: use path_info join instead of posixpath --- dvc/remote/s3.py | 5 ++--- tests/unit/remote/test_s3.py | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index ee234eb80b..d3e2544595 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -2,7 +2,6 @@ import os import logging -import posixpath from funcy import cached_property from dvc.progress import Tqdm @@ -208,9 +207,9 @@ def list_cache_paths(self): return self.walk_files(self.path_info) def exists(self, path_info): - dir_path = posixpath.join(path_info.path, "") + dir_path = path_info / "" fname = next(self._list_paths(path_info, max_items=1), "") - return path_info.path == fname or fname.startswith(dir_path) + return path_info.path == fname or fname.startswith(dir_path.path) def isdir(self, path_info): # S3 doesn't have a concept for directories. diff --git a/tests/unit/remote/test_s3.py b/tests/unit/remote/test_s3.py index 1992393a2f..c04c532730 100644 --- a/tests/unit/remote/test_s3.py +++ b/tests/unit/remote/test_s3.py @@ -74,6 +74,7 @@ def test_exists(s3, remote): assert remote.exists(remote.path_info / "data/alice") assert remote.exists(remote.path_info / "data/subdir/1") assert not remote.exists(remote.path_info / "data/al") + assert not remote.exists(remote.path_info / "foo/") def test_walk_files(s3, remote): From 2a1cd3151254922bd41914e976ae54e480cc19a9 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 23 Oct 2019 17:21:52 -0500 Subject: [PATCH 14/19] tests: use parametrize for s3 tests --- tests/unit/remote/test_s3.py | 100 ++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 44 deletions(-) diff --git a/tests/unit/remote/test_s3.py b/tests/unit/remote/test_s3.py index c04c532730..d2615969d2 100644 --- a/tests/unit/remote/test_s3.py +++ b/tests/unit/remote/test_s3.py @@ -1,22 +1,14 @@ import boto3 import os import pytest +import mock from moto import mock_s3 from dvc.remote.s3 import RemoteS3 @pytest.fixture -def aws_credentials(): - """Mocked AWS Credentials for moto.""" - os.environ["AWS_ACCESS_KEY_ID"] = "testing" - os.environ["AWS_SECRET_ACCESS_KEY"] = "testing" - os.environ["AWS_SECURITY_TOKEN"] = "testing" - os.environ["AWS_SESSION_TOKEN"] = "testing" - - -@pytest.fixture -def s3(aws_credentials): +def s3(): """Returns a connection to a bucket with the following file structure: bucket @@ -31,20 +23,30 @@ def s3(aws_credentials): ├── empty_file └── foo """ - with mock_s3(): - s3 = boto3.client("s3", region_name="us-east-1") - s3.create_bucket(Bucket="bucket") + # Mocked AWS Credentials for moto. + aws_credentials = { + "AWS_ACCESS_KEY_ID": "testing", + "AWS_SECRET_ACCESS_KEY": "testing", + "AWS_SECURITY_TOKEN": "testing", + "AWS_SESSION_TOKEN": "testing", + } + - s3.put_object(Bucket="bucket", Key="empty_dir/") - s3.put_object(Bucket="bucket", Key="empty_file", Body=b"") - s3.put_object(Bucket="bucket", Key="foo", Body=b"foo") - s3.put_object(Bucket="bucket", Key="data/alice", Body=b"alice") - s3.put_object(Bucket="bucket", Key="data/alpha", Body=b"alpha") - s3.put_object(Bucket="bucket", Key="data/subdir/1", Body=b"1") - s3.put_object(Bucket="bucket", Key="data/subdir/2", Body=b"2") - s3.put_object(Bucket="bucket", Key="data/subdir/3", Body=b"3") + with mock.patch.dict(os.environ, aws_credentials): + with mock_s3(): + s3 = boto3.client("s3", region_name="us-east-1") + s3.create_bucket(Bucket="bucket") - yield s3 + s3.put_object(Bucket="bucket", Key="empty_dir/") + s3.put_object(Bucket="bucket", Key="empty_file", Body=b"") + s3.put_object(Bucket="bucket", Key="foo", Body=b"foo") + s3.put_object(Bucket="bucket", Key="data/alice", Body=b"alice") + s3.put_object(Bucket="bucket", Key="data/alpha", Body=b"alpha") + s3.put_object(Bucket="bucket", Key="data/subdir/1", Body=b"1") + s3.put_object(Bucket="bucket", Key="data/subdir/2", Body=b"2") + s3.put_object(Bucket="bucket", Key="data/subdir/3", Body=b"3") + + yield s3 @pytest.fixture @@ -53,28 +55,38 @@ def remote(): yield RemoteS3(None, {"url": "s3://bucket", "region": "us-east-1"}) -def test_isdir(s3, remote): - assert remote.isdir(remote.path_info / "data") - assert remote.isdir(remote.path_info / "data/") - assert remote.isdir(remote.path_info / "data/subdir") - assert remote.isdir(remote.path_info / "empty_dir") - assert not remote.isdir(remote.path_info / "foo") - assert not remote.isdir(remote.path_info / "data/alice") - assert not remote.isdir(remote.path_info / "data/al") - assert not remote.isdir(remote.path_info / "data/subdir/1") - - -def test_exists(s3, remote): - assert remote.exists(remote.path_info / "data") - assert remote.exists(remote.path_info / "data/") - assert remote.exists(remote.path_info / "data/subdir") - assert remote.exists(remote.path_info / "empty_dir") - assert remote.exists(remote.path_info / "empty_file") - assert remote.exists(remote.path_info / "foo") - assert remote.exists(remote.path_info / "data/alice") - assert remote.exists(remote.path_info / "data/subdir/1") - assert not remote.exists(remote.path_info / "data/al") - assert not remote.exists(remote.path_info / "foo/") +@pytest.mark.parametrize( + "path, result", [ + ("data", True), + ("data/", True), + ("data/subdir", True), + ("empty_dir", True), + ("foo", False), + ("data/alice", False), + ("data/al", False), + ("data/subdir/1", False), + ] +) +def test_isdir(s3, remote, path, result): + assert remote.isdir(remote.path_info / path) == result + + +@pytest.mark.parametrize( + "path, result", [ + ("data", True), + ("data/", True), + ("data/subdir", True), + ("empty_dir", True), + ("empty_file", True), + ("foo", True), + ("data/alice", True), + ("data/subdir/1", True), + ("data/al", False), + ("foo/", False), + ] +) +def test_exists(s3, remote, path, result): + assert remote.exists(remote.path_info / path) == result def test_walk_files(s3, remote): From c31d20c8ac245ccf6bcf2fff7497748678a05da2 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 23 Oct 2019 17:26:36 -0500 Subject: [PATCH 15/19] remote: use posixpath accordingly --- dvc/remote/local.py | 4 ++-- dvc/remote/s3.py | 3 ++- dvc/remote/ssh/__init__.py | 3 ++- tests/unit/remote/test_s3.py | 43 ++++++++++++++++++------------------ 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/dvc/remote/local.py b/dvc/remote/local.py index 8a6d0cf5c5..dff4337566 100644 --- a/dvc/remote/local.py +++ b/dvc/remote/local.py @@ -144,8 +144,8 @@ def getsize(path_info): return os.path.getsize(fspath_py35(path_info)) def walk_files(self, path_info): - for fname in walk_files(str(path_info), self.repo.dvcignore): - yield path_info / os.path.relpath(fname, path_info) + for fname in walk_files(path_info, self.repo.dvcignore): + yield PathInfo(fname) def get_file_checksum(self, path_info): return file_md5(fspath_py35(path_info))[0] diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index d3e2544595..95e8d3318a 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -2,6 +2,7 @@ import os import logging +import posixpath from funcy import cached_property from dvc.progress import Tqdm @@ -270,4 +271,4 @@ def _generate_download_url(self, path_info, expires=3600): def walk_files(self, path_info, max_items=None): for fname in self._list_paths(path_info, max_items): - yield path_info / os.path.relpath(fname, path_info.path) + yield path_info / posixpath.relpath(fname, path_info.path) diff --git a/dvc/remote/ssh/__init__.py b/dvc/remote/ssh/__init__.py index 14242a6a11..41e35135fb 100644 --- a/dvc/remote/ssh/__init__.py +++ b/dvc/remote/ssh/__init__.py @@ -4,6 +4,7 @@ import itertools import io import os +import posixpath import getpass import logging import threading @@ -263,7 +264,7 @@ def list_cache_paths(self): def walk_files(self, path_info): with self.ssh(path_info) as ssh: for fname in ssh.walk_files(path_info.path): - yield path_info / os.path.relpath(fname, path_info.path) + yield path_info / posixpath.relpath(fname, path_info.path) def makedirs(self, path_info): with self.ssh(path_info) as ssh: diff --git a/tests/unit/remote/test_s3.py b/tests/unit/remote/test_s3.py index d2615969d2..ad3cc7dd23 100644 --- a/tests/unit/remote/test_s3.py +++ b/tests/unit/remote/test_s3.py @@ -31,7 +31,6 @@ def s3(): "AWS_SESSION_TOKEN": "testing", } - with mock.patch.dict(os.environ, aws_credentials): with mock_s3(): s3 = boto3.client("s3", region_name="us-east-1") @@ -56,34 +55,36 @@ def remote(): @pytest.mark.parametrize( - "path, result", [ - ("data", True), - ("data/", True), - ("data/subdir", True), - ("empty_dir", True), - ("foo", False), - ("data/alice", False), - ("data/al", False), + "path, result", + [ + ("data", True), + ("data/", True), + ("data/subdir", True), + ("empty_dir", True), + ("foo", False), + ("data/alice", False), + ("data/al", False), ("data/subdir/1", False), - ] + ], ) def test_isdir(s3, remote, path, result): assert remote.isdir(remote.path_info / path) == result @pytest.mark.parametrize( - "path, result", [ - ("data", True), - ("data/", True), - ("data/subdir", True), - ("empty_dir", True), - ("empty_file", True), - ("foo", True), - ("data/alice", True), + "path, result", + [ + ("data", True), + ("data/", True), + ("data/subdir", True), + ("empty_dir", True), + ("empty_file", True), + ("foo", True), + ("data/alice", True), ("data/subdir/1", True), - ("data/al", False), - ("foo/", False), - ] + ("data/al", False), + ("foo/", False), + ], ) def test_exists(s3, remote, path, result): assert remote.exists(remote.path_info / path) == result From 9c5484fd9b660d2d85da42a65b2018c9ac55c4fc Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 23 Oct 2019 20:44:57 -0500 Subject: [PATCH 16/19] tests: remove undeeded AWS variables from S3 test --- tests/unit/remote/test_s3.py | 37 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/tests/unit/remote/test_s3.py b/tests/unit/remote/test_s3.py index ad3cc7dd23..dd7032ff02 100644 --- a/tests/unit/remote/test_s3.py +++ b/tests/unit/remote/test_s3.py @@ -23,29 +23,20 @@ def s3(): ├── empty_file └── foo """ - # Mocked AWS Credentials for moto. - aws_credentials = { - "AWS_ACCESS_KEY_ID": "testing", - "AWS_SECRET_ACCESS_KEY": "testing", - "AWS_SECURITY_TOKEN": "testing", - "AWS_SESSION_TOKEN": "testing", - } - - with mock.patch.dict(os.environ, aws_credentials): - with mock_s3(): - s3 = boto3.client("s3", region_name="us-east-1") - s3.create_bucket(Bucket="bucket") - - s3.put_object(Bucket="bucket", Key="empty_dir/") - s3.put_object(Bucket="bucket", Key="empty_file", Body=b"") - s3.put_object(Bucket="bucket", Key="foo", Body=b"foo") - s3.put_object(Bucket="bucket", Key="data/alice", Body=b"alice") - s3.put_object(Bucket="bucket", Key="data/alpha", Body=b"alpha") - s3.put_object(Bucket="bucket", Key="data/subdir/1", Body=b"1") - s3.put_object(Bucket="bucket", Key="data/subdir/2", Body=b"2") - s3.put_object(Bucket="bucket", Key="data/subdir/3", Body=b"3") - - yield s3 + with mock_s3(): + s3 = boto3.client("s3", region_name="us-east-1") + s3.create_bucket(Bucket="bucket") + + s3.put_object(Bucket="bucket", Key="empty_dir/") + s3.put_object(Bucket="bucket", Key="empty_file", Body=b"") + s3.put_object(Bucket="bucket", Key="foo", Body=b"foo") + s3.put_object(Bucket="bucket", Key="data/alice", Body=b"alice") + s3.put_object(Bucket="bucket", Key="data/alpha", Body=b"alpha") + s3.put_object(Bucket="bucket", Key="data/subdir/1", Body=b"1") + s3.put_object(Bucket="bucket", Key="data/subdir/2", Body=b"2") + s3.put_object(Bucket="bucket", Key="data/subdir/3", Body=b"3") + + yield s3 @pytest.fixture From da28c5024a2df68f376ba503470828c1f60212cd Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Wed, 23 Oct 2019 20:45:24 -0500 Subject: [PATCH 17/19] tests: rearrange parametrize values for readability --- tests/unit/remote/test_s3.py | 42 +++++++++++++++++------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/tests/unit/remote/test_s3.py b/tests/unit/remote/test_s3.py index dd7032ff02..3b49204174 100644 --- a/tests/unit/remote/test_s3.py +++ b/tests/unit/remote/test_s3.py @@ -1,7 +1,5 @@ import boto3 -import os import pytest -import mock from moto import mock_s3 from dvc.remote.s3 import RemoteS3 @@ -46,16 +44,16 @@ def remote(): @pytest.mark.parametrize( - "path, result", + "result, path", [ - ("data", True), - ("data/", True), - ("data/subdir", True), - ("empty_dir", True), - ("foo", False), - ("data/alice", False), - ("data/al", False), - ("data/subdir/1", False), + (True, "data"), + (True, "data/"), + (True, "data/subdir"), + (True, "empty_dir"), + (False, "foo"), + (False, "data/alice"), + (False, "data/al"), + (False, "data/subdir/1"), ], ) def test_isdir(s3, remote, path, result): @@ -63,18 +61,18 @@ def test_isdir(s3, remote, path, result): @pytest.mark.parametrize( - "path, result", + "result, path", [ - ("data", True), - ("data/", True), - ("data/subdir", True), - ("empty_dir", True), - ("empty_file", True), - ("foo", True), - ("data/alice", True), - ("data/subdir/1", True), - ("data/al", False), - ("foo/", False), + (True, "data"), + (True, "data/"), + (True, "data/subdir"), + (True, "empty_dir"), + (True, "empty_file"), + (True, "foo"), + (True, "data/alice"), + (True, "data/subdir/1"), + (False, "data/al"), + (False, "foo/"), ], ) def test_exists(s3, remote, path, result): From 14835793f9f253f28f5b65864c7558d1d7907a40 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 24 Oct 2019 16:21:47 -0500 Subject: [PATCH 18/19] tests: add encoding, use a single s3 instance, for loop instead of parameterize --- tests/unit/remote/test_s3.py | 48 ++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/tests/unit/remote/test_s3.py b/tests/unit/remote/test_s3.py index 3b49204174..7861fb5a9a 100644 --- a/tests/unit/remote/test_s3.py +++ b/tests/unit/remote/test_s3.py @@ -1,4 +1,5 @@ -import boto3 +# -*- coding: utf-8 -*- + import pytest from moto import mock_s3 @@ -6,8 +7,8 @@ @pytest.fixture -def s3(): - """Returns a connection to a bucket with the following file structure: +def remote(): + """Returns a RemoteS3 connected to a bucket with the following structure: bucket ├── data @@ -22,9 +23,10 @@ def s3(): └── foo """ with mock_s3(): - s3 = boto3.client("s3", region_name="us-east-1") - s3.create_bucket(Bucket="bucket") + remote = RemoteS3(None, {"url": "s3://bucket", "region": "us-east-1"}) + s3 = remote.s3 + s3.create_bucket(Bucket="bucket") s3.put_object(Bucket="bucket", Key="empty_dir/") s3.put_object(Bucket="bucket", Key="empty_file", Body=b"") s3.put_object(Bucket="bucket", Key="foo", Body=b"foo") @@ -34,18 +36,11 @@ def s3(): s3.put_object(Bucket="bucket", Key="data/subdir/2", Body=b"2") s3.put_object(Bucket="bucket", Key="data/subdir/3", Body=b"3") - yield s3 - - -@pytest.fixture -def remote(): - """Returns RemoteS3 instance to work with the `s3` fixture.""" - yield RemoteS3(None, {"url": "s3://bucket", "region": "us-east-1"}) + yield remote -@pytest.mark.parametrize( - "result, path", - [ +def test_isdir(remote): + test_cases = [ (True, "data"), (True, "data/"), (True, "data/subdir"), @@ -54,15 +49,14 @@ def remote(): (False, "data/alice"), (False, "data/al"), (False, "data/subdir/1"), - ], -) -def test_isdir(s3, remote, path, result): - assert remote.isdir(remote.path_info / path) == result + ] + for expected, path in test_cases: + assert remote.isdir(remote.path_info / path) == expected -@pytest.mark.parametrize( - "result, path", - [ + +def test_exists(remote): + test_cases = [ (True, "data"), (True, "data/"), (True, "data/subdir"), @@ -73,13 +67,13 @@ def test_isdir(s3, remote, path, result): (True, "data/subdir/1"), (False, "data/al"), (False, "foo/"), - ], -) -def test_exists(s3, remote, path, result): - assert remote.exists(remote.path_info / path) == result + ] + + for expected, path in test_cases: + assert remote.exists(remote.path_info / path) == expected -def test_walk_files(s3, remote): +def test_walk_files(remote): files = [ remote.path_info / "data/alice", remote.path_info / "data/alpha", From d4cd49348574360e67e6fecc4201159e26b8d382 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Thu, 24 Oct 2019 18:45:46 -0500 Subject: [PATCH 19/19] py2: fix encoding --- dvc/remote/s3.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index 95e8d3318a..cef7f447c8 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + from __future__ import unicode_literals import os