From af2f216c5e9b169ec728bbfd43fa3138b7b7e323 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Fri, 22 Nov 2019 20:40:46 +0100 Subject: [PATCH 01/23] get: copy regular files Allows `dvc get` to copy regular files or directories. fixes: #2515 --- dvc/repo/get.py | 52 ++++++++++++++++++++++++---------------- tests/basic_env.py | 4 +++- tests/func/test_get.py | 24 ++++++++++++++++++- tests/func/test_repro.py | 8 +++---- 4 files changed, 61 insertions(+), 27 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 21d01ed7e7..ee47656787 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -1,5 +1,6 @@ import logging import os +import shutil import shortuuid @@ -29,32 +30,41 @@ def get(url, path, out=None, rev=None): # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. - dpath = os.path.dirname(os.path.abspath(out)) + abspath = os.path.abspath(out) + dpath = os.path.dirname(abspath) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: - # Note: we need to replace state, because in case of getting DVC - # dependency on CIFS or NFS filesystems, sqlite-based state - # will be unable to obtain lock - repo.state = StateNoop() + tmp_repo_path = os.path.join(repo.root_dir, path) + if os.path.exists(tmp_repo_path): + if os.path.isdir(tmp_repo_path): + shutil.copytree(tmp_repo_path, abspath) + else: + shutil.copy2(tmp_repo_path, abspath) + else: + # Note: we need to replace state, because in case of getting + # DVC dependency on CIFS or NFS filesystems, sqlite-based state + # will be unable to obtain lock + repo.state = StateNoop() - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we are - # done, and to make that work we would have to copy data over - # anyway before removing the cache, so we might just copy it - # right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we + # are done, and to make that work we would have to copy data + # over anyway before removing the cache, so we might just copy + # it right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - o = repo.find_out_by_relpath(path) - with repo.state: - repo.cloud.pull(o.get_used_cache()) - o.path_info = PathInfo(os.path.abspath(out)) - with o.repo.state: - o.checkout() + o = repo.find_out_by_relpath(path) + + with repo.state: + repo.cloud.pull(o.get_used_cache()) + o.path_info = PathInfo(os.path.abspath(out)) + with o.repo.state: + o.checkout() except NotDvcRepoError: raise UrlNotDvcRepoError(url) diff --git a/tests/basic_env.py b/tests/basic_env.py index 1fba67a550..a2f059df6b 100644 --- a/tests/basic_env.py +++ b/tests/basic_env.py @@ -41,7 +41,8 @@ class TestDirFixture(object): # in tests, we replace foo with bar, so we need to make sure that when we # modify a file in our tests, its content length changes. BAR_CONTENTS = BAR + "r" - CODE = "code.py" + CODE_DIR = "code" + CODE = "code/code.py" CODE_CONTENTS = ( "import sys\nimport shutil\n" "shutil.copyfile(sys.argv[1], sys.argv[2])" @@ -90,6 +91,7 @@ def setUp(self): self._pushd(self._root_dir) self.create(self.FOO, self.FOO_CONTENTS) self.create(self.BAR, self.BAR_CONTENTS) + os.mkdir(self.CODE_DIR) self.create(self.CODE, self.CODE_CONTENTS) os.mkdir(self.DATA_DIR) os.mkdir(self.DATA_SUB_DIR) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 07551c5f36..7c2fccfe47 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -38,7 +38,29 @@ def test_get_repo_dir(erepo): trees_equal(src, dst) -def test_cache_type_is_properly_overridden(erepo): +def test_get_regular_file(repo_dir, erepo): + src = erepo.CODE + dst = erepo.CODE + + Repo.get(erepo.root_dir, src, dst) + + assert os.path.exists(dst) + assert os.path.isfile(dst) + assert filecmp.cmp(repo_dir.CODE, dst, shallow=False) + + +def test_get_regular_dir(repo_dir, erepo): + src = erepo.CODE_DIR + dst = erepo.CODE_DIR + + Repo.get(erepo.root_dir, src, dst) + + assert os.path.exists(dst) + assert os.path.isdir(dst) + trees_equal(src, dst) + + +def test_cache_type_is_properly_overridden(repo_dir, erepo): erepo.dvc.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "symlink" ) diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index a85995fe9e..5caf5c40e4 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -738,9 +738,9 @@ def test(self): os.mkdir(dname) foo = os.path.join(dname, self.FOO) bar = os.path.join(dname, self.BAR) - code = os.path.join(dname, self.CODE) + code_dir = os.path.join(dname, self.CODE_DIR) shutil.copyfile(self.FOO, foo) - shutil.copyfile(self.CODE, code) + shutil.copytree(self.CODE_DIR, code_dir) ret = main( [ @@ -776,9 +776,9 @@ def test(self): os.mkdir(dname) foo = os.path.join(dname, self.FOO) bar = os.path.join(dname, self.BAR) - code = os.path.join(dname, self.CODE) + code_dir = os.path.join(dname, self.CODE_DIR) shutil.copyfile(self.FOO, foo) - shutil.copyfile(self.CODE, code) + shutil.copytree(self.CODE_DIR, code_dir) ret = main( [ From 4d142569957ac103d11302b08cba711e23c64563 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Sat, 23 Nov 2019 14:57:56 +0100 Subject: [PATCH 02/23] fixup! get: copy regular files --- tests/basic_env.py | 13 ++++++++++--- tests/func/test_get.py | 10 +++++----- tests/func/test_repro.py | 8 ++++---- tests/func/test_tree.py | 9 ++++++--- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/tests/basic_env.py b/tests/basic_env.py index a2f059df6b..d27130fbf2 100644 --- a/tests/basic_env.py +++ b/tests/basic_env.py @@ -41,14 +41,19 @@ class TestDirFixture(object): # in tests, we replace foo with bar, so we need to make sure that when we # modify a file in our tests, its content length changes. BAR_CONTENTS = BAR + "r" - CODE_DIR = "code" - CODE = "code/code.py" + CODE = "code.py" CODE_CONTENTS = ( "import sys\nimport shutil\n" "shutil.copyfile(sys.argv[1], sys.argv[2])" ) UNICODE = "тест" UNICODE_CONTENTS = "проверка" + REGULAR_DIR = "lib" + REGULAR_FILE = os.path.join(REGULAR_DIR, "file.txt") + REGULAR_FILE_CONTENTS = """ + A file used to test retrieval of regular files tracked by git that + are not stored in a dvc remote backend. + """ def __init__(self): root_dir = self.mkdtemp() @@ -91,13 +96,14 @@ def setUp(self): self._pushd(self._root_dir) self.create(self.FOO, self.FOO_CONTENTS) self.create(self.BAR, self.BAR_CONTENTS) - os.mkdir(self.CODE_DIR) self.create(self.CODE, self.CODE_CONTENTS) os.mkdir(self.DATA_DIR) os.mkdir(self.DATA_SUB_DIR) self.create(self.DATA, self.DATA_CONTENTS) self.create(self.DATA_SUB, self.DATA_SUB_CONTENTS) self.create(self.UNICODE, self.UNICODE_CONTENTS) + os.mkdir(self.REGULAR_DIR) + self.create(self.REGULAR_FILE, self.REGULAR_FILE_CONTENTS) def tearDown(self): self._popd() @@ -139,6 +145,7 @@ def setUp(self): raise self.git.index.add([self.CODE]) + self.git.index.add([self.REGULAR_DIR]) self.git.index.commit("add code") def tearDown(self): diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 7c2fccfe47..e66f0d96bb 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -39,19 +39,19 @@ def test_get_repo_dir(erepo): def test_get_regular_file(repo_dir, erepo): - src = erepo.CODE - dst = erepo.CODE + src = erepo.REGULAR_FILE + dst = erepo.REGULAR_FILE + "_imported" Repo.get(erepo.root_dir, src, dst) assert os.path.exists(dst) assert os.path.isfile(dst) - assert filecmp.cmp(repo_dir.CODE, dst, shallow=False) + assert filecmp.cmp(src, dst, shallow=False) def test_get_regular_dir(repo_dir, erepo): - src = erepo.CODE_DIR - dst = erepo.CODE_DIR + src = erepo.REGULAR_DIR + dst = erepo.REGULAR_DIR + "_imported" Repo.get(erepo.root_dir, src, dst) diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index 5caf5c40e4..a85995fe9e 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -738,9 +738,9 @@ def test(self): os.mkdir(dname) foo = os.path.join(dname, self.FOO) bar = os.path.join(dname, self.BAR) - code_dir = os.path.join(dname, self.CODE_DIR) + code = os.path.join(dname, self.CODE) shutil.copyfile(self.FOO, foo) - shutil.copytree(self.CODE_DIR, code_dir) + shutil.copyfile(self.CODE, code) ret = main( [ @@ -776,9 +776,9 @@ def test(self): os.mkdir(dname) foo = os.path.join(dname, self.FOO) bar = os.path.join(dname, self.BAR) - code_dir = os.path.join(dname, self.CODE_DIR) + code = os.path.join(dname, self.CODE) shutil.copyfile(self.FOO, foo) - shutil.copytree(self.CODE_DIR, code_dir) + shutil.copyfile(self.CODE, code) ret = main( [ diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index c2ae316c21..6563035250 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -115,9 +115,10 @@ def test(self): [ ( self._root_dir, - ["data_dir"], + ["lib", "data_dir"], ["code.py", "bar", "тест", "foo"], ), + (join(self._root_dir, "lib"), [], ["file.txt"]), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), ( join(self._root_dir, "data_dir", "data_sub_dir"), @@ -151,9 +152,10 @@ def test_nobranch(self): [ ( self._root_dir, - ["data_dir"], + ["lib", "data_dir"], ["bar", "тест", "code.py", "foo"], ), + (join(self._root_dir, "lib"), [], ["file.txt"]), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), ( join(self._root_dir, "data_dir", "data_sub_dir"), @@ -181,13 +183,14 @@ def test_branch(self): self.assertWalkEqual( tree.walk("."), [ - (self._root_dir, ["data_dir"], ["code.py"]), + (self._root_dir, ["data_dir", "lib"], ["code.py"]), (join(self._root_dir, "data_dir"), ["data_sub_dir"], []), ( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], ), + (join(self._root_dir, "lib"), [], ["file.txt"]), ], ) self.assertWalkEqual( From 97862a3d5fa2b5a979e4c1c8fdde280ad7368880 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Sat, 23 Nov 2019 15:26:32 +0100 Subject: [PATCH 03/23] Make tree.walk() output deterministic --- dvc/utils/__init__.py | 5 +++++ tests/func/test_tree.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 68ba418778..3a24a4f562 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -296,6 +296,11 @@ def dvc_walk(top, dvcignore, topdown=True, onerror=None, followlinks=False): for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks ): + # Dirs are returned in arbitrary order on different platforms. Force a + # specific order so that we can make deterministic assumptions on the + # directory order. + # https://stackoverflow.com/a/54773660/2966951 + dirs.sort() if dvcignore: dirs[:], files[:] = dvcignore(root, dirs, files) diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 6563035250..6aa2f58565 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -118,13 +118,13 @@ def test(self): ["lib", "data_dir"], ["code.py", "bar", "тест", "foo"], ), - (join(self._root_dir, "lib"), [], ["file.txt"]), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), ( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], ), + (join(self._root_dir, "lib"), [], ["file.txt"]), ], ) @@ -155,13 +155,13 @@ def test_nobranch(self): ["lib", "data_dir"], ["bar", "тест", "code.py", "foo"], ), - (join(self._root_dir, "lib"), [], ["file.txt"]), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), ( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], ), + (join(self._root_dir, "lib"), [], ["file.txt"]), ], ) self.assertWalkEqual( From 8c3ffe816da1cc740cb499370c1925a696e39766 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Tue, 26 Nov 2019 17:07:21 +0100 Subject: [PATCH 04/23] fixup! Make tree.walk() output deterministic --- dvc/utils/__init__.py | 5 ----- tests/basic_env.py | 9 --------- tests/func/test_get.py | 13 ++++++++----- tests/func/test_tree.py | 9 +++------ 4 files changed, 11 insertions(+), 25 deletions(-) diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index 3a24a4f562..68ba418778 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -296,11 +296,6 @@ def dvc_walk(top, dvcignore, topdown=True, onerror=None, followlinks=False): for root, dirs, files in os.walk( top, topdown=topdown, onerror=onerror, followlinks=followlinks ): - # Dirs are returned in arbitrary order on different platforms. Force a - # specific order so that we can make deterministic assumptions on the - # directory order. - # https://stackoverflow.com/a/54773660/2966951 - dirs.sort() if dvcignore: dirs[:], files[:] = dvcignore(root, dirs, files) diff --git a/tests/basic_env.py b/tests/basic_env.py index d27130fbf2..1fba67a550 100644 --- a/tests/basic_env.py +++ b/tests/basic_env.py @@ -48,12 +48,6 @@ class TestDirFixture(object): ) UNICODE = "тест" UNICODE_CONTENTS = "проверка" - REGULAR_DIR = "lib" - REGULAR_FILE = os.path.join(REGULAR_DIR, "file.txt") - REGULAR_FILE_CONTENTS = """ - A file used to test retrieval of regular files tracked by git that - are not stored in a dvc remote backend. - """ def __init__(self): root_dir = self.mkdtemp() @@ -102,8 +96,6 @@ def setUp(self): self.create(self.DATA, self.DATA_CONTENTS) self.create(self.DATA_SUB, self.DATA_SUB_CONTENTS) self.create(self.UNICODE, self.UNICODE_CONTENTS) - os.mkdir(self.REGULAR_DIR) - self.create(self.REGULAR_FILE, self.REGULAR_FILE_CONTENTS) def tearDown(self): self._popd() @@ -145,7 +137,6 @@ def setUp(self): raise self.git.index.add([self.CODE]) - self.git.index.add([self.REGULAR_DIR]) self.git.index.commit("add code") def tearDown(self): diff --git a/tests/func/test_get.py b/tests/func/test_get.py index e66f0d96bb..15034eef58 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -38,9 +38,10 @@ def test_get_repo_dir(erepo): trees_equal(src, dst) -def test_get_regular_file(repo_dir, erepo): - src = erepo.REGULAR_FILE - dst = erepo.REGULAR_FILE + "_imported" +def test_get_regular_file(erepo): + src = os.path.join(erepo.root_dir, "file") + dst = src + "_imported" + erepo.create(src, "hello") Repo.get(erepo.root_dir, src, dst) @@ -50,9 +51,11 @@ def test_get_regular_file(repo_dir, erepo): def test_get_regular_dir(repo_dir, erepo): - src = erepo.REGULAR_DIR - dst = erepo.REGULAR_DIR + "_imported" + src = os.path.join(erepo.root_dir, "directory") + dst = src + "_imported" + os.mkdir(src) + erepo.create(os.path.join(src, "file"), "hello") Repo.get(erepo.root_dir, src, dst) assert os.path.exists(dst) diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 6aa2f58565..c2ae316c21 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -115,7 +115,7 @@ def test(self): [ ( self._root_dir, - ["lib", "data_dir"], + ["data_dir"], ["code.py", "bar", "тест", "foo"], ), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), @@ -124,7 +124,6 @@ def test(self): [], ["data_sub"], ), - (join(self._root_dir, "lib"), [], ["file.txt"]), ], ) @@ -152,7 +151,7 @@ def test_nobranch(self): [ ( self._root_dir, - ["lib", "data_dir"], + ["data_dir"], ["bar", "тест", "code.py", "foo"], ), (join(self._root_dir, "data_dir"), ["data_sub_dir"], ["data"]), @@ -161,7 +160,6 @@ def test_nobranch(self): [], ["data_sub"], ), - (join(self._root_dir, "lib"), [], ["file.txt"]), ], ) self.assertWalkEqual( @@ -183,14 +181,13 @@ def test_branch(self): self.assertWalkEqual( tree.walk("."), [ - (self._root_dir, ["data_dir", "lib"], ["code.py"]), + (self._root_dir, ["data_dir"], ["code.py"]), (join(self._root_dir, "data_dir"), ["data_sub_dir"], []), ( join(self._root_dir, "data_dir", "data_sub_dir"), [], ["data_sub"], ), - (join(self._root_dir, "lib"), [], ["file.txt"]), ], ) self.assertWalkEqual( From 26b62331c7ce591ad87fc76c1ddee1d2af588119 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Tue, 26 Nov 2019 17:51:47 +0100 Subject: [PATCH 05/23] CLI --help changes --- dvc/command/get.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 77235ecef1..0ec3321ce2 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -33,7 +33,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download data from DVC repository." + GET_HELP = "Download files or directories from DVC repository." get_parser = subparsers.add_parser( "get", parents=[parent_parser], @@ -44,10 +44,10 @@ def add_parser(subparsers, parent_parser): get_parser.add_argument( "url", help="URL of Git repository with DVC project to download from." ) - get_parser.add_argument("path", help="Path to data within DVC repository.") get_parser.add_argument( - "-o", "--out", nargs="?", help="Destination path to put data to." + "path", help="Path to a file or directory within a DVC repository." ) + get_parser.add_argument("-o", "--out", nargs="?", help="Destination path.") get_parser.add_argument( "--rev", nargs="?", help="DVC repository git revision." ) From 7e1423d13c06ad80ca92a5e8081b220bc966abe7 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Thu, 28 Nov 2019 15:08:01 +0100 Subject: [PATCH 06/23] fix tests --- tests/func/test_get.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 15034eef58..f8143fc341 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -39,24 +39,28 @@ def test_get_repo_dir(erepo): def test_get_regular_file(erepo): - src = os.path.join(erepo.root_dir, "file") - dst = src + "_imported" - erepo.create(src, "hello") + src = os.path.join(erepo.root_dir, "some_file") + dst = os.path.join(os.getcwd(), "_imported") - Repo.get(erepo.root_dir, src, dst) + erepo.create(src, "hello") + erepo.dvc.scm.add([src]) + erepo.dvc.scm.commit("add a regular file") + Repo.get(erepo.root_dir, os.path.basename(src), os.path.basename(dst)) assert os.path.exists(dst) assert os.path.isfile(dst) assert filecmp.cmp(src, dst, shallow=False) -def test_get_regular_dir(repo_dir, erepo): - src = os.path.join(erepo.root_dir, "directory") - dst = src + "_imported" +def test_get_regular_dir(erepo): + src = os.path.join(erepo.root_dir, "some_directory") + src_file = os.path.join(src, "file.txt") + dst = os.path.join(os.getcwd(), "_imported") - os.mkdir(src) - erepo.create(os.path.join(src, "file"), "hello") - Repo.get(erepo.root_dir, src, dst) + erepo.create(src_file, "hello") + erepo.dvc.scm.add([src_file]) + erepo.dvc.scm.commit("add a regular dir") + Repo.get(erepo.root_dir, os.path.basename(src), os.path.basename(dst)) assert os.path.exists(dst) assert os.path.isdir(dst) From 1f5da3ffef8b2d6dde329e9826106db01bad706b Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Thu, 28 Nov 2019 15:22:22 +0100 Subject: [PATCH 07/23] fix CLI docs --- dvc/command/get.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 0ec3321ce2..e3172782d6 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -33,7 +33,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download files or directories from DVC repository." + GET_HELP = "Download/copy files or directories from DVC repository." get_parser = subparsers.add_parser( "get", parents=[parent_parser], @@ -47,7 +47,12 @@ def add_parser(subparsers, parent_parser): get_parser.add_argument( "path", help="Path to a file or directory within a DVC repository." ) - get_parser.add_argument("-o", "--out", nargs="?", help="Destination path.") + get_parser.add_argument( + "-o", + "--out", + nargs="?", + help="Destination path to copy/download files to.", + ) get_parser.add_argument( "--rev", nargs="?", help="DVC repository git revision." ) From 74f385111797fcafa100df9ec1c3380bb3902ae2 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Mon, 2 Dec 2019 18:27:14 +0100 Subject: [PATCH 08/23] Disallow absolute paths in `dvc get` --- dvc/exceptions.py | 9 +++++++++ dvc/repo/get.py | 4 ++++ tests/func/test_get.py | 8 ++++++++ 3 files changed, 21 insertions(+) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index e5fb6f00d5..1ced270321 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -268,6 +268,15 @@ def __init__(self, path, cause=None): ) +class NoAbsolutePathError(DvcException): + def __init__(self, path, cause=None): + super(NoAbsolutePathError, self).__init__( + "The path '{}' is an absolute path. ".format(path) + + "Provide a relative path.", + cause=cause, + ) + + class DvcIgnoreInCollectedDirError(DvcException): def __init__(self, ignore_dirname): super(DvcIgnoreInCollectedDirError, self).__init__( diff --git a/dvc/repo/get.py b/dvc/repo/get.py index ee47656787..bb2482302a 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -8,6 +8,7 @@ from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError from dvc.exceptions import UrlNotDvcRepoError +from dvc.exceptions import NoAbsolutePathError from dvc.external_repo import external_repo from dvc.path_info import PathInfo from dvc.stage import Stage @@ -20,6 +21,9 @@ @staticmethod def get(url, path, out=None, rev=None): + if os.path.isabs(path): + raise NoAbsolutePathError(path) + out = resolve_output(path, out) if Stage.is_valid_filename(out): diff --git a/tests/func/test_get.py b/tests/func/test_get.py index f8143fc341..ce0a7ab420 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -9,6 +9,7 @@ from dvc.config import Config from dvc.exceptions import GetDVCFileError from dvc.exceptions import UrlNotDvcRepoError +from dvc.exceptions import NoAbsolutePathError from dvc.repo import Repo from dvc.system import System from dvc.utils import makedirs @@ -106,6 +107,13 @@ def test_get_a_dvc_file(erepo): Repo.get(erepo.root_dir, "some_file.dvc") +# Prevent `Repo.get()` from copying any file on the filesystem +# https://github.com/iterative/dvc/pull/2837#discussion_r352123053 +def test_fails_with_absolute_paths(erepo): + with pytest.raises(NoAbsolutePathError): + Repo.get(erepo.root_dir, "/root/") + + @pytest.mark.parametrize("dname", [".", "dir", "dir/subdir"]) def test_get_to_dir(dname, erepo): src = erepo.FOO From cadbf10e646b1ea6a2f13446228d27583595f5c2 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Mon, 2 Dec 2019 18:44:23 +0100 Subject: [PATCH 09/23] dont use absolute paths in tests --- tests/func/test_get.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index ce0a7ab420..c263acc3c2 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -40,32 +40,33 @@ def test_get_repo_dir(erepo): def test_get_regular_file(erepo): - src = os.path.join(erepo.root_dir, "some_file") - dst = os.path.join(os.getcwd(), "_imported") + src = "some_file" + dst = "some_file_imported" - erepo.create(src, "hello") - erepo.dvc.scm.add([src]) + src_path = os.path.join(erepo.root_dir, src) + erepo.create(src_path, "hello") + erepo.dvc.scm.add([src_path]) erepo.dvc.scm.commit("add a regular file") - Repo.get(erepo.root_dir, os.path.basename(src), os.path.basename(dst)) + Repo.get(erepo.root_dir, src, dst) assert os.path.exists(dst) assert os.path.isfile(dst) - assert filecmp.cmp(src, dst, shallow=False) + assert filecmp.cmp(src_path, dst, shallow=False) def test_get_regular_dir(erepo): - src = os.path.join(erepo.root_dir, "some_directory") - src_file = os.path.join(src, "file.txt") - dst = os.path.join(os.getcwd(), "_imported") + src = "some_directory" + dst = "some_directory_imported" - erepo.create(src_file, "hello") - erepo.dvc.scm.add([src_file]) + src_file_path = os.path.join(erepo.root_dir, src, "file.txt") + erepo.create(src_file_path, "hello") + erepo.dvc.scm.add([src_file_path]) erepo.dvc.scm.commit("add a regular dir") - Repo.get(erepo.root_dir, os.path.basename(src), os.path.basename(dst)) + Repo.get(erepo.root_dir, src, dst) assert os.path.exists(dst) assert os.path.isdir(dst) - trees_equal(src, dst) + trees_equal(os.path.join(erepo.root_dir, src), dst) def test_cache_type_is_properly_overridden(repo_dir, erepo): From ed104978546eb015becd65bdd06d382ffc3cdb84 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Mon, 2 Dec 2019 20:26:42 +0100 Subject: [PATCH 10/23] Remove unused var --- tests/func/test_get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index c263acc3c2..11ab26fafc 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -69,7 +69,7 @@ def test_get_regular_dir(erepo): trees_equal(os.path.join(erepo.root_dir, src), dst) -def test_cache_type_is_properly_overridden(repo_dir, erepo): +def test_cache_type_is_properly_overridden(erepo): erepo.dvc.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "symlink" ) From 9ece4ed8add874d7a7d42bf8ae84bbdfcc0c2b19 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Tue, 3 Dec 2019 14:01:58 +0100 Subject: [PATCH 11/23] ensure absolute paths for dvc files work but not for git --- dvc/exceptions.py | 8 ++--- dvc/repo/get.py | 75 +++++++++++++++++++++++------------------- tests/func/test_get.py | 26 ++++++++++++--- 3 files changed, 66 insertions(+), 43 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 1ced270321..6909bcea08 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -268,12 +268,10 @@ def __init__(self, path, cause=None): ) -class NoAbsolutePathError(DvcException): +class FileOutsideRepoError(DvcException): def __init__(self, path, cause=None): - super(NoAbsolutePathError, self).__init__( - "The path '{}' is an absolute path. ".format(path) - + "Provide a relative path.", - cause=cause, + super(FileOutsideRepoError, self).__init__( + "The path '{}' is outside the target repository.".format(path) ) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index bb2482302a..24d1ae81a1 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -8,7 +8,8 @@ from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError from dvc.exceptions import UrlNotDvcRepoError -from dvc.exceptions import NoAbsolutePathError +from dvc.exceptions import NoOutputInExternalRepoError +from dvc.exceptions import FileOutsideRepoError from dvc.external_repo import external_repo from dvc.path_info import PathInfo from dvc.stage import Stage @@ -19,11 +20,24 @@ logger = logging.getLogger(__name__) +def copy_git_file(repo, src, dst): + try: + # Assert that the file does indeed exist in git before attempting to + # copy it. + repo.scm.repo.head.commit.tree[src] # noqa + except KeyError: + raise FileOutsideRepoError(src) + + src_full_path = os.path.join(repo.root_dir, src) + dst_full_path = os.path.abspath(dst) + if os.path.isdir(src_full_path): + shutil.copytree(src_full_path, dst_full_path) + else: + shutil.copy2(src_full_path, dst_full_path) + + @staticmethod def get(url, path, out=None, rev=None): - if os.path.isabs(path): - raise NoAbsolutePathError(path) - out = resolve_output(path, out) if Stage.is_valid_filename(out): @@ -34,42 +48,35 @@ def get(url, path, out=None, rev=None): # reflink and/or hardlink. Not using tempfile.TemporaryDirectory # because it will create a symlink to tmpfs, which defeats the purpose # and won't work with reflink/hardlink. - abspath = os.path.abspath(out) - dpath = os.path.dirname(abspath) + dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: - tmp_repo_path = os.path.join(repo.root_dir, path) - if os.path.exists(tmp_repo_path): - if os.path.isdir(tmp_repo_path): - shutil.copytree(tmp_repo_path, abspath) - else: - shutil.copy2(tmp_repo_path, abspath) - else: - # Note: we need to replace state, because in case of getting - # DVC dependency on CIFS or NFS filesystems, sqlite-based state - # will be unable to obtain lock - repo.state = StateNoop() - - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we - # are done, and to make that work we would have to copy data - # over anyway before removing the cache, so we might just copy - # it right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + # Note: we need to replace state, because in case of getting DVC + # dependency on CIFS or NFS filesystems, sqlite-based state + # will be unable to obtain lock + repo.state = StateNoop() - o = repo.find_out_by_relpath(path) + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we are + # done, and to make that work we would have to copy data over + # anyway before removing the cache, so we might just copy it + # right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - with repo.state: - repo.cloud.pull(o.get_used_cache()) - o.path_info = PathInfo(os.path.abspath(out)) - with o.repo.state: - o.checkout() + o = repo.find_out_by_relpath(path) + with repo.state: + repo.cloud.pull(o.get_used_cache()) + o.path_info = PathInfo(os.path.abspath(out)) + with o.repo.state: + o.checkout() + except NoOutputInExternalRepoError: + copy_git_file(repo, path, out) except NotDvcRepoError: raise UrlNotDvcRepoError(url) except OutputNotFoundError: diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 11ab26fafc..cf35f4bc56 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -9,7 +9,7 @@ from dvc.config import Config from dvc.exceptions import GetDVCFileError from dvc.exceptions import UrlNotDvcRepoError -from dvc.exceptions import NoAbsolutePathError +from dvc.exceptions import FileOutsideRepoError from dvc.repo import Repo from dvc.system import System from dvc.utils import makedirs @@ -108,10 +108,28 @@ def test_get_a_dvc_file(erepo): Repo.get(erepo.root_dir, "some_file.dvc") -# Prevent `Repo.get()` from copying any file on the filesystem # https://github.com/iterative/dvc/pull/2837#discussion_r352123053 -def test_fails_with_absolute_paths(erepo): - with pytest.raises(NoAbsolutePathError): +def test_get_full_dvc_path(erepo): + external_data_dir = erepo.mkdtemp() + external_data = os.path.join(external_data_dir, "ext_data") + with open(external_data, "w+") as fobj: + fobj.write("ext_data") + + cur_dir = os.getcwd() + os.chdir(erepo.root_dir) + erepo.dvc.add(external_data) + erepo.dvc.scm.add(["ext_data.dvc"]) + erepo.dvc.scm.commit("add external data") + os.chdir(cur_dir) + + Repo.get(erepo.root_dir, external_data, "ext_data_imported") + assert os.path.isfile("ext_data_imported") + assert filecmp.cmp(external_data, "ext_data_imported", shallow=False) + + +# https://github.com/iterative/dvc/pull/2837#discussion_r352123053 +def test_fails_with_non_git_files(erepo): + with pytest.raises(FileOutsideRepoError): Repo.get(erepo.root_dir, "/root/") From f829338e1eecb94735bbb13f97661e07457428cb Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Tue, 3 Dec 2019 14:06:49 +0100 Subject: [PATCH 12/23] if instead of try / catch --- dvc/repo/get.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 24d1ae81a1..d07251f82f 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -21,11 +21,9 @@ def copy_git_file(repo, src, dst): - try: - # Assert that the file does indeed exist in git before attempting to - # copy it. - repo.scm.repo.head.commit.tree[src] # noqa - except KeyError: + # Assert that the file does indeed exist in git before attempting to + # copy it. + if src not in repo.scm.repo.head.commit.tree: raise FileOutsideRepoError(src) src_full_path = os.path.join(repo.root_dir, src) From 97793b4fb3f1bb6563a9654799b4513e3db17221 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Tue, 3 Dec 2019 14:08:53 +0100 Subject: [PATCH 13/23] underscore private function --- dvc/repo/get.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index d07251f82f..69ab2d5382 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -20,7 +20,7 @@ logger = logging.getLogger(__name__) -def copy_git_file(repo, src, dst): +def _copy_git_file(repo, src, dst): # Assert that the file does indeed exist in git before attempting to # copy it. if src not in repo.scm.repo.head.commit.tree: @@ -74,7 +74,7 @@ def get(url, path, out=None, rev=None): o.checkout() except NoOutputInExternalRepoError: - copy_git_file(repo, path, out) + _copy_git_file(repo, path, out) except NotDvcRepoError: raise UrlNotDvcRepoError(url) except OutputNotFoundError: From 9930e96390748e44b4c5d8de4a38e84ae66d5d48 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Tue, 3 Dec 2019 14:50:11 +0100 Subject: [PATCH 14/23] fixup! ensure absolute paths for dvc files work but not for git --- dvc/repo/get.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 69ab2d5382..7ea5998447 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -21,13 +21,12 @@ def _copy_git_file(repo, src, dst): - # Assert that the file does indeed exist in git before attempting to - # copy it. - if src not in repo.scm.repo.head.commit.tree: - raise FileOutsideRepoError(src) - src_full_path = os.path.join(repo.root_dir, src) dst_full_path = os.path.abspath(dst) + + if repo.root_dir not in src_full_path: + raise FileOutsideRepoError(src) + if os.path.isdir(src_full_path): shutil.copytree(src_full_path, dst_full_path) else: From 38bed227c65463550e3fd624d6ff2f60cb6606ef Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Wed, 4 Dec 2019 17:28:01 +0100 Subject: [PATCH 15/23] fixup --- dvc/exceptions.py | 6 +++--- dvc/repo/get.py | 7 ++++--- tests/func/test_get.py | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 6909bcea08..237b0d2e5a 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -268,9 +268,9 @@ def __init__(self, path, cause=None): ) -class FileOutsideRepoError(DvcException): - def __init__(self, path, cause=None): - super(FileOutsideRepoError, self).__init__( +class PathOutsideRepoError(DvcException): + def __init__(self, path): + super(PathOutsideRepoError, self).__init__( "The path '{}' is outside the target repository.".format(path) ) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 7ea5998447..f986b31d15 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -9,13 +9,14 @@ from dvc.exceptions import OutputNotFoundError from dvc.exceptions import UrlNotDvcRepoError from dvc.exceptions import NoOutputInExternalRepoError -from dvc.exceptions import FileOutsideRepoError +from dvc.exceptions import PathOutsideRepoError from dvc.external_repo import external_repo from dvc.path_info import PathInfo from dvc.stage import Stage from dvc.state import StateNoop from dvc.utils import resolve_output from dvc.utils.fs import remove +from dvc.utils.fs import path_isin logger = logging.getLogger(__name__) @@ -24,8 +25,8 @@ def _copy_git_file(repo, src, dst): src_full_path = os.path.join(repo.root_dir, src) dst_full_path = os.path.abspath(dst) - if repo.root_dir not in src_full_path: - raise FileOutsideRepoError(src) + if not path_isin(src_full_path, repo.root_dir): + raise PathOutsideRepoError(src) if os.path.isdir(src_full_path): shutil.copytree(src_full_path, dst_full_path) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index cf35f4bc56..5d519194db 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -9,7 +9,7 @@ from dvc.config import Config from dvc.exceptions import GetDVCFileError from dvc.exceptions import UrlNotDvcRepoError -from dvc.exceptions import FileOutsideRepoError +from dvc.exceptions import PathOutsideRepoError from dvc.repo import Repo from dvc.system import System from dvc.utils import makedirs @@ -129,7 +129,7 @@ def test_get_full_dvc_path(erepo): # https://github.com/iterative/dvc/pull/2837#discussion_r352123053 def test_fails_with_non_git_files(erepo): - with pytest.raises(FileOutsideRepoError): + with pytest.raises(PathOutsideRepoError): Repo.get(erepo.root_dir, "/root/") From 3c2633e33ffe6eee3217c321f142ed4a221309c4 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Wed, 4 Dec 2019 22:59:55 +0100 Subject: [PATCH 16/23] Pass all tests --- dvc/exceptions.py | 7 ------ dvc/repo/get.py | 52 ++++++++++++++++++++---------------------- tests/func/test_get.py | 20 ++++++++++++++-- 3 files changed, 43 insertions(+), 36 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 237b0d2e5a..e5fb6f00d5 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -268,13 +268,6 @@ def __init__(self, path, cause=None): ) -class PathOutsideRepoError(DvcException): - def __init__(self, path): - super(PathOutsideRepoError, self).__init__( - "The path '{}' is outside the target repository.".format(path) - ) - - class DvcIgnoreInCollectedDirError(DvcException): def __init__(self, ignore_dirname): super(DvcIgnoreInCollectedDirError, self).__init__( diff --git a/dvc/repo/get.py b/dvc/repo/get.py index f986b31d15..9409e2ad87 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -8,8 +8,6 @@ from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError from dvc.exceptions import UrlNotDvcRepoError -from dvc.exceptions import NoOutputInExternalRepoError -from dvc.exceptions import PathOutsideRepoError from dvc.external_repo import external_repo from dvc.path_info import PathInfo from dvc.stage import Stage @@ -25,9 +23,6 @@ def _copy_git_file(repo, src, dst): src_full_path = os.path.join(repo.root_dir, src) dst_full_path = os.path.abspath(dst) - if not path_isin(src_full_path, repo.root_dir): - raise PathOutsideRepoError(src) - if os.path.isdir(src_full_path): shutil.copytree(src_full_path, dst_full_path) else: @@ -50,31 +45,34 @@ def get(url, path, out=None, rev=None): tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: - # Note: we need to replace state, because in case of getting DVC - # dependency on CIFS or NFS filesystems, sqlite-based state - # will be unable to obtain lock - repo.state = StateNoop() + full_path = os.path.join(repo.root_dir, path) + path_in_repo = path_isin(full_path, repo.root_dir) + if os.path.exists(full_path) and path_in_repo: + _copy_git_file(repo, path, out) + else: + # Note: we need to replace state, because in case of getting + # DVC dependency on CIFS or NFS filesystems, sqlite-based state + # will be unable to obtain lock + repo.state = StateNoop() - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we are - # done, and to make that work we would have to copy data over - # anyway before removing the cache, so we might just copy it - # right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we + # are done, and to make that work we would have to copy data + # over anyway before removing the cache, so we might just copy + # it right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - o = repo.find_out_by_relpath(path) - with repo.state: - repo.cloud.pull(o.get_used_cache()) - o.path_info = PathInfo(os.path.abspath(out)) - with o.repo.state: - o.checkout() + o = repo.find_out_by_relpath(path) + with repo.state: + repo.cloud.pull(o.get_used_cache()) + o.path_info = PathInfo(os.path.abspath(out)) + with o.repo.state: + o.checkout() - except NoOutputInExternalRepoError: - _copy_git_file(repo, path, out) except NotDvcRepoError: raise UrlNotDvcRepoError(url) except OutputNotFoundError: diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 5d519194db..d9e098c8e7 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -3,13 +3,14 @@ import filecmp import logging import os +import tempfile import pytest from dvc.config import Config from dvc.exceptions import GetDVCFileError from dvc.exceptions import UrlNotDvcRepoError -from dvc.exceptions import PathOutsideRepoError +from dvc.exceptions import NoOutputInExternalRepoError from dvc.repo import Repo from dvc.system import System from dvc.utils import makedirs @@ -127,9 +128,24 @@ def test_get_full_dvc_path(erepo): assert filecmp.cmp(external_data, "ext_data_imported", shallow=False) +def test_non_cached_output(request, erepo): + os.chdir(erepo.root_dir) + erepo.dvc.run( + outs_no_cache=["non_cached_file"], cmd="echo hello > non_cached_file" + ) + erepo.dvc.scm.add(["non_cached_file", "non_cached_file.dvc"]) + erepo.dvc.scm.commit("add non-cached output") + new_dir = tempfile.mkdtemp(prefix=request.node.name + "_") + os.chdir(new_dir) + Repo.get(erepo.root_dir, "non_cached_file") + assert os.path.isfile("non_cached_file") + src = os.path.join(erepo.root_dir, "non_cached_file") + assert filecmp.cmp(src, "non_cached_file", shallow=False) + + # https://github.com/iterative/dvc/pull/2837#discussion_r352123053 def test_fails_with_non_git_files(erepo): - with pytest.raises(PathOutsideRepoError): + with pytest.raises(NoOutputInExternalRepoError): Repo.get(erepo.root_dir, "/root/") From 20e7696da366218237b9b88667353ac0bec99623 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Wed, 4 Dec 2019 23:14:17 +0100 Subject: [PATCH 17/23] return early --- dvc/repo/get.py | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 9409e2ad87..0219d41706 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -49,29 +49,30 @@ def get(url, path, out=None, rev=None): path_in_repo = path_isin(full_path, repo.root_dir) if os.path.exists(full_path) and path_in_repo: _copy_git_file(repo, path, out) - else: - # Note: we need to replace state, because in case of getting - # DVC dependency on CIFS or NFS filesystems, sqlite-based state - # will be unable to obtain lock - repo.state = StateNoop() + return - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we - # are done, and to make that work we would have to copy data - # over anyway before removing the cache, so we might just copy - # it right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + # Note: we need to replace state, because in case of getting DVC + # dependency on CIFS or NFS filesystems, sqlite-based state + # will be unable to obtain lock + repo.state = StateNoop() - o = repo.find_out_by_relpath(path) - with repo.state: - repo.cloud.pull(o.get_used_cache()) - o.path_info = PathInfo(os.path.abspath(out)) - with o.repo.state: - o.checkout() + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we are + # done, and to make that work we would have to copy data over + # anyway before removing the cache, so we might just copy it + # right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + + o = repo.find_out_by_relpath(path) + with repo.state: + repo.cloud.pull(o.get_used_cache()) + o.path_info = PathInfo(os.path.abspath(out)) + with o.repo.state: + o.checkout() except NotDvcRepoError: raise UrlNotDvcRepoError(url) From b17d1c5b04d1a1ad78f13f8fbc79e21e3d7fa611 Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Thu, 5 Dec 2019 00:33:23 +0100 Subject: [PATCH 18/23] fixup --- dvc/repo/get.py | 4 +--- tests/func/test_get.py | 9 ++++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 0219d41706..2d1e04c449 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -14,7 +14,6 @@ from dvc.state import StateNoop from dvc.utils import resolve_output from dvc.utils.fs import remove -from dvc.utils.fs import path_isin logger = logging.getLogger(__name__) @@ -46,8 +45,7 @@ def get(url, path, out=None, rev=None): try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: full_path = os.path.join(repo.root_dir, path) - path_in_repo = path_isin(full_path, repo.root_dir) - if os.path.exists(full_path) and path_in_repo: + if os.path.exists(full_path) and not os.path.isabs(path): _copy_git_file(repo, path, out) return diff --git a/tests/func/test_get.py b/tests/func/test_get.py index d9e098c8e7..2dbd62daf3 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -3,7 +3,6 @@ import filecmp import logging import os -import tempfile import pytest @@ -128,18 +127,18 @@ def test_get_full_dvc_path(erepo): assert filecmp.cmp(external_data, "ext_data_imported", shallow=False) -def test_non_cached_output(request, erepo): +def test_non_cached_output(tmp_path, erepo): os.chdir(erepo.root_dir) erepo.dvc.run( outs_no_cache=["non_cached_file"], cmd="echo hello > non_cached_file" ) erepo.dvc.scm.add(["non_cached_file", "non_cached_file.dvc"]) erepo.dvc.scm.commit("add non-cached output") - new_dir = tempfile.mkdtemp(prefix=request.node.name + "_") - os.chdir(new_dir) + os.chdir(tmp_path) Repo.get(erepo.root_dir, "non_cached_file") - assert os.path.isfile("non_cached_file") + src = os.path.join(erepo.root_dir, "non_cached_file") + assert os.path.isfile("non_cached_file") assert filecmp.cmp(src, "non_cached_file", shallow=False) From 9ae4ab88bf790d463c541d7fe8752bba131e758c Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Thu, 5 Dec 2019 18:55:22 +0100 Subject: [PATCH 19/23] fix error message for git files --- dvc/repo/get.py | 26 ++++++++++++++++++++------ tests/func/test_get.py | 12 +++++++++++- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 2d1e04c449..5902575c62 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -8,6 +8,7 @@ from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError from dvc.exceptions import UrlNotDvcRepoError +from dvc.exceptions import FileMissingError from dvc.external_repo import external_repo from dvc.path_info import PathInfo from dvc.stage import Stage @@ -18,14 +19,28 @@ logger = logging.getLogger(__name__) +def _is_git_file(repo, path): + if not os.path.isabs(path): + try: + output = repo.find_out_by_relpath(path) + if not output.use_cache: + return True + except OutputNotFoundError: + return True + return False + + def _copy_git_file(repo, src, dst): src_full_path = os.path.join(repo.root_dir, src) dst_full_path = os.path.abspath(dst) - if os.path.isdir(src_full_path): - shutil.copytree(src_full_path, dst_full_path) - else: - shutil.copy2(src_full_path, dst_full_path) + try: + if os.path.isdir(src_full_path): + shutil.copytree(src_full_path, dst_full_path) + else: + shutil.copy2(src_full_path, dst_full_path) + except FileNotFoundError: + raise FileMissingError(src) @staticmethod @@ -44,8 +59,7 @@ def get(url, path, out=None, rev=None): tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: - full_path = os.path.join(repo.root_dir, path) - if os.path.exists(full_path) and not os.path.isabs(path): + if _is_git_file(repo, path): _copy_git_file(repo, path, out) return diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 2dbd62daf3..82c19b2171 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -10,6 +10,7 @@ from dvc.exceptions import GetDVCFileError from dvc.exceptions import UrlNotDvcRepoError from dvc.exceptions import NoOutputInExternalRepoError +from dvc.exceptions import FileMissingError from dvc.repo import Repo from dvc.system import System from dvc.utils import makedirs @@ -143,11 +144,20 @@ def test_non_cached_output(tmp_path, erepo): # https://github.com/iterative/dvc/pull/2837#discussion_r352123053 -def test_fails_with_non_git_files(erepo): +def test_fails_with_files_outside_repo(erepo): with pytest.raises(NoOutputInExternalRepoError): Repo.get(erepo.root_dir, "/root/") +def test_fails_with_non_existing_files(erepo): + fname = "file_does_not_exist" + with pytest.raises( + FileMissingError, + match="Can't find '{}' neither locally nor on remote".format(fname), + ): + Repo.get(erepo.root_dir, fname) + + @pytest.mark.parametrize("dname", [".", "dir", "dir/subdir"]) def test_get_to_dir(dname, erepo): src = erepo.FOO From 7debaca22e8d3e4b99c6ddd54a373ed6d99b3f9d Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Thu, 5 Dec 2019 21:02:18 +0100 Subject: [PATCH 20/23] fixup! fix error message for git files --- dvc/repo/get.py | 1 + tests/func/test_get.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 5902575c62..8f82fc8e90 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -15,6 +15,7 @@ from dvc.state import StateNoop from dvc.utils import resolve_output from dvc.utils.fs import remove +from dvc.utils.compat import FileNotFoundError logger = logging.getLogger(__name__) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 82c19b2171..930afd1550 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -15,6 +15,7 @@ from dvc.system import System from dvc.utils import makedirs from dvc.utils.compat import fspath +from dvc.utils import fspath_py35 from tests.utils import trees_equal @@ -135,7 +136,7 @@ def test_non_cached_output(tmp_path, erepo): ) erepo.dvc.scm.add(["non_cached_file", "non_cached_file.dvc"]) erepo.dvc.scm.commit("add non-cached output") - os.chdir(tmp_path) + os.chdir(fspath_py35(tmp_path)) Repo.get(erepo.root_dir, "non_cached_file") src = os.path.join(erepo.root_dir, "non_cached_file") From 811855ade22655d29fcc281914d7bb1c9983575f Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Thu, 5 Dec 2019 22:54:10 +0100 Subject: [PATCH 21/23] fixes --- dvc/exceptions.py | 6 ++++++ dvc/repo/get.py | 47 +++++++++++++++++++++--------------------- tests/func/test_get.py | 10 ++++----- 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index e5fb6f00d5..fbb8c7eaab 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -268,6 +268,12 @@ def __init__(self, path, cause=None): ) +class PathOutsideRepoError(DvcException): + def __init__(self, path, repo): + msg = "The path '{}' does not exist in the target repository '{}'." + super(PathOutsideRepoError, self).__init__(msg.format(path, repo)) + + class DvcIgnoreInCollectedDirError(DvcException): def __init__(self, ignore_dirname): super(DvcIgnoreInCollectedDirError, self).__init__( diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 8f82fc8e90..9185a1ef75 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -8,7 +8,7 @@ from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError from dvc.exceptions import UrlNotDvcRepoError -from dvc.exceptions import FileMissingError +from dvc.exceptions import PathOutsideRepoError from dvc.external_repo import external_repo from dvc.path_info import PathInfo from dvc.stage import Stage @@ -20,18 +20,7 @@ logger = logging.getLogger(__name__) -def _is_git_file(repo, path): - if not os.path.isabs(path): - try: - output = repo.find_out_by_relpath(path) - if not output.use_cache: - return True - except OutputNotFoundError: - return True - return False - - -def _copy_git_file(repo, src, dst): +def _copy_git_file(repo, src, dst, repo_url): src_full_path = os.path.join(repo.root_dir, src) dst_full_path = os.path.abspath(dst) @@ -41,7 +30,7 @@ def _copy_git_file(repo, src, dst): else: shutil.copy2(src_full_path, dst_full_path) except FileNotFoundError: - raise FileMissingError(src) + raise PathOutsideRepoError(src, repo_url) @staticmethod @@ -60,10 +49,6 @@ def get(url, path, out=None, rev=None): tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: - if _is_git_file(repo, path): - _copy_git_file(repo, path, out) - return - # Note: we need to replace state, because in case of getting DVC # dependency on CIFS or NFS filesystems, sqlite-based state # will be unable to obtain lock @@ -80,12 +65,28 @@ def get(url, path, out=None, rev=None): # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - o = repo.find_out_by_relpath(path) + output = None + output_error = None + + try: + output = repo.find_out_by_relpath(path) + except OutputNotFoundError as ex: + output_error = ex + + is_git_file = output_error and not os.path.isabs(path) + is_not_cached = output and not output.use_cache + + if is_git_file or is_not_cached: + return _copy_git_file(repo, path, out, url) + + if output_error: + raise output_error + with repo.state: - repo.cloud.pull(o.get_used_cache()) - o.path_info = PathInfo(os.path.abspath(out)) - with o.repo.state: - o.checkout() + repo.cloud.pull(output.get_used_cache()) + output.path_info = PathInfo(os.path.abspath(out)) + with output.repo.state: + output.checkout() except NotDvcRepoError: raise UrlNotDvcRepoError(url) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 930afd1550..f838985fba 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -10,7 +10,7 @@ from dvc.exceptions import GetDVCFileError from dvc.exceptions import UrlNotDvcRepoError from dvc.exceptions import NoOutputInExternalRepoError -from dvc.exceptions import FileMissingError +from dvc.exceptions import PathOutsideRepoError from dvc.repo import Repo from dvc.system import System from dvc.utils import makedirs @@ -152,10 +152,10 @@ def test_fails_with_files_outside_repo(erepo): def test_fails_with_non_existing_files(erepo): fname = "file_does_not_exist" - with pytest.raises( - FileMissingError, - match="Can't find '{}' neither locally nor on remote".format(fname), - ): + err = "The path '{}' does not exist in the target repository '{}'".format( + fname, erepo.root_dir + ) + with pytest.raises(PathOutsideRepoError, match=err): Repo.get(erepo.root_dir, fname) From d59bceb71afbe22c80102b5101a933235a19f42d Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Fri, 6 Dec 2019 20:20:25 +0100 Subject: [PATCH 22/23] fixes --- dvc/repo/get.py | 13 ++++++------- tests/func/test_get.py | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 9185a1ef75..5340475041 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -24,11 +24,12 @@ def _copy_git_file(repo, src, dst, repo_url): src_full_path = os.path.join(repo.root_dir, src) dst_full_path = os.path.abspath(dst) + if os.path.isdir(src_full_path): + shutil.copytree(src_full_path, dst_full_path) + return + try: - if os.path.isdir(src_full_path): - shutil.copytree(src_full_path, dst_full_path) - else: - shutil.copy2(src_full_path, dst_full_path) + shutil.copy2(src_full_path, dst_full_path) except FileNotFoundError: raise PathOutsideRepoError(src, repo_url) @@ -80,7 +81,7 @@ def get(url, path, out=None, rev=None): return _copy_git_file(repo, path, out, url) if output_error: - raise output_error + raise OutputNotFoundError(path) with repo.state: repo.cloud.pull(output.get_used_cache()) @@ -90,7 +91,5 @@ def get(url, path, out=None, rev=None): except NotDvcRepoError: raise UrlNotDvcRepoError(url) - except OutputNotFoundError: - raise OutputNotFoundError(path) finally: remove(tmp_dir) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index f838985fba..c7147447e4 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -9,7 +9,7 @@ from dvc.config import Config from dvc.exceptions import GetDVCFileError from dvc.exceptions import UrlNotDvcRepoError -from dvc.exceptions import NoOutputInExternalRepoError +from dvc.exceptions import OutputNotFoundError from dvc.exceptions import PathOutsideRepoError from dvc.repo import Repo from dvc.system import System @@ -146,7 +146,7 @@ def test_non_cached_output(tmp_path, erepo): # https://github.com/iterative/dvc/pull/2837#discussion_r352123053 def test_fails_with_files_outside_repo(erepo): - with pytest.raises(NoOutputInExternalRepoError): + with pytest.raises(OutputNotFoundError): Repo.get(erepo.root_dir, "/root/") From 29b34bacc2b963d9868cf0b84215f4ca49b67f9a Mon Sep 17 00:00:00 2001 From: Dani Hodovic Date: Fri, 6 Dec 2019 21:55:54 +0100 Subject: [PATCH 23/23] fixup! fixes --- dvc/repo/get.py | 3 ++- tests/func/test_get.py | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 5340475041..7fbe6a5d76 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -78,7 +78,8 @@ def get(url, path, out=None, rev=None): is_not_cached = output and not output.use_cache if is_git_file or is_not_cached: - return _copy_git_file(repo, path, out, url) + _copy_git_file(repo, path, out, url) + return if output_error: raise OutputNotFoundError(path) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index c7147447e4..11f70ecc3f 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -151,12 +151,8 @@ def test_fails_with_files_outside_repo(erepo): def test_fails_with_non_existing_files(erepo): - fname = "file_does_not_exist" - err = "The path '{}' does not exist in the target repository '{}'".format( - fname, erepo.root_dir - ) - with pytest.raises(PathOutsideRepoError, match=err): - Repo.get(erepo.root_dir, fname) + with pytest.raises(PathOutsideRepoError): + Repo.get(erepo.root_dir, "file_does_not_exist") @pytest.mark.parametrize("dname", [".", "dir", "dir/subdir"])