From afc8886ffc6a9dd8906569d2a4dde65b1ac0a692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Thu, 9 Jan 2020 18:15:25 +0000 Subject: [PATCH 01/13] get: handle non-DVC repositories Allows us to `dvc get` from non-DVC source repositories. Fixes #3089 --- dvc/exceptions.py | 5 ----- dvc/external_repo.py | 4 ++-- dvc/repo/get.py | 39 ++++++++++++++++++++++----------------- tests/func/test_get.py | 10 +++------- 4 files changed, 27 insertions(+), 31 deletions(-) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index e48014e95a..a551e35db1 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -240,11 +240,6 @@ def __init__(self, ignore_dirname): ) -class UrlNotDvcRepoError(DvcException): - def __init__(self, url): - super().__init__("URL '{}' is not a dvc repository.".format(url)) - - class GitHookAlreadyExistsError(DvcException): def __init__(self, hook_name): super().__init__( diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 9ff2f2a413..8f7cfaef55 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -33,7 +33,7 @@ def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): repo.close() -def cached_clone(url, rev=None, **_ignored_kwargs): +def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): """Clone an external git repo to a temporary directory. Returns the path to a local temporary directory with the specified @@ -44,7 +44,7 @@ def cached_clone(url, rev=None, **_ignored_kwargs): """ - new_path = tempfile.mkdtemp("dvc-erepo") + new_path = clone_path or tempfile.mkdtemp("dvc-erepo") # Copy and adjust existing clean clone if (url, None, None) in REPO_CACHE: diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 17607bbfae..58eddfcbe9 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -7,10 +7,9 @@ DvcException, NotDvcRepoError, OutputNotFoundError, - UrlNotDvcRepoError, PathMissingError, ) -from dvc.external_repo import external_repo +from dvc.external_repo import cached_clone from dvc.path_info import PathInfo from dvc.stage import Stage from dvc.utils import resolve_output @@ -28,8 +27,15 @@ def __init__(self): ) +# Dummy exception raised to signal a plain file copy is needed +class _DoPlainCopy(DvcException): + pass + + @staticmethod def get(url, path, out=None, rev=None): + from dvc.repo import Repo + out = resolve_output(path, out) if Stage.is_valid_filename(out): @@ -43,7 +49,8 @@ def get(url, path, out=None, rev=None): dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: - with external_repo(cache_dir=tmp_dir, url=url, rev=rev) as repo: + cached_clone(url, rev=rev, clone_path=tmp_dir) + try: # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are @@ -53,26 +60,24 @@ def get(url, path, out=None, rev=None): # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. + repo = Repo(tmp_dir) repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + output = repo.find_out_by_relpath(path) + if not output.use_cache: + # Catch this below and go for a plain old fs_copy + raise _DoPlainCopy + _get_cached(repo, output, out) - try: - output = repo.find_out_by_relpath(path) - except OutputNotFoundError: - output = None - - if output and output.use_cache: - _get_cached(repo, output, out) - else: - # Either an uncached out with absolute path or a user error - if os.path.isabs(path): - raise FileNotFoundError + except (NotDvcRepoError, OutputNotFoundError, _DoPlainCopy): + # It's an uncached out with absolute path, a non-DVC repo, or a + # user error + if os.path.isabs(path): + raise FileNotFoundError - fs_copy(os.path.join(repo.root_dir, path), out) + fs_copy(os.path.join(tmp_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) - except NotDvcRepoError: - raise UrlNotDvcRepoError(url) finally: remove(tmp_dir) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 99fd23840b..f988a5c286 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -5,7 +5,6 @@ from dvc.cache import Cache from dvc.config import Config -from dvc.exceptions import UrlNotDvcRepoError from dvc.repo.get import GetDVCFileError, PathMissingError from dvc.repo import Repo from dvc.system import System @@ -87,9 +86,10 @@ def test_get_repo_rev(tmp_dir, erepo_dir): def test_get_from_non_dvc_repo(tmp_dir, erepo_dir): erepo_dir.scm.repo.index.remove([erepo_dir.dvc.dvc_dir], r=True) erepo_dir.scm.commit("remove dvc") + erepo_dir.scm_gen({"some_file": "contents"}, commit="create file") - with pytest.raises(UrlNotDvcRepoError): - Repo.get(fspath(erepo_dir), "some_file.zip") + Repo.get(fspath(erepo_dir), "some_file", "file_imported") + assert (tmp_dir / "file_imported").read_text() == "contents" def test_get_a_dvc_file(tmp_dir, erepo_dir): @@ -164,10 +164,6 @@ def test_get_from_non_dvc_master(tmp_dir, erepo_dir, caplog): erepo_dir.dvc.scm.repo.index.remove([".dvc"], r=True) erepo_dir.dvc.scm.commit("remove .dvc") - # sanity check - with pytest.raises(UrlNotDvcRepoError): - Repo.get(fspath(erepo_dir), "some_file") - caplog.clear() dst = "file_imported" with caplog.at_level(logging.INFO, logger="dvc"): From d09285418ee2d8c4e8a0b51147ab607e6229d179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 14:00:26 +0000 Subject: [PATCH 02/13] style: improve code flow and move comments --- dvc/repo/get.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 58eddfcbe9..785c388a4a 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -51,6 +51,8 @@ def get(url, path, out=None, rev=None): try: cached_clone(url, rev=rev, clone_path=tmp_dir) try: + repo = Repo(tmp_dir) + # Try any links possible to avoid data duplication. # # Not using symlink, because we need to remove cache after we are @@ -60,21 +62,23 @@ def get(url, path, out=None, rev=None): # # Also, we can't use theoretical "move" link type here, because # the same cache file might be used a few times in a directory. - repo = Repo(tmp_dir) repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + output = repo.find_out_by_relpath(path) - if not output.use_cache: + if output.use_cache: # Catch this below and go for a plain old fs_copy - raise _DoPlainCopy - _get_cached(repo, output, out) + _get_cached(repo, output, out) + return + + except (NotDvcRepoError, OutputNotFoundError): + pass - except (NotDvcRepoError, OutputNotFoundError, _DoPlainCopy): - # It's an uncached out with absolute path, a non-DVC repo, or a - # user error - if os.path.isabs(path): - raise FileNotFoundError + # It's an uncached out with absolute path, a non-DVC repo, or a + # user error + if os.path.isabs(path): + raise FileNotFoundError - fs_copy(os.path.join(tmp_dir, path), out) + fs_copy(os.path.join(tmp_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) From 09bdc7325ed90dd15e8c00dd25ecab2d398ea3b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 14:10:35 +0000 Subject: [PATCH 03/13] doc: change get documentation so that it doesn't imply the target must be DVC enabled --- dvc/command/get.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 9aba35c0d8..492d7efba4 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -31,7 +31,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download/copy files or directories from DVC repository." + GET_HELP = "Download/copy files or directories from git repository." get_parser = subparsers.add_parser( "get", parents=[parent_parser], @@ -40,10 +40,10 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, ) get_parser.add_argument( - "url", help="URL of Git repository with DVC project to download from." + "url", help="URL of Git repository to download from." ) get_parser.add_argument( - "path", help="Path to a file or directory within a DVC repository." + "path", help="Path to a file or directory within the repository." ) get_parser.add_argument( "-o", @@ -52,6 +52,6 @@ def add_parser(subparsers, parent_parser): help="Destination path to copy/download files to.", ) get_parser.add_argument( - "--rev", nargs="?", help="DVC repository git revision." + "--rev", nargs="?", help="Repository git revision." ) get_parser.set_defaults(func=CmdGet) From 8e4922800f6ce47210976764af1c05e31e90deac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 16:48:09 +0000 Subject: [PATCH 04/13] get: recoup cache optimal location --- dvc/external_repo.py | 4 ++-- dvc/repo/get.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 8f7cfaef55..9ff2f2a413 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -33,7 +33,7 @@ def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): repo.close() -def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): +def cached_clone(url, rev=None, **_ignored_kwargs): """Clone an external git repo to a temporary directory. Returns the path to a local temporary directory with the specified @@ -44,7 +44,7 @@ def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): """ - new_path = clone_path or tempfile.mkdtemp("dvc-erepo") + new_path = tempfile.mkdtemp("dvc-erepo") # Copy and adjust existing clean clone if (url, None, None) in REPO_CACHE: diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 785c388a4a..49bb4e9013 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -3,6 +3,8 @@ import shortuuid +from dvc.cache import CacheConfig +from dvc.config import Config from dvc.exceptions import ( DvcException, NotDvcRepoError, @@ -49,9 +51,11 @@ def get(url, path, out=None, rev=None): dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: - cached_clone(url, rev=rev, clone_path=tmp_dir) + clone_dir = cached_clone(url, rev=rev) try: - repo = Repo(tmp_dir) + repo = Repo(clone_dir) + cache_config = CacheConfig(repo.config) + cache_config.set_dir(tmp_dir, level=Config.LEVEL_LOCAL) # Try any links possible to avoid data duplication. # @@ -78,12 +82,13 @@ def get(url, path, out=None, rev=None): if os.path.isabs(path): raise FileNotFoundError - fs_copy(os.path.join(tmp_dir, path), out) + fs_copy(os.path.join(clone_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) finally: remove(tmp_dir) + remove(clone_dir) def _get_cached(repo, output, out): From 3e386d93ba8da28f7d16513187fe46623cd03e3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 16:58:53 +0000 Subject: [PATCH 05/13] Update dvc/repo/get.py Co-Authored-By: Ruslan Kuprieiev --- dvc/repo/get.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 49bb4e9013..4c1f554237 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -70,7 +70,6 @@ def get(url, path, out=None, rev=None): output = repo.find_out_by_relpath(path) if output.use_cache: - # Catch this below and go for a plain old fs_copy _get_cached(repo, output, out) return From bfe068809427a5b2313e0b7b5b1aaed8259d3fed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 16:59:40 +0000 Subject: [PATCH 06/13] Revert "get: recoup cache optimal location" This reverts commit 8e4922800f6ce47210976764af1c05e31e90deac. --- dvc/external_repo.py | 4 ++-- dvc/repo/get.py | 11 +++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 9ff2f2a413..8f7cfaef55 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -33,7 +33,7 @@ def external_repo(url=None, rev=None, rev_lock=None, cache_dir=None): repo.close() -def cached_clone(url, rev=None, **_ignored_kwargs): +def cached_clone(url, rev=None, clone_path=None, **_ignored_kwargs): """Clone an external git repo to a temporary directory. Returns the path to a local temporary directory with the specified @@ -44,7 +44,7 @@ def cached_clone(url, rev=None, **_ignored_kwargs): """ - new_path = tempfile.mkdtemp("dvc-erepo") + new_path = clone_path or tempfile.mkdtemp("dvc-erepo") # Copy and adjust existing clean clone if (url, None, None) in REPO_CACHE: diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 4c1f554237..19028861c8 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -3,8 +3,6 @@ import shortuuid -from dvc.cache import CacheConfig -from dvc.config import Config from dvc.exceptions import ( DvcException, NotDvcRepoError, @@ -51,11 +49,9 @@ def get(url, path, out=None, rev=None): dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: - clone_dir = cached_clone(url, rev=rev) + cached_clone(url, rev=rev, clone_path=tmp_dir) try: - repo = Repo(clone_dir) - cache_config = CacheConfig(repo.config) - cache_config.set_dir(tmp_dir, level=Config.LEVEL_LOCAL) + repo = Repo(tmp_dir) # Try any links possible to avoid data duplication. # @@ -81,13 +77,12 @@ def get(url, path, out=None, rev=None): if os.path.isabs(path): raise FileNotFoundError - fs_copy(os.path.join(clone_dir, path), out) + fs_copy(os.path.join(tmp_dir, path), out) except (OutputNotFoundError, FileNotFoundError): raise PathMissingError(path, url) finally: remove(tmp_dir) - remove(clone_dir) def _get_cached(repo, output, out): From d8621152852904d6caf9db2bf4487d7b1f521223 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 17:00:04 +0000 Subject: [PATCH 07/13] remove unused exception class --- dvc/repo/get.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 19028861c8..0585cbfc97 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -27,11 +27,6 @@ def __init__(self): ) -# Dummy exception raised to signal a plain file copy is needed -class _DoPlainCopy(DvcException): - pass - - @staticmethod def get(url, path, out=None, rev=None): from dvc.repo import Repo From 0f7dad7b934f4414a6269d364a6e8ad97b429881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 17:29:21 +0000 Subject: [PATCH 08/13] change command doc string --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 492d7efba4..1b1e1bffab 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -31,7 +31,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download/copy files or directories from git repository." + GET_HELP = "Download/copy files or directories from Git repository." get_parser = subparsers.add_parser( "get", parents=[parent_parser], From cabbb32bb4435ecb5f01ae999d0f0247d1dbc620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:07:53 +0000 Subject: [PATCH 09/13] Update dvc/command/get.py Co-Authored-By: Jorge Orpinel --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 1b1e1bffab..90cac549c7 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -52,6 +52,6 @@ def add_parser(subparsers, parent_parser): help="Destination path to copy/download files to.", ) get_parser.add_argument( - "--rev", nargs="?", help="Repository git revision." + "--rev", nargs="?", help="Git revision (e.g. branch, tag, SHA)" ) get_parser.set_defaults(func=CmdGet) From 36bd848cb6b283e1c94e47f19b0f4fecfc40802a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:08:07 +0000 Subject: [PATCH 10/13] Update dvc/command/get.py Co-Authored-By: Jorge Orpinel --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 90cac549c7..3774bd9cb9 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -43,7 +43,7 @@ def add_parser(subparsers, parent_parser): "url", help="URL of Git repository to download from." ) get_parser.add_argument( - "path", help="Path to a file or directory within the repository." + "path", help="Path to a file or directory within the project or repository" ) get_parser.add_argument( "-o", From c3314b5aa2314885530e68f193ac20d37f477bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:08:33 +0000 Subject: [PATCH 11/13] Update dvc/command/get.py Co-Authored-By: Jorge Orpinel --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 3774bd9cb9..daae7fa266 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -40,7 +40,7 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, ) get_parser.add_argument( - "url", help="URL of Git repository to download from." + "url", help="Location of DVC project or Git repository to download from" ) get_parser.add_argument( "path", help="Path to a file or directory within the project or repository" From 724283fa382843e6dc11c16045e966cacce6cd30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Fri, 10 Jan 2020 18:08:46 +0000 Subject: [PATCH 12/13] Update dvc/command/get.py Co-Authored-By: Jorge Orpinel --- dvc/command/get.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index daae7fa266..7d5af18c79 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -31,7 +31,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download/copy files or directories from Git repository." + GET_HELP = "Download a file or directory from any DVC project or Git repository" get_parser = subparsers.add_parser( "get", parents=[parent_parser], From 9f4060690ca8c99fc06b534f4b447ec73f2e8c60 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Fri, 10 Jan 2020 18:08:53 +0000 Subject: [PATCH 13/13] Restyled by black --- dvc/command/get.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/dvc/command/get.py b/dvc/command/get.py index 7d5af18c79..30146010e5 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -31,7 +31,9 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download a file or directory from any DVC project or Git repository" + GET_HELP = ( + "Download a file or directory from any DVC project or Git repository" + ) get_parser = subparsers.add_parser( "get", parents=[parent_parser], @@ -40,10 +42,12 @@ def add_parser(subparsers, parent_parser): formatter_class=argparse.RawDescriptionHelpFormatter, ) get_parser.add_argument( - "url", help="Location of DVC project or Git repository to download from" + "url", + help="Location of DVC project or Git repository to download from", ) get_parser.add_argument( - "path", help="Path to a file or directory within the project or repository" + "path", + help="Path to a file or directory within the project or repository", ) get_parser.add_argument( "-o",