diff --git a/dvc/command/get.py b/dvc/command/get.py index 77235ecef1..e3172782d6 100644 --- a/dvc/command/get.py +++ b/dvc/command/get.py @@ -33,7 +33,7 @@ def run(self): def add_parser(subparsers, parent_parser): - GET_HELP = "Download data from DVC repository." + GET_HELP = "Download/copy files or directories from DVC repository." get_parser = subparsers.add_parser( "get", parents=[parent_parser], @@ -44,9 +44,14 @@ def add_parser(subparsers, parent_parser): get_parser.add_argument( "url", help="URL of Git repository with DVC project to download from." ) - get_parser.add_argument("path", help="Path to data within DVC repository.") get_parser.add_argument( - "-o", "--out", nargs="?", help="Destination path to put data to." + "path", help="Path to a file or directory within a DVC repository." + ) + get_parser.add_argument( + "-o", + "--out", + nargs="?", + help="Destination path to copy/download files to.", ) get_parser.add_argument( "--rev", nargs="?", help="DVC repository git revision." diff --git a/dvc/exceptions.py b/dvc/exceptions.py index e5fb6f00d5..fbb8c7eaab 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -268,6 +268,12 @@ def __init__(self, path, cause=None): ) +class PathOutsideRepoError(DvcException): + def __init__(self, path, repo): + msg = "The path '{}' does not exist in the target repository '{}'." + super(PathOutsideRepoError, self).__init__(msg.format(path, repo)) + + class DvcIgnoreInCollectedDirError(DvcException): def __init__(self, ignore_dirname): super(DvcIgnoreInCollectedDirError, self).__init__( diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 21d01ed7e7..7fbe6a5d76 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -1,5 +1,6 @@ import logging import os +import shutil import shortuuid @@ -7,16 +8,32 @@ from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError from dvc.exceptions import UrlNotDvcRepoError +from dvc.exceptions import PathOutsideRepoError from dvc.external_repo import external_repo from dvc.path_info import PathInfo from dvc.stage import Stage from dvc.state import StateNoop from dvc.utils import resolve_output from dvc.utils.fs import remove +from dvc.utils.compat import FileNotFoundError logger = logging.getLogger(__name__) +def _copy_git_file(repo, src, dst, repo_url): + src_full_path = os.path.join(repo.root_dir, src) + dst_full_path = os.path.abspath(dst) + + if os.path.isdir(src_full_path): + shutil.copytree(src_full_path, dst_full_path) + return + + try: + shutil.copy2(src_full_path, dst_full_path) + except FileNotFoundError: + raise PathOutsideRepoError(src, repo_url) + + @staticmethod def get(url, path, out=None, rev=None): out = resolve_output(path, out) @@ -49,16 +66,31 @@ def get(url, path, out=None, rev=None): # the same cache file might be used a few times in a directory. repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - o = repo.find_out_by_relpath(path) + output = None + output_error = None + + try: + output = repo.find_out_by_relpath(path) + except OutputNotFoundError as ex: + output_error = ex + + is_git_file = output_error and not os.path.isabs(path) + is_not_cached = output and not output.use_cache + + if is_git_file or is_not_cached: + _copy_git_file(repo, path, out, url) + return + + if output_error: + raise OutputNotFoundError(path) + with repo.state: - repo.cloud.pull(o.get_used_cache()) - o.path_info = PathInfo(os.path.abspath(out)) - with o.repo.state: - o.checkout() + repo.cloud.pull(output.get_used_cache()) + output.path_info = PathInfo(os.path.abspath(out)) + with output.repo.state: + output.checkout() except NotDvcRepoError: raise UrlNotDvcRepoError(url) - except OutputNotFoundError: - raise OutputNotFoundError(path) finally: remove(tmp_dir) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 07551c5f36..11f70ecc3f 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -9,10 +9,13 @@ from dvc.config import Config from dvc.exceptions import GetDVCFileError from dvc.exceptions import UrlNotDvcRepoError +from dvc.exceptions import OutputNotFoundError +from dvc.exceptions import PathOutsideRepoError from dvc.repo import Repo from dvc.system import System from dvc.utils import makedirs from dvc.utils.compat import fspath +from dvc.utils import fspath_py35 from tests.utils import trees_equal @@ -38,6 +41,36 @@ def test_get_repo_dir(erepo): trees_equal(src, dst) +def test_get_regular_file(erepo): + src = "some_file" + dst = "some_file_imported" + + src_path = os.path.join(erepo.root_dir, src) + erepo.create(src_path, "hello") + erepo.dvc.scm.add([src_path]) + erepo.dvc.scm.commit("add a regular file") + Repo.get(erepo.root_dir, src, dst) + + assert os.path.exists(dst) + assert os.path.isfile(dst) + assert filecmp.cmp(src_path, dst, shallow=False) + + +def test_get_regular_dir(erepo): + src = "some_directory" + dst = "some_directory_imported" + + src_file_path = os.path.join(erepo.root_dir, src, "file.txt") + erepo.create(src_file_path, "hello") + erepo.dvc.scm.add([src_file_path]) + erepo.dvc.scm.commit("add a regular dir") + Repo.get(erepo.root_dir, src, dst) + + assert os.path.exists(dst) + assert os.path.isdir(dst) + trees_equal(os.path.join(erepo.root_dir, src), dst) + + def test_cache_type_is_properly_overridden(erepo): erepo.dvc.config.set( Config.SECTION_CACHE, Config.SECTION_CACHE_TYPE, "symlink" @@ -77,6 +110,51 @@ def test_get_a_dvc_file(erepo): Repo.get(erepo.root_dir, "some_file.dvc") +# https://github.com/iterative/dvc/pull/2837#discussion_r352123053 +def test_get_full_dvc_path(erepo): + external_data_dir = erepo.mkdtemp() + external_data = os.path.join(external_data_dir, "ext_data") + with open(external_data, "w+") as fobj: + fobj.write("ext_data") + + cur_dir = os.getcwd() + os.chdir(erepo.root_dir) + erepo.dvc.add(external_data) + erepo.dvc.scm.add(["ext_data.dvc"]) + erepo.dvc.scm.commit("add external data") + os.chdir(cur_dir) + + Repo.get(erepo.root_dir, external_data, "ext_data_imported") + assert os.path.isfile("ext_data_imported") + assert filecmp.cmp(external_data, "ext_data_imported", shallow=False) + + +def test_non_cached_output(tmp_path, erepo): + os.chdir(erepo.root_dir) + erepo.dvc.run( + outs_no_cache=["non_cached_file"], cmd="echo hello > non_cached_file" + ) + erepo.dvc.scm.add(["non_cached_file", "non_cached_file.dvc"]) + erepo.dvc.scm.commit("add non-cached output") + os.chdir(fspath_py35(tmp_path)) + Repo.get(erepo.root_dir, "non_cached_file") + + src = os.path.join(erepo.root_dir, "non_cached_file") + assert os.path.isfile("non_cached_file") + assert filecmp.cmp(src, "non_cached_file", shallow=False) + + +# https://github.com/iterative/dvc/pull/2837#discussion_r352123053 +def test_fails_with_files_outside_repo(erepo): + with pytest.raises(OutputNotFoundError): + Repo.get(erepo.root_dir, "/root/") + + +def test_fails_with_non_existing_files(erepo): + with pytest.raises(PathOutsideRepoError): + Repo.get(erepo.root_dir, "file_does_not_exist") + + @pytest.mark.parametrize("dname", [".", "dir", "dir/subdir"]) def test_get_to_dir(dname, erepo): src = erepo.FOO