From fabff46ef0617ad0ab6eaf8aa77c7d062fbd5b16 Mon Sep 17 00:00:00 2001 From: "Mr. Outis" Date: Fri, 7 Feb 2020 14:26:52 -0600 Subject: [PATCH 1/2] get/import: retrieve files inside directory outs Close #2458 --- dvc/external_repo.py | 47 +++++++++++++++++++++++++++------------ dvc/output/base.py | 4 ++-- tests/func/test_get.py | 27 ++++++++++++++++++++++ tests/func/test_import.py | 36 ++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 16 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index dc4c108e84..c2704b6ca0 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -7,6 +7,7 @@ from funcy import retry, suppress, wrap_with, cached_property +from dvc.path_info import PathInfo from dvc.compat import fspath from dvc.repo import Repo from dvc.config import Config, NoRemoteError, NotDvcRepoError @@ -14,7 +15,8 @@ from dvc.exceptions import OutputNotFoundError, NoOutputInExternalRepoError from dvc.exceptions import FileMissingError, PathMissingError from dvc.remote import RemoteConfig -from dvc.utils.fs import remove, fs_copy +from dvc.utils.fs import remove, fs_copy, move +from dvc.utils import tmp_fname from dvc.scm.git import Git @@ -68,32 +70,49 @@ def __init__(self, root_dir, url): self._set_upstream() def pull_to(self, path, to_info): - try: - out = None - with suppress(OutputNotFoundError): - out = self.find_out_by_relpath(path) + """ + Pull the corresponding file or directory specified by `path` and + checkout it into `to_info`. + + It works with files tracked by Git and DVC, and also local files + outside the repository. + """ + out = None + path_info = PathInfo(self.root_dir) / path + + with suppress(OutputNotFoundError): + (out,) = self.find_outs_by_path(path_info, strict=False) + try: if out and out.use_cache: - self._pull_cached(out, to_info) + self._pull_cached(out, path, to_info) return - # Git handled files can't have absolute path + # Check if it is handled by Git (it can't have an absolute path) if os.path.isabs(path): raise FileNotFoundError - fs_copy(os.path.join(self.root_dir, path), fspath(to_info)) + fs_copy(fspath(path_info), fspath(to_info)) except FileNotFoundError: raise PathMissingError(path, self.url) - def _pull_cached(self, out, to_info): + def _pull_cached(self, out, src, dest): with self.state: + tmp = PathInfo(tmp_fname(dest)) + target = (out.path_info.parent / src).relative_to(out.path_info) + src = tmp / target + + out.path_info = tmp + # Only pull unless all needed cache is present - if out.changed_cache(): - self.cloud.pull(out.get_used_cache()) + if out.changed_cache(filter_info=src): + self.cloud.pull(out.get_used_cache(filter_info=src)) + + failed = out.checkout(filter_info=src) + + move(src, dest) + remove(tmp) - out.path_info = to_info - failed = out.checkout() - # This might happen when pull haven't really pulled all the files if failed: raise FileNotFoundError diff --git a/dvc/output/base.py b/dvc/output/base.py index 7770c30cce..d59276da7a 100644 --- a/dvc/output/base.py +++ b/dvc/output/base.py @@ -162,11 +162,11 @@ def exists(self): def changed_checksum(self): return self.checksum != self.remote.get_checksum(self.path_info) - def changed_cache(self): + def changed_cache(self, filter_info=None): if not self.use_cache or not self.checksum: return True - return self.cache.changed_cache(self.checksum) + return self.cache.changed_cache(self.checksum, filter_info=filter_info) def status(self): if self.checksum and self.use_cache and self.changed_cache(): diff --git a/tests/func/test_get.py b/tests/func/test_get.py index aa6f576425..0d28208341 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -170,6 +170,33 @@ def test_get_from_non_dvc_master(tmp_dir, git_dir, caplog): assert (tmp_dir / "some_dst").read_text() == "some text" +def test_get_file_from_dir(tmp_dir, erepo_dir): + with erepo_dir.chdir(): + erepo_dir.dvc_gen( + { + "dir": { + "1": "1", + "2": "2", + "subdir": {"foo": "foo", "bar": "bar"} + } + }, + commit="create dir" + ) + + Repo.get(fspath(erepo_dir), os.path.join("dir", "1")) + assert (tmp_dir / "1").read_text() == "1" + + Repo.get(fspath(erepo_dir), os.path.join("dir", "2"), out="file") + assert (tmp_dir / "file").read_text() == "2" + + Repo.get(fspath(erepo_dir), os.path.join("dir", "subdir")) + assert (tmp_dir / "subdir" / "foo").read_text() == "foo" + assert (tmp_dir / "subdir" / "bar").read_text() == "bar" + + Repo.get(fspath(erepo_dir), os.path.join("dir", "subdir", "foo"), out="X") + assert (tmp_dir / "X").read_text() == "foo" + + def test_get_url_positive(tmp_dir, erepo_dir, caplog): with erepo_dir.chdir(): erepo_dir.dvc_gen("foo", "foo") diff --git a/tests/func/test_import.py b/tests/func/test_import.py index 21591351d0..4872bafaf0 100644 --- a/tests/func/test_import.py +++ b/tests/func/test_import.py @@ -102,6 +102,42 @@ def test_import_dir(tmp_dir, scm, dvc, erepo_dir): } +def test_get_file_from_dir(tmp_dir, scm, dvc, erepo_dir): + with erepo_dir.chdir(): + erepo_dir.dvc_gen( + { + "dir": { + "1": "1", + "2": "2", + "subdir": {"foo": "foo", "bar": "bar"} + } + }, + commit="create dir" + ) + + stage = dvc.imp(fspath(erepo_dir), os.path.join("dir", "1")) + + assert (tmp_dir / "1").read_text() == "1" + assert scm.repo.git.check_ignore("1") + assert stage.deps[0].def_repo == { + "url": fspath(erepo_dir), + "rev_lock": erepo_dir.scm.get_rev(), + } + + dvc.imp(fspath(erepo_dir), os.path.join("dir", "2"), out="file") + assert (tmp_dir / "file").read_text() == "2" + assert (tmp_dir / "file.dvc").exists() + + dvc.imp(fspath(erepo_dir), os.path.join("dir", "subdir")) + assert (tmp_dir / "subdir" / "foo").read_text() == "foo" + assert (tmp_dir / "subdir" / "bar").read_text() == "bar" + assert (tmp_dir / "subdir.dvc").exists() + + dvc.imp(fspath(erepo_dir), os.path.join("dir", "subdir", "foo"), out="X") + assert (tmp_dir / "X").read_text() == "foo" + assert (tmp_dir / "X.dvc").exists() + + def test_import_non_cached(erepo_dir, tmp_dir, dvc, scm): src = "non_cached_output" dst = src + "_imported" From 5200d6967d8a16153f3705f34241f53d317dbd9a Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Wed, 12 Feb 2020 01:46:04 +0000 Subject: [PATCH 2/2] Restyled by black --- tests/func/test_get.py | 4 ++-- tests/func/test_import.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 0d28208341..ea7c3ffba0 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -177,10 +177,10 @@ def test_get_file_from_dir(tmp_dir, erepo_dir): "dir": { "1": "1", "2": "2", - "subdir": {"foo": "foo", "bar": "bar"} + "subdir": {"foo": "foo", "bar": "bar"}, } }, - commit="create dir" + commit="create dir", ) Repo.get(fspath(erepo_dir), os.path.join("dir", "1")) diff --git a/tests/func/test_import.py b/tests/func/test_import.py index 4872bafaf0..75acd380c7 100644 --- a/tests/func/test_import.py +++ b/tests/func/test_import.py @@ -109,10 +109,10 @@ def test_get_file_from_dir(tmp_dir, scm, dvc, erepo_dir): "dir": { "1": "1", "2": "2", - "subdir": {"foo": "foo", "bar": "bar"} + "subdir": {"foo": "foo", "bar": "bar"}, } }, - commit="create dir" + commit="create dir", ) stage = dvc.imp(fspath(erepo_dir), os.path.join("dir", "1"))