From f1ce7c214a62cb795ddd23998d45eafca06c9879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 14 Jul 2020 17:34:52 +0545 Subject: [PATCH 1/8] get/list/import/api: subrepo support --- dvc/api.py | 36 ++-- dvc/dependency/repo.py | 27 +-- dvc/external_repo.py | 286 +++++++++++++++--------------- dvc/ignore.py | 6 +- dvc/repo/__init__.py | 46 ++--- dvc/repo/fetch.py | 7 +- dvc/repo/get.py | 26 ++- dvc/repo/ls.py | 14 +- dvc/repo/metrics/show.py | 2 +- dvc/repo/plots/__init__.py | 2 +- dvc/repo/tree.py | 186 ++++++++++++------- dvc/subrepos.py | 11 ++ dvc/tree/_debug.py | 21 +++ dvc/tree/git.py | 12 +- dvc/tree/local.py | 12 +- tests/dir_helpers.py | 16 +- tests/func/test_external_repo.py | 23 ++- tests/func/test_get.py | 47 +++++ tests/func/test_import.py | 47 ++++- tests/func/test_tree.py | 4 +- tests/unit/repo/test_repo_tree.py | 20 +-- 21 files changed, 515 insertions(+), 336 deletions(-) create mode 100644 dvc/subrepos.py create mode 100644 dvc/tree/_debug.py diff --git a/dvc/api.py b/dvc/api.py index a2d648aa17..84f857b68b 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -2,7 +2,12 @@ from contextlib import _GeneratorContextManager as GCM from contextlib import contextmanager -from dvc.exceptions import DvcException, NotDvcRepoError +from dvc.exceptions import ( + DvcException, + FileMissingError, + NotDvcRepoError, + PathMissingError, +) from dvc.external_repo import external_repo from dvc.repo import Repo @@ -26,10 +31,14 @@ def get_url(path, repo=None, rev=None, remote=None): directory in the remote storage. """ with _make_repo(repo, rev=rev) as _repo: - if not isinstance(_repo, Repo): - raise UrlNotDvcRepoError(_repo.url) # pylint: disable=no-member - out = _repo.find_out_by_relpath(path) - remote_obj = _repo.cloud.get_remote(remote) + # pylint: disable=no-member + path = os.path.join(_repo.root_dir, path) + is_erepo = not isinstance(_repo, Repo) + r = _repo.in_repo(path) if is_erepo else _repo + if is_erepo and not r: + raise UrlNotDvcRepoError(_repo.url) + out = r.find_out_by_relpath(path) + remote_obj = r.cloud.get_remote(remote) return str(remote_obj.tree.hash_to_path_info(out.checksum)) @@ -74,10 +83,17 @@ def __getattr__(self, name): def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): with _make_repo(repo, rev=rev) as _repo: - with _repo.open_by_relpath( - path, remote=remote, mode=mode, encoding=encoding - ) as fd: - yield fd + is_erepo = not isinstance(_repo, Repo) + try: + with _repo.repo_tree.open_by_relpath( + path, remote=remote, mode=mode, encoding=encoding + ) as fd: + yield fd + except FileNotFoundError as exc: + if is_erepo: + # pylint: disable=no-member + raise PathMissingError(path, _repo.url) from exc + raise FileMissingError(path) from exc def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None): @@ -101,5 +117,5 @@ def _make_repo(repo_url=None, rev=None): return except NotDvcRepoError: pass # fallthrough to external_repo - with external_repo(url=repo_url, rev=rev) as repo: + with external_repo(url=repo_url, rev=rev, stream=True) as repo: yield repo diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index eb3c58b5ce..fcc205b822 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -2,7 +2,6 @@ from voluptuous import Required -from dvc.exceptions import OutputNotFoundError from dvc.path_info import PathInfo from .local import LocalDependency @@ -42,30 +41,17 @@ def repo_pair(self): def __str__(self): return "{} ({})".format(self.def_path, self.def_repo[self.PARAM_URL]) - def _make_repo(self, *, locked=True): + def _make_repo(self, *, locked=True, **kwargs): from dvc.external_repo import external_repo d = self.def_repo rev = (d.get("rev_lock") if locked else None) or d.get("rev") - return external_repo(d["url"], rev=rev) + return external_repo(d["url"], rev=rev, **kwargs) def _get_checksum(self, locked=True): - from dvc.repo.tree import RepoTree - - with self._make_repo(locked=locked) as repo: - try: - return repo.find_out_by_relpath(self.def_path).info["md5"] - except OutputNotFoundError: - path = PathInfo(os.path.join(repo.root_dir, self.def_path)) - - # we want stream but not fetch, so DVC out directories are - # walked, but dir contents is not fetched - tree = RepoTree(repo, stream=True) - - # We are polluting our repo cache with some dir listing here - if tree.isdir(path): - return self.repo.cache.local.tree.get_hash(path, tree=tree) - return tree.get_file_hash(path) + with self._make_repo(locked=locked, stream=True) as repo: + path = PathInfo(os.path.join(repo.root_dir, self.def_path)) + return repo.get_checksum(path, self.repo.cache.local) def status(self): current_checksum = self._get_checksum(locked=True) @@ -88,8 +74,7 @@ def download(self, to): self.def_repo[self.PARAM_REV_LOCK] = repo.get_rev() cache = self.repo.cache.local - with repo.use_cache(cache): - _, _, cache_infos = repo.fetch_external([self.def_path]) + _, _, cache_infos = repo.fetch_external([self.def_path], cache) cache.checkout(to.path_info, cache_infos[0]) def update(self, rev=None): diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 15332e7de1..766295691f 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -8,6 +8,7 @@ from funcy import cached_property, retry, wrap_with +from dvc import subrepos from dvc.config import NoRemoteError, NotDvcRepoError from dvc.exceptions import ( FileMissingError, @@ -21,14 +22,14 @@ from dvc.repo.tree import RepoTree from dvc.scm.base import CloneError from dvc.scm.git import Git -from dvc.tree.local import LocalTree +from dvc.tree import LocalTree from dvc.utils.fs import remove logger = logging.getLogger(__name__) @contextmanager -def external_repo(url, rev=None, for_write=False): +def external_repo(url, rev=None, for_write=False, **kwargs): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) if not rev: @@ -36,11 +37,11 @@ def external_repo(url, rev=None, for_write=False): # (which may not be the default branch), use origin/HEAD here to get # the tip of the default branch rev = "refs/remotes/origin/HEAD" - try: - repo = ExternalRepo(path, url, rev, for_write=for_write) - except NotDvcRepoError: - repo = ExternalGitRepo(path, url, rev) + # TODO: What to do with for the `dvcx`? + # something like following, perhaps? + # repo = ExternalDVCRepo if _is_dvc_main_repo(path, rev) else ExternalRepo + repo = ExternalRepo(path, url, rev, for_write=for_write, **kwargs) try: yield repo except NoRemoteError: @@ -71,53 +72,124 @@ def clean_repos(): _remove(path) -class BaseExternalRepo: - # pylint: disable=no-member +def _fix_local_remote(orig_repo, src_repo, remote_name): + # If a remote URL is relative to the source repo, + # it will have changed upon config load and made + # relative to this new repo. Restore the old one here. + new_remote = orig_repo.config["remote"][remote_name] + old_remote = src_repo.config["remote"][remote_name] + if new_remote["url"] != old_remote["url"]: + new_remote["url"] = old_remote["url"] - _local_cache = None - def __str__(self): - return self.url +def _add_upstream(orig_repo, src_repo): + # Fill the empty upstream entry with a new remote pointing to the + # original repo's cache location. + orig_repo.config["remote"]["auto-generated-upstream"] = { + "url": src_repo.cache.local.cache_dir + } + orig_repo.config["core"]["remote"] = "auto-generated-upstream" - @property - def local_cache(self): - if hasattr(self, "cache"): - return self.cache.local - return self._local_cache - @contextmanager - def use_cache(self, cache): - """Use the specified cache in place of default tmpdir cache for - download operations. - """ - if hasattr(self, "cache"): - save_cache = self.cache.local - self.cache.local = cache - self._local_cache = cache +class ExternalRepo: + def __init__( + self, root_dir, url, rev, for_write=False, fetch=True, **kwargs + ): + self.root_dir = os.path.abspath(root_dir) + self.scm = Git(root_dir) + self.url = url + self.config = {"fetch": fetch, **kwargs} + self.for_write = for_write - yield + repo_kw = {} + if for_write: + tree = LocalTree(None, {"url": self.root_dir}) + else: + # .dvc folders are ignored by dvcignore which is required for + # `subrepos.find()`, hence using a separate tree + tree = self.scm.get_tree(rev) + repo_kw = dict(scm=self.scm, rev=rev) + + paths = subrepos.find(tree) + self.repos = [Repo(path, **repo_kw) for path in paths] + self._setup_cache_dir() + self.rev = rev - if hasattr(self, "cache"): - self.cache.local = save_cache - self._local_cache = None + for repo in self.repos: + repo.cache.local.cache_dir = self.cache_dir + if os.path.isdir(self.url): + self._fix_upstream(repo) + + @wrap_with(threading.Lock()) + def _setup_cache_dir(self): + # share same cache_dir among all subrepos + try: + self.cache_dir = CACHE_DIRS[self.url] + except KeyError: + self.cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp( + "dvc-cache" + ) @cached_property - def repo_tree(self): - return RepoTree(self, fetch=True) + def tree(self): + kwargs = dict( + use_dvcignore=True, + dvcignore_root=self.root_dir, + ignore_subrepo=False, + ) + if self.for_write: + return LocalTree(None, {"url": self.root_dir}, **kwargs) + return self.scm.get_tree(rev=self.rev, **kwargs) - def get_rev(self): - if isinstance(self.tree, LocalTree): - return self.scm.get_rev() - return self.tree.rev + @cached_property + def repo_tree(self) -> "RepoTree": + return RepoTree(self.tree, subrepos=self.repos, **self.config) - def fetch_external(self, paths: Iterable, **kwargs): + @wrap_with(threading.Lock()) + @contextmanager + def with_cache(self, cache_dir, link_types=None): + for repo in self.repos: + repo.cache.local.cache_dir = cache_dir + if link_types: + # FIXME: not rolling back, should be fine for now + repo.cache.local.cache_types = link_types + yield + for repo in self.repos: + repo.cache.local.cache_dir = self.cache_dir + + def _fix_upstream(self, orig_repo): + try: + rel_path = os.path.relpath(orig_repo.root_dir, self.root_dir) + src_repo = Repo(PathInfo(self.url) / rel_path) + except NotDvcRepoError: + # If ExternalRepo does not throw NotDvcRepoError and Repo does, + # the self.url might be a bare git repo. + # NOTE: This will fail to resolve remote with relative path, + # same as if it was a remote DVC repo. + return + + try: + remote_name = orig_repo.config["core"].get("remote") + if remote_name: + _fix_local_remote(orig_repo, src_repo, remote_name) + else: + _add_upstream(orig_repo, src_repo) + finally: + src_repo.close() + + def close(self): + for repo in self.repos: + repo.close() + self.scm.close() + + def fetch_external(self, paths: Iterable, cache, **kwargs): """Fetch specified external repo paths into cache. - Returns 3-tuple in the form - (downloaded, failed, list(cache_infos)) - where cache_infos can be used as checkout targets for the - fetched paths. - """ + Returns 3-tuple in the form + (downloaded, failed, list(cache_infos)) + where cache_infos can be used as checkout targets for the + fetched paths. + """ download_results = [] failed = 0 @@ -130,120 +202,48 @@ def download_update(result): for path in paths: if not self.repo_tree.exists(path): raise PathMissingError(path, self.url) - save_info = self.local_cache.save( - path, - self.repo_tree, - None, - save_link=False, - download_callback=download_update, - ) + with self.with_cache(cache.cache_dir): + save_info = cache.save( + path, + self.repo_tree, + None, + save_link=False, + download_callback=download_update, + ) save_infos.append(save_info) return sum(download_results), failed, save_infos - def get_external(self, path, dest): + def get_external(self, src, dest): """Convenience wrapper for fetch_external and checkout.""" - if self.local_cache: - # fetch DVC and git files to tmpdir cache, then checkout - _, _, save_infos = self.fetch_external([path]) - self.local_cache.checkout(PathInfo(dest), save_infos[0]) + repo = self.in_repo(src) + if repo: + cache = repo.cache.local + _, _, save_infos = self.fetch_external([src], cache) + cache.checkout(PathInfo(dest), save_infos[0]) else: - # git-only erepo with no cache, just copy files directly - # to dest - path = PathInfo(self.root_dir) / path + path = PathInfo(self.root_dir) / src if not self.repo_tree.exists(path): - raise PathMissingError(path, self.url) + raise PathMissingError(src, self.url) self.repo_tree.copytree(path, dest) + def get_rev(self): + if isinstance(self.tree, LocalTree): + return self.scm.get_rev() + return self.tree.rev -class ExternalRepo(Repo, BaseExternalRepo): - def __init__(self, root_dir, url, rev, for_write=False): - if for_write: - super().__init__(root_dir) - else: - root_dir = os.path.realpath(root_dir) - super().__init__(root_dir, scm=Git(root_dir), rev=rev) - self.url = url - self._set_cache_dir() - self._fix_upstream() - - @wrap_with(threading.Lock()) - def _set_cache_dir(self): - try: - cache_dir = CACHE_DIRS[self.url] - except KeyError: - cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp("dvc-cache") - - self.cache.local.cache_dir = cache_dir - self._local_cache = self.cache.local - - def _fix_upstream(self): - if not os.path.isdir(self.url): - return - - try: - src_repo = Repo(self.url) - except NotDvcRepoError: - # If ExternalRepo does not throw NotDvcRepoError and Repo does, - # the self.url might be a bare git repo. - # NOTE: This will fail to resolve remote with relative path, - # same as if it was a remote DVC repo. - return - - try: - remote_name = self.config["core"].get("remote") - if remote_name: - self._fix_local_remote(src_repo, remote_name) - else: - self._add_upstream(src_repo) - finally: - src_repo.close() - - def _fix_local_remote(self, src_repo, remote_name): - # If a remote URL is relative to the source repo, - # it will have changed upon config load and made - # relative to this new repo. Restore the old one here. - new_remote = self.config["remote"][remote_name] - old_remote = src_repo.config["remote"][remote_name] - if new_remote["url"] != old_remote["url"]: - new_remote["url"] = old_remote["url"] - - def _add_upstream(self, src_repo): - # Fill the empty upstream entry with a new remote pointing to the - # original repo's cache location. - cache_dir = src_repo.cache.local.cache_dir - self.config["remote"]["auto-generated-upstream"] = {"url": cache_dir} - self.config["core"]["remote"] = "auto-generated-upstream" - - -class ExternalGitRepo(BaseExternalRepo): - def __init__(self, root_dir, url, rev): - self.root_dir = os.path.realpath(root_dir) - self.url = url - self.tree = self.scm.get_tree(rev) - - @cached_property - def scm(self): - return Git(self.root_dir) - - def close(self): - if "scm" in self.__dict__: - self.scm.close() + def get_checksum(self, path_info, cache): + if self.repo_tree.isdir(path_info): + return cache.tree.get_hash(path_info, tree=self.repo_tree) + return self.repo_tree.get_file_hash(path_info) - def find_out_by_relpath(self, path): - raise OutputNotFoundError(path, self) + def in_repo(self, path): + tree = self.repo_tree.in_subtree(path) + return tree.repo if tree else None - @contextmanager - def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): - """Opens a specified resource as a file object.""" - tree = RepoTree(self) - try: - with tree.open( - path, mode=mode, encoding=encoding, **kwargs - ) as fobj: - yield fobj - except FileNotFoundError: - raise PathMissingError(path, self.url) + @property + def main_repo(self): + return self.in_repo(os.curdir) def _cached_clone(url, rev, for_write=False): diff --git a/dvc/ignore.py b/dvc/ignore.py index 05463403cf..ebee0bf4fa 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -120,7 +120,7 @@ def _is_dvc_repo(root, directory): return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) - def __init__(self, tree, root_dir): + def __init__(self, tree, root_dir, ignore_subrepo=True): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] @@ -131,9 +131,11 @@ def __init__(self, tree, root_dir): self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir ) + for root, dirs, _ in self.tree.walk(self.root_dir): self._update(root) - self._update_sub_repo(root, dirs) + if ignore_subrepo: + self._update_sub_repo(root, dirs) dirs[:], _ = self(root, dirs, []) def _update(self, dirname): diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 6d98682727..f6aeb5f5b9 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -1,14 +1,11 @@ import logging import os -from contextlib import contextmanager from functools import wraps from funcy import cached_property, cat, first from dvc.config import Config from dvc.dvcfile import PIPELINE_FILE, Dvcfile, is_valid_filename -from dvc.exceptions import FileMissingError -from dvc.exceptions import IsADirectoryError as DvcIsADirectoryError from dvc.exceptions import ( NoOutputOrStageError, NotDvcRepoError, @@ -153,22 +150,29 @@ def tree(self, tree): def __repr__(self): return f"{self.__class__.__name__}: '{self.root_dir}'" + @cached_property + def repo_tree(self): + return RepoTree(self.tree, [self], stream=True) + @classmethod def find_root(cls, root=None, tree=None): + # TODO (@skshetry): Verify and refactor this root_dir = os.path.realpath(root or os.curdir) - if tree: - if tree.isdir(os.path.join(root_dir, cls.DVC_DIR)): - return root_dir - raise NotDvcRepoError(f"'{root}' does not contain DVC directory") - - if not os.path.isdir(root_dir): - raise NotDvcRepoError(f"directory '{root}' does not exist") + is_dir = tree.isdir if tree else os.path.isdir while True: dvc_dir = os.path.join(root_dir, cls.DVC_DIR) - if os.path.isdir(dvc_dir): + if is_dir(dvc_dir): return root_dir + if ( + tree + and os.path.dirname(os.path.abspath(tree.tree_root)) + == root_dir + ): + raise NotDvcRepoError( + f"'{root}' does not contain DVC directory" + ) if os.path.ismount(root_dir): break root_dir = os.path.dirname(root_dir) @@ -580,26 +584,6 @@ def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts - @contextmanager - def open_by_relpath(self, path, remote=None, mode="r", encoding=None): - """Opens a specified resource as a file descriptor""" - - tree = RepoTree(self, stream=True) - path = os.path.join(self.root_dir, path) - try: - with self.state: - with tree.open( - os.path.join(self.root_dir, path), - mode=mode, - encoding=encoding, - remote=remote, - ) as fobj: - yield fobj - except FileNotFoundError as exc: - raise FileMissingError(path) from exc - except IsADirectoryError as exc: - raise DvcIsADirectoryError(f"'{path}' is a directory") from exc - def close(self): self.scm.close() diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index fe3cd9c6f6..3814589445 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -81,10 +81,9 @@ def _fetch_external(self, repo_url, repo_rev, files, jobs): failed, downloaded = 0, 0 try: with external_repo(repo_url, repo_rev) as repo: - with repo.use_cache(self.cache.local): - d, f, _ = repo.fetch_external(files, jobs=jobs) - downloaded += d - failed += f + d, f, _ = repo.fetch_external(files, self.cache.local, jobs=jobs) + downloaded += d + failed += f except CloneError: failed += 1 logger.exception( diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 1141b00772..4ae5c23f6c 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -37,19 +37,17 @@ def get(url, path, out=None, rev=None): tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(url=url, rev=rev) as repo: - if hasattr(repo, "cache"): - repo.cache.local.cache_dir = tmp_dir - - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we - # are done, and to make that work we would have to copy data - # over anyway before removing the cache, so we might just copy - # it right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - repo.get_external(path, out) + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we + # are done, and to make that work we would have to copy data + # over anyway before removing the cache, so we might just copy + # it right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + link_types = ["reflink", "hardlink", "copy"] + with repo.with_cache(tmp_dir, link_types=link_types): + repo.get_external(path, out) finally: remove(tmp_dir) diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index e02d52116a..1f3ae17d60 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -29,7 +29,7 @@ def ls( """ from dvc.external_repo import external_repo - with external_repo(url, rev) as repo: + with external_repo(url, rev, stream=True) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path @@ -48,15 +48,10 @@ def ls( def _ls(repo, path_info, recursive=None, dvc_only=False): - from dvc.repo.tree import RepoTree - def onerror(exc): raise exc - # use our own RepoTree instance instead of repo.repo_tree since we want to - # fetch directory listings, but don't want to fetch file contents. - tree = RepoTree(repo, stream=True) - + tree = repo.repo_tree ret = {} try: for root, dirs, files in tree.walk( @@ -76,9 +71,8 @@ def onerror(exc): if not recursive: for dname in dirs: info = PathInfo(root) / dname - if not dvc_only or ( - tree.dvctree and tree.dvctree.exists(info) - ): + subtree = tree.in_subtree(info) + if not dvc_only or (subtree and subtree.exists(info)): dvc = tree.isdvc(info) path = str(info.relative_to(path_info)) ret[path] = { diff --git a/dvc/repo/metrics/show.py b/dvc/repo/metrics/show.py index cdd787f005..e57c71c56d 100644 --- a/dvc/repo/metrics/show.py +++ b/dvc/repo/metrics/show.py @@ -53,7 +53,7 @@ def _extract_metrics(metrics): def _read_metrics(repo, metrics, rev): - tree = RepoTree(repo) + tree = RepoTree(repo.tree, [repo]) res = {} for metric in metrics: diff --git a/dvc/repo/plots/__init__.py b/dvc/repo/plots/__init__.py index 4386ca5613..8ce3585493 100644 --- a/dvc/repo/plots/__init__.py +++ b/dvc/repo/plots/__init__.py @@ -44,7 +44,7 @@ def collect(self, targets=None, revs=None): continue rev = rev or "workspace" - tree = RepoTree(self.repo) + tree = RepoTree(self.repo.tree, [self.repo]) plots = _collect_plots(self.repo, targets, rev) for path_info, props in plots.items(): datafile = relpath(path_info, self.repo.root_dir) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 2b7810c29d..df4a7c2b41 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -1,5 +1,8 @@ import logging import os +from typing import Optional + +from funcy import cached_property, post_processing from dvc.dvcfile import is_valid_filename from dvc.exceptions import OutputNotFoundError @@ -245,25 +248,36 @@ class RepoTree(BaseTree): # pylint:disable=abstract-method Any kwargs will be passed to `DvcTree()`. """ - def __init__(self, repo, **kwargs): - super().__init__(repo, {"url": repo.root_dir}) - if hasattr(repo, "dvc_dir"): - self.dvctree = DvcTree(repo, **kwargs) - else: - # git-only erepo's do not need dvctree - self.dvctree = None + def __init__( + self, tree, subrepos=None, **kwargs + ): # pylint: disable=super-init-not-called + subrepos = subrepos or [] + subrepos.sort(key=lambda r: len(r.root_dir), reverse=True) + dvctrees = [DvcTree(repo, **kwargs) for repo in subrepos] + self._kwargs = kwargs + self._dvctrees = { + os.path.abspath(tree.repo.root_dir): tree for tree in dvctrees + } + self.tree = tree @property def fetch(self): - if self.dvctree: - return self.dvctree.fetch - return False + return self._kwargs.get("fetch", False) @property def stream(self): - if self.dvctree: - return self.dvctree.stream - return False + return self._kwargs.get("stream", False) + + def _find_subtree(self, path_prefix) -> Optional[DvcTree]: + return self._find_subtree_with_prefix(path_prefix)[1] + + def _find_subtree_with_prefix(self, path): + # dvctrees is already ordered from low to high + path_prefix = os.path.abspath(path) + for pref, tree in self._dvctrees.items(): + if os.path.abspath(path_prefix).startswith(pref): + return pref, tree + return "", None def open( self, path, mode="r", encoding="utf-8", **kwargs @@ -271,52 +285,71 @@ def open( if "b" in mode: encoding = None - if self.dvctree and self.dvctree.exists(path): - return self.dvctree.open( - path, mode=mode, encoding=encoding, **kwargs - ) - return self.repo.tree.open(path, mode=mode, encoding=encoding) + subtree = self._find_subtree(path) + if subtree and subtree.exists(path): + return subtree.open(path, mode=mode, encoding=encoding, **kwargs) + return self.tree.open(path, mode=mode, encoding=encoding) - def exists( - self, path, use_dvcignore=True - ): # pylint: disable=arguments-differ - return self.repo.tree.exists(path) or ( - self.dvctree and self.dvctree.exists(path) - ) + def open_by_relpath(self, path, *args, **kwargs): + return self.open(PathInfo(self.tree.tree_root, path), *args, **kwargs) + + def exists(self, path): # pylint: disable=arguments-differ + subtree = self._find_subtree(path) + return self.tree.exists(path) or (subtree and subtree.exists(path)) def isdir(self, path): # pylint: disable=arguments-differ - return self.repo.tree.isdir(path) or ( - self.dvctree and self.dvctree.isdir(path) - ) + subtree = self._find_subtree(path) + return self.tree.isdir(path) or (subtree and subtree.isdir(path)) def isdvc(self, path, **kwargs): - return self.dvctree is not None and self.dvctree.isdvc(path, **kwargs) + subtree = self._find_subtree(path) + return subtree is not None and subtree.isdvc(path, **kwargs) def isfile(self, path): # pylint: disable=arguments-differ - return self.repo.tree.isfile(path) or ( - self.dvctree and self.dvctree.isfile(path) - ) + subtree = self._find_subtree(path) + return self.tree.isfile(path) or (subtree and subtree.isfile(path)) def isexec(self, path): - if self.dvctree and self.dvctree.exists(path): - return self.dvctree.isexec(path) - return self.repo.tree.isexec(path) + subtree = self._find_subtree(path) + if subtree and subtree.exists(path): + return subtree.isexec(path) + return self.tree.isexec(path) def stat(self, path): - return self.repo.tree.stat(path) + return self.tree.stat(path) - def _walk_one(self, walk): + def _dvc_walk(self, walk): try: root, dirs, files = next(walk) except StopIteration: return yield root, dirs, files for _ in dirs: - yield from self._walk_one(walk) - - def _walk(self, dvc_walk, repo_walk, dvcfiles=False): + yield from self._dvc_walk(walk) + + def _repo_walk(self, dir_path, walk, **kwargs): + assert os.path.isabs(dir_path) + subtree = self._dvctrees.get(dir_path) + if subtree: + dvc_walk = subtree.walk(dir_path, topdown=True) + yield from self._walk(walk, dvc_walk, **kwargs) + else: + try: + root, dirs, files = next(walk) + except StopIteration: + return + yield root, dirs, files + for dirname in dirs: + yield from self._repo_walk( + os.path.join(root, dirname), walk, **kwargs + ) + + def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): + assert repo_walk try: - _, dvc_dirs, dvc_fnames = next(dvc_walk) + _, dvc_dirs, dvc_fnames = ( + next(dvc_walk) if dvc_walk else (None, [], []) + ) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return @@ -345,11 +378,15 @@ def _walk(self, dvc_walk, repo_walk, dvcfiles=False): for dirname in dirs: if dirname in shared: - yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles) + yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: - yield from self._walk_one(dvc_walk) + yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: - yield from self._walk_one(repo_walk) + yield from self._repo_walk( + os.path.join(repo_root, dirname), + repo_walk, + dvcfiles=dvcfiles, + ) def walk( self, top, topdown=True, onerror=None, dvcfiles=False, **kwargs @@ -379,22 +416,25 @@ def walk( onerror(NotADirectoryError(top)) return - dvc_exists = self.dvctree and self.dvctree.exists(top) - repo_exists = self.repo.tree.exists(top) + subtree = self._find_subtree(top) + dvc_exists = subtree and subtree.exists(top) + repo_exists = self.tree.exists(top) if dvc_exists and not repo_exists: - yield from self.dvctree.walk( + yield from subtree.walk( top, topdown=topdown, onerror=onerror, **kwargs ) - return - if repo_exists and not dvc_exists: - yield from self.repo.tree.walk( - top, topdown=topdown, onerror=onerror + if not dvc_exists and not repo_exists: + repo_walk = self.tree.walk(top, topdown=topdown, onerror=onerror) + yield from self._repo_walk( + os.path.abspath(top), repo_walk, dvcfiles=dvcfiles ) - return - - dvc_walk = self.dvctree.walk(top, topdown=topdown, **kwargs) - repo_walk = self.repo.tree.walk(top, topdown=topdown) - yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles) + dvc_walk = ( + subtree.walk(top, topdown=topdown, **kwargs) + if dvc_exists + else None + ) + repo_walk = self.tree.walk(top, topdown=topdown) + yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ for root, _, files in self.walk(top, **kwargs): @@ -410,11 +450,10 @@ def get_file_hash(self, path_info): """ if not self.exists(path_info): raise FileNotFoundError - if self.dvctree and self.dvctree.exists(path_info): - try: - return self.dvctree.get_file_hash(path_info) - except OutputNotFoundError: - pass + subtree = self._find_subtree(path_info) + subtree_exists = subtree is not None and subtree.exists(path_info) + if subtree_exists: + return subtree.get_file_hash(path_info) return file_md5(path_info, self)[0] def copytree(self, top, dest): @@ -439,6 +478,27 @@ def copytree(self, top, dest): with self.open(src, mode="rb") as fobj: copy_fobj_to_file(fobj, dest_dir / fname) - @property - def hash_jobs(self): # pylint: disable=invalid-overridden-method - return self.repo.tree.hash_jobs + @cached_property + def hash_jobs(self): + return self.tree.hash_jobs + + def in_subtree(self, path): + return self._find_subtree(path) + + @post_processing("\r\n".join) + def _visualize(self, top, **kwargs): + """`tree`-like output, useful for debugging/visualizing""" + indent = 4 + spacing = " " * indent + tee = "├── " + last = "└── " + for root, _, files in self.walk(top, **kwargs): + level = root.replace(top, "").count(os.sep) + indent = spacing * level + yield "{}{}/".format(indent, os.path.basename(root)) + sub_indent = spacing * (level + 1) + length = len(files) + for i, f in enumerate(files): + yield "{}{}{}".format( + sub_indent, tee if i + 1 != length else last, f + ) diff --git a/dvc/subrepos.py b/dvc/subrepos.py new file mode 100644 index 0000000000..0460a054d7 --- /dev/null +++ b/dvc/subrepos.py @@ -0,0 +1,11 @@ +import os + +from funcy import collecting + + +@collecting +def find(tree, top=None): + top = top or tree.tree_root + for root, _, _ in tree.walk(top): + if tree.isdir(os.path.join(root, ".dvc")): + yield root diff --git a/dvc/tree/_debug.py b/dvc/tree/_debug.py new file mode 100644 index 0000000000..468969dac2 --- /dev/null +++ b/dvc/tree/_debug.py @@ -0,0 +1,21 @@ +import os +from funcy import post_processing + + +@post_processing("\r\n".join) +def visualize(tree, top, **kwargs): + """`tree`-like output, useful for debugging/visualizing, needs `walk()`""" + indent = 4 + spacing = " " * indent + tee = "├── " + last = "└── " + for root, _, files in tree.walk(top, **kwargs): + level = root.replace(top, "").count(os.sep) + indent = spacing * level + yield "{}{}/".format(indent, os.path.basename(root)) + sub_indent = spacing * (level + 1) + length = len(files) + for i, f in enumerate(files): + yield "{}{}{}".format( + sub_indent, tee if i + 1 != length else last, f + ) diff --git a/dvc/tree/git.py b/dvc/tree/git.py index ff5be8209f..2149b370f1 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -24,7 +24,14 @@ def _item_basename(item): class GitTree(BaseTree): # pylint:disable=abstract-method """Proxies the repo file access methods to Git objects""" - def __init__(self, git, rev, use_dvcignore=False, dvcignore_root=None): + def __init__( + self, + git, + rev, + use_dvcignore=False, + dvcignore_root=None, + ignore_subrepo=True, + ): """Create GitTree instance Args: @@ -36,6 +43,7 @@ def __init__(self, git, rev, use_dvcignore=False, dvcignore_root=None): self.rev = rev self.use_dvcignore = use_dvcignore self.dvcignore_root = dvcignore_root + self.ignore_subrepo = ignore_subrepo @property def tree_root(self): @@ -49,7 +57,7 @@ def dvcignore(self): if not self.use_dvcignore: return DvcIgnoreFilterNoop(self, root) self.use_dvcignore = False - ret = DvcIgnoreFilter(self, root) + ret = DvcIgnoreFilter(self, root, ignore_subrepo=self.ignore_subrepo) self.use_dvcignore = True return ret diff --git a/dvc/tree/local.py b/dvc/tree/local.py index 24361f4c8d..198cd4d65c 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -36,12 +36,20 @@ class LocalTree(BaseTree): CACHE_MODE = 0o444 SHARED_MODE_MAP = {None: (0o644, 0o755), "group": (0o664, 0o775)} - def __init__(self, repo, config, use_dvcignore=False, dvcignore_root=None): + def __init__( + self, + repo, + config, + use_dvcignore=False, + dvcignore_root=None, + ignore_subrepo=False, + ): super().__init__(repo, config) url = config.get("url") self.path_info = self.PATH_CLS(url) if url else None self.use_dvcignore = use_dvcignore self.dvcignore_root = dvcignore_root + self.ignore_subrepo = ignore_subrepo @property def tree_root(self): @@ -61,7 +69,7 @@ def dvcignore(self): if not self.use_dvcignore: return DvcIgnoreFilterNoop(self, root) self.use_dvcignore = False - ret = DvcIgnoreFilter(self, root) + ret = DvcIgnoreFilter(self, root, ignore_subrepo=self.ignore_subrepo) self.use_dvcignore = True return ret diff --git a/tests/dir_helpers.py b/tests/dir_helpers.py index 02ae154fd6..ec192a77ba 100644 --- a/tests/dir_helpers.py +++ b/tests/dir_helpers.py @@ -87,7 +87,7 @@ def __new__(cls, *args, **kwargs): self._init() return self - def init(self, *, scm=False, dvc=False): + def init(self, *, scm=False, dvc=False, subdir=False): from dvc.repo import Repo from dvc.scm.git import Git @@ -100,7 +100,9 @@ def init(self, *, scm=False, dvc=False): git_init(str_path) if dvc: self.dvc = Repo.init( - str_path, no_scm=not scm and not hasattr(self, "scm") + str_path, + no_scm=not scm and not hasattr(self, "scm"), + subdir=subdir, ) if scm: self.scm = self.dvc.scm if hasattr(self, "dvc") else Git(str_path) @@ -123,10 +125,10 @@ def gen(self, struct, text=""): if isinstance(struct, (str, bytes, pathlib.PurePath)): struct = {struct: text} - self._gen(struct) - return struct.keys() + return self._gen(struct) def _gen(self, struct, prefix=None): + paths = [] for name, contents in struct.items(): path = (prefix or self) / name @@ -141,6 +143,8 @@ def _gen(self, struct, prefix=None): path.write_bytes(contents) else: path.write_text(contents, encoding="utf-8") + paths.append(path) + return paths def dvc_gen(self, struct, text="", commit=None): paths = self.gen(struct, text) @@ -249,10 +253,10 @@ class PosixTmpDir(TmpDir, pathlib.PurePosixPath): @pytest.fixture(scope="session") def make_tmp_dir(tmp_path_factory, request): - def make(name, *, scm=False, dvc=False): + def make(name, *, scm=False, dvc=False, **kwargs): path = tmp_path_factory.mktemp(name) if isinstance(name, str) else name new_dir = TmpDir(path) - new_dir.init(scm=scm, dvc=dvc) + new_dir.init(scm=scm, dvc=dvc, **kwargs) request.addfinalizer(new_dir.close) return new_dir diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 50b142dd48..9f51899af9 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -3,7 +3,6 @@ from mock import ANY, patch from dvc.external_repo import CLONES, external_repo -from dvc.path_info import PathInfo from dvc.scm.git import Git from dvc.tree.local import LocalTree from dvc.utils import relpath @@ -20,11 +19,11 @@ def test_external_repo(erepo_dir): with patch.object(Git, "clone", wraps=Git.clone) as mock: with external_repo(url) as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "master" with external_repo(url, rev="branch") as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "branch" assert mock.call_count == 1 @@ -43,7 +42,7 @@ def test_source_change(erepo_dir): assert old_rev != new_rev -def test_cache_reused(erepo_dir, mocker, local_cloud): +def test_cache_reused(tmp_dir, erepo_dir, mocker, local_cloud): erepo_dir.add_remote(config=local_cloud.config) with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "text", commit="add file") @@ -54,13 +53,13 @@ def test_cache_reused(erepo_dir, mocker, local_cloud): # Use URL to prevent any fishy optimizations url = f"file://{erepo_dir}" with external_repo(url) as repo: - repo.fetch() + repo.get_external("file", tmp_dir / "file1") assert download_spy.mock.call_count == 1 # Should not download second time erepo_dir.scm.branch("branch") with external_repo(url, "branch") as repo: - repo.fetch() + repo.get_external("file", tmp_dir / "file2") assert download_spy.mock.call_count == 1 @@ -90,10 +89,7 @@ def test_pull_subdir_file(tmp_dir, erepo_dir): dest = tmp_dir / "file" with external_repo(os.fspath(erepo_dir)) as repo: - _, _, save_infos = repo.fetch_external( - [os.path.join("subdir", "file")] - ) - repo.cache.local.checkout(PathInfo(dest), save_infos[0]) + repo.get_external(os.path.join("subdir", "file"), dest) assert dest.is_file() assert dest.read_text() == "contents" @@ -117,9 +113,10 @@ def test_relative_remote(erepo_dir, tmp_dir): url = os.fspath(erepo_dir) with external_repo(url) as repo: - assert os.path.isabs(repo.config["remote"]["upstream"]["url"]) - assert os.path.isdir(repo.config["remote"]["upstream"]["url"]) - with repo.open_by_relpath("file") as fd: + for subrepo in repo.repos: + assert os.path.isabs(subrepo.config["remote"]["upstream"]["url"]) + assert os.path.isdir(subrepo.config["remote"]["upstream"]["url"]) + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "contents" diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 72736f8077..60b4270b46 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -252,3 +252,50 @@ def test_get_pipeline_tracked_outs( with git_dir.chdir(): Repo.get("file:///{}".format(os.fspath(tmp_dir)), "bar", out="baz") assert (git_dir / "baz").read_text() == "foo" + + +def make_subrepo(dir_, scm, config): + dir_.mkdir(parents=True) + with dir_.chdir(): + dir_.scm = scm + dir_.init(dvc=True, subdir=True) + dir_.add_remote(config=config) + + +@pytest.mark.parametrize( + "output", + [ + "foo", + {"foo": "foo", "bar": "bar"}, + {"subdir": {"foo": "foo", "bar": "bar"}}, + ], + ids=["file", "dir", "nested_dir"], +) +@pytest.mark.parametrize( + "erepo", [pytest.lazy_fixture("erepo_dir"), pytest.lazy_fixture("git_dir")] +) +@pytest.mark.parametrize( + "subrepo_paths", + [ + (os.path.join("sub", "subdir1"), os.path.join("sub", "subdir2")), + (os.path.join("sub"), os.path.join("sub", "subdir1")), + ], + ids=["isolated", "nested"], +) +def test_subrepo_multiple( + tmp_dir, scm, output, subrepo_paths, erepo, local_cloud +): + sub_repos = [erepo / path for path in subrepo_paths] + filename = "output" + for repo in sub_repos: + make_subrepo(repo, erepo.scm, local_cloud.config) + repo.dvc_gen({filename: output}, commit="add subrepo") + repo.dvc.push() + + for i, repo in enumerate(sub_repos): + Repo.get( + f"file:///{erepo}", + str((repo / filename).relative_to(erepo)), + out=f"{filename}-{i}", + ) + assert (tmp_dir / f"{filename}-{i}").read_text() == output diff --git a/tests/func/test_import.py b/tests/func/test_import.py index 4e38f70d89..d55dc0585c 100644 --- a/tests/func/test_import.py +++ b/tests/func/test_import.py @@ -9,8 +9,10 @@ from dvc.config import NoRemoteError from dvc.dvcfile import Dvcfile from dvc.exceptions import DownloadError, PathMissingError +from dvc.scm.base import CloneError from dvc.system import System from dvc.utils.fs import makedirs, remove +from tests.func.test_get import make_subrepo def test_import(tmp_dir, scm, dvc, erepo_dir): @@ -234,7 +236,8 @@ def test_download_error_pulling_imported_stage(tmp_dir, dvc, erepo_dir): remove(dst_cache) with patch( - "dvc.tree.local.LocalTree._download", side_effect=Exception + "dvc.external_repo.external_repo", + side_effect=CloneError(os.fspath(erepo_dir), "somewhere"), ), pytest.raises(DownloadError): dvc.pull(["foo_imported.dvc"]) @@ -345,3 +348,45 @@ def test_local_import(tmp_dir, dvc, scm): tmp_dir.dvc_gen("foo", "foo", commit="init") (tmp_dir / "outdir").mkdir() dvc.imp(".", "foo", out="outdir") + + +@pytest.mark.parametrize( + "output", + [ + "foo", + {"foo": "foo", "bar": "bar"}, + {"subdir": {"foo": "foo", "bar": "bar"}}, + ], + ids=["file", "dir", "nested_dir"], +) +@pytest.mark.parametrize( + "erepo", [pytest.lazy_fixture("erepo_dir"), pytest.lazy_fixture("git_dir")] +) +@pytest.mark.parametrize( + "subrepo_paths", + [ + (os.path.join("sub", "subdir1"), os.path.join("sub", "subdir2")), + (os.path.join("sub"), os.path.join("sub", "subdir1")), + ], + ids=["isolated", "nested"], +) +def test_subrepo_import( + tmp_dir, scm, dvc, output, subrepo_paths, erepo, local_cloud +): + sub_repos = [erepo / path for path in subrepo_paths] + filename = "output" + for repo in sub_repos: + make_subrepo(repo, erepo.scm, local_cloud.config) + repo.dvc_gen({filename: output}, commit="add subrepo") + repo.dvc.push() + + rev = erepo.scm.get_rev() + for i, repo in enumerate(sub_repos): + url = f"file:///{erepo}" + file = str((repo / filename).relative_to(erepo)) + out = f"{filename}-{i}" + + stage = dvc.imp(url, file, out=out) + + assert stage.deps[0].def_repo == {"url": url, "rev_lock": rev} + assert (tmp_dir / out).read_text() == output diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 93b671bd75..74585c0e1d 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -184,7 +184,7 @@ def test_repotree_walk_fetch(tmp_dir, dvc, scm, local_remote): dvc.push() remove(dvc.cache.local.cache_dir) - tree = RepoTree(dvc, fetch=True) + tree = RepoTree(dvc.tree, [dvc], fetch=True) with dvc.state: for _, _, _ in tree.walk("dir"): pass @@ -208,7 +208,7 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud): # # for this test, all file objects are being opened() and copied from tree # into dvc.cache, not fetched or streamed from a remote - tree = RepoTree(erepo_dir.dvc, stream=True) + tree = RepoTree(erepo_dir.dvc.tree, [erepo_dir.dvc], stream=True) expected = [ tree.get_file_hash(PathInfo(erepo_dir / path)) for path in ("dir/bar", "dir/subdir/foo") diff --git a/tests/unit/repo/test_repo_tree.py b/tests/unit/repo/test_repo_tree.py index 417b2c0b92..e1da5f4880 100644 --- a/tests/unit/repo/test_repo_tree.py +++ b/tests/unit/repo/test_repo_tree.py @@ -12,7 +12,7 @@ def test_exists(tmp_dir, dvc): dvc.add("foo") (tmp_dir / "foo").unlink() - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.exists("foo") @@ -21,7 +21,7 @@ def test_open(tmp_dir, dvc): dvc.add("foo") (tmp_dir / "foo").unlink() - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with dvc.state: with tree.open("foo", "r") as fobj: assert fobj.read() == "foo" @@ -31,7 +31,7 @@ def test_open_dirty_hash(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("file", "r") as fobj: assert fobj.read() == "something" @@ -40,7 +40,7 @@ def test_open_dirty_no_hash(tmp_dir, dvc): tmp_dir.gen("file", "file") (tmp_dir / "file.dvc").write_text("outs:\n- path: file\n") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("file", "r") as fobj: assert fobj.read() == "file" @@ -60,7 +60,7 @@ def test_open_in_history(tmp_dir, scm, dvc): if rev == "workspace": continue - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("foo", "r") as fobj: assert fobj.read() == "foo" @@ -68,7 +68,7 @@ def test_open_in_history(tmp_dir, scm, dvc): def test_isdir_isfile(tmp_dir, dvc): tmp_dir.gen({"datafile": "data", "datadir": {"foo": "foo", "bar": "bar"}}) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdir("datadir") assert not tree.isfile("datadir") assert not tree.isdvc("datadir") @@ -93,7 +93,7 @@ def test_isdir_mixed(tmp_dir, dvc): dvc.add(str(tmp_dir / "dir" / "foo")) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdir("dir") assert not tree.isfile("dir") @@ -123,7 +123,7 @@ def test_walk(tmp_dir, dvc, dvcfiles, extra_expected): ) dvc.add(str(tmp_dir / "dir"), recursive=True) tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) expected = [ PathInfo("dir") / "subdir1", @@ -150,7 +150,7 @@ def onerror(exc): raise exc tmp_dir.dvc_gen("foo", "foo") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) # path does not exist for _ in tree.walk("dir"): @@ -171,7 +171,7 @@ def test_isdvc(tmp_dir, dvc): tmp_dir.gen({"foo": "foo", "bar": "bar", "dir": {"baz": "baz"}}) dvc.add("foo") dvc.add("dir") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdvc("foo") assert not tree.isdvc("bar") assert tree.isdvc("dir") From 3ffafca72a349ac4d7c22c1b08966105bab20baf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 11:46:16 +0545 Subject: [PATCH 2/8] off-by-one --- dvc/tree/local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/tree/local.py b/dvc/tree/local.py index 198cd4d65c..e3dc717452 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -42,7 +42,7 @@ def __init__( config, use_dvcignore=False, dvcignore_root=None, - ignore_subrepo=False, + ignore_subrepo=True, ): super().__init__(repo, config) url = config.get("url") From 0b4660f6ad1ea84700cf202f9d9f2675d975d45a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 11:46:45 +0545 Subject: [PATCH 3/8] remove test: get/import will not support absolute paths from now on --- tests/func/test_get.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 60b4270b46..d2e009aa27 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -98,22 +98,6 @@ def test_get_a_dvc_file(tmp_dir, erepo_dir): Repo.get(os.fspath(erepo_dir), "some_file.dvc") -# https://github.com/iterative/dvc/pull/2837#discussion_r352123053 -def test_get_full_dvc_path(tmp_dir, erepo_dir, tmp_path_factory): - path = tmp_path_factory.mktemp("ext") - external_data = path / "ext_data" - external_data.write_text("ext_data") - - with erepo_dir.chdir(): - erepo_dir.dvc.add(os.fspath(external_data), external=True) - erepo_dir.scm_add("ext_data.dvc", commit="add external data") - - Repo.get( - os.fspath(erepo_dir), os.fspath(external_data), "ext_data_imported" - ) - assert (tmp_dir / "ext_data_imported").read_text() == "ext_data" - - def test_non_cached_output(tmp_dir, erepo_dir): src = "non_cached_file" dst = src + "_imported" From d0c8c6669ef08cdd60b7ec8c4f011a35d65c3221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 16:01:35 +0545 Subject: [PATCH 4/8] fixup --- dvc/external_repo.py | 25 +++++++++++++++---------- dvc/repo/tree.py | 15 +++++++++------ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 766295691f..57c33b8564 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -183,6 +183,12 @@ def close(self): self.scm.close() def fetch_external(self, paths: Iterable, cache, **kwargs): + # `RepoTree` will try downloading it to the repo's cache + # instead of `cache_dir`, need to change on all instances + with self.with_cache(cache.cache_dir): + return self._fetch_to_cache(paths, cache, **kwargs) + + def _fetch_to_cache(self, paths: Iterable, cache, **kwargs): """Fetch specified external repo paths into cache. Returns 3-tuple in the form @@ -202,14 +208,13 @@ def download_update(result): for path in paths: if not self.repo_tree.exists(path): raise PathMissingError(path, self.url) - with self.with_cache(cache.cache_dir): - save_info = cache.save( - path, - self.repo_tree, - None, - save_link=False, - download_callback=download_update, - ) + save_info = cache.save( + path, + self.repo_tree, + None, + save_link=False, + download_callback=download_update, + ) save_infos.append(save_info) return sum(download_results), failed, save_infos @@ -219,7 +224,7 @@ def get_external(self, src, dest): repo = self.in_repo(src) if repo: cache = repo.cache.local - _, _, save_infos = self.fetch_external([src], cache) + _, _, save_infos = self._fetch_to_cache([src], cache) cache.checkout(PathInfo(dest), save_infos[0]) else: path = PathInfo(self.root_dir) / src @@ -238,7 +243,7 @@ def get_checksum(self, path_info, cache): return self.repo_tree.get_file_hash(path_info) def in_repo(self, path): - tree = self.repo_tree.in_subtree(path) + tree = self.repo_tree.in_subtree(PathInfo(self.root_dir) / path) return tree.repo if tree else None @property diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index df4a7c2b41..e377bae229 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -253,12 +253,15 @@ def __init__( ): # pylint: disable=super-init-not-called subrepos = subrepos or [] subrepos.sort(key=lambda r: len(r.root_dir), reverse=True) - dvctrees = [DvcTree(repo, **kwargs) for repo in subrepos] - self._kwargs = kwargs - self._dvctrees = { - os.path.abspath(tree.repo.root_dir): tree for tree in dvctrees - } + dvctrees = [ + (os.path.abspath(repo.root_dir), DvcTree(repo, **kwargs)) + for repo in subrepos + ] + self._dvctrees = dict( + sorted(dvctrees, key=lambda v: len(v[0]), reverse=True) + ) self.tree = tree + self._kwargs = kwargs @property def fetch(self): @@ -275,7 +278,7 @@ def _find_subtree_with_prefix(self, path): # dvctrees is already ordered from low to high path_prefix = os.path.abspath(path) for pref, tree in self._dvctrees.items(): - if os.path.abspath(path_prefix).startswith(pref): + if path_prefix.startswith(pref): return pref, tree return "", None From a47b6aee4b4bed1a4bfd6666f7306c9cf846783d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 21 Jul 2020 20:39:46 +0545 Subject: [PATCH 5/8] fix tests failure on mac --- dvc/external_repo.py | 6 +++--- dvc/repo/tree.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 57c33b8564..00bfc25677 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -95,15 +95,15 @@ class ExternalRepo: def __init__( self, root_dir, url, rev, for_write=False, fetch=True, **kwargs ): - self.root_dir = os.path.abspath(root_dir) - self.scm = Git(root_dir) + self.root_dir = os.path.realpath(root_dir) + self.scm = Git(self.root_dir) self.url = url self.config = {"fetch": fetch, **kwargs} self.for_write = for_write repo_kw = {} if for_write: - tree = LocalTree(None, {"url": self.root_dir}) + tree = LocalTree(None, {"url": root_dir}) else: # .dvc folders are ignored by dvcignore which is required for # `subrepos.find()`, hence using a separate tree diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index e377bae229..84dc98da17 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -294,7 +294,7 @@ def open( return self.tree.open(path, mode=mode, encoding=encoding) def open_by_relpath(self, path, *args, **kwargs): - return self.open(PathInfo(self.tree.tree_root, path), *args, **kwargs) + return self.open(PathInfo(self.tree.tree_root) / path, *args, **kwargs) def exists(self, path): # pylint: disable=arguments-differ subtree = self._find_subtree(path) From 656fe46291286c981c05b9fb317a99573bdf1301 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Wed, 22 Jul 2020 18:10:46 +0545 Subject: [PATCH 6/8] Split ExternalRepo into two One having a top-level dvc and another without it --- dvc/external_repo.py | 168 ++++++++++++++++++++++--------- tests/func/test_external_repo.py | 8 +- 2 files changed, 126 insertions(+), 50 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 00bfc25677..dc43b39067 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -28,20 +28,36 @@ logger = logging.getLogger(__name__) +def _is_dvc_main_repo(scm, path, rev): + isdir = scm.get_tree(rev).isdir if scm else os.path.isdir + return isdir(os.path.join(path, Repo.DVC_DIR)) + + @contextmanager def external_repo(url, rev=None, for_write=False, **kwargs): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) + path = os.path.realpath(path) + if not rev: # Local HEAD points to the tip of whatever branch we first cloned from # (which may not be the default branch), use origin/HEAD here to get # the tip of the default branch rev = "refs/remotes/origin/HEAD" + if for_write: + rev = None + scm = None + else: + scm = Git(path) + + erepo_cls = ( + ExternalDVCRepo + if _is_dvc_main_repo(scm, path, rev) + else ExternalGitRepo + ) + repo_kw = dict(scm=scm, rev=rev, for_write=for_write, url=url, **kwargs) + repo = erepo_cls(path, **repo_kw) - # TODO: What to do with for the `dvcx`? - # something like following, perhaps? - # repo = ExternalDVCRepo if _is_dvc_main_repo(path, rev) else ExternalRepo - repo = ExternalRepo(path, url, rev, for_write=for_write, **kwargs) try: yield repo except NoRemoteError: @@ -91,35 +107,19 @@ def _add_upstream(orig_repo, src_repo): orig_repo.config["core"]["remote"] = "auto-generated-upstream" -class ExternalRepo: - def __init__( - self, root_dir, url, rev, for_write=False, fetch=True, **kwargs - ): - self.root_dir = os.path.realpath(root_dir) - self.scm = Git(self.root_dir) - self.url = url - self.config = {"fetch": fetch, **kwargs} - self.for_write = for_write - - repo_kw = {} - if for_write: - tree = LocalTree(None, {"url": root_dir}) - else: - # .dvc folders are ignored by dvcignore which is required for - # `subrepos.find()`, hence using a separate tree - tree = self.scm.get_tree(rev) - repo_kw = dict(scm=self.scm, rev=rev) - - paths = subrepos.find(tree) - self.repos = [Repo(path, **repo_kw) for path in paths] +class BaseExternalMixin: + def _setup(self): self._setup_cache_dir() - self.rev = rev for repo in self.repos: repo.cache.local.cache_dir = self.cache_dir if os.path.isdir(self.url): self._fix_upstream(repo) + @property + def repos(self): + raise NotImplementedError + @wrap_with(threading.Lock()) def _setup_cache_dir(self): # share same cache_dir among all subrepos @@ -130,21 +130,6 @@ def _setup_cache_dir(self): "dvc-cache" ) - @cached_property - def tree(self): - kwargs = dict( - use_dvcignore=True, - dvcignore_root=self.root_dir, - ignore_subrepo=False, - ) - if self.for_write: - return LocalTree(None, {"url": self.root_dir}, **kwargs) - return self.scm.get_tree(rev=self.rev, **kwargs) - - @cached_property - def repo_tree(self) -> "RepoTree": - return RepoTree(self.tree, subrepos=self.repos, **self.config) - @wrap_with(threading.Lock()) @contextmanager def with_cache(self, cache_dir, link_types=None): @@ -178,9 +163,8 @@ def _fix_upstream(self, orig_repo): src_repo.close() def close(self): - for repo in self.repos: - repo.close() - self.scm.close() + if self.scm: + self.scm.close() def fetch_external(self, paths: Iterable, cache, **kwargs): # `RepoTree` will try downloading it to the repo's cache @@ -246,9 +230,101 @@ def in_repo(self, path): tree = self.repo_tree.in_subtree(PathInfo(self.root_dir) / path) return tree.repo if tree else None + def _find_subrepos(self): + repo_kw = {} + if self.for_write: + tree = LocalTree(None, {"url": self.root_dir}) + else: + assert self.url and self.scm and self.rev + # .dvc folders are ignored by dvcignore which is required for + # `subrepos.find()`, hence using a separate tree + tree = self.scm.get_tree(self.rev) + repo_kw = dict(scm=self.scm, rev=self.rev) + + paths = subrepos.find(tree) + return [Repo(path, **repo_kw) for path in paths] + + @cached_property + def main_tree(self): + # FIXME: Repo.tree has dvcignore embedded on it, which might ignore + # subrepos. Also, there might be unwanted side effects of using `tree` + # that does not ignore subrepos, so we create our own `master` tree, + # that speaks for the whole repository + # --- + # This is blocking implementation of subrepos inside `Repo`, + # as collecting `.dvcignore` twice is not an answer. + # we need very granular controls on dvcignore, such that we have + # per-ops controls and also be able to create a separate instance + # of tree that does have some attributes of dvcignore (en/dis)abled. + kwargs = dict( + use_dvcignore=True, + dvcignore_root=self.root_dir, + ignore_subrepo=False, + ) + if self.for_write: + return LocalTree(None, {"url": self.root_dir}, **kwargs) + return self.scm.get_tree(rev=self.rev, **kwargs) + + @cached_property + def repo_tree(self) -> "RepoTree": + return RepoTree( + self.main_tree, subrepos=self.repos, **self._tree_config + ) + + +class ExternalDVCRepo(BaseExternalMixin, Repo): + def __init__( + self, + root_dir, + scm=None, + rev=None, + for_write=False, + url=None, + fetch=True, + **kwargs + ): + super().__init__(root_dir, scm=scm, rev=rev) + + self.url = url + self.rev = rev + self.for_write = for_write + self._tree_config = {"fetch": fetch, **kwargs} + self.subrepos = self._find_subrepos() + self._setup() + @property - def main_repo(self): - return self.in_repo(os.curdir) + def repos(self): + return [self] + self.subrepos + + +class ExternalGitRepo(BaseExternalMixin): + def __init__( + self, + root_dir, + scm=None, + rev=None, + for_write=False, + url=None, + fetch=True, + **kwargs + ): + self.root_dir = root_dir + self.scm = scm + self.url = url + self.rev = rev + self.for_write = for_write + self._tree_config = {"fetch": fetch, **kwargs} + + self.subrepos = self._find_subrepos() + self._setup() + + @property + def repos(self): + return self.subrepos + + @property + def tree(self): + return self.main_tree def _cached_clone(url, rev, for_write=False): diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 9f51899af9..624fa78ce8 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -130,7 +130,7 @@ def test_shallow_clone_branch(erepo_dir): with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: with external_repo(url, rev="branch") as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "branch" mock_clone.assert_called_with(url, ANY, shallow_branch="branch") @@ -138,7 +138,7 @@ def test_shallow_clone_branch(erepo_dir): assert shallow with external_repo(url) as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "master" assert mock_clone.call_count == 1 @@ -156,7 +156,7 @@ def test_shallow_clone_tag(erepo_dir): with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: with external_repo(url, rev="v1") as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "foo" mock_clone.assert_called_with(url, ANY, shallow_branch="v1") @@ -164,7 +164,7 @@ def test_shallow_clone_tag(erepo_dir): assert shallow with external_repo(url, rev="master") as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "bar" assert mock_clone.call_count == 1 From aafac24da1df79be7e8cd7f80a8e2702893e16fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Thu, 23 Jul 2020 14:01:47 +0545 Subject: [PATCH 7/8] fix issue on finding subtree --- dvc/repo/tree.py | 26 ++++++------------------ tests/func/test_get.py | 5 +++-- tests/unit/repo/test_repo_tree.py | 33 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 22 deletions(-) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 84dc98da17..419783ff5b 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -2,7 +2,7 @@ import os from typing import Optional -from funcy import cached_property, post_processing +from funcy import cached_property from dvc.dvcfile import is_valid_filename from dvc.exceptions import OutputNotFoundError @@ -277,8 +277,12 @@ def _find_subtree(self, path_prefix) -> Optional[DvcTree]: def _find_subtree_with_prefix(self, path): # dvctrees is already ordered from low to high path_prefix = os.path.abspath(path) + exact_match = self._dvctrees.get(path_prefix) + if exact_match: + return path_prefix, exact_match + for pref, tree in self._dvctrees.items(): - if path_prefix.startswith(pref): + if path_prefix.startswith(pref + os.sep): return pref, tree return "", None @@ -487,21 +491,3 @@ def hash_jobs(self): def in_subtree(self, path): return self._find_subtree(path) - - @post_processing("\r\n".join) - def _visualize(self, top, **kwargs): - """`tree`-like output, useful for debugging/visualizing""" - indent = 4 - spacing = " " * indent - tee = "├── " - last = "└── " - for root, _, files in self.walk(top, **kwargs): - level = root.replace(top, "").count(os.sep) - indent = spacing * level - yield "{}{}/".format(indent, os.path.basename(root)) - sub_indent = spacing * (level + 1) - length = len(files) - for i, f in enumerate(files): - yield "{}{}{}".format( - sub_indent, tee if i + 1 != length else last, f - ) diff --git a/tests/func/test_get.py b/tests/func/test_get.py index d2e009aa27..94b9352db1 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -238,12 +238,13 @@ def test_get_pipeline_tracked_outs( assert (git_dir / "baz").read_text() == "foo" -def make_subrepo(dir_, scm, config): +def make_subrepo(dir_, scm, config=None): dir_.mkdir(parents=True) with dir_.chdir(): dir_.scm = scm dir_.init(dvc=True, subdir=True) - dir_.add_remote(config=config) + if config: + dir_.add_remote(config=config) @pytest.mark.parametrize( diff --git a/tests/unit/repo/test_repo_tree.py b/tests/unit/repo/test_repo_tree.py index e1da5f4880..40966d8231 100644 --- a/tests/unit/repo/test_repo_tree.py +++ b/tests/unit/repo/test_repo_tree.py @@ -177,3 +177,36 @@ def test_isdvc(tmp_dir, dvc): assert tree.isdvc("dir") assert not tree.isdvc("dir/baz") assert tree.isdvc("dir/baz", recursive=True, strict=False) + + +def test_in_subtree(tmp_dir, scm, dvc): + from tests.func.test_get import make_subrepo + + subrepo1 = tmp_dir / "dir" / "repo" + subrepo2 = tmp_dir / "dir" / "repo2" + + for repo in [subrepo1, subrepo2]: + make_subrepo(repo, scm) + + (tmp_dir / "dir" / "repotxt").write_text("file to confuse RepoTree") + subrepo1.dvc_gen({"foo": "foo"}, commit="FOO") + subrepo2.dvc_gen({"bar": "bar"}, commit="BAR") + + # dvc.tree ignores subrepos by default, + # but we just want to test `in_subtree()`, which is purely lexical + tree = RepoTree(dvc.tree, [dvc, subrepo1.dvc, subrepo2.dvc]) + + assert tree.in_subtree(str(tmp_dir / "dir")).repo == dvc + assert tree.in_subtree(str(tmp_dir / "dir" / "re")).repo == dvc + assert tree.in_subtree(str(tmp_dir / "dir" / "repo")).repo == subrepo1.dvc + assert tree.in_subtree(str(tmp_dir / "dir" / "repotxt")).repo == dvc + assert tree.in_subtree(str(tmp_dir / "dir" / "repo2")).repo == subrepo2.dvc + + for repo in [tmp_dir, subrepo1, subrepo2]: + for path in ["", "foo", "something-that-does-not-exist"]: + p = os.path.join(repo, path) + subtree = tree.in_subtree(p) + assert subtree, f"subtree not found for path '{p}'" + assert ( + subtree.repo == repo.dvc + ), f"repo did not match for path '{p}'" From 0074efba9496195c497bbb13a58313b05f786a2a Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Thu, 23 Jul 2020 08:29:05 +0000 Subject: [PATCH 8/8] Restyled by isort --- dvc/tree/_debug.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dvc/tree/_debug.py b/dvc/tree/_debug.py index 468969dac2..1d86ff7aae 100644 --- a/dvc/tree/_debug.py +++ b/dvc/tree/_debug.py @@ -1,4 +1,5 @@ import os + from funcy import post_processing