diff --git a/dvc/api.py b/dvc/api.py index 1202ebec48..5bf1517afa 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -2,35 +2,40 @@ from contextlib import _GeneratorContextManager as GCM from contextlib import contextmanager -from dvc.exceptions import DvcException, NotDvcRepoError +from funcy import reraise + +from dvc.exceptions import ( + NotDvcRepoError, + OutputNotFoundError, + PathMissingError, +) from dvc.external_repo import external_repo +from dvc.path_info import PathInfo from dvc.repo import Repo -class UrlNotDvcRepoError(DvcException): - """Thrown if the given URL is not a DVC repository.""" - - def __init__(self, url): - super().__init__(f"'{url}' is not a DVC repository.") - - def get_url(path, repo=None, rev=None, remote=None): """ Returns the URL to the storage location of a data file or directory tracked in a DVC repo. For Git repos, HEAD is used unless a rev argument is supplied. The default remote is tried unless a remote argument is supplied. - Raises UrlNotDvcRepoError if repo is not a DVC project. + Raises OutputNotFoundError if the file is not a dvc-tracked file. NOTE: This function does not check for the actual existence of the file or directory in the remote storage. """ with _make_repo(repo, rev=rev) as _repo: - if not isinstance(_repo, Repo): - raise UrlNotDvcRepoError(_repo.url) # pylint: disable=no-member - out = _repo.find_out_by_relpath(path) - remote_obj = _repo.cloud.get_remote(remote) - return str(remote_obj.tree.hash_to_path_info(out.hash_info.value)) + path_info = PathInfo(_repo.root_dir) / path + with reraise(FileNotFoundError, PathMissingError(path, repo)): + metadata = _repo.repo_tree.metadata(path_info) + + if not metadata.is_dvc: + raise OutputNotFoundError(path, repo) + + cloud = metadata.repo.cloud + hash_info = _repo.repo_tree.get_hash(path_info) + return cloud.get_url_for(remote, checksum=hash_info.value) def open( # noqa, pylint: disable=redefined-builtin @@ -97,7 +102,7 @@ def _make_repo(repo_url=None, rev=None): repo_url = repo_url or os.getcwd() if rev is None and os.path.exists(repo_url): try: - yield Repo(repo_url) + yield Repo(repo_url, subrepos=True) return except NotDvcRepoError: pass # fallthrough to external_repo diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index eb9189e215..30f4a618d7 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -114,3 +114,7 @@ def status( show_checksums=show_checksums, log_missing=log_missing, ) + + def get_url_for(self, remote, checksum): + remote = self.get_remote(remote) + return str(remote.tree.hash_to_path_info(checksum)) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index ce49247138..7eb2f87f0a 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -1,10 +1,5 @@ -import os - from voluptuous import Required -from dvc.exceptions import OutputNotFoundError -from dvc.path_info import PathInfo - from .local import LocalDependency @@ -42,30 +37,18 @@ def repo_pair(self): def __str__(self): return "{} ({})".format(self.def_path, self.def_repo[self.PARAM_URL]) - def _make_repo(self, *, locked=True): + def _make_repo(self, *, locked=True, **kwargs): from dvc.external_repo import external_repo d = self.def_repo rev = (d.get("rev_lock") if locked else None) or d.get("rev") - return external_repo(d["url"], rev=rev) + return external_repo(d["url"], rev=rev, **kwargs) def _get_hash(self, locked=True): - from dvc.tree.repo import RepoTree - - with self._make_repo(locked=locked) as repo: - try: - return repo.find_out_by_relpath(self.def_path).hash_info - except OutputNotFoundError: - path = PathInfo(os.path.join(repo.root_dir, self.def_path)) - - # we want stream but not fetch, so DVC out directories are - # walked, but dir contents is not fetched - tree = RepoTree(repo, stream=True) - - # We are polluting our repo cache with some dir listing here - if tree.isdir(path): - return self.repo.cache.local.tree.get_hash(path, tree=tree) - return tree.get_file_hash(path) + # we want stream but not fetch, so DVC out directories are + # walked, but dir contents is not fetched + with self._make_repo(locked=locked, fetch=False, stream=True) as repo: + return repo.get_checksum(self.def_path) def workspace_status(self): current = self._get_hash(locked=True) @@ -86,14 +69,15 @@ def dumpd(self): return {self.PARAM_PATH: self.def_path, self.PARAM_REPO: self.def_repo} def download(self, to): - with self._make_repo() as repo: + cache = self.repo.cache.local + + with self._make_repo(cache_dir=cache.cache_dir) as repo: if self.def_repo.get(self.PARAM_REV_LOCK) is None: self.def_repo[self.PARAM_REV_LOCK] = repo.get_rev() - cache = self.repo.cache.local - with repo.use_cache(cache): - _, _, cache_infos = repo.fetch_external([self.def_path]) - cache.checkout(to.path_info, cache_infos[0]) + _, _, cache_infos = repo.fetch_external([self.def_path]) + + cache.checkout(to.path_info, cache_infos[0]) def update(self, rev=None): if rev: diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 7ab5e1e2b1..556648402e 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -6,10 +6,12 @@ from distutils.dir_util import copy_tree from typing import Iterable -from funcy import cached_property, retry, wrap_with +from funcy import cached_property, reraise, retry, wrap_with +from dvc.cache import Cache from dvc.config import NoRemoteError, NotDvcRepoError from dvc.exceptions import ( + DvcException, FileMissingError, NoOutputInExternalRepoError, NoRemoteInExternalRepoError, @@ -20,6 +22,7 @@ from dvc.repo import Repo from dvc.scm.base import CloneError from dvc.scm.git import Git +from dvc.state import StateNoop from dvc.tree.local import LocalTree from dvc.tree.repo import RepoTree from dvc.utils import relpath @@ -28,30 +31,45 @@ logger = logging.getLogger(__name__) +class IsADVCRepoError(DvcException): + """Raised when it is not expected to be a dvc repo.""" + + @contextmanager -def external_repo(url, rev=None, for_write=False): +def external_repo(url, rev=None, for_write=False, **kwargs): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) - if not rev: - # Local HEAD points to the tip of whatever branch we first cloned from - # (which may not be the default branch), use origin/HEAD here to get - # the tip of the default branch - rev = "refs/remotes/origin/HEAD" + # Local HEAD points to the tip of whatever branch we first cloned from + # (which may not be the default branch), use origin/HEAD here to get + # the tip of the default branch + rev = rev or "refs/remotes/origin/HEAD" + + root_dir = path if for_write else os.path.realpath(path) + conf = dict( + root_dir=root_dir, + url=url, + scm=None if for_write else Git(root_dir), + rev=None if for_write else rev, + for_write=for_write, + **kwargs, + ) try: - repo = ExternalRepo(path, url, rev, for_write=for_write) + repo = ExternalRepo(**conf) except NotDvcRepoError: - repo = ExternalGitRepo(path, url, rev) + repo = ExternalGitRepo(**conf) try: yield repo - except NoRemoteError: - raise NoRemoteInExternalRepoError(url) + except NoRemoteError as exc: + raise NoRemoteInExternalRepoError(url) from exc except OutputNotFoundError as exc: if exc.repo is repo: - raise NoOutputInExternalRepoError(exc.output, repo.root_dir, url) + raise NoOutputInExternalRepoError( + exc.output, repo.root_dir, url + ) from exc raise except FileMissingError as exc: - raise PathMissingError(exc.path, url) + raise PathMissingError(exc.path, url) from exc finally: repo.close() if for_write: @@ -75,53 +93,37 @@ def clean_repos(): class BaseExternalRepo: # pylint: disable=no-member - _local_cache = None - def __str__(self): return self.url - @property - def local_cache(self): - if hasattr(self, "cache"): - return self.cache.local - return self._local_cache - - @contextmanager - def use_cache(self, cache): - """Use the specified cache in place of default tmpdir cache for - download operations. - """ - has_cache = hasattr(self, "cache") - - if has_cache: - save_cache = self.cache.local # pylint: disable=E0203 - self.cache.local = cache # pylint: disable=E0203 - else: - from collections import namedtuple - - mock_cache = namedtuple("MockCache", ["local"])(local=cache) - self.cache = mock_cache # pylint: disable=W0201 - - self._local_cache = cache - - yield - - if has_cache: - self.cache.local = save_cache - else: - del self.cache - - self._local_cache = None - @cached_property def repo_tree(self): - return RepoTree(self, fetch=True) + return self._get_tree_for( + self, subrepos=not self.for_write, repo_factory=self.make_repo + ) def get_rev(self): + assert self.scm if isinstance(self.tree, LocalTree): return self.scm.get_rev() return self.tree.rev + def _fetch_to_cache(self, path_info, repo, callback): + # don't support subrepo traversal as it might fail due to difference + # in remotes + tree = self._get_tree_for(repo) + cache = repo.cache.local + + hash_info = tree.get_hash(path_info, download_callback=callback) + cache.save( + path_info, + tree, + hash_info, + save_link=False, + download_callback=callback, + ) + return hash_info + def fetch_external(self, paths: Iterable, **kwargs): """Fetch specified external repo paths into cache. @@ -132,117 +134,196 @@ def fetch_external(self, paths: Iterable, **kwargs): """ download_results = [] failed = 0 + root = PathInfo(self.root_dir) - paths = [PathInfo(self.root_dir) / path for path in paths] + paths = [root / path for path in paths] def download_update(result): download_results.append(result) hash_infos = [] for path in paths: - if not self.repo_tree.exists(path): - raise PathMissingError(path, self.url) - hash_info = self.repo_tree.get_hash( - path, download_callback=download_update - ) - self.local_cache.save( - path, - self.repo_tree, - hash_info, - save_link=False, - download_callback=download_update, - ) + with reraise(FileNotFoundError, PathMissingError(path, self.url)): + metadata = self.repo_tree.metadata(path) + + self._check_repo(path, metadata.repo) + repo = metadata.repo + hash_info = self._fetch_to_cache(path, repo, download_update) hash_infos.append(hash_info) return sum(download_results), failed, hash_infos + def _check_repo(self, path_info, repo): + if not repo: + return + + repo_path = PathInfo(repo.root_dir) + if path_info == repo_path and isinstance(repo, Repo): + message = "Cannot fetch a complete DVC repository" + if repo.root_dir != self.root_dir: + rel = relpath(repo.root_dir, self.root_dir) + message += f" '{rel}'" + raise IsADVCRepoError(message) + def get_external(self, path, dest): """Convenience wrapper for fetch_external and checkout.""" - if self.local_cache: + path_info = PathInfo(self.root_dir) / path + with reraise(FileNotFoundError, PathMissingError(path, self.url)): + metadata = self.repo_tree.metadata(path_info) + + self._check_repo(path_info, metadata.repo) + if metadata.output_exists: + repo = metadata.repo + cache = repo.cache.local # fetch DVC and git files to tmpdir cache, then checkout - _, _, save_infos = self.fetch_external([path]) - self.local_cache.checkout(PathInfo(dest), save_infos[0]) + save_info = self._fetch_to_cache(path_info, repo, None) + cache.checkout(PathInfo(dest), save_info) else: - # git-only erepo with no cache, just copy files directly - # to dest - path = PathInfo(self.root_dir) / path - if not self.repo_tree.exists(path): - raise PathMissingError(path, self.url) - self.repo_tree.copytree(path, dest) + # git-only folder, just copy files directly to dest + tree = self._get_tree_for(metadata.repo) # ignore subrepos + tree.copytree(path_info, dest) + def _get_tree_for(self, repo, **kwargs): + """ + Provides a combined tree of a single repo with dvc + git/local tree. + """ + kw = {**self.tree_confs, **kwargs} + if "fetch" not in kw: + kw["fetch"] = True + return RepoTree(repo, **kw) + + def get_checksum(self, path): + path_info = PathInfo(self.root_dir) / path + with reraise(FileNotFoundError, PathMissingError(path, self.url)): + metadata = self.repo_tree.metadata(path_info) + + # skip subrepos to check for + tree = self._get_tree_for(metadata.repo) + return tree.get_hash(path_info) + + @staticmethod + def _fix_local_remote(orig_repo, src_repo, remote_name): + # If a remote URL is relative to the source repo, + # it will have changed upon config load and made + # relative to this new repo. Restore the old one here. + new_remote = orig_repo.config["remote"][remote_name] + old_remote = src_repo.config["remote"][remote_name] + if new_remote["url"] != old_remote["url"]: + new_remote["url"] = old_remote["url"] -class ExternalRepo(Repo, BaseExternalRepo): - def __init__(self, root_dir, url, rev, for_write=False): - if for_write: - super().__init__(root_dir) - else: - root_dir = os.path.realpath(root_dir) - super().__init__(root_dir, scm=Git(root_dir), rev=rev) - self.url = url - self._set_cache_dir() - self._fix_upstream() + @staticmethod + def _add_upstream(orig_repo, src_repo): + # Fill the empty upstream entry with a new remote pointing to the + # original repo's cache location. + cache_dir = src_repo.cache.local.cache_dir + orig_repo.config["remote"]["auto-generated-upstream"] = { + "url": cache_dir + } + orig_repo.config["core"]["remote"] = "auto-generated-upstream" - @wrap_with(threading.Lock()) - def _set_cache_dir(self): - try: - cache_dir = CACHE_DIRS[self.url] - except KeyError: - cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp("dvc-cache") + def make_repo(self, path): + repo = Repo(path, scm=self.scm, rev=self.get_rev()) + + self._setup_cache(repo) + self._fix_upstream(repo) - self.cache.local.cache_dir = cache_dir - self._local_cache = self.cache.local + return repo - def _fix_upstream(self): + def _setup_cache(self, repo): + repo.cache.local.cache_dir = self.cache_dir + if self.cache_types: + repo.cache.local.cache_types = self.cache_types + + def _fix_upstream(self, repo): if not os.path.isdir(self.url): return try: - src_repo = Repo(self.url) + rel_path = os.path.relpath(repo.root_dir, self.root_dir) + src_repo = Repo(PathInfo(self.url) / rel_path) except NotDvcRepoError: - # If ExternalRepo does not throw NotDvcRepoError and Repo does, - # the self.url might be a bare git repo. - # NOTE: This will fail to resolve remote with relative path, - # same as if it was a remote DVC repo. return try: - remote_name = self.config["core"].get("remote") + remote_name = repo.config["core"].get("remote") if remote_name: - self._fix_local_remote(src_repo, remote_name) + self._fix_local_remote(repo, src_repo, remote_name) else: - self._add_upstream(src_repo) + self._add_upstream(repo, src_repo) finally: src_repo.close() - def _fix_local_remote(self, src_repo, remote_name): - # If a remote URL is relative to the source repo, - # it will have changed upon config load and made - # relative to this new repo. Restore the old one here. - new_remote = self.config["remote"][remote_name] - old_remote = src_repo.config["remote"][remote_name] - if new_remote["url"] != old_remote["url"]: - new_remote["url"] = old_remote["url"] + @wrap_with(threading.Lock()) + def _get_cache_dir(self): + try: + cache_dir = CACHE_DIRS[self.url] + except KeyError: + cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp("dvc-cache") + return cache_dir + + +class ExternalRepo(BaseExternalRepo, Repo): + def __init__( + self, + root_dir, + url, + scm=None, + rev=None, + for_write=False, + cache_dir=None, + cache_types=None, + **kwargs, + ): + super().__init__(root_dir, scm=scm, rev=rev) - def _add_upstream(self, src_repo): - # Fill the empty upstream entry with a new remote pointing to the - # original repo's cache location. - cache_dir = src_repo.cache.local.cache_dir - self.config["remote"]["auto-generated-upstream"] = {"url": cache_dir} - self.config["core"]["remote"] = "auto-generated-upstream" + self.url = url + self.for_write = for_write + self.cache_dir = cache_dir or self._get_cache_dir() + self.cache_types = cache_types + + self._setup_cache(self) + self._fix_upstream(self) + self.tree_confs = kwargs class ExternalGitRepo(BaseExternalRepo): - def __init__(self, root_dir, url, rev): + def __init__( + self, + root_dir, + url, + scm=None, + rev=None, + for_write=False, + cache_dir=None, + cache_types=None, + **kwargs, + ): self.root_dir = os.path.realpath(root_dir) + self.scm = scm + self.url = url - self.tree = self.scm.get_tree(rev) + self.for_write = for_write + self.cache_dir = cache_dir or self._get_cache_dir() + self.cache_types = cache_types + + self.rev = rev + self.tree_confs = kwargs + + self.config = {"cache": {"dir": self.cache_dir}} + self.cache = Cache(self) + if cache_types: + self.cache.local.cache_types = cache_types + + self.state = StateNoop() @cached_property - def scm(self): - return Git(self.root_dir) + def tree(self): + if self.scm: + return self.scm.get_tree(self.rev) + return LocalTree(self, {"url": self.root_dir}) def close(self): - if "scm" in self.__dict__: + if self.scm: self.scm.close() def find_out_by_relpath(self, path): @@ -251,14 +332,14 @@ def find_out_by_relpath(self, path): @contextmanager def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): """Opens a specified resource as a file object.""" - tree = RepoTree(self) + path_info = PathInfo(self.root_dir) / path try: - with tree.open( - path, mode=mode, encoding=encoding, **kwargs + with self.repo_tree.open( + path_info, mode=mode, encoding=encoding, **kwargs ) as fobj: yield fobj - except FileNotFoundError: - raise PathMissingError(path, self.url) + except FileNotFoundError as exc: + raise PathMissingError(path, self.url) from exc def _cached_clone(url, rev, for_write=False): diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index edebd74fb6..dbc80b5570 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -66,7 +66,7 @@ class Repo: from dvc.repo.status import status from dvc.repo.update import update - def __init__(self, root_dir=None, scm=None, rev=None): + def __init__(self, root_dir=None, scm=None, rev=None, subrepos=False): from dvc.cache import Cache from dvc.data_cloud import DataCloud from dvc.lock import make_lock @@ -116,6 +116,8 @@ def __init__(self, root_dir=None, scm=None, rev=None): hardlink_lock=hardlink_lock, friendly=True, ) + # used by RepoTree to determine if it should traverse subrepos + self.subrepos = subrepos self.cache = Cache(self) self.cloud = DataCloud(self) @@ -581,19 +583,20 @@ def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts + @cached_property + def repo_tree(self): + return RepoTree(self, subrepos=self.subrepos, fetch=True) + @contextmanager def open_by_relpath(self, path, remote=None, mode="r", encoding=None): """Opens a specified resource as a file descriptor""" - tree = RepoTree(self, stream=True) + tree = RepoTree(self, stream=True, subrepos=True) path = os.path.join(self.root_dir, path) try: with self.state: with tree.open( - os.path.join(self.root_dir, path), - mode=mode, - encoding=encoding, - remote=remote, + path, mode=mode, encoding=encoding, remote=remote, ) as fobj: yield fobj except FileNotFoundError as exc: diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index fe3cd9c6f6..3fe807e07a 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -79,12 +79,14 @@ def _fetch_external(self, repo_url, repo_rev, files, jobs): from dvc.external_repo import external_repo failed, downloaded = 0, 0 + cache = self.cache.local try: - with external_repo(repo_url, repo_rev) as repo: - with repo.use_cache(self.cache.local): - d, f, _ = repo.fetch_external(files, jobs=jobs) - downloaded += d - failed += f + with external_repo( + repo_url, repo_rev, cache_dir=cache.cache_dir + ) as repo: + d, f, _ = repo.fetch_external(files, jobs=jobs) + downloaded += d + failed += f except CloneError: failed += 1 logger.exception( diff --git a/dvc/repo/get.py b/dvc/repo/get.py index f14011f523..f2c62673cb 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -35,21 +35,21 @@ def get(url, path, out=None, rev=None): # and won't work with reflink/hardlink. dpath = os.path.dirname(os.path.abspath(out)) tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) + + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we + # are done, and to make that work we would have to copy data + # over anyway before removing the cache, so we might just copy + # it right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + cache_types = ["reflink", "hardlink", "copy"] try: - with external_repo(url=url, rev=rev) as repo: - if hasattr(repo, "cache"): - repo.cache.local.cache_dir = tmp_dir - - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we - # are done, and to make that work we would have to copy data - # over anyway before removing the cache, so we might just copy - # it right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] + with external_repo( + url=url, rev=rev, cache_dir=tmp_dir, cache_types=cache_types + ) as repo: repo.get_external(path, out) finally: remove(tmp_dir) diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index a761b91dd1..51878d349e 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -31,7 +31,9 @@ def ls( """ from dvc.external_repo import external_repo - with external_repo(url, rev) as repo: + # use our own RepoTree instance instead of repo.repo_tree since we want to + # fetch directory listings, but don't want to fetch file contents. + with external_repo(url, rev, fetch=False, stream=True) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path @@ -50,14 +52,10 @@ def ls( def _ls(repo, path_info, recursive=None, dvc_only=False): - from dvc.tree.repo import RepoTree - def onerror(exc): raise exc - # use our own RepoTree instance instead of repo.repo_tree since we want to - # fetch directory listings, but don't want to fetch file contents. - tree = RepoTree(repo, stream=True) + tree = repo.repo_tree infos = [] try: diff --git a/dvc/tree/_metadata.py b/dvc/tree/_metadata.py index 325f1ce3a4..6ea8822a21 100644 --- a/dvc/tree/_metadata.py +++ b/dvc/tree/_metadata.py @@ -1,9 +1,12 @@ from dataclasses import dataclass, field -from typing import List +from typing import TYPE_CHECKING, List from dvc.output import BaseOutput from dvc.path_info import PathInfo +if TYPE_CHECKING: + from dvc.repo import Repo + @dataclass class Metadata: @@ -14,6 +17,7 @@ class Metadata: # required field path_info: PathInfo + repo: "Repo" # computed fields is_output: bool = field(init=False, default=False) # is it an output? diff --git a/dvc/tree/dvc.py b/dvc/tree/dvc.py index 3ae13493a8..520ee42bcc 100644 --- a/dvc/tree/dvc.py +++ b/dvc/tree/dvc.py @@ -268,6 +268,6 @@ def metadata(self, path_info): path_info = PathInfo(os.path.abspath(path_info)) outs = self._find_outs(path_info, strict=False, recursive=True) - meta = Metadata(path_info=path_info, outs=outs) + meta = Metadata(path_info=path_info, outs=outs, repo=self.repo) meta.isdir = meta.isdir or self.check_isdir(meta.path_info, meta.outs) return meta diff --git a/dvc/tree/git.py b/dvc/tree/git.py index 03b86c379d..25cc9d68fd 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -74,9 +74,10 @@ def dvcignore(self): return cls(self, root) def open( - self, path, mode="r", encoding="utf-8" + self, path, mode="r", encoding=None ): # pylint: disable=arguments-differ assert mode in {"r", "rb"} + encoding = encoding or "utf-8" relative_path = relpath(path, self.git.working_dir) diff --git a/dvc/tree/repo.py b/dvc/tree/repo.py index e736e55836..4861c068ea 100644 --- a/dvc/tree/repo.py +++ b/dvc/tree/repo.py @@ -364,7 +364,8 @@ def hash_jobs(self): # pylint: disable=invalid-overridden-method return self._main_repo.tree.hash_jobs def metadata(self, path): - path_info = PathInfo(os.path.abspath(path)) + abspath = os.path.abspath(path) + path_info = PathInfo(abspath) tree, dvc_tree = self._get_tree_pair(path_info) dvc_meta = None @@ -379,7 +380,10 @@ def metadata(self, path): if not stat_result and not dvc_meta: raise FileNotFoundError - meta = dvc_meta or Metadata(path_info=path_info) + meta = dvc_meta or Metadata( + path_info=path_info, + repo=self._get_repo(abspath) or self._main_repo, + ) isdir = bool(stat_result) and stat.S_ISDIR(stat_result.st_mode) meta.isdir = meta.isdir or isdir diff --git a/tests/func/test_api.py b/tests/func/test_api.py index 6920df5654..a736103a11 100644 --- a/tests/func/test_api.py +++ b/tests/func/test_api.py @@ -3,10 +3,10 @@ import pytest from dvc import api -from dvc.api import UrlNotDvcRepoError -from dvc.exceptions import FileMissingError +from dvc.exceptions import FileMissingError, OutputNotFoundError from dvc.path_info import URLInfo from dvc.utils.fs import remove +from tests.unit.tree.test_repo import make_subrepo cloud_names = [ "s3", @@ -47,10 +47,10 @@ def test_get_url_external(erepo_dir, cloud): def test_get_url_requires_dvc(tmp_dir, scm): tmp_dir.scm_gen({"foo": "foo"}, commit="initial") - with pytest.raises(UrlNotDvcRepoError, match="not a DVC repository"): + with pytest.raises(OutputNotFoundError, match="output 'foo'"): api.get_url("foo", repo=os.fspath(tmp_dir)) - with pytest.raises(UrlNotDvcRepoError): + with pytest.raises(OutputNotFoundError, match="output 'foo'"): api.get_url("foo", repo=f"file://{tmp_dir}") @@ -173,3 +173,66 @@ def test_open_not_cached(dvc): os.remove(metric_file) with pytest.raises(FileMissingError): api.read(metric_file) + + +@pytest.mark.parametrize("local_repo", [False, True]) +def test_read_with_subrepos(tmp_dir, scm, local_cloud, local_repo): + tmp_dir.scm_gen("foo.txt", "foo.txt", commit="add foo.txt") + subrepo = tmp_dir / "dir" / "subrepo" + make_subrepo(subrepo, scm, config=local_cloud.config) + with subrepo.chdir(): + subrepo.scm_gen({"lorem": "lorem"}, commit="add lorem") + subrepo.dvc_gen({"dir": {"file.txt": "file.txt"}}, commit="add dir") + subrepo.dvc_gen("dvc-file", "dvc-file", commit="add dir") + subrepo.dvc.push() + + repo_path = None if local_repo else f"file:///{tmp_dir}" + subrepo_path = os.path.join("dir", "subrepo") + + assert api.read("foo.txt", repo=repo_path) == "foo.txt" + assert ( + api.read(os.path.join(subrepo_path, "lorem"), repo=repo_path) + == "lorem" + ) + assert ( + api.read(os.path.join(subrepo_path, "dvc-file"), repo=repo_path) + == "dvc-file" + ) + assert ( + api.read(os.path.join(subrepo_path, "dir", "file.txt"), repo=repo_path) + == "file.txt" + ) + + +def test_get_url_granular(tmp_dir, dvc, s3): + tmp_dir.add_remote(config=s3.config) + tmp_dir.dvc_gen( + {"dir": {"foo": "foo", "bar": "bar", "nested": {"file": "file"}}} + ) + + expected_url = URLInfo(s3.url) / "ac/bd18db4cc2f85cedef654fccc4a4d8" + assert api.get_url("dir/foo") == expected_url + + expected_url = URLInfo(s3.url) / "37/b51d194a7513e45b56f6524f2d51f2" + assert api.get_url("dir/bar") == expected_url + + expected_url = URLInfo(s3.url) / "8c/7dd922ad47494fc02c388e12c00eac" + assert api.get_url(os.path.join("dir", "nested", "file")) == expected_url + + +def test_get_url_subrepos(tmp_dir, scm, local_cloud): + subrepo = tmp_dir / "subrepo" + make_subrepo(subrepo, scm, config=local_cloud.config) + with subrepo.chdir(): + subrepo.dvc_gen( + {"dir": {"foo": "foo"}, "bar": "bar"}, commit="add files" + ) + subrepo.dvc.push() + + path = os.path.relpath(local_cloud.config["url"]) + + expected_url = os.path.join(path, "ac", "bd18db4cc2f85cedef654fccc4a4d8") + assert api.get_url(os.path.join("subrepo", "dir", "foo")) == expected_url + + expected_url = os.path.join(path, "37", "b51d194a7513e45b56f6524f2d51f2") + assert api.get_url("subrepo/bar") == expected_url diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 50b142dd48..7e1d53f481 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -3,11 +3,13 @@ from mock import ANY, patch from dvc.external_repo import CLONES, external_repo +from dvc.hash_info import HashInfo from dvc.path_info import PathInfo from dvc.scm.git import Git from dvc.tree.local import LocalTree from dvc.utils import relpath -from dvc.utils.fs import remove +from dvc.utils.fs import makedirs, remove +from tests.unit.tree.test_repo import make_subrepo def test_external_repo(erepo_dir): @@ -173,3 +175,52 @@ def test_shallow_clone_tag(erepo_dir): assert mock_clone.call_count == 1 _, shallow = CLONES[url] assert not shallow + + +def test_subrepos_are_ignored(tmp_dir, erepo_dir): + subrepo = erepo_dir / "dir" / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + with erepo_dir.chdir(): + erepo_dir.dvc_gen("dir/foo", "foo", commit="foo") + erepo_dir.scm_gen("dir/bar", "bar", commit="bar") + + with subrepo.chdir(): + subrepo.dvc_gen({"file": "file"}, commit="add files on subrepo") + + with external_repo(os.fspath(erepo_dir)) as repo: + repo.get_external("dir", "out") + expected_files = {"foo": "foo", "bar": "bar", ".gitignore": "/foo\n"} + assert (tmp_dir / "out").read_text() == expected_files + + expected_hash = HashInfo("md5", "e1d9e8eae5374860ae025ec84cfd85c7.dir") + assert ( + repo.get_checksum(os.path.join(repo.root_dir, "dir")) + == expected_hash + ) + + # clear cache to test `fetch_external` again + cache_dir = tmp_dir / repo.cache.local.cache_dir + remove(cache_dir) + makedirs(cache_dir) + + assert repo.fetch_external(["dir"]) == ( + len(expected_files), + 0, + [expected_hash], + ) + + +def test_subrepos_are_ignored_for_git_tracked_dirs(tmp_dir, erepo_dir): + subrepo = erepo_dir / "dir" / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + with erepo_dir.chdir(): + scm_files = {"foo": "foo", "bar": "bar", "subdir": {"lorem": "lorem"}} + erepo_dir.scm_gen({"dir": scm_files}, commit="add scm dir") + + with subrepo.chdir(): + subrepo.dvc_gen({"file": "file"}, commit="add files on subrepo") + + with external_repo(os.fspath(erepo_dir)) as repo: + repo.get_external("dir", "out") + # subrepo files should not be here + assert (tmp_dir / "out").read_text() == scm_files diff --git a/tests/func/test_get.py b/tests/func/test_get.py index a74ebdbca0..d3cbfbbd0e 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -5,11 +5,13 @@ from dvc.cache import Cache from dvc.exceptions import PathMissingError +from dvc.external_repo import IsADVCRepoError from dvc.main import main from dvc.repo import Repo from dvc.repo.get import GetDVCFileError from dvc.system import System from dvc.utils.fs import makedirs +from tests.unit.tree.test_repo import make_subrepo def test_get_repo_file(tmp_dir, erepo_dir): @@ -265,3 +267,45 @@ def test_get_mixed_dir(tmp_dir, erepo_dir): "foo": "foo", "bar": "bar", } + + +@pytest.mark.parametrize("is_dvc", [True, False]) +@pytest.mark.parametrize("files", [{"foo": "foo"}, {"dir": {"bar": "bar"}}]) +def test_get_from_subrepos(tmp_dir, erepo_dir, is_dvc, files): + subrepo = erepo_dir / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + gen = subrepo.dvc_gen if is_dvc else subrepo.scm_gen + with subrepo.chdir(): + gen(files, commit="add files in subrepo") + + key = next(iter(files)) + Repo.get(os.fspath(erepo_dir), f"subrepo/{key}", out="out") + + assert (tmp_dir / "out").read_text() == files[key] + + +def test_granular_get_from_subrepos(tmp_dir, erepo_dir): + subrepo = erepo_dir / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + with subrepo.chdir(): + subrepo.dvc_gen({"dir": {"bar": "bar"}}, commit="files in subrepo") + + path = os.path.join("subrepo", "dir", "bar") + Repo.get(os.fspath(erepo_dir), path, out="out") + assert (tmp_dir / "out").read_text() == "bar" + + +def test_try_to_get_complete_repo(tmp_dir, dvc, erepo_dir): + subrepo = erepo_dir / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + with subrepo.chdir(): + subrepo.dvc_gen({"dir": {"bar": "bar"}}, commit="files in subrepo") + + expected_message = "Cannot fetch a complete DVC repository" + with pytest.raises(IsADVCRepoError) as exc_info: + Repo.get(os.fspath(erepo_dir), "subrepo", out="out") + assert f"{expected_message} 'subrepo'" == str(exc_info.value) + + with pytest.raises(IsADVCRepoError) as exc_info: + Repo.get(os.fspath(erepo_dir), ".", out="out") + assert expected_message == str(exc_info.value) diff --git a/tests/func/test_import.py b/tests/func/test_import.py index f3cdc9184f..ca5e803655 100644 --- a/tests/func/test_import.py +++ b/tests/func/test_import.py @@ -2,6 +2,7 @@ import os import pytest +from funcy import first from mock import patch import dvc.data_cloud as cloud @@ -9,9 +10,11 @@ from dvc.config import NoRemoteError from dvc.dvcfile import Dvcfile from dvc.exceptions import DownloadError, PathMissingError +from dvc.external_repo import IsADVCRepoError from dvc.stage.exceptions import StagePathNotFoundError from dvc.system import System from dvc.utils.fs import makedirs, remove +from tests.unit.tree.test_repo import make_subrepo def test_import(tmp_dir, scm, dvc, erepo_dir): @@ -381,3 +384,84 @@ def test_import_mixed_dir(tmp_dir, dvc, erepo_dir): "foo": "foo", "bar": "bar", } + + +@pytest.mark.parametrize("is_dvc", [True, False]) +@pytest.mark.parametrize("files", [{"foo": "foo"}, {"dir": {"bar": "bar"}}]) +def test_import_subrepos(tmp_dir, erepo_dir, dvc, scm, is_dvc, files): + subrepo = erepo_dir / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + gen = subrepo.dvc_gen if is_dvc else subrepo.scm_gen + with subrepo.chdir(): + gen(files, commit="add files in subrepo") + + key = next(iter(files)) + path = str((subrepo / key).relative_to(erepo_dir)) + + stage = dvc.imp(os.fspath(erepo_dir), path, out="out",) + + assert (tmp_dir / "out").read_text() == files[key] + assert stage.deps[0].def_path == path + assert stage.deps[0].def_repo == { + "url": os.fspath(erepo_dir), + "rev_lock": erepo_dir.scm.get_rev(), + } + + +def test_granular_import_from_subrepos(tmp_dir, dvc, erepo_dir): + subrepo = erepo_dir / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + with subrepo.chdir(): + subrepo.dvc_gen({"dir": {"bar": "bar"}}, commit="files in subrepo") + + path = os.path.join("subrepo", "dir", "bar") + stage = dvc.imp(os.fspath(erepo_dir), path, out="out") + assert (tmp_dir / "out").read_text() == "bar" + assert stage.deps[0].def_path == path + assert stage.deps[0].def_repo == { + "url": os.fspath(erepo_dir), + "rev_lock": erepo_dir.scm.get_rev(), + } + + +@pytest.mark.parametrize("is_dvc", [True, False]) +@pytest.mark.parametrize("files", [{"foo": "foo"}, {"dir": {"bar": "bar"}}]) +def test_pull_imported_stage_from_subrepos( + tmp_dir, dvc, erepo_dir, is_dvc, files +): + subrepo = erepo_dir / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + gen = subrepo.dvc_gen if is_dvc else subrepo.scm_gen + with subrepo.chdir(): + gen(files, commit="files in subrepo") + + key = first(files) + path = os.path.join("subrepo", key) + dvc.imp(os.fspath(erepo_dir), path, out="out") + + # clean everything + remove(dvc.cache.local.cache_dir) + remove("out") + makedirs(dvc.cache.local.cache_dir) + + stats = dvc.pull(["out.dvc"]) + + expected = [f"out{os.sep}"] if isinstance(files[key], dict) else ["out"] + assert stats["added"] == expected + assert (tmp_dir / "out").read_text() == files[key] + + +def test_try_import_complete_repo(tmp_dir, dvc, erepo_dir): + subrepo = erepo_dir / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + with subrepo.chdir(): + subrepo.dvc_gen({"dir": {"bar": "bar"}}, commit="files in subrepo") + + expected_message = "Cannot fetch a complete DVC repository" + with pytest.raises(IsADVCRepoError) as exc_info: + dvc.imp(os.fspath(erepo_dir), "subrepo", out="out") + assert f"{expected_message} 'subrepo'" == str(exc_info.value) + + with pytest.raises(IsADVCRepoError) as exc_info: + dvc.imp(os.fspath(erepo_dir), os.curdir, out="out") + assert expected_message == str(exc_info.value) diff --git a/tests/func/test_ls.py b/tests/func/test_ls.py index 0f65c52f4a..ff189b7c24 100644 --- a/tests/func/test_ls.py +++ b/tests/func/test_ls.py @@ -1,6 +1,7 @@ import os import shutil import textwrap +from operator import itemgetter import pytest @@ -512,3 +513,46 @@ def _ls(path): {"isdir": False, "isexec": 0, "isout": False, "path": "bar"}, {"isdir": False, "isexec": 0, "isout": False, "path": "foo"}, ] + + +@pytest.mark.parametrize( + "dvc_top_level, erepo", + [ + (True, pytest.lazy_fixture("erepo_dir")), + (False, pytest.lazy_fixture("git_dir")), + ], +) +def test_subrepo(dvc_top_level, erepo): + from tests.func.test_get import make_subrepo + + dvc_files = {"foo.txt": "foo.txt", "dvc_dir": {"lorem": "lorem"}} + scm_files = {"bar.txt": "bar.txt", "scm_dir": {"ipsum": "ipsum"}} + subrepo = erepo / "subrepo" + make_subrepo(subrepo, erepo.scm) + + for repo in [erepo, subrepo]: + with repo.chdir(): + repo.scm_gen(scm_files, commit=f"scm track for top {repo}") + if hasattr(repo, "dvc"): + repo.dvc_gen(dvc_files, commit=f"dvc track for {repo}") + + def _list_files(path=None): + return set(map(itemgetter("path"), Repo.ls(os.fspath(erepo), path))) + + extras = {".dvcignore", ".gitignore"} + git_tracked_outputs = {"bar.txt", "scm_dir"} + dvc_files = {"dvc_dir", "foo.txt", "foo.txt.dvc", "dvc_dir.dvc"} + common_outputs = git_tracked_outputs | extras | dvc_files + + top_level_outputs = ( + common_outputs if dvc_top_level else git_tracked_outputs + ) + assert _list_files() == top_level_outputs | {"subrepo"} + assert _list_files("subrepo") == common_outputs + + assert _list_files("scm_dir") == {"ipsum"} + assert _list_files("subrepo/scm_dir") == {"ipsum"} + + if dvc_top_level: + assert _list_files("dvc_dir") == {"lorem"} + assert _list_files("subrepo/dvc_dir") == {"lorem"} diff --git a/tests/func/test_run_multistage.py b/tests/func/test_run_multistage.py index ec05642fa9..e7d49a63da 100644 --- a/tests/func/test_run_multistage.py +++ b/tests/func/test_run_multistage.py @@ -4,7 +4,6 @@ import pytest from dvc.exceptions import InvalidArgumentError -from dvc.repo import Repo from dvc.stage.exceptions import DuplicateStageName, InvalidStageName from dvc.utils.serialize import dump_yaml, parse_yaml_for_update @@ -306,9 +305,9 @@ def test_run_params_no_exec(tmp_dir, dvc): {"outs": ["foo"], "deps": ["bar"], "name": "copy-foo-bar"}, ], ) -def test_run_without_cmd(kwargs): +def test_run_without_cmd(tmp_dir, dvc, kwargs): with pytest.raises(InvalidArgumentError) as exc: - Repo().run(**kwargs) + dvc.run(**kwargs) assert "command is not specified" == str(exc.value) diff --git a/tests/func/test_update.py b/tests/func/test_update.py index 12d0c9d61c..6a21e4967d 100644 --- a/tests/func/test_update.py +++ b/tests/func/test_update.py @@ -3,6 +3,7 @@ import pytest from dvc.dvcfile import Dvcfile +from tests.unit.tree.test_repo import make_subrepo @pytest.mark.parametrize("cached", [True, False]) @@ -282,3 +283,32 @@ def test_update_recursive(tmp_dir, dvc, erepo_dir): assert stage1.deps[0].def_repo["rev_lock"] == new_rev assert stage2.deps[0].def_repo["rev_lock"] == new_rev assert stage3.deps[0].def_repo["rev_lock"] == new_rev + + +@pytest.mark.parametrize("is_dvc", [True, False]) +def test_update_from_subrepos(tmp_dir, dvc, erepo_dir, is_dvc): + subrepo = erepo_dir / "subrepo" + make_subrepo(subrepo, erepo_dir.scm) + gen = subrepo.dvc_gen if is_dvc else subrepo.scm_gen + with subrepo.chdir(): + gen("foo", "foo", commit="subrepo initial") + + path = os.path.join("subrepo", "foo") + repo_path = os.fspath(erepo_dir) + dvc.imp(repo_path, path, out="out") + assert dvc.status() == {} + + with subrepo.chdir(): + gen("foo", "foobar", commit="subrepo second commit") + + assert dvc.status()["out.dvc"][0]["changed deps"] == { + f"{path} ({repo_path})": "update available" + } + (stage,) = dvc.update(["out.dvc"]) + + assert (tmp_dir / "out").read_text() == "foobar" + assert stage.deps[0].def_path == os.path.join("subrepo", "foo") + assert stage.deps[0].def_repo == { + "url": repo_path, + "rev_lock": erepo_dir.scm.get_rev(), + } diff --git a/tests/unit/test_external_repo.py b/tests/unit/test_external_repo.py new file mode 100644 index 0000000000..948796c738 --- /dev/null +++ b/tests/unit/test_external_repo.py @@ -0,0 +1,79 @@ +import os +from unittest.mock import call + +import pytest + +from dvc.external_repo import external_repo +from tests.unit.tree.test_repo import make_subrepo + + +def test_hook_is_called(tmp_dir, erepo_dir, mocker): + subrepo_paths = [ + "subrepo1", + "subrepo2", + os.path.join("dir", "subrepo3"), + os.path.join("dir", "subrepo4"), + "subrepo5", + os.path.join("subrepo5", "subrepo6"), + ] + subrepos = [erepo_dir / path for path in subrepo_paths] + for repo in subrepos: + make_subrepo(repo, erepo_dir.scm) + + for repo in subrepos + [erepo_dir]: + with repo.chdir(): + repo.scm_gen("foo", "foo", commit=f"git add {repo}/foo") + repo.dvc_gen("bar", "bar", commit=f"dvc add {repo}/bar") + + with external_repo(str(erepo_dir)) as repo: + spy = mocker.spy(repo, "make_repo") + + list(repo.repo_tree.walk(repo.root_dir)) # drain + assert spy.call_count == len(subrepos) + + paths = [os.path.join(repo.root_dir, path) for path in subrepo_paths] + spy.assert_has_calls([call(path) for path in paths], any_order=True) + + +@pytest.mark.parametrize("root_is_dvc", [False, True]) +def test_subrepo_is_constructed_properly( + tmp_dir, scm, mocker, make_tmp_dir, root_is_dvc +): + if root_is_dvc: + make_subrepo(tmp_dir, scm) + + subrepo = tmp_dir / "subrepo" + make_subrepo(subrepo, scm) + local_cache = subrepo.dvc.cache.local.cache_dir + + tmp_dir.scm_gen("bar", "bar", commit="add bar") + subrepo.dvc_gen("foo", "foo", commit="add foo") + + cache_dir = make_tmp_dir("temp-cache") + with external_repo( + str(tmp_dir), cache_dir=str(cache_dir), cache_types=["symlink"] + ) as repo: + spy = mocker.spy(repo, "make_repo") + + list(repo.repo_tree.walk(repo.root_dir)) # drain + assert spy.call_count == 1 + subrepo = spy.return_value + + assert repo.url == str(tmp_dir) + assert repo.cache_dir == str(cache_dir) + assert repo.cache.local.cache_dir == str(cache_dir) + assert subrepo.cache.local.cache_dir == str(cache_dir) + + assert repo.cache_types == ["symlink"] + assert repo.cache.local.cache_types == ["symlink"] + assert subrepo.cache.local.cache_types == ["symlink"] + + assert ( + subrepo.config["remote"]["auto-generated-upstream"]["url"] + == local_cache + ) + if root_is_dvc: + main_cache = tmp_dir.dvc.cache.local.cache_dir + assert repo.config["remote"]["auto-generated-upstream"][ + "url" + ] == str(main_cache) diff --git a/tests/unit/tree/test_repo_metadata.py b/tests/unit/tree/test_repo_metadata.py index ef559c3abf..e14f8abccc 100644 --- a/tests/unit/tree/test_repo_metadata.py +++ b/tests/unit/tree/test_repo_metadata.py @@ -2,6 +2,7 @@ from dvc.path_info import PathInfo from dvc.tree.repo import RepoTree +from tests.unit.tree.test_repo import make_subrepo @pytest.fixture(scope="module") @@ -53,7 +54,7 @@ def repo_tree(temp_repo): temp_repo.scm_gen(fs_structure, commit="repo init") temp_repo.dvc_gen(dvc_structure, commit="use dvc") - yield RepoTree(temp_repo.dvc, fetch=True) + yield RepoTree(temp_repo.dvc, fetch=True, subrepos=True) def test_metadata_not_existing(repo_tree): @@ -78,6 +79,7 @@ def test_metadata_git_tracked_file(repo_tree, path): meta = repo_tree.metadata(path) assert meta.path_info == root / path + assert meta.repo.root_dir == repo_tree.root_dir assert not meta.is_output assert not meta.part_of_output assert not meta.contains_outputs @@ -116,6 +118,7 @@ def test_metadata_dvc_tracked_file(repo_tree, path, outs, is_output): meta = repo_tree.metadata(path) assert meta.path_info == root / path + assert meta.repo.root_dir == repo_tree.root_dir assert meta.is_output == is_output assert meta.part_of_output != is_output assert not meta.contains_outputs @@ -133,6 +136,7 @@ def test_metadata_git_only_dirs(repo_tree, path): meta = repo_tree.metadata(path) assert meta.path_info == root / path + assert meta.repo.root_dir == repo_tree.root_dir assert not meta.is_output assert not meta.part_of_output assert not meta.contains_outputs @@ -156,6 +160,7 @@ def test_metadata_git_dvc_mixed_dirs(repo_tree, path, expected_outs): meta = repo_tree.metadata(root / path) assert meta.path_info == root / path + assert meta.repo.root_dir == repo_tree.root_dir assert not meta.is_output assert not meta.part_of_output assert meta.contains_outputs @@ -184,6 +189,7 @@ def test_metadata_dvc_only_dirs(repo_tree, path, is_output): meta = repo_tree.metadata(root / path) assert meta.path_info == root / path + assert meta.repo.root_dir == repo_tree.root_dir assert meta.is_output == is_output assert meta.part_of_output != is_output assert not meta.contains_outputs @@ -193,3 +199,23 @@ def test_metadata_dvc_only_dirs(repo_tree, path, is_output): assert not meta.is_exec assert not meta.isfile assert {out.path_info for out in meta.outs} == {data} + + +def test_metadata_on_subrepos(make_tmp_dir, temp_repo, repo_tree): + subrepo = temp_repo / "subrepo" + make_subrepo(subrepo, temp_repo.scm) + subrepo.scm_gen("foo", "foo", commit="add foo on subrepo") + subrepo.dvc_gen("foobar", "foobar", commit="add foobar on subrepo") + + for path in ["subrepo", "subrepo/foo", "subrepo/foobar"]: + meta = repo_tree.metadata(temp_repo / path) + assert meta.repo.root_dir == str( + subrepo + ), f"repo root didn't match for {path}" + + # supports external outputs on top-level DVC repo + external_dir = make_tmp_dir("external-output") + external_dir.gen("bar", "bar") + temp_repo.dvc.add(str(external_dir / "bar"), external=True) + meta = repo_tree.metadata(external_dir / "bar") + assert meta.repo.root_dir == str(temp_repo)