From e190ba2d367b438bccc4e4ed8e9e93672e03e0e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Mon, 24 Aug 2020 18:45:42 +0545 Subject: [PATCH] api: refactor to use metadata and RepoTree This does not yet support subrepos as it requires us to work on setting up cache correctly which is pending. --- dvc/api.py | 50 +++++++++++++++++++++++++++--------------- dvc/data_cloud.py | 8 +++++++ dvc/exceptions.py | 21 ++++++++---------- dvc/tree/_metadata.py | 6 ++++- dvc/tree/dvc.py | 5 +++-- tests/func/test_api.py | 13 +++++------ 6 files changed, 63 insertions(+), 40 deletions(-) diff --git a/dvc/api.py b/dvc/api.py index a2d648aa17..a2b255530e 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -2,16 +2,17 @@ from contextlib import _GeneratorContextManager as GCM from contextlib import contextmanager -from dvc.exceptions import DvcException, NotDvcRepoError +from funcy import reraise + +from dvc.exceptions import ( + NotDvcRepoError, + OutputNotFoundError, + PathMissingError, +) from dvc.external_repo import external_repo +from dvc.path_info import PathInfo from dvc.repo import Repo - - -class UrlNotDvcRepoError(DvcException): - """Thrown if the given URL is not a DVC repository.""" - - def __init__(self, url): - super().__init__(f"'{url}' is not a DVC repository.") +from dvc.tree.repo import RepoTree def get_url(path, repo=None, rev=None, remote=None): @@ -20,17 +21,26 @@ def get_url(path, repo=None, rev=None, remote=None): in a DVC repo. For Git repos, HEAD is used unless a rev argument is supplied. The default remote is tried unless a remote argument is supplied. - Raises UrlNotDvcRepoError if repo is not a DVC project. + Raises OutputNotFoundError if the file is not a dvc-tracked file. NOTE: This function does not check for the actual existence of the file or directory in the remote storage. """ with _make_repo(repo, rev=rev) as _repo: - if not isinstance(_repo, Repo): - raise UrlNotDvcRepoError(_repo.url) # pylint: disable=no-member - out = _repo.find_out_by_relpath(path) - remote_obj = _repo.cloud.get_remote(remote) - return str(remote_obj.tree.hash_to_path_info(out.checksum)) + tree = RepoTree(_repo, fetch=True) + path_info = PathInfo(_repo.root_dir) / path + + meta = tree.metadata(path_info) + exc = OutputNotFoundError(path) + if not meta.is_dvc: + raise exc + + with reraise(FileNotFoundError, exc): + _, hash_ = tree.get_hash(PathInfo(_repo.root_dir) / path) + + assert hash_ and meta.repo + cloud = meta.repo.cloud + return cloud.get_url_for(remote, checksum=hash_) def open( # noqa, pylint: disable=redefined-builtin @@ -74,10 +84,14 @@ def __getattr__(self, name): def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): with _make_repo(repo, rev=rev) as _repo: - with _repo.open_by_relpath( - path, remote=remote, mode=mode, encoding=encoding - ) as fd: - yield fd + tree = RepoTree(_repo, stream=True, fetch=True) + + path = PathInfo(_repo.root_dir) / path + with reraise(FileNotFoundError, PathMissingError(path, repo)): + with tree.open( + path, remote=remote, mode=mode, encoding=encoding + ) as fd: + yield fd def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None): diff --git a/dvc/data_cloud.py b/dvc/data_cloud.py index 5a3407759c..85b0dc12f4 100644 --- a/dvc/data_cloud.py +++ b/dvc/data_cloud.py @@ -116,3 +116,11 @@ def status(self, cache, jobs=None, remote=None, show_checksums=False): return self.repo.cache.local.status( cache, jobs=jobs, remote=remote, show_checksums=show_checksums ) + + def get_url_for(self, remote=None, checksum=None): + remote = self.get_remote(remote) + tree = remote.tree + + if checksum: + return tree.hash_to_path_info(checksum).url + return tree.path_info.url diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 6e140b5f61..709c1d309b 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -291,19 +291,16 @@ def __init__(self, code, reason): class PathMissingError(DvcException): - default_msg = ( - "The path '{}' does not exist in the target repository '{}'" - " neither as a DVC output nor as a Git-tracked file." - ) - default_msg_dvc_only = ( - "The path '{}' does not exist in the target repository '{}'" - " as an DVC output." - ) - - def __init__(self, path, repo, dvc_only=False): - msg = self.default_msg if not dvc_only else self.default_msg_dvc_only - super().__init__(msg.format(path, repo)) + def __init__(self, path, repo=None, dvc_only=False): + msg = f"The path '{path}' does not exist in the" + msg += " repository" if not repo else f" target repository '{repo}'" + msg += " as a DVC output" + msg = f"{msg}." if dvc_only else f"{msg} nor as a Git-tracked file." + self.dvc_only = dvc_only + self.repo = repo + self.path = path + super().__init__(msg) class RemoteCacheRequiredError(DvcException): diff --git a/dvc/tree/_metadata.py b/dvc/tree/_metadata.py index 325f1ce3a4..f96be39a71 100644 --- a/dvc/tree/_metadata.py +++ b/dvc/tree/_metadata.py @@ -1,9 +1,12 @@ from dataclasses import dataclass, field -from typing import List +from typing import TYPE_CHECKING, List, Optional from dvc.output import BaseOutput from dvc.path_info import PathInfo +if TYPE_CHECKING: + from dvc.repo import Repo + @dataclass class Metadata: @@ -30,6 +33,7 @@ class Metadata: isdir: bool = False # is it a directory? is_exec: bool = False # is it an executable? outs: List[BaseOutput] = field(default_factory=list) # list of outputs + repo: Optional["Repo"] = None # the repo path falls in def __post_init__(self): self.output_exists = bool(self.outs) diff --git a/dvc/tree/dvc.py b/dvc/tree/dvc.py index 1ffc931a5e..3197c1035f 100644 --- a/dvc/tree/dvc.py +++ b/dvc/tree/dvc.py @@ -45,7 +45,8 @@ def _get_granular_checksum(self, path, out, remote=None): assert isinstance(path, PathInfo) if not self.fetch and not self.stream: raise FileNotFoundError - dir_cache = out.get_dir_cache(remote=remote) + with self.repo.state: + dir_cache = out.get_dir_cache(remote=remote) for entry in dir_cache: entry_relpath = entry[out.tree.PARAM_RELPATH] if os.name == "nt": @@ -240,6 +241,6 @@ def metadata(self, path_info): path_info = PathInfo(os.path.abspath(path_info)) outs = self._find_outs(path_info, strict=False, recursive=True) - meta = Metadata(path_info=path_info, outs=outs) + meta = Metadata(path_info=path_info, outs=outs, repo=self.repo) meta.isdir = meta.isdir or self.check_isdir(meta.path_info, meta.outs) return meta diff --git a/tests/func/test_api.py b/tests/func/test_api.py index 6920df5654..bd38d20589 100644 --- a/tests/func/test_api.py +++ b/tests/func/test_api.py @@ -3,8 +3,7 @@ import pytest from dvc import api -from dvc.api import UrlNotDvcRepoError -from dvc.exceptions import FileMissingError +from dvc.exceptions import OutputNotFoundError, PathMissingError from dvc.path_info import URLInfo from dvc.utils.fs import remove @@ -47,10 +46,10 @@ def test_get_url_external(erepo_dir, cloud): def test_get_url_requires_dvc(tmp_dir, scm): tmp_dir.scm_gen({"foo": "foo"}, commit="initial") - with pytest.raises(UrlNotDvcRepoError, match="not a DVC repository"): + with pytest.raises(OutputNotFoundError, match="with output 'foo'"): api.get_url("foo", repo=os.fspath(tmp_dir)) - with pytest.raises(UrlNotDvcRepoError): + with pytest.raises(OutputNotFoundError): api.get_url("foo", repo=f"file://{tmp_dir}") @@ -144,7 +143,7 @@ def test_missing(tmp_dir, dvc, remote): remove("foo") - with pytest.raises(FileMissingError): + with pytest.raises(PathMissingError, match="path 'foo' does not exist"): api.read("foo") @@ -164,12 +163,12 @@ def test_open_not_cached(dvc): dvc.run( single_stage=True, metrics_no_cache=[metric_file], - cmd=(f'python -c "{metric_code}"'), + cmd=f'python -c "{metric_code}"', ) with api.open(metric_file) as fd: assert fd.read() == metric_content os.remove(metric_file) - with pytest.raises(FileMissingError): + with pytest.raises(PathMissingError): api.read(metric_file)