Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 32 additions & 18 deletions dvc/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
from contextlib import _GeneratorContextManager as GCM
from contextlib import contextmanager

from dvc.exceptions import DvcException, NotDvcRepoError
from funcy import reraise

from dvc.exceptions import (
NotDvcRepoError,
OutputNotFoundError,
PathMissingError,
)
from dvc.external_repo import external_repo
from dvc.path_info import PathInfo
from dvc.repo import Repo


class UrlNotDvcRepoError(DvcException):
"""Thrown if the given URL is not a DVC repository."""

def __init__(self, url):
super().__init__(f"'{url}' is not a DVC repository.")
from dvc.tree.repo import RepoTree


def get_url(path, repo=None, rev=None, remote=None):
Expand All @@ -20,17 +21,26 @@ def get_url(path, repo=None, rev=None, remote=None):
in a DVC repo. For Git repos, HEAD is used unless a rev argument is
supplied. The default remote is tried unless a remote argument is supplied.

Raises UrlNotDvcRepoError if repo is not a DVC project.
Raises OutputNotFoundError if the file is not a dvc-tracked file.

NOTE: This function does not check for the actual existence of the file or
directory in the remote storage.
"""
with _make_repo(repo, rev=rev) as _repo:
if not isinstance(_repo, Repo):
raise UrlNotDvcRepoError(_repo.url) # pylint: disable=no-member
out = _repo.find_out_by_relpath(path)
remote_obj = _repo.cloud.get_remote(remote)
return str(remote_obj.tree.hash_to_path_info(out.checksum))
tree = RepoTree(_repo, fetch=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
tree = RepoTree(_repo, fetch=True)
tree = DvcTree(_repo, fetch=True)

Copy link
Collaborator Author

@skshetry skshetry Aug 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DvcTree is an internal API, and does not support subrepos.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, I think I get why you are suggesting that. As DvcTree does not support subrepos, we need to implement RepoTree.get_dvctree(path) or RepoTree.get_repo(repo_path), as get_hash also hashes git-files.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or, get_hash(dvc_only=True)? Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I suggest it because this is not meant to work with git files. Not a big fan of dvc_only=True though, but we could simply do:

if not tree.isdvc(path):
    raise ....
hash = tree.get_hash() 

🙂

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Theoretically we could've made hash part of the metadata, but that might make it complicated for files inside dirs, as you'll need to parse the dir_cache (as we do in _get_granular_checksum, which is not that bad)

Copy link
Contributor

@efiop efiop Aug 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw, this will automatically close #3182 for files in the directory 🙂

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, did as you recommended, but metadata/hash/RepoTree all gets mixed. Feels like that thing could be a part of RepoTree on itself.

path_info = PathInfo(_repo.root_dir) / path

meta = tree.metadata(path_info)
exc = OutputNotFoundError(path)
if not meta.is_dvc:
raise exc

with reraise(FileNotFoundError, exc):
_, hash_ = tree.get_hash(PathInfo(_repo.root_dir) / path)

assert hash_ and meta.repo
cloud = meta.repo.cloud
return cloud.get_url_for(remote, checksum=hash_)


def open( # noqa, pylint: disable=redefined-builtin
Expand Down Expand Up @@ -74,10 +84,14 @@ def __getattr__(self, name):

def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None):
with _make_repo(repo, rev=rev) as _repo:
with _repo.open_by_relpath(
path, remote=remote, mode=mode, encoding=encoding
) as fd:
yield fd
tree = RepoTree(_repo, stream=True, fetch=True)

path = PathInfo(_repo.root_dir) / path
with reraise(FileNotFoundError, PathMissingError(path, repo)):
with tree.open(
path, remote=remote, mode=mode, encoding=encoding
) as fd:
yield fd
Comment on lines +87 to +94
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't look like there is a need for this change. Repo.open_by_relpath already does all of this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

open_by_relpath will most likely go away, as RepoTree supports subrepos directly.

Copy link
Contributor

@efiop efiop Aug 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@skshetry Agreed, it was only used for the API, so we could delete it in that case.

open_by_relpath also has some weird exceptions that need to be double checked. Maybe not worth messing with this right now, your call.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exceptions do not look good, I do have to tell you that because Repo and ExternalRepo threw different exception before, but now, I am throwing PathMissingError. Unified, but quite verbose.



def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None):
Expand Down
8 changes: 8 additions & 0 deletions dvc/data_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,11 @@ def status(self, cache, jobs=None, remote=None, show_checksums=False):
return self.repo.cache.local.status(
cache, jobs=jobs, remote=remote, show_checksums=show_checksums
)

def get_url_for(self, remote=None, checksum=None):
remote = self.get_remote(remote)
tree = remote.tree

if checksum:
return tree.hash_to_path_info(checksum).url
return tree.path_info.url
21 changes: 9 additions & 12 deletions dvc/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,19 +291,16 @@ def __init__(self, code, reason):


class PathMissingError(DvcException):
default_msg = (
"The path '{}' does not exist in the target repository '{}'"
" neither as a DVC output nor as a Git-tracked file."
)
default_msg_dvc_only = (
"The path '{}' does not exist in the target repository '{}'"
" as an DVC output."
)

def __init__(self, path, repo, dvc_only=False):
msg = self.default_msg if not dvc_only else self.default_msg_dvc_only
super().__init__(msg.format(path, repo))
def __init__(self, path, repo=None, dvc_only=False):
msg = f"The path '{path}' does not exist in the"
msg += " repository" if not repo else f" target repository '{repo}'"
msg += " as a DVC output"
msg = f"{msg}." if dvc_only else f"{msg} nor as a Git-tracked file."

self.dvc_only = dvc_only
self.repo = repo
self.path = path
super().__init__(msg)


class RemoteCacheRequiredError(DvcException):
Expand Down
6 changes: 5 additions & 1 deletion dvc/tree/_metadata.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from dataclasses import dataclass, field
from typing import List
from typing import TYPE_CHECKING, List, Optional

from dvc.output import BaseOutput
from dvc.path_info import PathInfo

if TYPE_CHECKING:
from dvc.repo import Repo


@dataclass
class Metadata:
Expand All @@ -30,6 +33,7 @@ class Metadata:
isdir: bool = False # is it a directory?
is_exec: bool = False # is it an executable?
outs: List[BaseOutput] = field(default_factory=list) # list of outputs
repo: Optional["Repo"] = None # the repo path falls in

def __post_init__(self):
self.output_exists = bool(self.outs)
Expand Down
5 changes: 3 additions & 2 deletions dvc/tree/dvc.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def _get_granular_checksum(self, path, out, remote=None):
assert isinstance(path, PathInfo)
if not self.fetch and not self.stream:
raise FileNotFoundError
dir_cache = out.get_dir_cache(remote=remote)
with self.repo.state:
dir_cache = out.get_dir_cache(remote=remote)
for entry in dir_cache:
entry_relpath = entry[out.tree.PARAM_RELPATH]
if os.name == "nt":
Expand Down Expand Up @@ -240,6 +241,6 @@ def metadata(self, path_info):
path_info = PathInfo(os.path.abspath(path_info))
outs = self._find_outs(path_info, strict=False, recursive=True)

meta = Metadata(path_info=path_info, outs=outs)
meta = Metadata(path_info=path_info, outs=outs, repo=self.repo)
meta.isdir = meta.isdir or self.check_isdir(meta.path_info, meta.outs)
return meta
13 changes: 6 additions & 7 deletions tests/func/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import pytest

from dvc import api
from dvc.api import UrlNotDvcRepoError
from dvc.exceptions import FileMissingError
from dvc.exceptions import OutputNotFoundError, PathMissingError
from dvc.path_info import URLInfo
from dvc.utils.fs import remove

Expand Down Expand Up @@ -47,10 +46,10 @@ def test_get_url_external(erepo_dir, cloud):
def test_get_url_requires_dvc(tmp_dir, scm):
tmp_dir.scm_gen({"foo": "foo"}, commit="initial")

with pytest.raises(UrlNotDvcRepoError, match="not a DVC repository"):
with pytest.raises(OutputNotFoundError, match="with output 'foo'"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is changing the API, we shouldn't do it in this PR if we can avoid it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plus OutputNotFoundError is an internal thing, that shouldn't be exposed to api users.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do have to break things here as it's not precise enough. OutputNotFoundError is more of a correct term here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, we already threw OutputNotFoundError. It was just not documented. The following threw OutputNotFoundError.

https://github.com/iterative/dvc/blob/334556f07dc511927543218d4a2a1a1c1c83ed65/dvc/api.py#L31

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@skshetry I think that one was caught by https://github.com/iterative/dvc/blob/master/dvc/external_repo.py#L51 and re-raised properly.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh missed that. So, we could raise the same exc here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@skshetry But this particular test doesn't catch that one, but rather UrlNotDvcRepoError and it should stay that way. The issue is that we are using RepoTree instead of DvcTree right now, which works for git-only repos, when it shouldn't. Could do something simple like if not tree.dvc_tree: raise UrlNotDvcRepoError though, if subrepos are a concern.

api.get_url("foo", repo=os.fspath(tmp_dir))

with pytest.raises(UrlNotDvcRepoError):
with pytest.raises(OutputNotFoundError):
api.get_url("foo", repo=f"file://{tmp_dir}")


Expand Down Expand Up @@ -144,7 +143,7 @@ def test_missing(tmp_dir, dvc, remote):

remove("foo")

with pytest.raises(FileMissingError):
with pytest.raises(PathMissingError, match="path 'foo' does not exist"):
api.read("foo")


Expand All @@ -164,12 +163,12 @@ def test_open_not_cached(dvc):
dvc.run(
single_stage=True,
metrics_no_cache=[metric_file],
cmd=(f'python -c "{metric_code}"'),
cmd=f'python -c "{metric_code}"',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

)

with api.open(metric_file) as fd:
assert fd.read() == metric_content

os.remove(metric_file)
with pytest.raises(FileMissingError):
with pytest.raises(PathMissingError):
api.read(metric_file)