From a5d039e79ca860b2b52f4172b853be323a81489d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Sat, 5 Dec 2020 09:14:09 +0545 Subject: [PATCH] Run graph checks on `collect`/`find_outs_by_path` We try to optimize `tree.exists` calls and probably few others in that they either look directly into the workspace or, to the cache without running graph checks. It does not seem to be possible just to run graph checks on `find_outs_by_path` due to those optimizations. So, that's why, the `collect` also does a graph check for this reason. Fixes #5027 Fixes #4010 --- dvc/repo/__init__.py | 4 +-- dvc/repo/collect.py | 2 +- tests/func/metrics/test_common.py | 47 ++++++++++++++++++++++++++++++- tests/unit/repo/test_repo.py | 11 ++++++++ 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 19b1e4f505..bd8b0d92bb 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -375,8 +375,8 @@ def stages(self): return self.stage.collect_repo(onerror=error_handler) def find_outs_by_path(self, path, outs=None, recursive=False, strict=True): - if not outs: - outs = [out for stage in self.stages for out in stage.outs] + # using `outs_graph` to ensure graph checks are run + outs = outs or self.outs_graph abs_path = os.path.abspath(path) path_info = PathInfo(abs_path) diff --git a/dvc/repo/collect.py b/dvc/repo/collect.py index 78ee4e1aa2..32dd924738 100644 --- a/dvc/repo/collect.py +++ b/dvc/repo/collect.py @@ -21,7 +21,7 @@ def _collect_outs( ) -> Outputs: outs = { out - for stage in repo.stages + for stage in repo.graph # using `graph` to ensure graph checks run for out in (stage.deps if deps else stage.outs) } return set(filter(output_filter, outs)) if output_filter else outs diff --git a/tests/func/metrics/test_common.py b/tests/func/metrics/test_common.py index 2ba3b6e54b..1a3d3735c8 100644 --- a/tests/func/metrics/test_common.py +++ b/tests/func/metrics/test_common.py @@ -2,7 +2,14 @@ import pytest -from dvc.exceptions import NoMetricsFoundError, NoMetricsParsedError +from dvc.exceptions import ( + NoMetricsFoundError, + NoMetricsParsedError, + OverlappingOutputPathsError, +) +from dvc.path_info import PathInfo +from dvc.utils.fs import remove +from dvc.utils.serialize import dump_yaml, modify_yaml from tests.func.metrics.utils import _write_json @@ -61,3 +68,41 @@ def test_show_malformed_metric( def test_show_no_metrics_files(tmp_dir, dvc, caplog, show): with pytest.raises(NoMetricsFoundError): show(dvc) + + +@pytest.mark.parametrize("clear_before_run", [True, False]) +@pytest.mark.parametrize("typ", ["metrics", "plots"]) +def test_metrics_show_overlap( + tmp_dir, dvc, run_copy_metrics, clear_before_run, typ +): + data_dir = PathInfo("data") + (tmp_dir / data_dir).mkdir() + + outs = {typ: [str(data_dir / "m1.yaml")]} + dump_yaml(data_dir / "m1_temp.yaml", {"a": {"b": {"c": 2, "d": 1}}}) + run_copy_metrics( + str(data_dir / "m1_temp.yaml"), + str(data_dir / "m1.yaml"), + single_stage=False, + commit=f"add m1 {typ}", + name="cp-m1", + **outs, + ) + with modify_yaml("dvc.yaml") as d: + # trying to make an output overlaps error + d["stages"]["corrupted-stage"] = { + "cmd": "mkdir data", + "outs": ["data"], + } + + # running by clearing and not clearing stuffs + # so as it works even for optimized cases + if clear_before_run: + remove(data_dir) + remove(dvc.cache.local.cache_dir) + + dvc._reset() + + show = dvc.metrics.show if typ == "metrics" else dvc.plots.show + with pytest.raises(OverlappingOutputPathsError): + show() diff --git a/tests/unit/repo/test_repo.py b/tests/unit/repo/test_repo.py index 272d5b78e2..5b9b6f5367 100644 --- a/tests/unit/repo/test_repo.py +++ b/tests/unit/repo/test_repo.py @@ -1,7 +1,9 @@ import os +import shutil import pytest +from dvc.exceptions import OutputDuplicationError from dvc.repo import NotDvcRepoError, Repo, locked from dvc.utils.fs import remove @@ -29,6 +31,15 @@ def test_find_outs_by_path(tmp_dir, dvc, path): assert outs[0].path_info == stage.outs[0].path_info +def test_find_outs_by_path_does_graph_checks(tmp_dir, dvc): + tmp_dir.dvc_gen("foo", "foo") + shutil.copyfile("foo.dvc", "foo-2.dvc") + + dvc._reset() + with pytest.raises(OutputDuplicationError): + dvc.find_outs_by_path("foo") + + @pytest.mark.parametrize( "path", [os.path.join("dir", "subdir", "file"), os.path.join("dir", "subdir")],