From 7cd8510207357858f2d87e57132badb406569ba2 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Wed, 21 Apr 2021 18:58:52 +0900 Subject: [PATCH 1/5] stage,commit: add support for only committing data stages from experiments --- dvc/repo/commit.py | 24 ++++++++++++++++++------ dvc/stage/__init__.py | 12 +++++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/dvc/repo/commit.py b/dvc/repo/commit.py index a0405b61f0..d58b4775a2 100644 --- a/dvc/repo/commit.py +++ b/dvc/repo/commit.py @@ -37,20 +37,32 @@ def prompt_to_commit(stage, changes, force=False): @locked def commit( - self, target, with_deps=False, recursive=False, force=False, + self, + target, + with_deps=False, + recursive=False, + force=False, + allow_missing=False, + data_only=False, ): from dvc.dvcfile import Dvcfile - stages_info = self.stage.collect_granular( - target, with_deps=with_deps, recursive=recursive - ) + stages_info = [ + info + for info in self.stage.collect_granular( + target, with_deps=with_deps, recursive=recursive + ) + if not data_only or info.stage.is_data_source + ] for stage_info in stages_info: stage = stage_info.stage changes = stage.changed_entries() if any(changes): prompt_to_commit(stage, changes, force=force) - stage.save() - stage.commit(filter_info=stage_info.filter_info) + stage.save(allow_missing=allow_missing) + stage.commit( + filter_info=stage_info.filter_info, allow_missing=allow_missing + ) Dvcfile(self, stage.path).dump(stage, update_pipeline=False) return [s.stage for s in stages_info] diff --git a/dvc/stage/__init__.py b/dvc/stage/__init__.py index bbbed8beb4..9f60813490 100644 --- a/dvc/stage/__init__.py +++ b/dvc/stage/__init__.py @@ -433,15 +433,21 @@ def compute_md5(self): return m def save(self, allow_missing=False): - self.save_deps() + self.save_deps(allow_missing=allow_missing) self.save_outs(allow_missing=allow_missing) self.md5 = self.compute_md5() self.repo.stage_cache.save(self) - def save_deps(self): + def save_deps(self, allow_missing=False): + from dvc.dependency.base import DependencyDoesNotExistError + for dep in self.deps: - dep.save() + try: + dep.save() + except DependencyDoesNotExistError: + if not allow_missing: + raise def save_outs(self, allow_missing=False): from dvc.output.base import OutputDoesNotExistError From 692eca792d3a477c304afd90a73fbedd730c24c8 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Wed, 21 Apr 2021 18:59:29 +0900 Subject: [PATCH 2/5] exp run: `dvc commit` data deps when stashing exp changes --- dvc/repo/experiments/__init__.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/dvc/repo/experiments/__init__.py b/dvc/repo/experiments/__init__.py index 08451ed41c..af23926181 100644 --- a/dvc/repo/experiments/__init__.py +++ b/dvc/repo/experiments/__init__.py @@ -186,6 +186,10 @@ def _stash_exp( if params: self._update_params(params) + # DVC commit data deps to preserve state across workspace + # & tempdir runs + self._stash_commit_deps(*args, **kwargs) + if resume_rev: if branch: branch_name = ExpRefInfo.from_ref(branch).name @@ -242,6 +246,25 @@ def _stash_exp( return stash_rev + def _stash_commit_deps(self, *args, **kwargs): + if len(args): + targets = args[0] + else: + targets = kwargs.get("targets") + if targets is None: + targets = [None] + elif isinstance(targets, str): + targets = [targets] + for target in targets: + self.repo.commit( + target, + with_deps=True, + recursive=kwargs.get("recursive", False), + force=True, + allow_missing=True, + data_only=True, + ) + def _stash_msg( self, rev: str, From 06cb2939d80430aab5f76a73d9b7aee8cc96fd91 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Wed, 21 Apr 2021 19:00:26 +0900 Subject: [PATCH 3/5] add test for modified DVC-tracked data when generating experiment --- tests/func/experiments/test_experiments.py | 44 ++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/tests/func/experiments/test_experiments.py b/tests/func/experiments/test_experiments.py index 9e3d6bac5c..e88f41f090 100644 --- a/tests/func/experiments/test_experiments.py +++ b/tests/func/experiments/test_experiments.py @@ -624,3 +624,47 @@ def test_fix_exp_head(tmp_dir, scm, tail): scm.set_ref(EXEC_BASELINE, "refs/heads/master") assert EXEC_BASELINE + tail == fix_exp_head(scm, head) assert "foo" + tail == fix_exp_head(scm, "foo" + tail) + + +@pytest.mark.parametrize("workspace", [True, False]) +def test_modified_data_dep(tmp_dir, scm, dvc, workspace): + tmp_dir.dvc_gen("data", "data") + tmp_dir.gen("copy.py", COPY_SCRIPT) + tmp_dir.gen("params.yaml", "foo: 1") + exp_stage = dvc.run( + cmd="python copy.py params.yaml metrics.yaml", + metrics_no_cache=["metrics.yaml"], + params=["foo"], + name="copy-file", + deps=["copy.py", "data"], + ) + scm.add( + [ + "dvc.yaml", + "dvc.lock", + "copy.py", + "params.yaml", + "metrics.yaml", + "data.dvc", + ".gitignore", + ] + ) + scm.commit("init") + + tmp_dir.gen("params.yaml", "foo: 2") + tmp_dir.gen("data", "modified") + + results = dvc.experiments.run(exp_stage.addressing, tmp_dir=not workspace) + exp = first(results) + + for rev in dvc.brancher(revs=[exp]): + if rev != exp: + continue + with dvc.repo_fs.open(tmp_dir / "metrics.yaml") as fobj: + assert fobj.read().strip() == "foo: 2" + with dvc.repo_fs.open(tmp_dir / "data") as fobj: + assert fobj.read().strip() == "modified" + + if workspace: + assert (tmp_dir / "metrics.yaml").read_text().strip() == "foo: 2" + assert (tmp_dir / "data").read_text().strip() == "modified" From a4439e3f95025a8c0892046dcd9ca9091a253732 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Thu, 22 Apr 2021 13:28:09 +0900 Subject: [PATCH 4/5] test with and without params change --- tests/func/experiments/test_experiments.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/func/experiments/test_experiments.py b/tests/func/experiments/test_experiments.py index e88f41f090..6fcc3179f5 100644 --- a/tests/func/experiments/test_experiments.py +++ b/tests/func/experiments/test_experiments.py @@ -626,8 +626,11 @@ def test_fix_exp_head(tmp_dir, scm, tail): assert "foo" + tail == fix_exp_head(scm, "foo" + tail) -@pytest.mark.parametrize("workspace", [True, False]) -def test_modified_data_dep(tmp_dir, scm, dvc, workspace): +@pytest.mark.parametrize( + "workspace, params", + [(True, "foo: 1"), (True, "foo: 2"), (False, "foo: 1"), (False, "foo: 2")], +) +def test_modified_data_dep(tmp_dir, scm, dvc, workspace, params): tmp_dir.dvc_gen("data", "data") tmp_dir.gen("copy.py", COPY_SCRIPT) tmp_dir.gen("params.yaml", "foo: 1") @@ -651,7 +654,7 @@ def test_modified_data_dep(tmp_dir, scm, dvc, workspace): ) scm.commit("init") - tmp_dir.gen("params.yaml", "foo: 2") + tmp_dir.gen("params.yaml", params) tmp_dir.gen("data", "modified") results = dvc.experiments.run(exp_stage.addressing, tmp_dir=not workspace) @@ -661,10 +664,10 @@ def test_modified_data_dep(tmp_dir, scm, dvc, workspace): if rev != exp: continue with dvc.repo_fs.open(tmp_dir / "metrics.yaml") as fobj: - assert fobj.read().strip() == "foo: 2" + assert fobj.read().strip() == params with dvc.repo_fs.open(tmp_dir / "data") as fobj: assert fobj.read().strip() == "modified" if workspace: - assert (tmp_dir / "metrics.yaml").read_text().strip() == "foo: 2" + assert (tmp_dir / "metrics.yaml").read_text().strip() == params assert (tmp_dir / "data").read_text().strip() == "modified" From 2c63880db7d38a7178373b857c63eaa78c6d7e44 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Fri, 23 Apr 2021 08:32:12 +0900 Subject: [PATCH 5/5] handle run without target --- dvc/repo/experiments/__init__.py | 6 +++--- tests/func/experiments/test_experiments.py | 11 +++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/dvc/repo/experiments/__init__.py b/dvc/repo/experiments/__init__.py index af23926181..cf84180c4e 100644 --- a/dvc/repo/experiments/__init__.py +++ b/dvc/repo/experiments/__init__.py @@ -251,10 +251,10 @@ def _stash_commit_deps(self, *args, **kwargs): targets = args[0] else: targets = kwargs.get("targets") - if targets is None: - targets = [None] - elif isinstance(targets, str): + if isinstance(targets, str): targets = [targets] + elif not targets: + targets = [None] for target in targets: self.repo.commit( target, diff --git a/tests/func/experiments/test_experiments.py b/tests/func/experiments/test_experiments.py index 6fcc3179f5..6b9dfa5a56 100644 --- a/tests/func/experiments/test_experiments.py +++ b/tests/func/experiments/test_experiments.py @@ -1,3 +1,4 @@ +import itertools import logging import os import stat @@ -627,10 +628,10 @@ def test_fix_exp_head(tmp_dir, scm, tail): @pytest.mark.parametrize( - "workspace, params", - [(True, "foo: 1"), (True, "foo: 2"), (False, "foo: 1"), (False, "foo: 2")], + "workspace, params, target", + itertools.product((True, False), ("foo: 1", "foo: 2"), (True, False)), ) -def test_modified_data_dep(tmp_dir, scm, dvc, workspace, params): +def test_modified_data_dep(tmp_dir, scm, dvc, workspace, params, target): tmp_dir.dvc_gen("data", "data") tmp_dir.gen("copy.py", COPY_SCRIPT) tmp_dir.gen("params.yaml", "foo: 1") @@ -657,7 +658,9 @@ def test_modified_data_dep(tmp_dir, scm, dvc, workspace, params): tmp_dir.gen("params.yaml", params) tmp_dir.gen("data", "modified") - results = dvc.experiments.run(exp_stage.addressing, tmp_dir=not workspace) + results = dvc.experiments.run( + exp_stage.addressing if target else None, tmp_dir=not workspace + ) exp = first(results) for rev in dvc.brancher(revs=[exp]):