From d43bed73023899f32acadd8c19653f64f7a55e7c Mon Sep 17 00:00:00 2001 From: dberenbaum Date: Mon, 11 Dec 2023 18:16:18 -0500 Subject: [PATCH 1/2] gc: add --skip-failed --- dvc/commands/gc.py | 7 +++++++ dvc/exceptions.py | 12 +++++++++++ dvc/repo/__init__.py | 38 +++++++++++++++++++++++------------ dvc/repo/gc.py | 2 ++ tests/func/test_gc.py | 12 ++++++++++- tests/unit/command/test_gc.py | 2 ++ 6 files changed, 59 insertions(+), 14 deletions(-) diff --git a/dvc/commands/gc.py b/dvc/commands/gc.py index 55984fe565..9bb0f8ded0 100644 --- a/dvc/commands/gc.py +++ b/dvc/commands/gc.py @@ -86,6 +86,7 @@ def run(self): # noqa: C901, PLR0912 num=self.args.num, not_in_remote=self.args.not_in_remote, dry=self.args.dry, + skip_failed=self.args.skip_failed, ) return 0 @@ -188,6 +189,12 @@ def add_parser(subparsers, parent_parser): help="Remote storage to collect garbage in", metavar="", ) + gc_parser.add_argument( + "--skip-failed", + action="store_true", + default=False, + help="Skip revisions that fail when collected.", + ) gc_parser.add_argument( "-f", "--force", diff --git a/dvc/exceptions.py b/dvc/exceptions.py index cc74310eb1..c79ee09b45 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -357,3 +357,15 @@ def __init__( desc = f" @ {stage or version}" if (stage or version) else "" super().__init__(f"Unable to find artifact '{name}{desc}'") + + +class RevCollectionError(DvcException): + """Thrown if a revision failed to be collected. + + Args: + rev (str): revision that failed (or "workspace"). + """ + + def __init__(self, rev): + self.rev = rev + super().__init__(f"Failed to collect '{rev}'") diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 804d5a25b3..e715f5c461 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -12,7 +12,12 @@ Union, ) -from dvc.exceptions import NotDvcRepoError, OutputNotFoundError +from dvc.exceptions import ( + DvcException, + NotDvcRepoError, + OutputNotFoundError, + RevCollectionError, +) from dvc.ignore import DvcIgnoreFilter from dvc.log import logger from dvc.utils.objects import cached_property @@ -487,6 +492,7 @@ def used_objs( # noqa: PLR0913 revs=None, num=1, push: bool = False, + skip_failed: bool = False, ): """Get the stages related to the given target and collect the `info` of its outputs. @@ -505,7 +511,7 @@ def used_objs( # noqa: PLR0913 """ used = defaultdict(set) - for _ in self.brancher( + for rev in self.brancher( revs=revs, all_branches=all_branches, all_tags=all_tags, @@ -514,17 +520,23 @@ def used_objs( # noqa: PLR0913 commit_date=commit_date, num=num, ): - for odb, objs in self.index.used_objs( - targets, - remote=remote, - force=force, - jobs=jobs, - recursive=recursive, - with_deps=with_deps, - push=push, - ).items(): - used[odb].update(objs) - + try: + for odb, objs in self.index.used_objs( + targets, + remote=remote, + force=force, + jobs=jobs, + recursive=recursive, + with_deps=with_deps, + push=push, + ).items(): + used[odb].update(objs) + except DvcException as exc: + rev = rev or "workspace" + if skip_failed: + logger.warning("Failed to collect '%s', skipping", rev) + else: + raise RevCollectionError(rev) from exc if used_run_cache: for odb, objs in self.stage_cache.get_used_objs( used_run_cache, remote=remote, force=force, jobs=jobs diff --git a/dvc/repo/gc.py b/dvc/repo/gc.py index 5b1f9fc4f0..682d05a3f5 100644 --- a/dvc/repo/gc.py +++ b/dvc/repo/gc.py @@ -68,6 +68,7 @@ def gc( # noqa: PLR0913, C901 num: Optional[int] = None, not_in_remote: bool = False, dry: bool = False, + skip_failed: bool = False, ): # require `workspace` to be true to come into effect. # assume `workspace` to be enabled if any of `all_tags`, `all_commits`, @@ -113,6 +114,7 @@ def gc( # noqa: PLR0913, C901 jobs=jobs, revs=[rev] if rev else None, num=num or 1, + skip_failed=skip_failed, ).items(): if odb not in odb_to_obj_ids: odb_to_obj_ids[odb] = set() diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index f1e28465ac..b7cf26f5ba 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -7,7 +7,7 @@ import pytest from dvc.cli import main -from dvc.exceptions import CollectCacheError, InvalidArgumentError +from dvc.exceptions import CollectCacheError, InvalidArgumentError, RevCollectionError from dvc.fs import LocalFileSystem from dvc.utils.fs import remove from dvc_data.hashfile.db.local import LocalHashFileDB @@ -439,3 +439,13 @@ def test_gc_logging(caplog, dvc, good_and_bad_cache): assert "Removed 3 objects from repo cache." in caplog.text assert "No unused 'local' cache to remove." in caplog.text assert "No unused 'legacy' cache to remove." in caplog.text + + +def test_gc_skip_failed(tmp_dir, dvc): + with open("dvc.yaml", mode="w") as f: + f.write("\ninvalid") + + with pytest.raises(RevCollectionError): + dvc.gc(force=True, workspace=True) + + dvc.gc(force=True, workspace=True, skip_failed=True) diff --git a/tests/unit/command/test_gc.py b/tests/unit/command/test_gc.py index 5df8327674..98102a5802 100644 --- a/tests/unit/command/test_gc.py +++ b/tests/unit/command/test_gc.py @@ -26,6 +26,7 @@ def test_(dvc, scm, mocker): "--projects", "project1", "project2", + "--skip-failed", ] ) assert cli_args.func == CmdGC @@ -51,6 +52,7 @@ def test_(dvc, scm, mocker): num=None, not_in_remote=False, dry=True, + skip_failed=True, ) cli_args = parse_args(["gc"]) From 78b02b8493f4aaec18f2f8fa920323c241d02007 Mon Sep 17 00:00:00 2001 From: dberenbaum Date: Mon, 11 Dec 2023 19:53:28 -0500 Subject: [PATCH 2/2] gc: fix test_gc_no_dir_cache --- tests/func/test_gc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index b7cf26f5ba..af42c0a1ac 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -111,8 +111,9 @@ def test_gc_no_dir_cache(tmp_dir, dvc): remove(dir_stage.outs[0].cache_path) - with pytest.raises(CollectCacheError): + with pytest.raises(RevCollectionError) as exc: dvc.gc(workspace=True) + assert type(exc.value.__cause__) == CollectCacheError assert _count_files(dvc.cache.local.path) == 4 dvc.gc(force=True, workspace=True)