From 24120164a93ba2c2016d8740a475d671da055f85 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Tue, 21 Jan 2020 20:28:53 +0545 Subject: [PATCH 1/4] gc: change option to use single remove flags --- dvc/command/gc.py | 51 +++++++++++++++++++++-------------------------- dvc/repo/gc.py | 8 +++++--- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/dvc/command/gc.py b/dvc/command/gc.py index 7cbffc537c..4aaf516423 100644 --- a/dvc/command/gc.py +++ b/dvc/command/gc.py @@ -9,19 +9,28 @@ logger = logging.getLogger(__name__) +supported_removal_filters = ("commits", "branches", "tags") + class CmdGC(CmdBase): def run(self): - msg = "This will remove all cache except items used in " + msg = "This will remove all cache except items used in " msg += "the working tree" - if self.args.all_commits: + + self.args.remove = self.args.remove or [] + + if "commits" not in self.args.remove: msg += " and all git commits" - elif self.args.all_branches and self.args.all_tags: + + if ( + "branches" not in self.args.remove + and "tags" not in self.args.remove + ): msg += " and all git branches and tags" - elif self.args.all_branches: + elif "branches" not in self.args.remove: msg += " and all git branches" - elif self.args.all_tags: + elif "tags" not in self.args.remove: msg += " and all git tags" if self.args.repos: @@ -39,9 +48,9 @@ def run(self): return 1 self.repo.gc( - all_branches=self.args.all_branches, - all_tags=self.args.all_tags, - all_commits=self.args.all_commits, + all_branches="branches" in self.args.remove, + all_tags="tags" in self.args.remove, + all_commits="commits" in self.args.remove, cloud=self.args.cloud, remote=self.args.remote, force=self.args.force, @@ -64,26 +73,6 @@ def add_parser(subparsers, parent_parser): help=GC_HELP, formatter_class=argparse.RawDescriptionHelpFormatter, ) - gc_parser.add_argument( - "-a", - "--all-branches", - action="store_true", - default=False, - help="Keep data files for the tips of all git branches.", - ) - gc_parser.add_argument( - "-T", - "--all-tags", - action="store_true", - default=False, - help="Keep data files for all git tags.", - ) - gc_parser.add_argument( - "--all-commits", - action="store_true", - default=False, - help=argparse.SUPPRESS, - ) gc_parser.add_argument( "-c", "--cloud", @@ -94,6 +83,12 @@ def add_parser(subparsers, parent_parser): gc_parser.add_argument( "-r", "--remote", help="Remote storage to collect garbage in." ) + gc_parser.add_argument( + "--remove", + nargs="*", + choices=supported_removal_filters, + help="Available choices for remove: %(choices)s", + ) gc_parser.add_argument( "-f", "--force", diff --git a/dvc/repo/gc.py b/dvc/repo/gc.py index e55a34dc3f..d732f44758 100644 --- a/dvc/repo/gc.py +++ b/dvc/repo/gc.py @@ -2,6 +2,7 @@ from . import locked from dvc.cache import NamedCache +from dvc.scm import NoSCM logger = logging.getLogger(__name__) @@ -41,12 +42,13 @@ def gc( used = NamedCache() for repo in all_repos + [self]: + has_scm = not isinstance(repo.scm, NoSCM) used.update( repo.used_cache( - all_branches=all_branches, + all_branches=not all_branches and has_scm, with_deps=with_deps, - all_tags=all_tags, - all_commits=all_commits, + all_tags=not all_tags and has_scm, + all_commits=not all_commits and has_scm, remote=remote, force=force, jobs=jobs, From b9838b76d836a2ef32c6ca1cf1701c60923cd371 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 23 Jan 2020 10:14:46 +0545 Subject: [PATCH 2/4] fixup! brancher: do not get tree if there's no revs --- dvc/command/gc.py | 4 +--- dvc/repo/__init__.py | 2 ++ dvc/repo/brancher.py | 12 +++++++++++- dvc/repo/gc.py | 25 +++++++++++++++++-------- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/dvc/command/gc.py b/dvc/command/gc.py index 4aaf516423..b43c86bf59 100644 --- a/dvc/command/gc.py +++ b/dvc/command/gc.py @@ -48,9 +48,7 @@ def run(self): return 1 self.repo.gc( - all_branches="branches" in self.args.remove, - all_tags="tags" in self.args.remove, - all_commits="commits" in self.args.remove, + remove=self.args.remove, cloud=self.args.cloud, remote=self.args.remote, force=self.args.force, diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 9d0a506a87..1a2ee06df9 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -223,6 +223,7 @@ def used_cache( force=False, jobs=None, recursive=False, + exclude_revs=None, ): """Get the stages related to the given target and collect the `info` of its outputs. @@ -245,6 +246,7 @@ def used_cache( all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, + exclude_revs=exclude_revs, ): targets = targets or [None] diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index 955b135135..bc83d98dc5 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -4,7 +4,12 @@ def brancher( # noqa: E302 - self, revs=None, all_branches=False, all_tags=False, all_commits=False + self, + revs=None, + all_branches=False, + all_tags=False, + all_commits=False, + exclude_revs=None, ): """Generator that iterates over specified revisions. @@ -26,6 +31,7 @@ def brancher( # noqa: E302 saved_tree = self.tree revs = revs or [] + exclude_revs = exclude_revs or [] scm = self.scm @@ -41,9 +47,13 @@ def brancher( # noqa: E302 if all_tags: revs.extend(scm.list_tags()) + excluded_shas = [scm.resolve_rev(rev) for rev in exclude_revs] + try: if revs: for sha, names in group_by(scm.resolve_rev, revs).items(): + if sha in excluded_shas: + continue self.tree = scm.get_tree(sha) yield ", ".join(names) finally: diff --git a/dvc/repo/gc.py b/dvc/repo/gc.py index d732f44758..9c57d419ae 100644 --- a/dvc/repo/gc.py +++ b/dvc/repo/gc.py @@ -2,7 +2,6 @@ from . import locked from dvc.cache import NamedCache -from dvc.scm import NoSCM logger = logging.getLogger(__name__) @@ -17,12 +16,10 @@ def _do_gc(typ, func, clist): @locked def gc( self, - all_branches=False, + remove=None, cloud=False, remote=None, with_deps=False, - all_tags=False, - all_commits=False, force=False, jobs=None, repos=None, @@ -31,6 +28,7 @@ def gc( from dvc.repo import Repo all_repos = [] + remove = remove or [] if repos: all_repos = [Repo(path) for path in repos] @@ -42,16 +40,27 @@ def gc( used = NamedCache() for repo in all_repos + [self]: - has_scm = not isinstance(repo.scm, NoSCM) + exclude_revs = ( + repo.scm.list_branches() + if {"commits", "branches"} - set(remove) + else [] + ) + exclude_revs += ( + repo.scm.list_tags() + if {"tags", "commits"} - set(remove) + else [] + ) + used.update( repo.used_cache( - all_branches=not all_branches and has_scm, + all_branches="branches" not in remove, with_deps=with_deps, - all_tags=not all_tags and has_scm, - all_commits=not all_commits and has_scm, + all_tags="tags" not in remove, + all_commits="commits" not in remove, remote=remote, force=force, jobs=jobs, + exclude_revs=exclude_revs, ) ) From 13c3338c3a56f8ea7dd56c32a941f0100592d328 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Thu, 23 Jan 2020 17:10:20 +0545 Subject: [PATCH 3/4] temp: try to exclude some hashes --- dvc/repo/gc.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/dvc/repo/gc.py b/dvc/repo/gc.py index 9c57d419ae..58ba4b51dd 100644 --- a/dvc/repo/gc.py +++ b/dvc/repo/gc.py @@ -41,22 +41,17 @@ def gc( used = NamedCache() for repo in all_repos + [self]: exclude_revs = ( - repo.scm.list_branches() - if {"commits", "branches"} - set(remove) - else [] - ) - exclude_revs += ( - repo.scm.list_tags() - if {"tags", "commits"} - set(remove) - else [] + repo.scm.list_branches() if "branches" in remove else [] ) + exclude_revs += repo.scm.list_tags() if "tags" in remove else [] + all_commits = "commits" not in remove and not bool(exclude_revs) used.update( repo.used_cache( - all_branches="branches" not in remove, + all_branches="branches" not in remove and not all_commits, with_deps=with_deps, - all_tags="tags" not in remove, - all_commits="commits" not in remove, + all_tags="tags" not in remove and not all_commits, + all_commits=all_commits, remote=remote, force=force, jobs=jobs, From 3c07c4248ec4e7b12901f19afa571523048f7e7c Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Sun, 26 Jan 2020 21:32:38 +0200 Subject: [PATCH 4/4] gc: rework implementation --- dvc/command/gc.py | 56 +++++++++++++++++++---------------- dvc/repo/__init__.py | 2 -- dvc/repo/brancher.py | 17 ++++------- dvc/repo/gc.py | 18 ++++------- dvc/scm/base.py | 4 ++- dvc/scm/git/__init__.py | 14 +++++++-- scripts/completion/dvc.bash | 2 +- scripts/completion/dvc.zsh | 4 +-- tests/func/test_gc.py | 18 ++++++++--- tests/unit/command/test_gc.py | 39 ++++++++++++++++++++++++ tests/unit/scm/test_git.py | 13 ++++++++ 11 files changed, 126 insertions(+), 61 deletions(-) create mode 100644 tests/unit/command/test_gc.py diff --git a/dvc/command/gc.py b/dvc/command/gc.py index b43c86bf59..9e7f84680a 100644 --- a/dvc/command/gc.py +++ b/dvc/command/gc.py @@ -9,29 +9,21 @@ logger = logging.getLogger(__name__) -supported_removal_filters = ("commits", "branches", "tags") - class CmdGC(CmdBase): def run(self): - - msg = "This will remove all cache except items used in " - msg += "the working tree" - - self.args.remove = self.args.remove or [] - - if "commits" not in self.args.remove: - msg += " and all git commits" - - if ( - "branches" not in self.args.remove - and "tags" not in self.args.remove - ): - msg += " and all git branches and tags" - elif "branches" not in self.args.remove: - msg += " and all git branches" - elif "tags" not in self.args.remove: - msg += " and all git tags" + msg = ( + "This will remove all cache except items used in the working tree" + "{tags}{history}{branches}{history}" + ).format( + tags="" if self.args.remove_all_tags else "and all git tags", + branches="" + if self.args.remove_all_branches + else "and all git branches", + history="" + if self.args.remove_all_history + else "and their history", + ) if self.args.repos: msg += " of the current and the following repos:" @@ -48,7 +40,9 @@ def run(self): return 1 self.repo.gc( - remove=self.args.remove, + remove_all_tags=self.args.remove_all_tags, + remove_all_branches=self.args.remove_all_branches, + remove_all_history=self.args.remove_all_history, cloud=self.args.cloud, remote=self.args.remote, force=self.args.force, @@ -82,10 +76,22 @@ def add_parser(subparsers, parent_parser): "-r", "--remote", help="Remote storage to collect garbage in." ) gc_parser.add_argument( - "--remove", - nargs="*", - choices=supported_removal_filters, - help="Available choices for remove: %(choices)s", + "--remove-all-tags", + action="store_true", + default=False, + help="Remove cache for all git tags.", + ) + gc_parser.add_argument( + "--remove-all-branches", + action="store_true", + default=False, + help="Remove cache for all git branches.", + ) + gc_parser.add_argument( + "--remove-all-history", + action="store_true", + default=False, + help="Remove cache for all history of all branches and tags.", ) gc_parser.add_argument( "-f", diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 1a2ee06df9..9d0a506a87 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -223,7 +223,6 @@ def used_cache( force=False, jobs=None, recursive=False, - exclude_revs=None, ): """Get the stages related to the given target and collect the `info` of its outputs. @@ -246,7 +245,6 @@ def used_cache( all_branches=all_branches, all_tags=all_tags, all_commits=all_commits, - exclude_revs=exclude_revs, ): targets = targets or [None] diff --git a/dvc/repo/brancher.py b/dvc/repo/brancher.py index bc83d98dc5..5a95257e24 100644 --- a/dvc/repo/brancher.py +++ b/dvc/repo/brancher.py @@ -4,12 +4,7 @@ def brancher( # noqa: E302 - self, - revs=None, - all_branches=False, - all_tags=False, - all_commits=False, - exclude_revs=None, + self, revs=None, all_branches=False, all_tags=False, all_commits=False ): """Generator that iterates over specified revisions. @@ -31,7 +26,6 @@ def brancher( # noqa: E302 saved_tree = self.tree revs = revs or [] - exclude_revs = exclude_revs or [] scm = self.scm @@ -39,7 +33,10 @@ def brancher( # noqa: E302 yield "working tree" if all_commits: - revs = scm.list_all_commits() + revs = scm.list_all_commits( + exclude_all_tags=not all_tags, + exclude_all_branches=not all_branches, + ) else: if all_branches: revs.extend(scm.list_branches()) @@ -47,13 +44,9 @@ def brancher( # noqa: E302 if all_tags: revs.extend(scm.list_tags()) - excluded_shas = [scm.resolve_rev(rev) for rev in exclude_revs] - try: if revs: for sha, names in group_by(scm.resolve_rev, revs).items(): - if sha in excluded_shas: - continue self.tree = scm.get_tree(sha) yield ", ".join(names) finally: diff --git a/dvc/repo/gc.py b/dvc/repo/gc.py index 58ba4b51dd..fc38bf147a 100644 --- a/dvc/repo/gc.py +++ b/dvc/repo/gc.py @@ -16,7 +16,9 @@ def _do_gc(typ, func, clist): @locked def gc( self, - remove=None, + remove_all_tags=False, + remove_all_branches=False, + remove_all_history=False, cloud=False, remote=None, with_deps=False, @@ -28,7 +30,6 @@ def gc( from dvc.repo import Repo all_repos = [] - remove = remove or [] if repos: all_repos = [Repo(path) for path in repos] @@ -40,22 +41,15 @@ def gc( used = NamedCache() for repo in all_repos + [self]: - exclude_revs = ( - repo.scm.list_branches() if "branches" in remove else [] - ) - exclude_revs += repo.scm.list_tags() if "tags" in remove else [] - all_commits = "commits" not in remove and not bool(exclude_revs) - used.update( repo.used_cache( - all_branches="branches" not in remove and not all_commits, with_deps=with_deps, - all_tags="tags" not in remove and not all_commits, - all_commits=all_commits, + all_branches=not remove_all_branches, + all_tags=not remove_all_tags, + all_commits=not remove_all_history, remote=remote, force=force, jobs=jobs, - exclude_revs=exclude_revs, ) ) diff --git a/dvc/scm/base.py b/dvc/scm/base.py index 412731ca89..34c72aba52 100644 --- a/dvc/scm/base.py +++ b/dvc/scm/base.py @@ -115,7 +115,9 @@ def list_tags(self): # pylint: disable=no-self-use """Returns a list of available tags in the repo.""" return [] - def list_all_commits(self): # pylint: disable=no-self-use + def list_all_commits( + self, exclude_all_tags=False, exclude_all_branches=False + ): # pylint: disable=no-self-use """Returns a list of commits in the repo.""" return [] diff --git a/dvc/scm/git/__init__.py b/dvc/scm/git/__init__.py index 3872c8e72e..8ffc842a2f 100644 --- a/dvc/scm/git/__init__.py +++ b/dvc/scm/git/__init__.py @@ -244,8 +244,18 @@ def list_branches(self): def list_tags(self): return [t.name for t in self.repo.tags] - def list_all_commits(self): - return [c.hexsha for c in self.repo.iter_commits("--all")] + def list_all_commits( + self, exclude_all_tags=False, exclude_all_branches=False + ): + args = [] + if exclude_all_tags: + args.extend(["--exclude", "refs/tags/*"]) + if exclude_all_branches: + args.extend(["--exclude", "refs/heads/*"]) + # NOTE: order matters, `--exclude` needs to be before `--all` in order + # to affect it. + args.append("--all") + return self.repo.git.rev_list(*args).splitlines() def _install_hook(self, name, cmd): command = ( diff --git a/scripts/completion/dvc.bash b/scripts/completion/dvc.bash index 2af965a67d..5c9ea3c4ff 100644 --- a/scripts/completion/dvc.bash +++ b/scripts/completion/dvc.bash @@ -26,7 +26,7 @@ _dvc_diff='-t --target' _dvc_fetch='-j --jobs -r --remote -a --all-branches -T --all-tags -d --with-deps -R --recursive $(compgen -G *.dvc)' _dvc_get_url='' _dvc_get='-o --out --rev --show-url' -_dvc_gc='-a --all-branches -T --all-tags -c --cloud -r --remote -f --force -p --projects -j --jobs' +_dvc_gc='--remove-all-branches --remove-all-tags --remove-all-history -c --cloud -r --remote -f --force -p --projects -j --jobs' _dvc_import_url='-f --file' _dvc_import='-o --out --rev' _dvc_init='--no-scm -f --force' diff --git a/scripts/completion/dvc.zsh b/scripts/completion/dvc.zsh index 32bdef47ff..3b6911c450 100644 --- a/scripts/completion/dvc.zsh +++ b/scripts/completion/dvc.zsh @@ -124,8 +124,8 @@ _dvc_get=( ) _dvc_gc=( - {-a,--all-branches}"[Keep data files for the tips of all git branches.]" - {-T,--all-tags}"[Keep data files for all git tags.]" + "--remove-all-branches[Keep data files for the tips of all git branches.]" + "--remove-all-tags[Keep data files for all git tags.]" {-c,--cloud}"[Collect garbage in remote repository.]" {-r,--remote}"[Remote storage to collect garbage in.]:Remote repository:" {-f,--force}"[Force garbage collection - automatically agree to all prompts.]:Repos:_files" diff --git a/tests/func/test_gc.py b/tests/func/test_gc.py index 41c4f2204a..fd29ea8ef8 100644 --- a/tests/func/test_gc.py +++ b/tests/func/test_gc.py @@ -92,16 +92,26 @@ def test(self): self._check_cache(4) - self.dvc.gc(all_tags=True, all_branches=True) + self.dvc.gc() + self._check_cache(4) + self.dvc.gc(remove_all_history=True) self._check_cache(3) - self.dvc.gc(all_tags=False, all_branches=True) + self.dvc.gc(remove_all_tags=True) + self._check_cache(3) + self.dvc.gc(remove_all_branches=True) self._check_cache(2) - self.dvc.gc(all_tags=True, all_branches=False) + self.dvc.gc(remove_all_branches=True, remove_all_tags=True) + self._check_cache(2) + self.dvc.gc( + remove_all_branches=True, + remove_all_tags=True, + remove_all_history=True, + ) self._check_cache(1) @@ -183,7 +193,7 @@ def test_all_commits(tmp_dir, scm, dvc): tmp_dir.dvc_gen("testfile", "workspace") n = _count_files(dvc.cache.local.cache_dir) - dvc.gc(all_commits=True) + dvc.gc() # Only one uncommitted file should go away assert _count_files(dvc.cache.local.cache_dir) == n - 1 diff --git a/tests/unit/command/test_gc.py b/tests/unit/command/test_gc.py new file mode 100644 index 0000000000..a390c8f29b --- /dev/null +++ b/tests/unit/command/test_gc.py @@ -0,0 +1,39 @@ +from dvc.cli import parse_args +from dvc.command.gc import CmdGC + + +def test_gc(mocker): + cli_args = parse_args( + [ + "gc", + "--remove-all-tags", + "--remove-all-branches", + "--remove-all-history", + "--cloud", + "--remote", + "myremote", + "--force", + "--jobs", + "10", + "--projects", + "/some/path", + "/some/other/path", + ] + ) + assert cli_args.func == CmdGC + + cmd = cli_args.func(cli_args) + m = mocker.patch("dvc.repo.Repo.gc") + + assert cmd.run() == 0 + + m.assert_called_once_with( + remove_all_tags=True, + remove_all_branches=True, + remove_all_history=True, + cloud=True, + remote="myremote", + force=True, + jobs=10, + repos=["/some/path", "/some/other/path"], + ) diff --git a/tests/unit/scm/test_git.py b/tests/unit/scm/test_git.py index 2874da16b5..40f08dab4c 100644 --- a/tests/unit/scm/test_git.py +++ b/tests/unit/scm/test_git.py @@ -15,3 +15,16 @@ def test_belongs_to_scm_true_on_git_internal(self): def test_belongs_to_scm_false(self): path = os.path.join("some", "non-.git", "file") self.assertFalse(self.dvc.scm.belongs_to_scm(path)) + + +def test_list_all_commits_detached_head(tmp_dir, scm): + tmp_dir.scm_gen({"first": "first"}, commit="first") + tmp_dir.scm_gen({"second": "second"}, commit="second") + scm.branch("branch_second") + scm.checkout("branch_third", create_new=True) + tmp_dir.scm_gen({"third": "third"}, commit="third") + scm.checkout("branch_second") + assert len(scm.list_all_commits()) == 3 + # deleting the branch so that `third` commit gets lost + scm.repo.git.branch("branch_third", D=True) + assert len(scm.list_all_commits()) == 2