From a69d0ea8fceacac2be734f8548b97dc09fe76cc6 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 20 Jul 2020 14:36:15 +0900 Subject: [PATCH 1/7] erepo: use shallow clones when rev is branch or tag name --- dvc/external_repo.py | 40 +++++++++++++++++++++++++++++----------- dvc/scm/git.py | 15 ++++++++++++--- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 42df57c691..e8b3be5e74 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -19,6 +19,7 @@ from dvc.path_info import PathInfo from dvc.repo import Repo from dvc.repo.tree import RepoTree +from dvc.scm.base import CloneError from dvc.scm.git import Git from dvc.tree.local import LocalTree from dvc.utils.fs import remove @@ -59,7 +60,7 @@ def external_repo(url, rev=None, for_write=False): def clean_repos(): # Outside code should not see cache while we are removing - paths = list(CLONES.values()) + list(CACHE_DIRS.values()) + paths = [path for path, _ in CLONES.values()] + list(CACHE_DIRS.values()) CLONES.clear() CACHE_DIRS.clear() @@ -251,10 +252,10 @@ def _cached_clone(url, rev, for_write=False): """ # even if we have already cloned this repo, we may need to # fetch/fast-forward to get specified rev - clone_path = _clone_default_branch(url, rev) + clone_path, shallow = _clone_default_branch(url, rev, for_write=for_write) if not for_write and (url) in CLONES: - return CLONES[url] + return CLONES[url][0] # Copy to a new dir to keep the clone clean repo_path = tempfile.mkdtemp("dvc-erepo") @@ -265,17 +266,17 @@ def _cached_clone(url, rev, for_write=False): if for_write: _git_checkout(repo_path, rev) else: - CLONES[url] = repo_path + CLONES[url] = (repo_path, shallow) return repo_path @wrap_with(threading.Lock()) -def _clone_default_branch(url, rev): +def _clone_default_branch(url, rev, for_write=False): """Get or create a clean clone of the url. The cloned is reactualized with git pull unless rev is a known sha. """ - clone_path = CLONES.get(url) + clone_path, shallow = CLONES.get(url, (None, False)) git = None try: @@ -283,18 +284,35 @@ def _clone_default_branch(url, rev): git = Git(clone_path) # Do not pull for known shas, branches and tags might move if not Git.is_sha(rev) or not git.has_rev(rev): - logger.debug("erepo: git pull %s", url) + if shallow: + logger.debug("erepo: unshallowing clone for '%s'", url) + git.repo.git.fetch(unshallow=True) + shallow = False + CLONES[url] = (clone_path, shallow) + logger.debug("erepo: git pull '%s'", url) git.pull() else: - logger.debug("erepo: git clone %s to a temporary dir", url) + logger.debug("erepo: git clone '%s' to a temporary dir", url) clone_path = tempfile.mkdtemp("dvc-clone") - git = Git.clone(url, clone_path) - CLONES[url] = clone_path + if not for_write and rev and not Git.is_sha(rev): + # If rev is a tag or branch name try shallow clone first + try: + git = Git.clone(url, clone_path, shallow_branch=rev) + shallow = True + logger.debug( + "erepo: using shallow clone for branch '%s'", rev + ) + except CloneError: + pass + if not git: + git = Git.clone(url, clone_path) + shallow = False + CLONES[url] = (clone_path, shallow) finally: if git: git.close() - return clone_path + return clone_path, shallow def _git_checkout(repo_path, rev): diff --git a/dvc/scm/git.py b/dvc/scm/git.py index 30b5a4b495..b9854c9b93 100644 --- a/dvc/scm/git.py +++ b/dvc/scm/git.py @@ -5,7 +5,7 @@ import shlex import yaml -from funcy import cached_property +from funcy import cached_property, partial from pathspec.patterns import GitWildMatchPattern from dvc.exceptions import GitHookAlreadyExistsError @@ -92,7 +92,7 @@ def root_dir(self): return self.repo.working_tree_dir @staticmethod - def clone(url, to_path, rev=None): + def clone(url, to_path, rev=None, shallow_branch=None): import git ld_key = "LD_LIBRARY_PATH" @@ -109,14 +109,23 @@ def clone(url, to_path, rev=None): env[ld_key] = "" try: + if shallow_branch is not None and os.path.exists(url): + # git disables --depth for local clones unless file:// url + # scheme is used + url = f"file://{url}" with TqdmGit(desc="Cloning", unit="obj") as pbar: - tmp_repo = git.Repo.clone_from( + clone_from = partial( + git.Repo.clone_from, url, to_path, env=env, # needed before we can fix it in __init__ no_single_branch=True, progress=pbar.update_git, ) + if shallow_branch is None: + tmp_repo = clone_from() + else: + tmp_repo = clone_from(branch=shallow_branch, depth=1) tmp_repo.close() except git.exc.GitCommandError as exc: # pylint: disable=no-member raise CloneError(url, to_path) from exc From 20c293c51d65efb6a6560390b06406185f0b42e8 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 20 Jul 2020 16:04:03 +0900 Subject: [PATCH 2/7] tests: test erepo shallow clone behavior --- tests/func/test_external_repo.py | 56 ++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index d8c96fbcd8..253274a682 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -1,8 +1,8 @@ import os -from mock import patch +from mock import ANY, patch -from dvc.external_repo import external_repo +from dvc.external_repo import CLONES, external_repo from dvc.path_info import PathInfo from dvc.scm.git import Git from dvc.tree.local import LocalTree @@ -121,3 +121,55 @@ def test_relative_remote(erepo_dir, tmp_dir): assert os.path.isdir(repo.config["remote"]["upstream"]["url"]) with repo.open_by_relpath("file") as fd: assert fd.read() == "contents" + + +def test_shallow_clone_branch(erepo_dir): + with erepo_dir.chdir(): + with erepo_dir.branch("branch", new=True): + erepo_dir.dvc_gen("file", "branch", commit="create file on branch") + erepo_dir.dvc_gen("file", "master", commit="create file on master") + + url = os.fspath(erepo_dir) + + with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: + with external_repo(url, rev="branch") as repo: + with repo.open_by_relpath("file") as fd: + assert fd.read() == "branch" + + mock_clone.assert_called_with(url, ANY, shallow_branch="branch") + _, shallow = CLONES[url] + assert shallow + + with external_repo(url, rev="master") as repo: + with repo.open_by_relpath("file") as fd: + assert fd.read() == "master" + + assert mock_clone.call_count == 1 + _, shallow = CLONES[url] + assert not shallow + + +def test_shallow_clone_tag(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.dvc_gen("file", "foo", commit="init") + erepo_dir.scm.tag("v1") + erepo_dir.dvc_gen("file", "bar", commit="update file") + + url = os.fspath(erepo_dir) + + with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: + with external_repo(url, rev="v1") as repo: + with repo.open_by_relpath("file") as fd: + assert fd.read() == "foo" + + mock_clone.assert_called_with(url, ANY, shallow_branch="v1") + _, shallow = CLONES[url] + assert shallow + + with external_repo(url, rev="master") as repo: + with repo.open_by_relpath("file") as fd: + assert fd.read() == "bar" + + assert mock_clone.call_count == 1 + _, shallow = CLONES[url] + assert not shallow From 7e282089a0c31b34a4344285d727723defc2229b Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 20 Jul 2020 16:04:32 +0900 Subject: [PATCH 3/7] erepo: fix detached head after shallow cloning a tag --- dvc/external_repo.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index e8b3be5e74..013d706a10 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -289,6 +289,16 @@ def _clone_default_branch(url, rev, for_write=False): git.repo.git.fetch(unshallow=True) shallow = False CLONES[url] = (clone_path, shallow) + if git.repo.head.is_detached: + # If this is a detached head (i.e. we shallow + # cloned a tag) checkout the default branch + origin_refs = git.repo.remotes["origin"].refs + ref = origin_refs["HEAD"].reference + branch_name = ref.name.split("/")[-1] + branch = git.repo.create_head(branch_name, ref) + branch.set_tracking_branch(ref) + branch.checkout() + logger.debug("erepo: git pull '%s'", url) git.pull() else: From ffdd55640f9a438b54409e9ca39379d5e11fb77a Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 20 Jul 2020 16:25:22 +0900 Subject: [PATCH 4/7] move unshallow logic into its own function --- dvc/external_repo.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 013d706a10..2803fcccf4 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -270,6 +270,19 @@ def _cached_clone(url, rev, for_write=False): return repo_path +def _unshallow(git): + git.repo.git.fetch(unshallow=True) + if git.repo.head.is_detached: + # If this is a detached head (i.e. we shallow cloned a tag) switch to + # the default branch + origin_refs = git.repo.remotes["origin"].refs + ref = origin_refs["HEAD"].reference + branch_name = ref.name.split("/")[-1] + branch = git.repo.create_head(branch_name, ref) + branch.set_tracking_branch(ref) + branch.checkout() + + @wrap_with(threading.Lock()) def _clone_default_branch(url, rev, for_write=False): """Get or create a clean clone of the url. @@ -286,19 +299,9 @@ def _clone_default_branch(url, rev, for_write=False): if not Git.is_sha(rev) or not git.has_rev(rev): if shallow: logger.debug("erepo: unshallowing clone for '%s'", url) - git.repo.git.fetch(unshallow=True) + _unshallow(git) shallow = False CLONES[url] = (clone_path, shallow) - if git.repo.head.is_detached: - # If this is a detached head (i.e. we shallow - # cloned a tag) checkout the default branch - origin_refs = git.repo.remotes["origin"].refs - ref = origin_refs["HEAD"].reference - branch_name = ref.name.split("/")[-1] - branch = git.repo.create_head(branch_name, ref) - branch.set_tracking_branch(ref) - branch.checkout() - logger.debug("erepo: git pull '%s'", url) git.pull() else: From 84e5f96cb638624f2a923a96c5c68b9c52b25fe0 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 20 Jul 2020 16:42:14 +0900 Subject: [PATCH 5/7] erepo: default rev should be origin HEAD not local HEAD --- dvc/external_repo.py | 2 +- tests/func/test_external_repo.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 2803fcccf4..57971943af 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -32,7 +32,7 @@ def external_repo(url, rev=None, for_write=False): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) if not rev: - rev = "HEAD" + rev = "refs/remotes/origin/HEAD" try: repo = ExternalRepo(path, url, rev, for_write=for_write) except NotDvcRepoError: diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 253274a682..50b142dd48 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -140,7 +140,7 @@ def test_shallow_clone_branch(erepo_dir): _, shallow = CLONES[url] assert shallow - with external_repo(url, rev="master") as repo: + with external_repo(url) as repo: with repo.open_by_relpath("file") as fd: assert fd.read() == "master" From e2844f1ae790502debd5af0dfe137a8071145659 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 20 Jul 2020 17:38:01 +0900 Subject: [PATCH 6/7] use functools.partial --- dvc/scm/git.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dvc/scm/git.py b/dvc/scm/git.py index b9854c9b93..1766933ff2 100644 --- a/dvc/scm/git.py +++ b/dvc/scm/git.py @@ -3,9 +3,10 @@ import logging import os import shlex +from functools import partial import yaml -from funcy import cached_property, partial +from funcy import cached_property from pathspec.patterns import GitWildMatchPattern from dvc.exceptions import GitHookAlreadyExistsError From bc3948f9d63edacc59ad94465d7846788722c495 Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 20 Jul 2020 21:17:18 +0900 Subject: [PATCH 7/7] review fixes * comment shallow clone behavior in erepo * don't duplicate unnecessary fetch/pull call --- dvc/external_repo.py | 40 +++++++++++++++++++++++++--------------- dvc/scm/git.py | 4 ++-- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 57971943af..15332e7de1 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -32,6 +32,9 @@ def external_repo(url, rev=None, for_write=False): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) if not rev: + # Local HEAD points to the tip of whatever branch we first cloned from + # (which may not be the default branch), use origin/HEAD here to get + # the tip of the default branch rev = "refs/remotes/origin/HEAD" try: repo = ExternalRepo(path, url, rev, for_write=for_write) @@ -270,19 +273,6 @@ def _cached_clone(url, rev, for_write=False): return repo_path -def _unshallow(git): - git.repo.git.fetch(unshallow=True) - if git.repo.head.is_detached: - # If this is a detached head (i.e. we shallow cloned a tag) switch to - # the default branch - origin_refs = git.repo.remotes["origin"].refs - ref = origin_refs["HEAD"].reference - branch_name = ref.name.split("/")[-1] - branch = git.repo.create_head(branch_name, ref) - branch.set_tracking_branch(ref) - branch.checkout() - - @wrap_with(threading.Lock()) def _clone_default_branch(url, rev, for_write=False): """Get or create a clean clone of the url. @@ -298,12 +288,19 @@ def _clone_default_branch(url, rev, for_write=False): # Do not pull for known shas, branches and tags might move if not Git.is_sha(rev) or not git.has_rev(rev): if shallow: + # If we are missing a rev in a shallow clone, fallback to + # a full (unshallowed) clone. Since fetching specific rev + # SHAs is only available in certain git versions, if we + # have need to reference multiple specific revs for a + # given repo URL it is easier/safer for us to work with + # full clones in this case. logger.debug("erepo: unshallowing clone for '%s'", url) _unshallow(git) shallow = False CLONES[url] = (clone_path, shallow) - logger.debug("erepo: git pull '%s'", url) - git.pull() + else: + logger.debug("erepo: git pull '%s'", url) + git.pull() else: logger.debug("erepo: git clone '%s' to a temporary dir", url) clone_path = tempfile.mkdtemp("dvc-clone") @@ -328,6 +325,19 @@ def _clone_default_branch(url, rev, for_write=False): return clone_path, shallow +def _unshallow(git): + if git.repo.head.is_detached: + # If this is a detached head (i.e. we shallow cloned a tag) switch to + # the default branch + origin_refs = git.repo.remotes["origin"].refs + ref = origin_refs["HEAD"].reference + branch_name = ref.name.split("/")[-1] + branch = git.repo.create_head(branch_name, ref) + branch.set_tracking_branch(ref) + branch.checkout() + git.pull(unshallow=True) + + def _git_checkout(repo_path, rev): logger.debug("erepo: git checkout %s@%s", repo_path, rev) git = Git(repo_path) diff --git a/dvc/scm/git.py b/dvc/scm/git.py index 1766933ff2..12b0b198cf 100644 --- a/dvc/scm/git.py +++ b/dvc/scm/git.py @@ -260,8 +260,8 @@ def checkout(self, branch, create_new=False): else: self.repo.git.checkout(branch) - def pull(self): - infos = self.repo.remote().pull() + def pull(self, **kwargs): + infos = self.repo.remote().pull(**kwargs) for info in infos: if info.flags & info.ERROR: raise SCMError(f"pull failed: {info.note}")