From 76e323e431cef76e2eda658b0ba2e26a279c428a Mon Sep 17 00:00:00 2001 From: joncrall Date: Mon, 13 May 2024 12:24:28 -0400 Subject: [PATCH 1/4] Explore skipping graph checks --- dvc/repo/__init__.py | 1 + dvc/repo/add.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 8d67614e0a..ee9d5c2e64 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -284,6 +284,7 @@ def index(self) -> "Index": def check_graph( self, stages: Iterable["Stage"], callback: Optional[Callable] = None ) -> None: + self._skip_graph_checks = True if not getattr(self, "_skip_graph_checks", False): new = self.index.update(stages) if callable(callback): diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 7cbacad793..9b4fc0f894 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -55,6 +55,10 @@ def get_or_create_stage( path, wdir, out = resolve_paths(repo, target, always_local=to_remote and not out) try: + # How best to disable this line? With Skip Graph Checks Flag? + repo._skip_graph_checks = True + if getattr(repo, "_skip_graph_checks", False): + print('todo: skip graph checks') (out_obj,) = repo.find_outs_by_path(target, strict=False) stage = out_obj.stage if not stage.is_data_source: From 2581ffce7c5b8fb0b1bca327d110939270f570c1 Mon Sep 17 00:00:00 2001 From: joncrall Date: Tue, 14 May 2024 15:24:17 -0400 Subject: [PATCH 2/4] Add skip_graph_checks as CLI argument --- dvc/commands/add.py | 11 +++++++++++ dvc/repo/__init__.py | 3 ++- dvc/repo/add.py | 38 +++++++++++++++++++++++++++++++++++--- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/dvc/commands/add.py b/dvc/commands/add.py index 38146b30e1..471fea8716 100644 --- a/dvc/commands/add.py +++ b/dvc/commands/add.py @@ -52,6 +52,7 @@ def run(self): remote_jobs=self.args.remote_jobs, force=self.args.force, relink=self.args.relink, + skip_graph_checks=self.args.skip_graph_checks, ) except FileNotFoundError: logger.exception("") @@ -127,6 +128,16 @@ def add_parser(subparsers, parent_parser): help="Don't recreate links from cache to workspace.", ) parser.set_defaults(relink=True) + parser.add_argument( + # Do we want a short code here? + "--skip-graph-checks", + action="store_true", + help=( + "Can speed up simple add operations by avoiding graph checks. " + "Warning: partial or virtual will not work when enabled." + ), + ) + parser.add_argument( "targets", nargs="+", help="Input files/directories to add." ).complete = completion.FILE diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index ee9d5c2e64..2f5b70e886 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -284,12 +284,13 @@ def index(self) -> "Index": def check_graph( self, stages: Iterable["Stage"], callback: Optional[Callable] = None ) -> None: - self._skip_graph_checks = True if not getattr(self, "_skip_graph_checks", False): new = self.index.update(stages) if callable(callback): callback() new.check_graph() + else: + logger.warning("partial or virtual add does not work when --skip-graph-checks are enabled") @staticmethod def open(url: Optional[str], *args, **kwargs) -> "Repo": diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 9b4fc0f894..3cc6b13e7f 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -56,9 +56,14 @@ def get_or_create_stage( try: # How best to disable this line? With Skip Graph Checks Flag? - repo._skip_graph_checks = True + # repo._skip_graph_checks = True if getattr(repo, "_skip_graph_checks", False): - print('todo: skip graph checks') + print("WARNING: partial or virtual add does not work when --skip-graph-checks are enabled") + # FIXME: this probably is not the correct implementation. when + # skip_graph_checks is enabled, we just want to avoid touching the + # graph. The output might already exist and need to be updated. + raise OutputNotFoundError(path) + (out_obj,) = repo.find_outs_by_path(target, strict=False) stage = out_obj.stage if not stage.is_data_source: @@ -191,6 +196,30 @@ def _add( stage.dump() +class _contextual_setattr: + """ + Sets an attribute on an object within the context and then restores it. + """ + def __init__(self, obj, attr_name, attr_value): + self.obj = obj + self.attr_name = attr_name + self.attr_value = attr_value + self._prev_value = None + self._had_prev_value = None + + def __enter__(self): + self._had_prev_value = hasattr(self.obj, self.attr_name) + if self._had_prev_value: + self._prev_value = getattr(self.obj, self.attr_name) + setattr(self.obj, self.attr_name, self.attr_value) + + def __exit__(self, ex_type, ex_value, ex_traceback): + if self._had_prev_value: + setattr(self.obj, self.attr_name, self._prev_value) + else: + delattr(self.obj, self.attr_name) + + @locked @scm_context def add( @@ -204,6 +233,7 @@ def add( remote_jobs: Optional[int] = None, force: bool = False, relink: bool = True, + skip_graph_checks: bool = False, ) -> list["Stage"]: add_targets = find_targets(targets, glob=glob) if not add_targets: @@ -220,9 +250,11 @@ def add( for target in add_targets } + attr_context = _contextual_setattr( + repo, "_skip_graph_checks", skip_graph_checks) stages = [stage for stage, _ in stages_with_targets.values()] msg = "Collecting stages from the workspace" - with translate_graph_error(stages), ui.status(msg) as st: + with attr_context, translate_graph_error(stages), ui.status(msg) as st: repo.check_graph(stages=stages, callback=lambda: st.update("Checking graph")) if to_remote or out: From a81b659c58edd2630473448d404dfcc79547d8b5 Mon Sep 17 00:00:00 2001 From: joncrall Date: Tue, 14 May 2024 19:16:29 -0400 Subject: [PATCH 3/4] Add temporary changes / notes --- dvc/repo/add.py | 46 +++++++++++++++++++++++++++++++++---------- dvc/stage/__init__.py | 7 +++++++ dvc/utils/__init__.py | 9 +++++++++ 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 3cc6b13e7f..3503e09780 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -20,6 +20,7 @@ from dvc.repo import Repo from dvc.stage import Stage from dvc.types import StrOrBytesPath + from dvc.output import Output class StageInfo(NamedTuple): @@ -50,6 +51,21 @@ def get_or_create_stage( to_remote: bool = False, force: bool = False, ) -> StageInfo: + """ + Adds a new tracked file or update an existing one. + + Used in the context of dvc-add. + + Args: + target : an expression that resolves to a ... + out : if specified, what does this to? + to_remote : if True, what does this to? + force : what does this to? + """ + + import xdev + xdev.embed() + if out: target = resolve_output(target, out, force=force) path, wdir, out = resolve_paths(repo, target, always_local=to_remote and not out) @@ -64,6 +80,7 @@ def get_or_create_stage( # graph. The output might already exist and need to be updated. raise OutputNotFoundError(path) + out_obj: Output (out_obj,) = repo.find_outs_by_path(target, strict=False) stage = out_obj.stage if not stage.is_data_source: @@ -239,23 +256,32 @@ def add( if not add_targets: return [] - stages_with_targets = { - target: get_or_create_stage( - repo, - target, - out=out, - to_remote=to_remote, - force=force, - ) - for target in add_targets - } + print('ABOUT TO GET OR CREATE STAGE') + attr_context = _contextual_setattr( + repo, "_skip_graph_checks", skip_graph_checks) + with attr_context: + stages_with_targets = { + target: get_or_create_stage( + repo, + target, + out=out, + to_remote=to_remote, + force=force, + ) + for target in add_targets + } + print(f'stages_with_targets={stages_with_targets}') + print('FINISHED GET OR CREATE STAGE') attr_context = _contextual_setattr( repo, "_skip_graph_checks", skip_graph_checks) stages = [stage for stage, _ in stages_with_targets.values()] msg = "Collecting stages from the workspace" + print('ABOUT TO ENTER CHECK THE GRAPH CONTEXT') with attr_context, translate_graph_error(stages), ui.status(msg) as st: + print('ABOUT TO CHECK THE GRAPH') repo.check_graph(stages=stages, callback=lambda: st.update("Checking graph")) + print('FINISHED CHECK THE GRAPH CONTEXT') if to_remote or out: assert len(stages_with_targets) == 1, "multiple targets are unsupported" diff --git a/dvc/stage/__init__.py b/dvc/stage/__init__.py index b9deed4646..730beb1073 100644 --- a/dvc/stage/__init__.py +++ b/dvc/stage/__init__.py @@ -144,6 +144,13 @@ def __init__( # noqa: PLR0913 desc: Optional[str] = None, meta=None, ): + """ + A stage represents a dvc file? + + Attributes: + path (str): the absolute path to a .dvc file + outs (List[Output]): the "outs" associated with this .dvc file + """ if deps is None: deps = [] if outs is None: diff --git a/dvc/utils/__init__.py b/dvc/utils/__init__.py index a43d11ac79..e95340d7d5 100644 --- a/dvc/utils/__init__.py +++ b/dvc/utils/__init__.py @@ -267,6 +267,15 @@ def resolve_output(inp: str, out: Optional[str], force=False) -> str: def resolve_paths(repo, out, always_local=False): + """ + Get the path to a DVC file that corresponds to a specific "out" in the repo. + + Returns: + Tuple[str, std, str]: + path - the path to the .dvc file + wdir - the directory containing the .dvc file + out - the name of the tracked file relative to wdir. + """ from urllib.parse import urlparse from dvc.dvcfile import DVC_FILE_SUFFIX From 38aceec7a7a8f3fe572365a4e106bcdf9fe3d6ee Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 14 May 2024 23:19:31 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dvc/repo/__init__.py | 4 +++- dvc/repo/add.py | 26 ++++++++++++++------------ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 2f5b70e886..54c3352209 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -290,7 +290,9 @@ def check_graph( callback() new.check_graph() else: - logger.warning("partial or virtual add does not work when --skip-graph-checks are enabled") + logger.warning( + "partial or virtual add does not work when --skip-graph-checks are enabled" + ) @staticmethod def open(url: Optional[str], *args, **kwargs) -> "Repo": diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 3503e09780..871adcd008 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -17,10 +17,10 @@ from . import locked if TYPE_CHECKING: + from dvc.output import Output from dvc.repo import Repo from dvc.stage import Stage from dvc.types import StrOrBytesPath - from dvc.output import Output class StageInfo(NamedTuple): @@ -64,6 +64,7 @@ def get_or_create_stage( """ import xdev + xdev.embed() if out: @@ -74,7 +75,9 @@ def get_or_create_stage( # How best to disable this line? With Skip Graph Checks Flag? # repo._skip_graph_checks = True if getattr(repo, "_skip_graph_checks", False): - print("WARNING: partial or virtual add does not work when --skip-graph-checks are enabled") + print( + "WARNING: partial or virtual add does not work when --skip-graph-checks are enabled" + ) # FIXME: this probably is not the correct implementation. when # skip_graph_checks is enabled, we just want to avoid touching the # graph. The output might already exist and need to be updated. @@ -217,6 +220,7 @@ class _contextual_setattr: """ Sets an attribute on an object within the context and then restores it. """ + def __init__(self, obj, attr_name, attr_value): self.obj = obj self.attr_name = attr_name @@ -256,9 +260,8 @@ def add( if not add_targets: return [] - print('ABOUT TO GET OR CREATE STAGE') - attr_context = _contextual_setattr( - repo, "_skip_graph_checks", skip_graph_checks) + print("ABOUT TO GET OR CREATE STAGE") + attr_context = _contextual_setattr(repo, "_skip_graph_checks", skip_graph_checks) with attr_context: stages_with_targets = { target: get_or_create_stage( @@ -270,18 +273,17 @@ def add( ) for target in add_targets } - print(f'stages_with_targets={stages_with_targets}') - print('FINISHED GET OR CREATE STAGE') + print(f"stages_with_targets={stages_with_targets}") + print("FINISHED GET OR CREATE STAGE") - attr_context = _contextual_setattr( - repo, "_skip_graph_checks", skip_graph_checks) + attr_context = _contextual_setattr(repo, "_skip_graph_checks", skip_graph_checks) stages = [stage for stage, _ in stages_with_targets.values()] msg = "Collecting stages from the workspace" - print('ABOUT TO ENTER CHECK THE GRAPH CONTEXT') + print("ABOUT TO ENTER CHECK THE GRAPH CONTEXT") with attr_context, translate_graph_error(stages), ui.status(msg) as st: - print('ABOUT TO CHECK THE GRAPH') + print("ABOUT TO CHECK THE GRAPH") repo.check_graph(stages=stages, callback=lambda: st.update("Checking graph")) - print('FINISHED CHECK THE GRAPH CONTEXT') + print("FINISHED CHECK THE GRAPH CONTEXT") if to_remote or out: assert len(stages_with_targets) == 1, "multiple targets are unsupported"