From 58c9e5557ced5ae7e6c9c0a1643d84ea8e07ddcb Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Mon, 11 Nov 2019 13:41:50 +0100 Subject: [PATCH 1/2] perf: optimize yaml parsing for stages --- dvc/stage.py | 33 ++++++++++++++++++++------------- dvc/utils/stage.py | 19 ++++++++++++++++--- setup.py | 1 + 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/dvc/stage.py b/dvc/stage.py index d408019dc9..32e8204bfb 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import copy import logging import os import re @@ -27,7 +26,8 @@ from dvc.utils.compat import str from dvc.utils.fs import contains_symlink_up_to from dvc.utils.stage import dump_stage_file -from dvc.utils.stage import load_stage_fd +from dvc.utils.stage import parse_stage +from dvc.utils.stage import parse_stage_for_update logger = logging.getLogger(__name__) @@ -170,8 +170,8 @@ def __init__( md5=None, locked=False, tag=None, - state=None, always_changed=False, + stage_text=None, ): if deps is None: deps = [] @@ -188,7 +188,7 @@ def __init__( self.locked = locked self.tag = tag self.always_changed = always_changed - self._state = state or {} + self._stage_text = stage_text def __repr__(self): return "Stage: '{path}'".format( @@ -613,10 +613,8 @@ def load(repo, fname): Stage._check_isfile(repo, fname) with repo.tree.open(fname) as fd: - d = load_stage_fd(fd, fname) - # Making a deepcopy since the original structure - # looses keys in deps and outs load - state = copy.deepcopy(d) + stage_text = fd.read() + d = parse_stage(stage_text, fname) Stage.validate(d, fname=relpath(fname)) path = os.path.abspath(fname) @@ -634,7 +632,8 @@ def load(repo, fname): locked=d.get(Stage.PARAM_LOCKED, False), tag=tag, always_changed=d.get(Stage.PARAM_ALWAYS_CHANGED, False), - state=state, + # We store stage text to apply updates to the same structure + stage_text=stage_text, ) stage.deps = dependency.loadd_from(stage, d.get(Stage.PARAM_DEPS, [])) @@ -657,7 +656,6 @@ def dumpd(self): Stage.PARAM_LOCKED: self.locked, Stage.PARAM_DEPS: [d.dumpd() for d in self.deps], Stage.PARAM_OUTS: [o.dumpd() for o in self.outs], - Stage.PARAM_META: self._state.get("meta"), Stage.PARAM_ALWAYS_CHANGED: self.always_changed, }.items() if value @@ -671,9 +669,18 @@ def dump(self): logger.debug( "Saving information to '{file}'.".format(file=relpath(fname)) ) - d = self.dumpd() - apply_diff(d, self._state) - dump_stage_file(fname, self._state) + state = self.dumpd() + + # If we have stage file we apply diffs instead of overwriting it + # to preserve comments, string formatting and meta key. + if self._stage_text is not None: + saved_state = parse_stage_for_update(self._stage_text, fname) + if "meta" in saved_state: + state["meta"] = saved_state["meta"] + apply_diff(state, saved_state) + state = saved_state + + dump_stage_file(fname, state) self.repo.scm.track_file(relpath(fname)) diff --git a/dvc/utils/stage.py b/dvc/utils/stage.py index 7d5eb8116d..2d3f31618a 100644 --- a/dvc/utils/stage.py +++ b/dvc/utils/stage.py @@ -1,19 +1,32 @@ +import yaml from ruamel.yaml import YAML from ruamel.yaml.error import YAMLError +try: + from yaml import CSafeLoader as SafeLoader +except ImportError: + from yaml import SafeLoader + from dvc.exceptions import StageFileCorruptedError from dvc.utils.compat import open def load_stage_file(path): with open(path, "r", encoding="utf-8") as fd: - return load_stage_fd(fd, path) + return parse_stage(fd.read(), path) + + +def parse_stage(text, path): + try: + return yaml.load(text, Loader=SafeLoader) or {} + except yaml.error.YAMLError as exc: + raise StageFileCorruptedError(path, cause=exc) -def load_stage_fd(fd, path): +def parse_stage_for_update(text, path): try: yaml = YAML() - return yaml.load(fd) or {} + return yaml.load(text) or {} except YAMLError as exc: raise StageFileCorruptedError(path, cause=exc) diff --git a/setup.py b/setup.py index 00bd03bf66..d8f08940a7 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,7 @@ def run(self): "treelib>=1.5.5", "inflect>=2.1.0", "humanize>=0.5.1", + "PyYAML>=5.1.2", "ruamel.yaml>=0.16.1", "funcy>=1.12", "pathspec>=0.6.0", From 329d072c2241f9414b87f84d95b6b1e1a92d7313 Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Wed, 13 Nov 2019 16:35:05 +0100 Subject: [PATCH 2/2] perf: explain why two yaml parsers are used --- dvc/stage.py | 10 ++++++++-- dvc/utils/stage.py | 8 ++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/dvc/stage.py b/dvc/stage.py index 32e8204bfb..48820cd867 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -671,10 +671,16 @@ def dump(self): ) state = self.dumpd() - # If we have stage file we apply diffs instead of overwriting it - # to preserve comments, string formatting and meta key. + # When we load a stage we parse yaml with a fast parser, which strips + # off all the comments and formatting. To retain those on update we do + # a trick here: + # - reparse the same yaml text with a slow but smart ruamel yaml parser + # - apply changes to a returned structure + # - serialize it if self._stage_text is not None: saved_state = parse_stage_for_update(self._stage_text, fname) + # Stage doesn't work with meta in any way, so .dumpd() doesn't + # have it. We simply copy it over. if "meta" in saved_state: state["meta"] = saved_state["meta"] apply_diff(state, saved_state) diff --git a/dvc/utils/stage.py b/dvc/utils/stage.py index 2d3f31618a..c3a4b0e66e 100644 --- a/dvc/utils/stage.py +++ b/dvc/utils/stage.py @@ -24,6 +24,14 @@ def parse_stage(text, path): def parse_stage_for_update(text, path): + """Parses text into Python structure. + + Unlike `parse_stage()` this returns ordereddicts, values have special + attributes to store comments and line breaks. This allows us to preserve + all of those upon dump. + + This one is, however, several times slower than simple `parse_stage()`. + """ try: yaml = YAML() return yaml.load(text) or {}