-
Notifications
You must be signed in to change notification settings - Fork 1.3k
perf: optimize yaml parsing for stages #2775
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,5 @@ | ||
| from __future__ import unicode_literals | ||
|
|
||
| import copy | ||
| import logging | ||
| import os | ||
| import re | ||
|
|
@@ -27,7 +26,8 @@ | |
| from dvc.utils.compat import str | ||
| from dvc.utils.fs import contains_symlink_up_to | ||
| from dvc.utils.stage import dump_stage_file | ||
| from dvc.utils.stage import load_stage_fd | ||
| from dvc.utils.stage import parse_stage | ||
| from dvc.utils.stage import parse_stage_for_update | ||
|
|
||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
@@ -170,8 +170,8 @@ def __init__( | |
| md5=None, | ||
| locked=False, | ||
| tag=None, | ||
| state=None, | ||
| always_changed=False, | ||
| stage_text=None, | ||
| ): | ||
| if deps is None: | ||
| deps = [] | ||
|
|
@@ -188,7 +188,7 @@ def __init__( | |
| self.locked = locked | ||
| self.tag = tag | ||
| self.always_changed = always_changed | ||
| self._state = state or {} | ||
| self._stage_text = stage_text | ||
|
|
||
| def __repr__(self): | ||
| return "Stage: '{path}'".format( | ||
|
|
@@ -613,10 +613,8 @@ def load(repo, fname): | |
| Stage._check_isfile(repo, fname) | ||
|
|
||
| with repo.tree.open(fname) as fd: | ||
| d = load_stage_fd(fd, fname) | ||
| # Making a deepcopy since the original structure | ||
| # looses keys in deps and outs load | ||
| state = copy.deepcopy(d) | ||
| stage_text = fd.read() | ||
| d = parse_stage(stage_text, fname) | ||
|
|
||
| Stage.validate(d, fname=relpath(fname)) | ||
| path = os.path.abspath(fname) | ||
|
|
@@ -634,7 +632,8 @@ def load(repo, fname): | |
| locked=d.get(Stage.PARAM_LOCKED, False), | ||
| tag=tag, | ||
| always_changed=d.get(Stage.PARAM_ALWAYS_CHANGED, False), | ||
| state=state, | ||
| # We store stage text to apply updates to the same structure | ||
| stage_text=stage_text, | ||
| ) | ||
|
|
||
| stage.deps = dependency.loadd_from(stage, d.get(Stage.PARAM_DEPS, [])) | ||
|
|
@@ -657,7 +656,6 @@ def dumpd(self): | |
| Stage.PARAM_LOCKED: self.locked, | ||
| Stage.PARAM_DEPS: [d.dumpd() for d in self.deps], | ||
| Stage.PARAM_OUTS: [o.dumpd() for o in self.outs], | ||
| Stage.PARAM_META: self._state.get("meta"), | ||
| Stage.PARAM_ALWAYS_CHANGED: self.always_changed, | ||
| }.items() | ||
| if value | ||
|
|
@@ -671,9 +669,24 @@ def dump(self): | |
| logger.debug( | ||
| "Saving information to '{file}'.".format(file=relpath(fname)) | ||
| ) | ||
| d = self.dumpd() | ||
| apply_diff(d, self._state) | ||
| dump_stage_file(fname, self._state) | ||
| state = self.dumpd() | ||
|
|
||
| # When we load a stage we parse yaml with a fast parser, which strips | ||
| # off all the comments and formatting. To retain those on update we do | ||
| # a trick here: | ||
| # - reparse the same yaml text with a slow but smart ruamel yaml parser | ||
| # - apply changes to a returned structure | ||
| # - serialize it | ||
| if self._stage_text is not None: | ||
| saved_state = parse_stage_for_update(self._stage_text, fname) | ||
| # Stage doesn't work with meta in any way, so .dumpd() doesn't | ||
| # have it. We simply copy it over. | ||
| if "meta" in saved_state: | ||
| state["meta"] = saved_state["meta"] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is some special handling required for meta here? (vs just a single
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But why didn't we have the same (seemingly very ad-hoc and fragile logic) before? We were using dump before right? and were applying diff on top of it?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. Before it was part of the Still looks fragile that we have one more place that manipulates with all these attributes. It feels that it would be better to follow the "regular" way in this file and make the Also, wondering if
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The regular way now will mean we will need to store I am not sure that is a good idea, I don't want to add an another attribute, which is never used in fact. And even less I want to add another constructor param. |
||
| apply_diff(state, saved_state) | ||
| state = saved_state | ||
|
|
||
| dump_stage_file(fname, state) | ||
|
|
||
| self.repo.scm.track_file(relpath(fname)) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,19 +1,40 @@ | ||
| import yaml | ||
| from ruamel.yaml import YAML | ||
| from ruamel.yaml.error import YAMLError | ||
|
|
||
| try: | ||
| from yaml import CSafeLoader as SafeLoader | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it installed automatically on most systems?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PyYAML for Windows has wheels, which include binaries. I didn't test how it works though. For other systems you will need sudo apt install libyaml-devBefore installing PyYAML, otherwise it won't be linked. Pure python PyYAML still works 2x fast as
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is quite a requirement, and won't be easy to explain to the users. This will result in dvc stage collection staying slow for pretty much everyone, except a few guys who will discover that they need to install libyam-dev. Though 2x is still pretty good.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Poking PyYAML guy and helping him is still an option. Will work on top of this seamlessly.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 2x sounds good and we can def ping the guys (let's do this right now?) |
||
| except ImportError: | ||
| from yaml import SafeLoader | ||
|
|
||
| from dvc.exceptions import StageFileCorruptedError | ||
| from dvc.utils.compat import open | ||
|
|
||
|
|
||
| def load_stage_file(path): | ||
| with open(path, "r", encoding="utf-8") as fd: | ||
| return load_stage_fd(fd, path) | ||
| return parse_stage(fd.read(), path) | ||
|
|
||
|
|
||
| def parse_stage(text, path): | ||
| try: | ||
| return yaml.load(text, Loader=SafeLoader) or {} | ||
| except yaml.error.YAMLError as exc: | ||
| raise StageFileCorruptedError(path, cause=exc) | ||
|
|
||
|
|
||
| def parse_stage_for_update(text, path): | ||
| """Parses text into Python structure. | ||
|
|
||
| Unlike `parse_stage()` this returns ordereddicts, values have special | ||
| attributes to store comments and line breaks. This allows us to preserve | ||
| all of those upon dump. | ||
|
|
||
| def load_stage_fd(fd, path): | ||
| This one is, however, several times slower than simple `parse_stage()`. | ||
| """ | ||
| try: | ||
| yaml = YAML() | ||
| return yaml.load(fd) or {} | ||
| return yaml.load(text) or {} | ||
| except YAMLError as exc: | ||
| raise StageFileCorruptedError(path, cause=exc) | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.