Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions dvc/stage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import unicode_literals

import copy
import logging
import os
import re
Expand All @@ -27,7 +26,8 @@
from dvc.utils.compat import str
from dvc.utils.fs import contains_symlink_up_to
from dvc.utils.stage import dump_stage_file
from dvc.utils.stage import load_stage_fd
from dvc.utils.stage import parse_stage
from dvc.utils.stage import parse_stage_for_update


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -170,8 +170,8 @@ def __init__(
md5=None,
locked=False,
tag=None,
state=None,
always_changed=False,
stage_text=None,
):
if deps is None:
deps = []
Expand All @@ -188,7 +188,7 @@ def __init__(
self.locked = locked
self.tag = tag
self.always_changed = always_changed
self._state = state or {}
self._stage_text = stage_text

def __repr__(self):
return "Stage: '{path}'".format(
Expand Down Expand Up @@ -613,10 +613,8 @@ def load(repo, fname):
Stage._check_isfile(repo, fname)

with repo.tree.open(fname) as fd:
d = load_stage_fd(fd, fname)
# Making a deepcopy since the original structure
# looses keys in deps and outs load
state = copy.deepcopy(d)
stage_text = fd.read()
d = parse_stage(stage_text, fname)

Stage.validate(d, fname=relpath(fname))
path = os.path.abspath(fname)
Expand All @@ -634,7 +632,8 @@ def load(repo, fname):
locked=d.get(Stage.PARAM_LOCKED, False),
tag=tag,
always_changed=d.get(Stage.PARAM_ALWAYS_CHANGED, False),
state=state,
# We store stage text to apply updates to the same structure
stage_text=stage_text,
)

stage.deps = dependency.loadd_from(stage, d.get(Stage.PARAM_DEPS, []))
Expand All @@ -657,7 +656,6 @@ def dumpd(self):
Stage.PARAM_LOCKED: self.locked,
Stage.PARAM_DEPS: [d.dumpd() for d in self.deps],
Stage.PARAM_OUTS: [o.dumpd() for o in self.outs],
Stage.PARAM_META: self._state.get("meta"),
Stage.PARAM_ALWAYS_CHANGED: self.always_changed,
}.items()
if value
Expand All @@ -671,9 +669,24 @@ def dump(self):
logger.debug(
"Saving information to '{file}'.".format(file=relpath(fname))
)
d = self.dumpd()
apply_diff(d, self._state)
dump_stage_file(fname, self._state)
state = self.dumpd()

# When we load a stage we parse yaml with a fast parser, which strips
# off all the comments and formatting. To retain those on update we do
# a trick here:
# - reparse the same yaml text with a slow but smart ruamel yaml parser
# - apply changes to a returned structure
# - serialize it
if self._stage_text is not None:
saved_state = parse_stage_for_update(self._stage_text, fname)
# Stage doesn't work with meta in any way, so .dumpd() doesn't
# have it. We simply copy it over.
if "meta" in saved_state:
state["meta"] = saved_state["meta"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is some special handling required for meta here? (vs just a single apply_diff before)

Copy link
Contributor Author

@Suor Suor Nov 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because state, which we get from self.dumpd() does not have meta key. So apply_diff() will remove it from the target structure. The alternative would be to store meta in some key and then include it into .dumpd(), which is more hustle.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But why didn't we have the same (seemingly very ad-hoc and fragile logic) before? We were using dump before right? and were applying diff on top of it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Before it was part of the dumpd. I missed that a line was removed in dumpd as well.

Still looks fragile that we have one more place that manipulates with all these attributes. It feels that it would be better to follow the "regular" way in this file and make the meta a regular stage attribute. So, that we don't have a separate path to handle just it.

Also, wondering if is_cached can be optimized considering that we are saving the state.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The regular way now will mean we will need to store meta on some attribute, then return it in .dumpd() then let it be applied to the same meta, but with comments etc inside apply_diff().

I am not sure that is a good idea, I don't want to add an another attribute, which is never used in fact. And even less I want to add another constructor param.

apply_diff(state, saved_state)
state = saved_state

dump_stage_file(fname, state)

self.repo.scm.track_file(relpath(fname))

Expand Down
27 changes: 24 additions & 3 deletions dvc/utils/stage.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,40 @@
import yaml
from ruamel.yaml import YAML
from ruamel.yaml.error import YAMLError

try:
from yaml import CSafeLoader as SafeLoader
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it installed automatically on most systems?

Copy link
Contributor Author

@Suor Suor Nov 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PyYAML for Windows has wheels, which include binaries. I didn't test how it works though. For other systems you will need libyaml-dev installed to use C. Was not installed in my Ubuntu, so one needs to:

sudo apt install libyaml-dev

Before installing PyYAML, otherwise it won't be linked.

Pure python PyYAML still works 2x fast as ruamel.yaml. We might also poke PyYAML author to create all the wheels.

Copy link
Contributor

@efiop efiop Nov 12, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is quite a requirement, and won't be easy to explain to the users. This will result in dvc stage collection staying slow for pretty much everyone, except a few guys who will discover that they need to install libyam-dev. Though 2x is still pretty good.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Poking PyYAML guy and helping him is still an option. Will work on top of this seamlessly.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

2x sounds good and we can def ping the guys (let's do this right now?)

except ImportError:
from yaml import SafeLoader

from dvc.exceptions import StageFileCorruptedError
from dvc.utils.compat import open


def load_stage_file(path):
with open(path, "r", encoding="utf-8") as fd:
return load_stage_fd(fd, path)
return parse_stage(fd.read(), path)


def parse_stage(text, path):
try:
return yaml.load(text, Loader=SafeLoader) or {}
except yaml.error.YAMLError as exc:
raise StageFileCorruptedError(path, cause=exc)


def parse_stage_for_update(text, path):
"""Parses text into Python structure.

Unlike `parse_stage()` this returns ordereddicts, values have special
attributes to store comments and line breaks. This allows us to preserve
all of those upon dump.

def load_stage_fd(fd, path):
This one is, however, several times slower than simple `parse_stage()`.
"""
try:
yaml = YAML()
return yaml.load(fd) or {}
return yaml.load(text) or {}
except YAMLError as exc:
raise StageFileCorruptedError(path, cause=exc)

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def run(self):
"treelib>=1.5.5",
"inflect>=2.1.0",
"humanize>=0.5.1",
"PyYAML>=5.1.2",
"ruamel.yaml>=0.16.1",
"funcy>=1.12",
"pathspec>=0.6.0",
Expand Down