From afa4510fdcfe9326aa023c72e506d593f106c101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= Date: Mon, 27 Jul 2020 16:11:49 +0200 Subject: [PATCH 1/2] workaround for #4281 --- dvc/utils/yaml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dvc/utils/yaml.py b/dvc/utils/yaml.py index 70fb46f31a..e43838e254 100644 --- a/dvc/utils/yaml.py +++ b/dvc/utils/yaml.py @@ -44,6 +44,7 @@ def parse_yaml_for_update(text, path): def dump_yaml(path, data): with open(path, "w", encoding="utf-8") as fd: yaml = YAML() + yaml.version = (1, 1) # Workaround for #4281 yaml.default_flow_style = False # tell Dumper to represent OrderedDict as # normal dict From cbe7fe6e30afc0767bae18fc98c5ae4b8f54ce7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Mon, 10 Aug 2020 23:15:50 +0545 Subject: [PATCH 2/2] dump_yaml: dump as YAML 1.1 * only dump dvc.lock in 1.1, dump dvc.yaml in 1.2 --- dvc/dvcfile.py | 13 +++- dvc/utils/yaml.py | 125 +++++++++++++++++++++++++++------- tests/unit/utils/test_yaml.py | 82 ++++++++++++++++++++++ 3 files changed, 193 insertions(+), 27 deletions(-) create mode 100644 tests/unit/utils/test_yaml.py diff --git a/dvc/dvcfile.py b/dvc/dvcfile.py index 5b0f2f38c6..a6c4daed22 100644 --- a/dvc/dvcfile.py +++ b/dvc/dvcfile.py @@ -16,7 +16,12 @@ from dvc.stage.loader import SingleStageLoader, StageLoader from dvc.utils import relpath from dvc.utils.collections import apply_diff -from dvc.utils.yaml import dump_yaml, parse_yaml, parse_yaml_for_update +from dvc.utils.yaml import ( + YAMLVersion, + dump_yaml, + parse_yaml, + parse_yaml_for_update, +) logger = logging.getLogger(__name__) @@ -176,7 +181,9 @@ def _dump_pipeline_file(self, stage): data = {} if self.exists(): with open(self.path) as fd: - data = parse_yaml_for_update(fd.read(), self.path) + data = parse_yaml_for_update( + fd.read(), self.path, version=YAMLVersion.V12 + ) else: logger.info("Creating '%s'", self.relpath) open(self.path, "w+").close() @@ -196,7 +203,7 @@ def _dump_pipeline_file(self, stage): else: data["stages"].update(stage_data) - dump_yaml(self.path, data) + dump_yaml(self.path, data, version=YAMLVersion.V12) self.repo.scm.track_file(self.relpath) @property diff --git a/dvc/utils/yaml.py b/dvc/utils/yaml.py index e43838e254..15a421a645 100644 --- a/dvc/utils/yaml.py +++ b/dvc/utils/yaml.py @@ -1,7 +1,10 @@ from collections import OrderedDict +from funcy import reraise from ruamel.yaml import YAML +from ruamel.yaml.emitter import Emitter from ruamel.yaml.error import YAMLError +from ruamel.yaml.events import DocumentStartEvent from dvc.exceptions import YAMLFileCorruptedError @@ -11,21 +14,43 @@ from yaml import SafeLoader -def load_yaml(path): - with open(path, encoding="utf-8") as fd: - return parse_yaml(fd.read(), path) +class YAMLVersion: + V11 = (1, 1) + V12 = (1, 2) + + +def _ruamel_load(text, path, *, version=None, typ="safe"): + yaml = YAML(typ=typ) + if version in (None, YAMLVersion.V11): + yaml.version = YAMLVersion.V11 + with reraise(YAMLError, YAMLFileCorruptedError(path)): + return yaml.load(text) or {} -def parse_yaml(text, path): - try: - import yaml +def _parse_yaml_v1_1(text, path): + import yaml + with reraise(yaml.error.YAMLError, YAMLFileCorruptedError(path)): return yaml.load(text, Loader=SafeLoader) or {} - except yaml.error.YAMLError as exc: - raise YAMLFileCorruptedError(path) from exc -def parse_yaml_for_update(text, path): +def _parse_yaml_v1_2(text, path): + return _ruamel_load(text, path, version=YAMLVersion.V12) + + +def parse_yaml(text, path, *, version=None): + parser = _parse_yaml_v1_1 + if version == YAMLVersion.V12: + parser = _parse_yaml_v1_2 + return parser(text, path) + + +def load_yaml(path, *, version=None): + with open(path, encoding="utf-8") as fd: + return parse_yaml(fd.read(), path, version=version) + + +def parse_yaml_for_update(text, path, *, version=None): """Parses text into Python structure. Unlike `parse_yaml()` this returns ordered dicts, values have special @@ -34,21 +59,73 @@ def parse_yaml_for_update(text, path): This one is, however, several times slower than simple `parse_yaml()`. """ - try: - yaml = YAML() - return yaml.load(text) or {} - except YAMLError as exc: - raise YAMLFileCorruptedError(path) from exc + return _ruamel_load(text, path, version=version, typ="rt") + + +class _YAMLEmitterNoVersionDirective(Emitter): + """ + This emitter skips printing version directive when we set yaml version + on `dump_yaml()`. Also, ruamel.yaml will still try to add a document start + marker line (assuming version directive was written), for which we + need to find a _hack_ to ensure the marker line is not written to the + stream, as our dvcfiles and hopefully, params file are single document + YAML files. + + NOTE: do not use this emitter during load/parse, only when dump for 1.1 + """ + + MARKER_START_LINE = "---" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._first_document = False + + def write_version_directive(self, version_text): + """Do not write version directive at all.""" + + def expect_first_document_start(self): + # as our yaml files are expected to only have a single document, + # this is not needed, just trying to make it a bit resilient, + # but it's not well-thought out. + self._first_document = True + ret = super().expect_first_document_start() + self._first_document = False + return ret + + # pylint: disable=signature-differs + def write_indicator(self, indicator, *args, **kwargs): + # NOTE: if the yaml file already have a directive, + # this will strip it + if isinstance(self.event, DocumentStartEvent): + skip_marker = ( + # see comments in _expect_first_document_start() + getattr(self, "_first_document", False) + and not self.event.explicit + and not self.canonical + and not self.event.tags + ) + if skip_marker and indicator == self.MARKER_START_LINE: + return + super().write_indicator(indicator, *args, **kwargs) + + +def _dump_yaml(data, stream, *, version=None, with_directive=False): + yaml = YAML() + if version in (None, YAMLVersion.V11): + yaml.version = YAMLVersion.V11 # Workaround for #4281 + if not with_directive: + yaml.Emitter = _YAMLEmitterNoVersionDirective + elif with_directive and version == YAMLVersion.V12: + # `ruamel.yaml` dumps in 1.2 by default + yaml.version = version + + yaml.default_flow_style = False + yaml.Representer.add_representer( + OrderedDict, yaml.Representer.represent_dict + ) + yaml.dump(data, stream) -def dump_yaml(path, data): +def dump_yaml(path, data, *, version=None, with_directive=False): with open(path, "w", encoding="utf-8") as fd: - yaml = YAML() - yaml.version = (1, 1) # Workaround for #4281 - yaml.default_flow_style = False - # tell Dumper to represent OrderedDict as - # normal dict - yaml.Representer.add_representer( - OrderedDict, yaml.Representer.represent_dict - ) - yaml.dump(data, fd) + _dump_yaml(data, fd, version=version, with_directive=with_directive) diff --git a/tests/unit/utils/test_yaml.py b/tests/unit/utils/test_yaml.py new file mode 100644 index 0000000000..361da53840 --- /dev/null +++ b/tests/unit/utils/test_yaml.py @@ -0,0 +1,82 @@ +import pytest + +from dvc.exceptions import YAMLFileCorruptedError +from dvc.utils.yaml import ( + YAMLVersion, + dump_yaml, + parse_yaml, + parse_yaml_for_update, +) + +V12 = YAMLVersion.V12 +V11 = YAMLVersion.V11 +V12_DIRECTIVE = "%YAML 1.2\n---\n" +V11_DIRECTIVE = "%YAML 1.1\n---\n" + + +@pytest.mark.parametrize("data", [{"x": 3e24}]) +@pytest.mark.parametrize("with_directive", [True, False]) +@pytest.mark.parametrize( + "ver, directive, expected", + [ + # dot before mantissa is not required in yaml1.2, + # whereas it's required in yaml1.1 + (V12, V12_DIRECTIVE, "x: 3e+24\n"), + (V11, V11_DIRECTIVE, "x: 3.0e+24\n"), + ], +) +def test_dump_yaml_with_directive( + tmp_dir, ver, directive, expected, with_directive, data +): + dump_yaml("data.yaml", data, version=ver, with_directive=with_directive) + actual = (tmp_dir / "data.yaml").read_text() + exp = expected if not with_directive else directive + expected + assert actual == exp + + +@pytest.mark.parametrize( + "parser, rt_parser", [(parse_yaml, False), (parse_yaml_for_update, True)] +) +def test_load_yaml(parser, rt_parser): + # ruamel.yaml.load() complains about dot before mantissa not allowed + # on 1.1 and goes on anyway to convert this to a float based on 1.2 spec + str_value = "3e24" + float_value = float(str_value) + yaml11_text = "x: 3.0e+24" # pyyaml parses as str if there's no +/- sign + # luckily, `ruamel.yaml` always dumps with sign + yaml12_text = f"x: {str_value}" + assert parser(yaml11_text, "data.yaml") == {"x": float_value} + assert parser(yaml12_text, "data.yaml") == { + "x": float_value if rt_parser else str_value + } + + assert parser(yaml11_text, "data.yaml", version=V12) == {"x": float_value} + assert parser(yaml12_text, "data.yaml", version=V12) == {"x": float_value} + + with pytest.raises(YAMLFileCorruptedError): + assert parser("invalid: '", "data.yaml") + + with pytest.raises(YAMLFileCorruptedError): + assert parser("invalid: '", "data.yaml", version=V12) + + +def test_comments_are_preserved_on_update_and_dump(tmp_dir): + text = "x: 3 # this is a comment" + d = parse_yaml_for_update(text, "data.yaml") + d["w"] = 7e24 + + dump_yaml("data.yaml", d) + assert (tmp_dir / "data.yaml").read_text() == f"{text}\nw: 7.0e+24\n" + + dump_yaml("data.yaml", d, with_directive=True) + assert ( + tmp_dir / "data.yaml" + ).read_text() == V11_DIRECTIVE + f"{text}\nw: 7.0e+24\n" + + dump_yaml("data.yaml", d, version=V12) + assert (tmp_dir / "data.yaml").read_text() == f"{text}\nw: 7e+24\n" + + dump_yaml("data.yaml", d, with_directive=True, version=V12) + assert ( + tmp_dir / "data.yaml" + ).read_text() == V12_DIRECTIVE + f"{text}\nw: 7e+24\n"