Skip to content

Commit

Permalink
Fix block scalar mangling bug lyz-code#231
Browse files Browse the repository at this point in the history
The regex based parsing for fixing comments was breaking block scalars.
By using the ruyaml round trip handler, instead the comment formatting
now can correctly identify block-scalars and avoid mangling them.
  • Loading branch information
wrouesnel committed Sep 28, 2023
1 parent 7e4aae6 commit 888d89d
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 30 deletions.
163 changes: 134 additions & 29 deletions src/yamlfix/adapters.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Define adapter / helper classes to hide unrelated functionality in."""

import io
import logging
import re
from functools import partial
Expand All @@ -12,6 +12,7 @@
from ruyaml.tokens import CommentToken

from yamlfix.model import YamlfixConfig, YamlNodeStyle
from yamlfix.util import walk_object

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -334,18 +335,12 @@ def __init__(self, yaml: Yaml, config: Optional[YamlfixConfig]) -> None:
self.yaml = yaml.yaml
self.config = config or YamlfixConfig()

def fix(self, source_code: str) -> str:
"""Run all yaml source code fixers.
Args:
source_code: Source code to be corrected.
Returns:
Corrected source code.
"""
log.debug("Running source code fixers...")

fixers = [
# The list of fixers to run, and the index of the currently called fixer.
# This allows fixers which might need to reinvoke other fixers to reinvoke the
# previous fixers (currently fix comments does this as it needs to re-emit the
# YAML).
self._fixer_idx = -1
self._fixers = [
self._fix_truthy_strings,
self._fix_jinja_variables,
self._ruamel_yaml_fixer,
Expand All @@ -359,8 +354,40 @@ def fix(self, source_code: str) -> str:
self._add_newline_at_end_of_file,
]

for fixer in fixers:
def _rerun_previous_fixers(self, source_code: str) -> str:
"""Run all the previous source code fixers except the currently running one.
This is re-entrant safe and will correctly restore the fixer idx once it's
complete.
Args:
source_code: Source code to be corrected.
Returns:
Corrected source code.
"""
cur_fixer_idx = self._fixer_idx
for fixer_idx, fixer in enumerate(self._fixers[:cur_fixer_idx]):
self._fixer_idx = fixer_idx
source_code = fixer(source_code)
# Restore fixer idx
self._fixer_idx = cur_fixer_idx
return source_code

def fix(self, source_code: str) -> str:
"""Run all yaml source code fixers.
Args:
source_code: Source code to be corrected.
Returns:
Corrected source code.
"""
log.debug("Running source code fixers...")

# Just use the re-run system do it.
self._fixer_idx = len(self._fixers)
source_code = self._rerun_previous_fixers(source_code)

return source_code

Expand Down Expand Up @@ -573,24 +600,102 @@ def _restore_truthy_strings(source_code: str) -> str:
def _fix_comments(self, source_code: str) -> str:
log.debug("Fixing comments...")
config = self.config
comment_start = " " * config.comments_min_spaces_from_content + "#"

fixed_source_lines = []
# We need the source lines for the comment fixers to analyze whitespace easily
source_lines = source_code.splitlines()

for line in source_code.splitlines():
# Comment at the start of the line
if config.comments_require_starting_space and re.search(r"(^|\s)#\w", line):
line = line.replace("#", "# ")
# Comment in the middle of the line, but it's not part of a string
if (
config.comments_min_spaces_from_content > 1
and " #" in line
and line[-1] not in ["'", '"']
):
line = re.sub(r"(.+\S)(\s+?)#", rf"\1{comment_start}", line)
fixed_source_lines.append(line)
yaml = YAML(typ="rt")
# Hijack config options from the regular fixer
yaml.explicit_start = self.yaml.explicit_start
yaml.width = self.yaml.width
# preserve_quotes however must always be true, otherwise we change output
# unexpectedly.
yaml.preserve_quotes = True # type: ignore
yaml_documents = list(yaml.load_all(source_code))

return "\n".join(fixed_source_lines)
handled_comments: List[CommentToken] = []

def _comment_token_cb(target: Any) -> None: # noqa:W0613,ANN401
"""This function is a callback for walk_object.
This implements the actual comment fixing, and is used because comment
objects are structured in nested-lists and can repeat.
"""
if not isinstance(target, CommentToken):
return
if any(target is e for e in handled_comments):
# This comment was handled at a higher level already.
return
if target.value is None:
return
comment_lines = target.value.split("\n")
fixed_comment_lines = []
for line in comment_lines:
if config.comments_require_starting_space and re.search(
r"(^|\s)#\w", line
):
line = line.replace("#", "# ")
fixed_comment_lines.append(line)

# Update the comment with the fixed lines
target.value = "\n".join(fixed_comment_lines)

if config.comments_min_spaces_from_content > 1:
# It's hard to reconstruct exactly where the content is, but since we
# have the line numbers what we do is lookup the literal source line
# here and check where the whitespace is compared to where we know the
# comment starts.
source_line = source_lines[target.start_mark.line]
content_part = source_line[0 : target.start_mark.column] # noqa: E203
# Find the non-whitespace position in the content part
rx_match = re.match(r"^.*\S", content_part)
if (
rx_match is not None
): # If no match then nothing to do - no content to be away from
_, content_end = rx_match.span()
# If there's less than min-spaces from content, we're going to add
# some.
if (
target.start_mark.column - content_end
< config.comments_min_spaces_from_content
):
# Handled
target.start_mark.column = (
content_end + config.comments_min_spaces_from_content
)
# Some ruyaml objects will return attached comments at multiple levels (but
# not all). Keep track of which comments we've already processed to avoid
# double processing them (important because we use raw source lines to
# determine content position above).
handled_comments.append(target)

def _comment_fixer(target: Any) -> None: # noqa:W0613,ANN401
"""This function is a callback for walk_object.
walk_object calls it for every object it finds, and then will walk the
mapping/sequence subvalues and call this function on those too. This gives
us direct access to all round tripped comments.
"""
if not hasattr(target, "ca"):
# Scalar or other object with no comment parameter.
return
# Find all comment tokens and fix them
walk_object(target.ca.comment, _comment_token_cb)
walk_object(target.ca.end, _comment_token_cb)
walk_object(target.ca.items, _comment_token_cb)
walk_object(target.ca.pre, _comment_token_cb)

# Walk the object and invoke the comment fixer
walk_object(yaml_documents, _comment_fixer)

# Dump out the YAML documents
stream = io.StringIO()
yaml.dump_all(yaml_documents, stream=stream)

fixed_source_code = stream.getvalue()
# Reinvoke the previous fixers to ensure we fix the new output we just created.
fixed_source_code = self._rerun_previous_fixers(fixed_source_code)
return fixed_source_code

def _fix_whitelines(self, source_code: str) -> str:
"""Fixes number of consecutive whitelines.
Expand Down
18 changes: 18 additions & 0 deletions src/yamlfix/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""Define utility helpers."""
from typing import Any, Callable, Iterable, Mapping


def walk_object(
target: Any, callback_fn: Callable[[Any], None] # noqa: ANN401
) -> None:
"""Walk a YAML/JSON-like object and call a function on all values."""
callback_fn(target) # Call the callback and whatever we received.

if isinstance(target, Mapping):
# Map type
for _, value in target.items():
walk_object(value, callback_fn)
elif isinstance(target, Iterable) and not isinstance(target, (bytes, str)):
# List type
for value in target:
walk_object(value, callback_fn)
48 changes: 48 additions & 0 deletions tests/unit/test_adapter_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -1016,3 +1016,51 @@ def test_enforcing_flow_style_together_with_adjustable_newlines(self) -> None:
result = fix_code(source, config)

assert result == fixed_source

def test_block_scalar_whitespace_is_preserved(self) -> None:
"""Check that multi-line blocks with #'s in them are not treated as comments.
Regression test for https://github.com/lyz-code/yamlfix/issues/231
"""
source = dedent(
"""\
---
addn_doc_key: |-
#######################################
# This would also be broken #
#######################################
---
#Comment above the key
key: |-
###########################################
# Value with lots of whitespace #
# Some More Whitespace #
###########################################
#Comment below
#Comment with some whitespace below
""" # noqa: W293
)
fixed_source = dedent(
"""\
---
addn_doc_key: |-
#######################################
# This would also be broken #
#######################################
---
# Comment above the key
key: |-
###########################################
# Value with lots of whitespace #
# Some More Whitespace #
###########################################
# Comment below
# Comment with some whitespace below
""" # noqa: W293
)
config = YamlfixConfig()

result = fix_code(source, config)

assert result == fixed_source
10 changes: 9 additions & 1 deletion tests/unit/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ def test_fix_code_functions_emit_debug_logs(

fix_code("") # act

expected_logs = {
expected_logs = { # noqa: W0130
"Setting up ruamel yaml 'quote simple values' configuration...",
"Setting up ruamel yaml 'sequence flow style' configuration...",
"Running ruamel yaml base configuration...",
Expand All @@ -481,6 +481,14 @@ def test_fix_code_functions_emit_debug_logs(
"Restoring jinja2 variables...",
"Restoring double exclamations...",
"Fixing comments...",
# Fixing comments causes a re-run of fixers, so we get duplicates from here
"Fixing truthy strings...",
"Fixing jinja2 variables...",
"Running ruamel yaml fixer...",
"Restoring truthy strings...",
"Restoring jinja2 variables...",
"Restoring double exclamations...",
# End fixing comments duplicates
"Fixing top level lists...",
"Fixing flow-style lists...",
}
Expand Down

0 comments on commit 888d89d

Please sign in to comment.