Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Special char escaping #98

Merged
merged 4 commits into from Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
90 changes: 45 additions & 45 deletions BENCHMARK.md
@@ -1,6 +1,6 @@
Benchmark

Benchmark run from 2023-11-18 14:03:41.157149Z UTC
Benchmark run from 2023-11-23 17:53:11.749718Z UTC

## System

Expand Down Expand Up @@ -65,20 +65,20 @@ Run Time

<tr>
<td style="white-space: nowrap">Jason</td>
<td style="white-space: nowrap; text-align: right">267.76</td>
<td style="white-space: nowrap; text-align: right">3.73 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;14.12%</td>
<td style="white-space: nowrap; text-align: right">3.45 ms</td>
<td style="white-space: nowrap; text-align: right">4.99 ms</td>
<td style="white-space: nowrap; text-align: right">248.88</td>
<td style="white-space: nowrap; text-align: right">4.02 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;14.75%</td>
<td style="white-space: nowrap; text-align: right">3.81 ms</td>
<td style="white-space: nowrap; text-align: right">5.72 ms</td>
</tr>

<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap; text-align: right">22.80</td>
<td style="white-space: nowrap; text-align: right">43.87 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;2.48%</td>
<td style="white-space: nowrap; text-align: right">43.77 ms</td>
<td style="white-space: nowrap; text-align: right">48.11 ms</td>
<td style="white-space: nowrap; text-align: right">3.27</td>
<td style="white-space: nowrap; text-align: right">305.72 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;1.14%</td>
<td style="white-space: nowrap; text-align: right">305.02 ms</td>
<td style="white-space: nowrap; text-align: right">317.72 ms</td>
</tr>

</table>
Expand All @@ -93,14 +93,14 @@ Run Time Comparison
<th style="text-align: right">Slower</th>
<tr>
<td style="white-space: nowrap">Jason</td>
<td style="white-space: nowrap;text-align: right">267.76</td>
<td style="white-space: nowrap;text-align: right">248.88</td>
<td>&nbsp;</td>
</tr>

<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap; text-align: right">22.80</td>
<td style="white-space: nowrap; text-align: right">11.75x</td>
<td style="white-space: nowrap; text-align: right">3.27</td>
<td style="white-space: nowrap; text-align: right">76.09x</td>
</tr>

</table>
Expand All @@ -122,8 +122,8 @@ Memory Usage
</tr>
<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap">40.60 MB</td>
<td>8.45x</td>
<td style="white-space: nowrap">68.18 MB</td>
<td>14.19x</td>
</tr>
</table>

Expand All @@ -145,20 +145,20 @@ Run Time

<tr>
<td style="white-space: nowrap">Jason</td>
<td style="white-space: nowrap; text-align: right">121.07</td>
<td style="white-space: nowrap; text-align: right">8.26 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;14.07%</td>
<td style="white-space: nowrap; text-align: right">7.82 ms</td>
<td style="white-space: nowrap; text-align: right">13.76 ms</td>
<td style="white-space: nowrap; text-align: right">121.34</td>
<td style="white-space: nowrap; text-align: right">8.24 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;142.36%</td>
<td style="white-space: nowrap; text-align: right">7.35 ms</td>
<td style="white-space: nowrap; text-align: right">13.61 ms</td>
</tr>

<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap; text-align: right">22.00</td>
<td style="white-space: nowrap; text-align: right">45.45 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;19.87%</td>
<td style="white-space: nowrap; text-align: right">45.58 ms</td>
<td style="white-space: nowrap; text-align: right">59.34 ms</td>
<td style="white-space: nowrap; text-align: right">21.10</td>
<td style="white-space: nowrap; text-align: right">47.39 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;62.18%</td>
<td style="white-space: nowrap; text-align: right">43.84 ms</td>
<td style="white-space: nowrap; text-align: right">319.16 ms</td>
</tr>

</table>
Expand All @@ -173,14 +173,14 @@ Run Time Comparison
<th style="text-align: right">Slower</th>
<tr>
<td style="white-space: nowrap">Jason</td>
<td style="white-space: nowrap;text-align: right">121.07</td>
<td style="white-space: nowrap;text-align: right">121.34</td>
<td>&nbsp;</td>
</tr>

<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap; text-align: right">22.00</td>
<td style="white-space: nowrap; text-align: right">5.5x</td>
<td style="white-space: nowrap; text-align: right">21.10</td>
<td style="white-space: nowrap; text-align: right">5.75x</td>
</tr>

</table>
Expand All @@ -202,7 +202,7 @@ Memory Usage
</tr>
<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap">50.08 MB</td>
<td style="white-space: nowrap">50.09 MB</td>
<td>5.43x</td>
</tr>
</table>
Expand All @@ -225,20 +225,20 @@ Run Time

<tr>
<td style="white-space: nowrap">Jason</td>
<td style="white-space: nowrap; text-align: right">349.85</td>
<td style="white-space: nowrap; text-align: right">2.86 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;6.74%</td>
<td style="white-space: nowrap; text-align: right">2.81 ms</td>
<td style="white-space: nowrap; text-align: right">3.49 ms</td>
<td style="white-space: nowrap; text-align: right">368.32</td>
<td style="white-space: nowrap; text-align: right">2.72 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;251.44%</td>
<td style="white-space: nowrap; text-align: right">2.57 ms</td>
<td style="white-space: nowrap; text-align: right">3.29 ms</td>
</tr>

<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap; text-align: right">28.33</td>
<td style="white-space: nowrap; text-align: right">35.30 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;4.73%</td>
<td style="white-space: nowrap; text-align: right">35.12 ms</td>
<td style="white-space: nowrap; text-align: right">45.67 ms</td>
<td style="white-space: nowrap; text-align: right">4.76</td>
<td style="white-space: nowrap; text-align: right">210.14 ms</td>
<td style="white-space: nowrap; text-align: right">&plusmn;0.45%</td>
<td style="white-space: nowrap; text-align: right">209.83 ms</td>
<td style="white-space: nowrap; text-align: right">211.96 ms</td>
</tr>

</table>
Expand All @@ -253,14 +253,14 @@ Run Time Comparison
<th style="text-align: right">Slower</th>
<tr>
<td style="white-space: nowrap">Jason</td>
<td style="white-space: nowrap;text-align: right">349.85</td>
<td style="white-space: nowrap;text-align: right">368.32</td>
<td>&nbsp;</td>
</tr>

<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap; text-align: right">28.33</td>
<td style="white-space: nowrap; text-align: right">12.35x</td>
<td style="white-space: nowrap; text-align: right">4.76</td>
<td style="white-space: nowrap; text-align: right">77.4x</td>
</tr>

</table>
Expand All @@ -282,7 +282,7 @@ Memory Usage
</tr>
<tr>
<td style="white-space: nowrap">Ymlr</td>
<td style="white-space: nowrap">47.48 MB</td>
<td>18.67x</td>
<td style="white-space: nowrap">65.86 MB</td>
<td>25.89x</td>
</tr>
</table>
4 changes: 4 additions & 0 deletions CHANGELOG.md
Expand Up @@ -10,6 +10,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

<!-- ### Added | Changed | Deprecated | Removed | Fixed | Security -->

### Added

- Support for escape and unicode characters [#98](https://github.com/ufirstgroup/ymlr/pull/98)

<!--------------------- Don't add new entries after this line --------------------->

### Added
Expand Down
143 changes: 138 additions & 5 deletions lib/ymlr/encode.ex
@@ -1,5 +1,67 @@
defmodule Ymlr.Encode do
@moduledoc false
@moduledoc ~S"""

This module implements the logic of encoding scalars.

## Strings and Characters

### Printable Characters

The YAML spec defines a set of printable characters `c-printable` (see
https://yaml.org/spec/1.2.2/#character-set). All these characters can
theoretically be left alone when encoding a string.

### Escape Characters

The YAML spec also defines a set of escape charactesr `c-ns-esc-char` (see
https://yaml.org/spec/1.2.2/#57-escaped-characters). Some of these chars are
also in the printable range `c-printable`. Being in `c-printable` means they
could be left alone. I.e. there would be no need to encode them as escape
chars. However, we think in certain cases, escape characters are more
reader friendly than the actual characters. An example is the "next line"
character (`U+0085` or `\N`). It is part of `c-printable`. However, on the
screen this character cannot be distinguished from a simple "line feed"
character (`U+000A` or `\n`). Therefore all characters in `c-ns-esc-char` with
the exception of `\n` and `\t` are always encoded using their escape character.

### Other 8-bit Unicode Characters

Any 8-bit unicode character that neither a printable nor an escape character
has to be encoded using one of the three unicode escape characters \x, \u or
\U (i.e. \xXX, \u00XX or \U000000XX).

### Double Quotes for Escape Characters

Printable Characters can be encoded unquoted, single-quoted or double-quoted.
Escape characters require double quotes.

### Chars with Special Treatments

#### Chars `\n` and `\t`

These two characters are never converted to their escape characters.
One exception: If the given string is literally just a newline, we
encode it as "\n" (double quotes required for escape chars) rather than a
single newline.

#### Chars `"` and `\`

These two characters have escape characters (`\"` and `\\`) but they are also
part of the of the printable character range `c-printable` and they have a
well-defined presentation on the screen. Ocurrance of these characters don't
enforce double-quotes but if they occur within a string that for other reasons
requires double-quotes, they need to be escaped.

### Implemented Decision Logic

First matching rule is applied:

1. Char is `\t` or `\n` => leave alone
1. Char is `"` or `\` => if within double quotes, escape. Otherwise leave alone.
1. Char has an escape character (i.e. is part of `c-ns-esc-char`) => force double quotes and encode as escape character
1. Char is a printable character => leave alone
1. Char is a non-printable character => force double quotes and encode as \xXX (only 8-bit supported for now)
"""

alias Ymlr.Encoder

Expand Down Expand Up @@ -38,6 +100,51 @@ defmodule Ymlr.Encode do
":"
]

# Escape chars that, if contained within, force the string to be double-quoted:
@escape_chars_forcing_double_quotes ~c"\a\b\e\f\r\v\0\u00a0\u0085\u2028\u2029"

# Chars that have to be escaped if within double quotes:
@escape_if_within_double_quotes @escape_chars_forcing_double_quotes ++ ~c"\"\\"

# mapping char => escape char.
@escape_if_within_double_quotes_mapping Enum.zip(
@escape_if_within_double_quotes,
~c"abefrv0_NLP\"\\"
)

# Printable Characters (8-bit only for now):
@printable_chars List.flatten([
# Tab (\t)
0x09,
# Line feed (LF \n)
0x0A,
# Carriage Return (CR \r)
# 0x0D, theoretically printable, seems to require double quotes.
# Next Line (NEL)
0x85,
# Printable ASCII
Enum.to_list(0x20..0x7E),
# Basic Multilingual Plane (BMP)
Enum.to_list(0xA0..0xD7FF),
Enum.to_list(0xE000..0xFFFD),
# 32 bit
Enum.to_list(0x010000..0x10FFFF)
])

@not_supported_by_elixir Enum.to_list(0xD800..0xDFFF)

# Non-Printable Characters (8-bit only for now) - all chars minus union of printable and escape chars:
@non_printable_chars Enum.to_list(0..0x10FFFF) --
(@printable_chars ++
@escape_if_within_double_quotes ++ @not_supported_by_elixir)

# Chars that, if contained within, force the string to be double-quoted:
@chars_forcing_double_quotes_strings Enum.map(
@non_printable_chars ++
@escape_chars_forcing_double_quotes,
&<<&1::utf8>>
)

@doc ~S"""
Encodes the given data as YAML string. Raises if it cannot be encoded.

Expand Down Expand Up @@ -146,6 +253,7 @@ defmodule Ymlr.Encode do
defp encode_binary(data, indent_level) do
cond do
data == "" -> ~S('')
data == "~" -> ~S('~')
data == "\n" -> ~S("\n")
data == "null" -> ~S('null')
data == "yes" -> ~S('yes')
Expand All @@ -155,7 +263,7 @@ defmodule Ymlr.Encode do
data == "True" -> ~S('True')
data == "False" -> ~S('False')
String.contains?(data, "\n") -> multiline(data, indent_level)
String.contains?(data, "\t") -> ~s("#{data}")
String.contains?(data, @chars_forcing_double_quotes_strings) -> with_double_quotes(data)
String.at(data, 0) in @quote_when_first -> with_quotes(data)
String.at(data, -1) in @quote_when_last -> with_quotes(data)
String.starts_with?(data, "- ") -> with_quotes(data)
Expand Down Expand Up @@ -187,16 +295,41 @@ defmodule Ymlr.Encode do

defp with_quotes(data) do
if String.contains?(data, "'") do
~s("#{escape(data)}")
with_double_quotes(data)
else
~s('#{data}')
with_single_quotes(data)
end
end

defp with_double_quotes(data) do
~s("#{escape(data)}")
end

defp with_single_quotes(data), do: ~s('#{data}')

defp escape(data) do
data |> String.replace("\\", "\\\\") |> String.replace(~s("), ~s(\\"))
for <<char::utf8 <- data>> do
escape_char(char)
end
end

for {char, escaped} <- @escape_if_within_double_quotes_mapping do
defp escape_char(unquote(char)), do: <<?\\, unquote(escaped)>>
end

for uchar <- @non_printable_chars do
unicode_sequence =
case uchar do
uchar when uchar <= 0xFF -> List.to_string(:io_lib.format("\\x~2.16.0B", [uchar]))
uchar when uchar <= 0xFFFF -> List.to_string(:io_lib.format("\\u~4.16.0B", [uchar]))
uchar -> List.to_string(:io_lib.format("\\U~6.16.0B", [uchar]))
end

defp escape_char(unquote(uchar)), do: unquote(unicode_sequence)
end

defp escape_char(char), do: char

# for example for map keys
defp multiline(data, nil), do: inspect(data)
# see https://yaml-multiline.info/
Expand Down