diff --git a/BENCHMARK.md b/BENCHMARK.md index 8a71727..70d8678 100644 --- a/BENCHMARK.md +++ b/BENCHMARK.md @@ -1,6 +1,6 @@ Benchmark -Benchmark run from 2023-11-18 14:03:41.157149Z UTC +Benchmark run from 2023-11-23 17:53:11.749718Z UTC ## System @@ -65,20 +65,20 @@ Run Time Jason - 267.76 - 3.73 ms - ±14.12% - 3.45 ms - 4.99 ms + 248.88 + 4.02 ms + ±14.75% + 3.81 ms + 5.72 ms Ymlr - 22.80 - 43.87 ms - ±2.48% - 43.77 ms - 48.11 ms + 3.27 + 305.72 ms + ±1.14% + 305.02 ms + 317.72 ms @@ -93,14 +93,14 @@ Run Time Comparison Slower Jason - 267.76 + 248.88   Ymlr - 22.80 - 11.75x + 3.27 + 76.09x @@ -122,8 +122,8 @@ Memory Usage Ymlr - 40.60 MB - 8.45x + 68.18 MB + 14.19x @@ -145,20 +145,20 @@ Run Time Jason - 121.07 - 8.26 ms - ±14.07% - 7.82 ms - 13.76 ms + 121.34 + 8.24 ms + ±142.36% + 7.35 ms + 13.61 ms Ymlr - 22.00 - 45.45 ms - ±19.87% - 45.58 ms - 59.34 ms + 21.10 + 47.39 ms + ±62.18% + 43.84 ms + 319.16 ms @@ -173,14 +173,14 @@ Run Time Comparison Slower Jason - 121.07 + 121.34   Ymlr - 22.00 - 5.5x + 21.10 + 5.75x @@ -202,7 +202,7 @@ Memory Usage Ymlr - 50.08 MB + 50.09 MB 5.43x @@ -225,20 +225,20 @@ Run Time Jason - 349.85 - 2.86 ms - ±6.74% - 2.81 ms - 3.49 ms + 368.32 + 2.72 ms + ±251.44% + 2.57 ms + 3.29 ms Ymlr - 28.33 - 35.30 ms - ±4.73% - 35.12 ms - 45.67 ms + 4.76 + 210.14 ms + ±0.45% + 209.83 ms + 211.96 ms @@ -253,14 +253,14 @@ Run Time Comparison Slower Jason - 349.85 + 368.32   Ymlr - 28.33 - 12.35x + 4.76 + 77.4x @@ -282,7 +282,7 @@ Memory Usage Ymlr - 47.48 MB - 18.67x + 65.86 MB + 25.89x \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 971a4cc..3ebe587 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +### Added + +- Support for escape and unicode characters [#98](https://github.com/ufirstgroup/ymlr/pull/98) + ### Added diff --git a/lib/ymlr/encode.ex b/lib/ymlr/encode.ex index 9c8ff17..a4c1ee5 100644 --- a/lib/ymlr/encode.ex +++ b/lib/ymlr/encode.ex @@ -1,5 +1,67 @@ defmodule Ymlr.Encode do - @moduledoc false + @moduledoc ~S""" + + This module implements the logic of encoding scalars. + + ## Strings and Characters + + ### Printable Characters + + The YAML spec defines a set of printable characters `c-printable` (see + https://yaml.org/spec/1.2.2/#character-set). All these characters can + theoretically be left alone when encoding a string. + + ### Escape Characters + + The YAML spec also defines a set of escape charactesr `c-ns-esc-char` (see + https://yaml.org/spec/1.2.2/#57-escaped-characters). Some of these chars are + also in the printable range `c-printable`. Being in `c-printable` means they + could be left alone. I.e. there would be no need to encode them as escape + chars. However, we think in certain cases, escape characters are more + reader friendly than the actual characters. An example is the "next line" + character (`U+0085` or `\N`). It is part of `c-printable`. However, on the + screen this character cannot be distinguished from a simple "line feed" + character (`U+000A` or `\n`). Therefore all characters in `c-ns-esc-char` with + the exception of `\n` and `\t` are always encoded using their escape character. + + ### Other 8-bit Unicode Characters + + Any 8-bit unicode character that neither a printable nor an escape character + has to be encoded using one of the three unicode escape characters \x, \u or + \U (i.e. \xXX, \u00XX or \U000000XX). + + ### Double Quotes for Escape Characters + + Printable Characters can be encoded unquoted, single-quoted or double-quoted. + Escape characters require double quotes. + + ### Chars with Special Treatments + + #### Chars `\n` and `\t` + + These two characters are never converted to their escape characters. + One exception: If the given string is literally just a newline, we + encode it as "\n" (double quotes required for escape chars) rather than a + single newline. + + #### Chars `"` and `\` + + These two characters have escape characters (`\"` and `\\`) but they are also + part of the of the printable character range `c-printable` and they have a + well-defined presentation on the screen. Ocurrance of these characters don't + enforce double-quotes but if they occur within a string that for other reasons + requires double-quotes, they need to be escaped. + + ### Implemented Decision Logic + + First matching rule is applied: + + 1. Char is `\t` or `\n` => leave alone + 1. Char is `"` or `\` => if within double quotes, escape. Otherwise leave alone. + 1. Char has an escape character (i.e. is part of `c-ns-esc-char`) => force double quotes and encode as escape character + 1. Char is a printable character => leave alone + 1. Char is a non-printable character => force double quotes and encode as \xXX (only 8-bit supported for now) + """ alias Ymlr.Encoder @@ -38,6 +100,51 @@ defmodule Ymlr.Encode do ":" ] + # Escape chars that, if contained within, force the string to be double-quoted: + @escape_chars_forcing_double_quotes ~c"\a\b\e\f\r\v\0\u00a0\u0085\u2028\u2029" + + # Chars that have to be escaped if within double quotes: + @escape_if_within_double_quotes @escape_chars_forcing_double_quotes ++ ~c"\"\\" + + # mapping char => escape char. + @escape_if_within_double_quotes_mapping Enum.zip( + @escape_if_within_double_quotes, + ~c"abefrv0_NLP\"\\" + ) + + # Printable Characters (8-bit only for now): + @printable_chars List.flatten([ + # Tab (\t) + 0x09, + # Line feed (LF \n) + 0x0A, + # Carriage Return (CR \r) + # 0x0D, theoretically printable, seems to require double quotes. + # Next Line (NEL) + 0x85, + # Printable ASCII + Enum.to_list(0x20..0x7E), + # Basic Multilingual Plane (BMP) + Enum.to_list(0xA0..0xD7FF), + Enum.to_list(0xE000..0xFFFD), + # 32 bit + Enum.to_list(0x010000..0x10FFFF) + ]) + + @not_supported_by_elixir Enum.to_list(0xD800..0xDFFF) + + # Non-Printable Characters (8-bit only for now) - all chars minus union of printable and escape chars: + @non_printable_chars Enum.to_list(0..0x10FFFF) -- + (@printable_chars ++ + @escape_if_within_double_quotes ++ @not_supported_by_elixir) + + # Chars that, if contained within, force the string to be double-quoted: + @chars_forcing_double_quotes_strings Enum.map( + @non_printable_chars ++ + @escape_chars_forcing_double_quotes, + &<<&1::utf8>> + ) + @doc ~S""" Encodes the given data as YAML string. Raises if it cannot be encoded. @@ -146,6 +253,7 @@ defmodule Ymlr.Encode do defp encode_binary(data, indent_level) do cond do data == "" -> ~S('') + data == "~" -> ~S('~') data == "\n" -> ~S("\n") data == "null" -> ~S('null') data == "yes" -> ~S('yes') @@ -155,7 +263,7 @@ defmodule Ymlr.Encode do data == "True" -> ~S('True') data == "False" -> ~S('False') String.contains?(data, "\n") -> multiline(data, indent_level) - String.contains?(data, "\t") -> ~s("#{data}") + String.contains?(data, @chars_forcing_double_quotes_strings) -> with_double_quotes(data) String.at(data, 0) in @quote_when_first -> with_quotes(data) String.at(data, -1) in @quote_when_last -> with_quotes(data) String.starts_with?(data, "- ") -> with_quotes(data) @@ -187,16 +295,41 @@ defmodule Ymlr.Encode do defp with_quotes(data) do if String.contains?(data, "'") do - ~s("#{escape(data)}") + with_double_quotes(data) else - ~s('#{data}') + with_single_quotes(data) end end + defp with_double_quotes(data) do + ~s("#{escape(data)}") + end + + defp with_single_quotes(data), do: ~s('#{data}') + defp escape(data) do - data |> String.replace("\\", "\\\\") |> String.replace(~s("), ~s(\\")) + for <> do + escape_char(char) + end + end + + for {char, escaped} <- @escape_if_within_double_quotes_mapping do + defp escape_char(unquote(char)), do: <> end + for uchar <- @non_printable_chars do + unicode_sequence = + case uchar do + uchar when uchar <= 0xFF -> List.to_string(:io_lib.format("\\x~2.16.0B", [uchar])) + uchar when uchar <= 0xFFFF -> List.to_string(:io_lib.format("\\u~4.16.0B", [uchar])) + uchar -> List.to_string(:io_lib.format("\\U~6.16.0B", [uchar])) + end + + defp escape_char(unquote(uchar)), do: unquote(unicode_sequence) + end + + defp escape_char(char), do: char + # for example for map keys defp multiline(data, nil), do: inspect(data) # see https://yaml-multiline.info/ diff --git a/test/ymlr/encode_test.exs b/test/ymlr/encode_test.exs index 5eeb9da..a198484 100644 --- a/test/ymlr/encode_test.exs +++ b/test/ymlr/encode_test.exs @@ -15,8 +15,9 @@ defmodule Ymlr.EncodeTest do assert_identity_and_output("", "''") end - test "simple string" do + test "plain strings" do assert_identity_and_output("hello world", "hello world") + assert_identity_and_output("that's it", "that's it") end # see http://blogs.perl.org/users/tinita/2018/03/strings-in-yaml---to-quote-or-not-to-quote.html @@ -24,6 +25,7 @@ defmodule Ymlr.EncodeTest do test "quoted strings - avoid type confusion" do assert_identity_and_output("yes", ~S('yes')) assert_identity_and_output("no", ~S('no')) + assert_identity_and_output("~", "'~'") assert_identity_and_output("true", ~S('true')) assert_identity_and_output("false", ~S('false')) assert_identity_and_output("True", ~S('True')) @@ -78,9 +80,9 @@ defmodule Ymlr.EncodeTest do assert_identity_and_output("some:entry:", ~S('some:entry:')) end - test "quoted strings - escape seq forces double quotes (tab char)" do - assert_identity_and_output("a\tb", ~s("a\tb")) - assert_identity_and_output("!a\tb", ~s("!a\tb")) + test "quoted strings - tab char with and without quotes" do + assert_identity_and_output("a\tb", ~s(a\tb)) + assert_identity_and_output("!a\tb", ~s('!a\tb')) # Not for explicit backslash: assert_identity_and_output(~S(!a\tb), ~S('!a\tb')) end @@ -89,7 +91,7 @@ defmodule Ymlr.EncodeTest do # ... (prefer single quotes) assert_identity_and_output("[]", ~S('[]')) assert_identity_and_output(~S(["hello"]), ~S('["hello"]')) - assert_identity_and_output(~S(["he|\o"]), ~S('["he|\o"]')) + assert_identity_and_output(~S(["he|\o"]), ~s('["he|\\o"]')) assert_identity_and_output("{}", ~S('{}')) assert_identity_and_output("[{}]", ~S('[{}]')) # ... (use double quotes if string contains single quotes) @@ -111,27 +113,75 @@ defmodule Ymlr.EncodeTest do end end - @tag skip: "not sure about those => to be reviewed" # https://yaml.org/spec/1.2.2/#example-escaped-characters test "quoted strings - example-escaped-characters from 1.2.2 spec" do - assert_identity_and_output("Fun with \\", ~S("Fun with \\")) + assert_identity_and_output(~S(Fun with \\), ~S(Fun with \\)) assert_identity_and_output("\" \u0007 \b \u001b \f", ~S("\" \a \b \e \f")) - # assert_identity_and_output("\n \r \t \u000b \u0000", ~S("\n \r \t \v \0")) - # or we use | when string contains newlines => rewrite the example to: - assert_identity_and_output("\r \t \u000b \u0000", ~S("\r \t \v \0")) - assert_identity_and_output("\u0020 \u00a0 \u0085 \u2028 \u2029", ~S("\ \_ \N \L \P")) + # Line breaks inside scalar content must be normalized by the YAML processor. + # Each such line break must be parsed into a single line feed character. + # The original line break format is a presentation detail and must not be + # used to convey content information. + # I.e. the following cannot be tested for identity as \r will be parsed as \n. + assert_output("\n\r \t \u000b \u0000", "|-\n\n \r \t \v \0") + assert_identity_and_output("\r \t \u000b \u0000", ~s("\\r \t \\v \\0")) + + assert_identity_and_output( + "\u0020 \u00a0 \u0085 \u2028 \u2029", + ~S(" \_ \N \L \P") + ) + + # Possible formats: \x13 \u0013 \U00000013. + assert_identity_and_output( + "\u0013\uFFFD\uFFFE\u{10FFFF}", + "\"\\x13\uFFFD\\uFFFE\u{10FFFF}\"" + ) end - @tag skip: "not sure about those => review the spec" test "quoted strings - in map key (requires escape char)" do - assert_identity_and_output(%{"a\tb" => "value"}, ~s("a\tb": value)) - assert_identity_and_output(%{"a\rb" => "value"}, ~s("a\rb": value)) + assert_identity_and_output(%{"a\tb" => "value"}, ~s(a\tb: value)) + assert_identity_and_output(%{"a\rb" => "value"}, ~s("a\\rb": value)) end test "newline in map key" do assert_identity_and_output(%{"a\nb" => "value"}, ~S("a\nb": value)) end + test "backslash" do + # in plain string + assert assert_identity_and_output(~S(a\b), ~S(a\b)) + # in single quote string + assert assert_identity_and_output(~S(!a\b), ~S('!a\b')) + # double quotes because of single quote + assert assert_identity_and_output(~s(!a'b\\c), ~S("!a'b\\c")) + # double quotes because of tab + assert assert_identity_and_output(~s(a\tb\\c), ~s(a\tb\\c)) + end + + test "backslash in map key" do + # in plain string + assert assert_identity_and_output(%{~S(a\b) => "value"}, ~S(a\b: value)) + # in single quote string + assert assert_identity_and_output(%{~S(!a\b) => "value"}, ~S('!a\b': value)) + # double quotes because of single quote + assert assert_identity_and_output(%{~s(a'b\\c) => "value"}, ~s(a'b\\c: value)) + # double quotes because of tab + assert assert_identity_and_output(%{~s(a\tb\\c) => "value"}, ~s(a\tb\\c: value)) + end + + test "tab" do + # would be plain string without the tab + assert assert_identity_and_output("a\tb", ~s(a\tb)) + # would be single quoted string without the tab + assert assert_identity_and_output("!a\tb", ~s('!a\tb')) + end + + test "tab in map key" do + # would be plain string without the tab + assert assert_identity_and_output(%{"a\tb" => "value"}, ~s(a\tb: value)) + # would be single quoted string without the tab + assert assert_identity_and_output(%{"!a\tb" => "value"}, ~s('!a\tb': value)) + end + test "integers" do assert_identity_and_output(1, "1") end @@ -275,6 +325,11 @@ defmodule Ymlr.EncodeTest do end # see https://yaml-multiline.info/ + @tag skip: "still buggy" + test "multiline strings - starting with spaces" do + assert_identity_and_output("\n abc", "|-\n\n abc") + assert_identity_and_output(" abc\nabc", "|-\n abc\n abc") + end test "multiline strings - base cases" do assert_identity_and_output("a\n b\nc", "|-\n a\n b\n c") @@ -406,6 +461,14 @@ defmodule Ymlr.EncodeTest do }) end + test "tab(s) and newline(s) in the same string" do + assert_identity_and_output("a\tb\nc", "|-\n a\tb\n c") + # with extra whitespaces around the newline + assert_identity_and_output("a\tb \n c", "|-\n a\tb \n c") + # with backslash + assert_identity_and_output(~s(a\tb\nc\\w), "|-\n a\tb\n c\\w") + end + test "date" do assert_output(~D[2016-05-24], "2016-05-24") end