From e0d3f3f11425edfadc0fe83c88c8f47b47a58754 Mon Sep 17 00:00:00 2001 From: Roman Tsisyk Date: Fri, 29 May 2015 17:58:25 +0300 Subject: [PATCH] Fix #354: yaml incorrectly escapces special characters in a string --- test/app/json.result | 6 +- test/app/lua/serializer_test.lua | 6 +- test/app/msgpack.result | 6 +- test/app/msgpackffi.result | 6 +- test/app/yaml.result | 10 +- test/app/yaml.test.lua | 8 +- test/box/net.box.result | 29 +-- third_party/lua-yaml/LICENSE | 1 + third_party/lua-yaml/lyaml.cc | 350 +++++++++++++++++++++---------- 9 files changed, 285 insertions(+), 137 deletions(-) diff --git a/test/app/json.result b/test/app/json.result index a899ccf6befc..a2458d809df1 100644 --- a/test/app/json.result +++ b/test/app/json.result @@ -203,11 +203,15 @@ ok - double # boolean: end ok - boolean # string - 1..4 + 1..8 ok - encode/decode for ok - encode/decode for abcde ok - encode/decode for Кудыкины горы ok - encode/decode for xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + ok - encode/decode for $a $ + ok - encode/decode for $a $ + ok - encode/decode for $a\t $ + ok - encode/decode for $a\\t $ # string: end ok - string # nil diff --git a/test/app/lua/serializer_test.lua b/test/app/lua/serializer_test.lua index a5aed744031e..0226f1e23817 100644 --- a/test/app/lua/serializer_test.lua +++ b/test/app/lua/serializer_test.lua @@ -202,11 +202,15 @@ local function test_boolean(test, s) end local function test_string(test, s) - test:plan(4) + test:plan(8) rt(test, s, "") rt(test, s, "abcde") rt(test, s, "Кудыкины горы") -- utf-8 rt(test, s, string.rep("x", 33)) + rt(test, s, '$a\t $') + rt(test, s, '$a\t $') + rt(test, s, [[$a\t $]]) + rt(test, s, [[$a\\t $]]) end local function test_nil(test, s) diff --git a/test/app/msgpack.result b/test/app/msgpack.result index 986a733911f4..c6e2b38bc846 100644 --- a/test/app/msgpack.result +++ b/test/app/msgpack.result @@ -203,11 +203,15 @@ ok - double # boolean: end ok - boolean # string - 1..4 + 1..8 ok - encode/decode for ok - encode/decode for abcde ok - encode/decode for Кудыкины горы ok - encode/decode for xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + ok - encode/decode for $a $ + ok - encode/decode for $a $ + ok - encode/decode for $a\t $ + ok - encode/decode for $a\\t $ # string: end ok - string # nil diff --git a/test/app/msgpackffi.result b/test/app/msgpackffi.result index 1a966dcc71c7..ab2f9c1a3f8d 100644 --- a/test/app/msgpackffi.result +++ b/test/app/msgpackffi.result @@ -197,11 +197,15 @@ ok - double # boolean: end ok - boolean # string - 1..4 + 1..8 ok - encode/decode for ok - encode/decode for abcde ok - encode/decode for Кудыкины горы ok - encode/decode for xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + ok - encode/decode for $a $ + ok - encode/decode for $a $ + ok - encode/decode for $a\t $ + ok - encode/decode for $a\\t $ # string: end ok - string # nil diff --git a/test/app/yaml.result b/test/app/yaml.result index 872f589c6425..d0d1308c7bab 100644 --- a/test/app/yaml.result +++ b/test/app/yaml.result @@ -203,11 +203,15 @@ ok - double # boolean: end ok - boolean # string - 1..4 + 1..8 ok - encode/decode for ok - encode/decode for abcde ok - encode/decode for Кудыкины горы ok - encode/decode for xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + ok - encode/decode for $a $ + ok - encode/decode for $a $ + ok - encode/decode for $a\t $ + ok - encode/decode for $a\\t $ # string: end ok - string # nil @@ -278,13 +282,15 @@ ok - ucdata # compact: end ok - compact # output - 1..6 + 1..8 ok - encode for true ok - decode for 'yes' ok - encode for false ok - decode for 'no' ok - encode for nil ok - decode for ~ + ok - encode for binary + ok - tutorial string # output: end ok - output # yaml: end diff --git a/test/app/yaml.test.lua b/test/app/yaml.test.lua index d339a41f68d4..c931a4eb1c52 100755 --- a/test/app/yaml.test.lua +++ b/test/app/yaml.test.lua @@ -34,7 +34,7 @@ local function test_compact(test, s) test:is(ss.encode(setmetatable({k = 'v'}, { __serialize="mapping"})), "---\nk: v\n...\n", "block map") test:is(ss.encode({setmetatable({k = 'v'}, { __serialize="map"})}), - "---\n- {k: v}\n...\n", "flow map") + "---\n- {'k': 'v'}\n...\n", "flow map") test:is(getmetatable(ss.decode(ss.encode({k = 'v'}))).__serialize, "map", "decoded __serialize is map") @@ -42,13 +42,17 @@ local function test_compact(test, s) end local function test_output(test, s) - test:plan(6) + test:plan(8) test:is(s.encode({true}), '---\n- true\n...\n', "encode for true") test:is(s.decode("---\nyes\n..."), true, "decode for 'yes'") test:is(s.encode({false}), '---\n- false\n...\n', "encode for false") test:is(s.decode("---\nno\n..."), false, "decode for 'no'") test:is(s.encode({s.NULL}), '---\n- null\n...\n', "encode for nil") test:is(s.decode("---\n~\n..."), s.NULL, "decode for ~") + test:is(s.encode("\x80\x92\xe8s\x16"), '--- !!binary gJLocxY=\n...\n', + "encode for binary") + test:is(s.encode("Tutorial -- Header\n====\n\nText"), + "--- |-\n Tutorial -- Header\n ====\n\n Text\n...\n", "tutorial string"); end tap.test("yaml", function(test) diff --git a/test/box/net.box.result b/test/box/net.box.result index 6f3f53d61eac..96c3d152c8d5 100644 --- a/test/box/net.box.result +++ b/test/box/net.box.result @@ -106,9 +106,9 @@ cn:eval('return ...', 1, 2, 3) ... cn:eval('return { k = "v1" }, true, { xx = 10, yy = 15 }, nil') --- -- {k: v1} +- {'k': 'v1'} - true -- {yy: 15, xx: 10} +- {'yy': 15, 'xx': 10} - null ... cn:eval('return nil') @@ -632,41 +632,28 @@ cnc.console ~= nil ... cnc:console('return 1, 2, 3, "string", nil') --- -- '--- - +- | + --- - 1 - - 2 - - 3 - - string - - null - ... - -' ... cnc:console('error("test")') --- -- '--- - +- | + --- - error: test - ... - -' ... cnc:console('a = {1, 2, 3, 4}; return a[3]') --- -- '--- - +- | + --- - 3 - ... - -' ... -- #545 user or password is not defined remote:new(LISTEN.host, LISTEN.service, { user = 'test' }) diff --git a/third_party/lua-yaml/LICENSE b/third_party/lua-yaml/LICENSE index c675c09aa835..97b5fcfa508a 100644 --- a/third_party/lua-yaml/LICENSE +++ b/third_party/lua-yaml/LICENSE @@ -1,4 +1,5 @@ Copyright (c) 2009, Andrew Danforth +Copyright (c) 2013-2015, Tarantool Authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/third_party/lua-yaml/lyaml.cc b/third_party/lua-yaml/lyaml.cc index 3530ed4acce8..ac7f59b0ab81 100644 --- a/third_party/lua-yaml/lyaml.cc +++ b/third_party/lua-yaml/lyaml.cc @@ -356,76 +356,6 @@ static int l_load(lua_State *L) { static int dump_node(struct lua_yaml_dumper *dumper); -/* - Copyright (c) 2013 Palard Julien. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - SUCH DAMAGE. - - Check if the given unsigned char * is a valid utf-8 sequence. - - Return value : - If the string is valid utf-8, 1 is returned. - Else, 0 is returned. - - Valid utf-8 sequences look like this : - 0xxxxxxx - 110xxxxx 10xxxxxx - 1110xxxx 10xxxxxx 10xxxxxx - 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx -*/ -static int is_utf8(const char *data, size_t len) -{ - const unsigned char *str = (const unsigned char *) data; - size_t i = 0; - size_t continuation_bytes = 0; - - while (i < len) - { - if (str[i] <= 0x7F) - continuation_bytes = 0; - else if (str[i] >= 0xC0 /*11000000*/ && str[i] <= 0xDF /*11011111*/) - continuation_bytes = 1; - else if (str[i] >= 0xE0 /*11100000*/ && str[i] <= 0xEF /*11101111*/) - continuation_bytes = 2; - else if (str[i] >= 0xF0 /*11110000*/ && str[i] <= 0xF4 /* Cause of RFC 3629 */) - continuation_bytes = 3; - else - return 0; - i += 1; - while (i < len && continuation_bytes > 0 - && str[i] >= 0x80 - && str[i] <= 0xBF) - { - i += 1; - continuation_bytes -= 1; - } - if (continuation_bytes != 0) - return 0; - } - return 1; -} - static yaml_char_t *get_yaml_anchor(struct lua_yaml_dumper *dumper) { const char *s = ""; lua_pushvalue(dumper->L, -1); @@ -514,6 +444,237 @@ static int dump_null(struct lua_yaml_dumper *dumper) { return yaml_emitter_emit(&dumper->emitter, &ev); } +static yaml_scalar_style_t analyze_string(struct lua_yaml_dumper *dumper, + const char *str, size_t len, int *is_binary) +{ + *is_binary = 0; + + /** + * This function ported from PyYAML implementation. + * PyYAML has same authors and licence as LibYAML. See License.LibYaml + * https://bitbucket.org/xi/pyyaml/src/ddf211a41bb231c365fece5599b7e484e6dc33fc/lib/yaml/emitter.py?at=default#cl-629 + */ + + /* + * Fast checks + */ + /* Display empty scalar as plain */ + if (len == 0) + return YAML_PLAIN_SCALAR_STYLE; + + /* Special string values */ + if (len <= 5 && (!strcmp(str, "true") + || !strcmp(str, "false") + || !strcmp(str, "~") + || !strcmp(str, "null"))) { + return YAML_SINGLE_QUOTED_SCALAR_STYLE; + } + + /* Check document indicators. */ + if (len >= 3 && (memcmp(str, "---", 3) == 0 || memcmp(str, "...", 3) == 0)) + return YAML_LITERAL_SCALAR_STYLE; + + /* Allowed styles */ + bool allowPlain = true; + bool allowSingleQuoted = true; + + /* Indicators and special characters. */ + bool blockIndicators = false; + bool flowIndicators = false; + bool lineBreaks = false; + bool specialCharacters = false; + + /* Important whitespace combinations. */ + bool leadingSpace = false; + bool leadingBreak = false; + bool trailingSpace = false; + bool trailingBreak = false; + bool breakSpace = false; + bool spaceBreak = false; + bool emptyLines = false; + bool previousSpace = false; + bool previousBreak = false; + + const unsigned char *s = (const unsigned char *) str; + const unsigned char *p = s; + const unsigned char *e = s + len; + while (p < e) { + if (*p > 0x7F) { + /* UTF-8 */ + + int continuation_bytes = 0; + if (*p >= 0xC0 && *p <= 0xDF) { + continuation_bytes = 1; + } else if (*p >= 0xE0 && *p <= 0xEF /*11101111*/) { + continuation_bytes = 2; + } else if (*p >= 0xF0 && *p <= 0xF4) { + continuation_bytes = 3; + } else { + /* Invalid UTF-8 */ + *is_binary = 1; + return YAML_PLAIN_SCALAR_STYLE; + } + + ++p; + while (p < e && continuation_bytes > 0 && *p >= 0x80 && *p <= 0xBF) { + ++p; + continuation_bytes -= 1; + } + if (continuation_bytes != 0) { + /* Invalid UTF-8 */ + *is_binary = 1; + return YAML_PLAIN_SCALAR_STYLE; + } else { + continue; + } + } + + /* ASCII */ + + bool preceededByWhitespace = (p > s) && + strchr(" \t\r\n\x85", *(p - 1)) != NULL; + bool followedByWhitespace = (p + 1 >= s + len) || + strchr(" \t\r\n\x85", *(p + 1)) != NULL; + + /* Check for line breaks and special characters */ + bool isLineBreak = false; + if (*p == '\n' || *p == 0x85) { + lineBreaks = isLineBreak = true; + } else if (*p < 0x20) { + specialCharacters = true; + } + + /* Check for indicators. */ + if (p == s) { + /* Leading indicators are special characters. */ + if (strchr("#,[]{}&*!|>\'\"%@`", *p) != NULL) { + flowIndicators = true; + blockIndicators = true; + } + if (*p == '?' || *p == ':') { + flowIndicators = true; + if (followedByWhitespace) + blockIndicators = true; + } + if (*p == '-' && followedByWhitespace) { + flowIndicators = true; + blockIndicators = true; + } + } else { + if (isLineBreak && *(p - 1) == '\n') + emptyLines = true; + /* Some indicators cannot appear within a scalar as well. */ + if (strchr(",?[]{}", *p) != NULL) { + flowIndicators = true; + } + if (*p == ':') { + flowIndicators = true; + if (followedByWhitespace) { + blockIndicators = true; + } + } + if (*p == '#' && preceededByWhitespace) { + flowIndicators = true; + blockIndicators = true; + } + } + + /* Detect important whitespace combinations. */ + if (*p == ' ') { + if (p == s) + leadingSpace = true; + if (p == s + len - 1) + trailingSpace = true; + if (previousBreak) + breakSpace = true; + previousSpace = true; + previousBreak = false; + } else if (isLineBreak) { + if (p == s) + leadingBreak = true; + if (p == s + len - 1) + trailingBreak = true; + if (previousSpace) + spaceBreak = true; + previousSpace = false; + previousBreak = true; + } else { + previousSpace = false; + previousBreak = false; + } + + ++p; + } + + /* + * Tarantool-specific: use literal style for string with empty lines. + * Useful for tutorial(). + */ + if (emptyLines) + return YAML_LITERAL_SCALAR_STYLE; + + bool flowMode = false; + if (dumper->emitter.flow_level > 0) { + flowMode = true; + } else { + yaml_event_t *evp; + for (evp = dumper->emitter.events.head; + evp != dumper->emitter.events.tail; evp++) { + if ((evp->type == YAML_SEQUENCE_START_EVENT && + evp->data.sequence_start.style == YAML_FLOW_SEQUENCE_STYLE) || + (evp->type == YAML_MAPPING_START_EVENT && + evp->data.mapping_start.style == YAML_FLOW_MAPPING_STYLE)) { + flowMode = true; + break; + } + } + } + + /* Let's decide what styles are allowed. */ + + /* + * Spaces followed by breaks, as well as special character are only + * allowed for double quoted scalars. + */ + if (spaceBreak || specialCharacters) + return YAML_DOUBLE_QUOTED_SCALAR_STYLE; + + /* + * Spaces at the beginning of a new line are only acceptable for block + * scalars + */ + if (breakSpace) + allowPlain = allowSingleQuoted = false; + + /* Leading and trailing whitespaces are bad for plain scalars. */ + if (leadingSpace || leadingBreak || trailingSpace || trailingBreak) + allowPlain = false; + + if (flowMode) { + //if (flowMode && flowIndicators) + // allowPlain = false; + /* + * Tarantool-specific: always quote strings in FLOW SEQUENCE + * Flow: [1, 'a', 'testing'] + * Block: + * - 1 + * - a + * - testing + */ + allowPlain = false; + } else /* blockMode */ { + /* Block indicators are forbidden for block plain scalars. */ + if (blockIndicators) + allowPlain = false; + } + + if (allowPlain) + return YAML_PLAIN_SCALAR_STYLE; + else if (allowSingleQuoted) + return YAML_SINGLE_QUOTED_SCALAR_STYLE; + return YAML_DOUBLE_QUOTED_SCALAR_STYLE; +} + static int dump_node(struct lua_yaml_dumper *dumper) { size_t len; @@ -555,49 +716,22 @@ static int dump_node(struct lua_yaml_dumper *dumper) return dump_array(dumper, &field); case MP_MAP: return dump_table(dumper, &field); - case MP_BIN: case MP_STR: str = lua_tolstring(dumper->L, -1, &len); - if (field.type == MP_BIN || !is_utf8(str, len)) { - is_binary = 1; - tobase64(dumper->L, -1); - str = lua_tolstring(dumper->L, -1, &len); - tag = (yaml_char_t *) LUAYAML_TAG_PREFIX "binary"; - break; - } - - /* - * Always quote strings in FLOW SEQUENCE - * Flow: [1, 'a', 'testing'] - * Block: - * - 1 - * - a - * - testing - */ - if (dumper->emitter.flow_level > 0) { - style = YAML_SINGLE_QUOTED_SCALAR_STYLE; - break; - } - - for (evp = dumper->emitter.events.head; - evp != dumper->emitter.events.tail; evp++) { - if (evp->type == YAML_SEQUENCE_START_EVENT && - evp->data.sequence_start.style == YAML_FLOW_SEQUENCE_STYLE) { - style = YAML_SINGLE_QUOTED_SCALAR_STYLE; - goto strbreak; - } - } - - if (len <= 5 && (!strcmp(str, "true") - || !strcmp(str, "false") - || !strcmp(str, "~") - || !strcmp(str, "null"))) { - style = YAML_SINGLE_QUOTED_SCALAR_STYLE; - } else if (lua_isnumber(dumper->L, -1)) { + if (lua_isnumber(dumper->L, -1)) { /* string is convertible to number, quote it to preserve type */ style = YAML_SINGLE_QUOTED_SCALAR_STYLE; + break; } - strbreak: + style = analyze_string(dumper, str, len, &is_binary); + if (!is_binary) + break; + /* Fall through */ + case MP_BIN: + is_binary = 1; + tobase64(dumper->L, -1); + str = lua_tolstring(dumper->L, -1, &len); + tag = (yaml_char_t *) LUAYAML_TAG_PREFIX "binary"; break; case MP_BOOL: if (field.bval) {