diff --git a/src/parser/scanner.lex b/src/parser/scanner.lex index a6fedde4a68..cbc40b42b17 100644 --- a/src/parser/scanner.lex +++ b/src/parser/scanner.lex @@ -35,7 +35,8 @@ static constexpr size_t MAX_STRING = 4096; %x LB_STR %x COMMENT -blank_without_newline ([ \t\r\xa0]) +nbsp (\xc2\xa0) +blank_without_newline ([ \t\r]|{nbsp}) blank ({blank_without_newline}|[\n]) blanks ({blank}+) @@ -57,17 +58,19 @@ HEX ([0-9a-fA-F]) OCT ([0-7]) IP_OCTET ([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5]) -U [\x80-\xbf] -U2 [\xc2-\xdf] +U [\x80-\x9f\xa1-\xbf] +UA0 \xa0 +U2 [\xc3-\xdf] +UC2 \xc2 U3 [\xe0-\xee] U4 [\xf0-\xf4] -CHINESE {U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} +CHINESE {U2}{UA0}|{UC2}{U}|{U2}{U}|{U3}{U}{U}|{U4}{U}{U}{U} CN_EN {CHINESE}|[a-zA-Z] CN_EN_NUM {CHINESE}|[_a-zA-Z0-9] LABEL {CN_EN}{CN_EN_NUM}* U3_FULL_WIDTH [\xe0-\xef] -CHINESE_FULL_WIDTH {U2}{U}|{U3_FULL_WIDTH}{U}{U}|{U4}{U}{U}{U} +CHINESE_FULL_WIDTH {U2}{UA0}|{UC2}{U}|{U2}{U}|{U3_FULL_WIDTH}{U}{U}|{U4}{U}{U}{U} CN_EN_FULL_WIDTH {CHINESE_FULL_WIDTH}|[a-zA-Z] CN_EN_NUM_FULL_WIDTH {CHINESE_FULL_WIDTH}|[_a-zA-Z0-9 ] LABEL_FULL_WIDTH {CN_EN_FULL_WIDTH}{CN_EN_NUM_FULL_WIDTH}* diff --git a/src/parser/test/ParserTest.cpp b/src/parser/test/ParserTest.cpp index 0d25a607c65..9bc976cc1a0 100644 --- a/src/parser/test/ParserTest.cpp +++ b/src/parser/test/ParserTest.cpp @@ -3350,4 +3350,18 @@ TEST_F(ParserTest, TestShowSentenceWithPipe) { ASSERT_TRUE(result.ok()) << result.status(); } } + +TEST_F(ParserTest, TestSpecialWhiteSpaceChar) { + { + std::string query = "SHOW\xC2\xA0SPACES"; + auto result = parse(query); + ASSERT_TRUE(result.ok()) << result.status(); + } + { + std::string query = "SHOW \xC2\xA0SPACES\xC2\xA0"; + auto result = parse(query); + ASSERT_TRUE(result.ok()) << result.status(); + } +} + } // namespace nebula diff --git a/src/parser/test/ScannerTest.cpp b/src/parser/test/ScannerTest.cpp index f1b0675166e..6e6c4a32e51 100644 --- a/src/parser/test/ScannerTest.cpp +++ b/src/parser/test/ScannerTest.cpp @@ -533,7 +533,7 @@ TEST(Scanner, Basic) { CHECK_SEMANTIC_VALUE("label", TokenType::LABEL, "label"), CHECK_SEMANTIC_VALUE("label123", TokenType::LABEL, "label123"), // \xA0 is white space in UTF-8 too - CHECK_SEMANTIC_VALUE("\xA0" + CHECK_SEMANTIC_VALUE("\xC2\xA0" "abc", TokenType::LABEL, "abc"), diff --git a/tests/tck/features/basic/Parser.feature b/tests/tck/features/basic/Parser.feature new file mode 100644 index 00000000000..021e2ffa475 --- /dev/null +++ b/tests/tck/features/basic/Parser.feature @@ -0,0 +1,16 @@ +# Copyright (c) 2022 vesoft inc. All rights reserved. +# +# This source code is licensed under Apache 2.0 License. +Feature: Parser + + Scenario: Test special white space character + When executing query: + """ + SHOW  SPACES + """ + Then the execution should be successful + When executing query: + """ + RETURN  1 + """ + Then the execution should be successful