Skip to content

Commit

Permalink
Extend range of printable unicode characters
Browse files Browse the repository at this point in the history
Before the patch IS_PRINTABLE macros was used
to determine if given character is printable or not.
This macros did not take into account characters
encoded with 4 bytes.
After the patch IS_PRINTABLE is replaced with new
corresponding function. Now the range of printable
characters is: (libyaml old range) U (icu range). This
new range include characters encoded with 4 bytes.

Related to tarantool/tarantool#4090
  • Loading branch information
SudoBobo authored and kyukhin committed Jul 18, 2019
1 parent 6bd4be1 commit 74a00fa
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 25 deletions.
1 change: 0 additions & 1 deletion .gitignore
@@ -1,6 +1,5 @@
*.BAK
*.a
*.cmake
*.dll
*.exe
*.la
Expand Down
6 changes: 6 additions & 0 deletions CMakeLists.txt
Expand Up @@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC
$<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
)


include(cmake/FindICU.cmake)
find_package(ICU)
target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS})
target_link_libraries(yaml ${ICU_LIBRARIES})

#
# Install rules
#
Expand Down
66 changes: 66 additions & 0 deletions cmake/FindICU.cmake
@@ -0,0 +1,66 @@
# - Find ICU header and library
# The module defines the following variables:
#
# ICU_FOUND - true if ICU was found
# ICU_INCLUDE_DIRS - the directory of the ICU headers
# ICU_LIBRARIES - the ICU libraries needed for linking
#

if(DEFINED ICU_ROOT)
set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH)
set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib")
set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include")
else()
set(ICU_FIND_OPTS)
set(ICU_FIND_LIBRARY_HINTS)
set(ICU_FIND_PATH_HINTS)
endif()

find_path(ICU_INCLUDE_DIR
unicode/ucol.h
HINTS ${ICU_FIND_PATH_HINTS}
${ICU_FIND_OPTS}
)

if(BUILD_STATIC)
set(ICU_I18N_LIB_NAME libicui18n.a)
set(ICU_UC_LIB_NAME libicuuc.a)
set(ICU_DATA_LIB_NAME libicudata.a)
else()
set(ICU_I18N_LIB_NAME icui18n)
set(ICU_UC_LIB_NAME icuuc)
set(ICU_DATA_LIB_NAME icudata)
endif()

find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME}
HINTS ${ICU_FIND_LIBRARY_HINTS}
${ICU_FIND_OPTS}
)
find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME}
HINTS ${ICU_FIND_LIBRARY_HINTS}
${ICU_FIND_OPTS}
)

find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME}
HINTS ${ICU_FIND_LIBRARY_HINTS}
${ICU_FIND_OPTS}
)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(ICU
REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC)
set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA})
mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS
ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES)

#
# Check presence of ucol_strcollUTF8 function from ICU
#
set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES})
set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS})
set(CMAKE_REQUIRED_FLAGS "-std=c++11")
set(CMAKE_REQUIRED_DEFINITIONS "")
set(CMAKE_REQUIRED_LIBRARIES "")
set(CMAKE_REQUIRED_INCLUDES "")
set(CMAKE_REQUIRED_FLAGS "")
47 changes: 45 additions & 2 deletions src/emitter.c
@@ -1,6 +1,9 @@

#include "yaml_private.h"

#include <unicode/utf8.h>
#include <unicode/uchar.h>

/*
* Flush the buffer if needed.
*/
Expand Down Expand Up @@ -86,6 +89,9 @@ static int
yaml_emitter_increase_indent(yaml_emitter_t *emitter,
int flow, int indentless);

static inline int
yaml_emitter_is_printable(yaml_string_t string);

/*
* State functions.
*/
Expand Down Expand Up @@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter,
return 1;
}

/*
* Checks if given utf-8 encoded code point represent printable character.
*/

static inline int
yaml_emitter_is_printable(yaml_string_t string)
{
unsigned char octet;
unsigned int width;
unsigned int value;

octet = string.pointer[0];
width = (octet & 0x80) == 0x00 ? 1 :
(octet & 0xE0) == 0xC0 ? 2 :
(octet & 0xF0) == 0xE0 ? 3 :
(octet & 0xF8) == 0xF0 ? 4 : 0;
value = (octet & 0x80) == 0x00 ? octet & 0x7F :
(octet & 0xE0) == 0xC0 ? octet & 0x1F :
(octet & 0xF0) == 0xE0 ? octet & 0x0F :
(octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
for (int k = 1; k < (int)width; k ++) {
octet = string.pointer[k];
value = (value << 6) + (octet & 0x3F);
}
return (((string).pointer[0] == 0x0A)
|| ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E)
|| ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0)
|| ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED)
|| ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0)
|| ((string).pointer[0] == 0xEE)
|| ((string).pointer[0] == 0xEF
&& !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF)
&& !((string).pointer[1] == 0xBF
&& ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF)))
|| u_isprint(value));
}

/*
* State dispatcher.
*/
Expand Down Expand Up @@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter,
}
}

if (!IS_PRINTABLE(string)
if (!yaml_emitter_is_printable(string)
|| (!IS_ASCII(string) && !emitter->unicode)) {
special_characters = 1;
}
Expand Down Expand Up @@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter,

while (string.pointer != string.end)
{
if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string))
if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string))
|| IS_BOM(string) || IS_BREAK(string)
|| CHECK(string, '"') || CHECK(string, '\\'))
{
Expand Down
20 changes: 0 additions & 20 deletions src/yaml_private.h
Expand Up @@ -258,26 +258,6 @@ yaml_string_join(
* Check if the character can be printed unescaped.
*/

#define IS_PRINTABLE_AT(string,offset) \
(((string).pointer[offset] == 0x0A) /* . == #x0A */ \
|| ((string).pointer[offset] >= 0x20 /* #x20 <= . <= #x7E */ \
&& (string).pointer[offset] <= 0x7E) \
|| ((string).pointer[offset] == 0xC2 /* #0xA0 <= . <= #xD7FF */ \
&& (string).pointer[offset+1] >= 0xA0) \
|| ((string).pointer[offset] > 0xC2 \
&& (string).pointer[offset] < 0xED) \
|| ((string).pointer[offset] == 0xED \
&& (string).pointer[offset+1] < 0xA0) \
|| ((string).pointer[offset] == 0xEE) \
|| ((string).pointer[offset] == 0xEF /* #xE000 <= . <= #xFFFD */ \
&& !((string).pointer[offset+1] == 0xBB /* && . != #xFEFF */ \
&& (string).pointer[offset+2] == 0xBF) \
&& !((string).pointer[offset+1] == 0xBF \
&& ((string).pointer[offset+2] == 0xBE \
|| (string).pointer[offset+2] == 0xBF))))

#define IS_PRINTABLE(string) IS_PRINTABLE_AT((string),0)

/*
* Check if the character at the specified position is NUL.
*/
Expand Down
5 changes: 3 additions & 2 deletions tests/run-all-tests.sh
Expand Up @@ -5,14 +5,15 @@ set -e
main() {
# Autoconf based in-source build and tests
clean

export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc"
export CPPFLAGS="-I/usr/local/opt/icu4c/include"
./bootstrap
./configure
make test-all

# CMake based in-source build and tests
clean

export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c
cmake .
make
make test
Expand Down

0 comments on commit 74a00fa

Please sign in to comment.