Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend range of printable unicode characters #150

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
*.BAK
*.a
*.cmake
*.dll
*.exe
*.la
Expand Down
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@ target_include_directories(yaml PUBLIC
$<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
)


include(cmake/FindICU.cmake)
find_package(ICU)
target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS})
target_link_libraries(yaml ${ICU_LIBRARIES})

#
# Install rules
#
Expand Down
66 changes: 66 additions & 0 deletions cmake/FindICU.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# - Find ICU header and library
# The module defines the following variables:
#
# ICU_FOUND - true if ICU was found
# ICU_INCLUDE_DIRS - the directory of the ICU headers
# ICU_LIBRARIES - the ICU libraries needed for linking
#

if(DEFINED ICU_ROOT)
set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH)
set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib")
set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include")
else()
set(ICU_FIND_OPTS)
set(ICU_FIND_LIBRARY_HINTS)
set(ICU_FIND_PATH_HINTS)
endif()

find_path(ICU_INCLUDE_DIR
unicode/ucol.h
HINTS ${ICU_FIND_PATH_HINTS}
${ICU_FIND_OPTS}
)

if(BUILD_STATIC)
set(ICU_I18N_LIB_NAME libicui18n.a)
set(ICU_UC_LIB_NAME libicuuc.a)
set(ICU_DATA_LIB_NAME libicudata.a)
else()
set(ICU_I18N_LIB_NAME icui18n)
set(ICU_UC_LIB_NAME icuuc)
set(ICU_DATA_LIB_NAME icudata)
endif()

find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME}
HINTS ${ICU_FIND_LIBRARY_HINTS}
${ICU_FIND_OPTS}
)
find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME}
HINTS ${ICU_FIND_LIBRARY_HINTS}
${ICU_FIND_OPTS}
)

find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME}
HINTS ${ICU_FIND_LIBRARY_HINTS}
${ICU_FIND_OPTS}
)

include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(ICU
REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC)
set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA})
mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS
ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES)

#
# Check presence of ucol_strcollUTF8 function from ICU
#
set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES})
set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS})
set(CMAKE_REQUIRED_FLAGS "-std=c++11")
set(CMAKE_REQUIRED_DEFINITIONS "")
set(CMAKE_REQUIRED_LIBRARIES "")
set(CMAKE_REQUIRED_INCLUDES "")
set(CMAKE_REQUIRED_FLAGS "")
48 changes: 46 additions & 2 deletions src/emitter.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@

#include "yaml_private.h"

#include <unicode/utf8.h>
#include <unicode/uchar.h>

/*
* Flush the buffer if needed.
*/
Expand Down Expand Up @@ -86,6 +89,9 @@ static int
yaml_emitter_increase_indent(yaml_emitter_t *emitter,
int flow, int indentless);

static inline int
yaml_emitter_is_printable(yaml_string_t string);

/*
* State functions.
*/
Expand Down Expand Up @@ -416,6 +422,44 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter,
return 1;
}

/*
* Checks if given utf-8 encoded code point represent printable character.
*/

static inline int
yaml_emitter_is_printable(yaml_string_t string)
{
unsigned char octet;
unsigned int width;
unsigned int value;

octet = string.pointer[0];
width = (octet & 0x80) == 0x00 ? 1 :
(octet & 0xE0) == 0xC0 ? 2 :
(octet & 0xF0) == 0xE0 ? 3 :
(octet & 0xF8) == 0xF0 ? 4 : 0;
value = (octet & 0x80) == 0x00 ? octet & 0x7F :
(octet & 0xE0) == 0xC0 ? octet & 0x1F :
(octet & 0xF0) == 0xE0 ? octet & 0x0F :
(octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
int k;
for (k = 1; k < (int)width; k ++) {
octet = string.pointer[k];
value = (value << 6) + (octet & 0x3F);
}
return (((string).pointer[0] == 0x0A)
|| ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E)
|| ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0)
|| ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED)
|| ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0)
|| ((string).pointer[0] == 0xEE)
|| ((string).pointer[0] == 0xEF
&& !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF)
&& !((string).pointer[1] == 0xBF
&& ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF)))
|| u_isprint(value));
}

/*
* State dispatcher.
*/
Expand Down Expand Up @@ -1569,7 +1613,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter,
}
}

if (!IS_PRINTABLE(string)
if (!yaml_emitter_is_printable(string)
|| (!IS_ASCII(string) && !emitter->unicode)) {
special_characters = 1;
}
Expand Down Expand Up @@ -2027,7 +2071,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter,

while (string.pointer != string.end)
{
if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string))
if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string))
|| IS_BOM(string) || IS_BREAK(string)
|| CHECK(string, '"') || CHECK(string, '\\'))
{
Expand Down
20 changes: 0 additions & 20 deletions src/yaml_private.h
Original file line number Diff line number Diff line change
Expand Up @@ -258,26 +258,6 @@ yaml_string_join(
* Check if the character can be printed unescaped.
*/

#define IS_PRINTABLE_AT(string,offset) \
(((string).pointer[offset] == 0x0A) /* . == #x0A */ \
|| ((string).pointer[offset] >= 0x20 /* #x20 <= . <= #x7E */ \
&& (string).pointer[offset] <= 0x7E) \
|| ((string).pointer[offset] == 0xC2 /* #0xA0 <= . <= #xD7FF */ \
&& (string).pointer[offset+1] >= 0xA0) \
|| ((string).pointer[offset] > 0xC2 \
&& (string).pointer[offset] < 0xED) \
|| ((string).pointer[offset] == 0xED \
&& (string).pointer[offset+1] < 0xA0) \
|| ((string).pointer[offset] == 0xEE) \
|| ((string).pointer[offset] == 0xEF /* #xE000 <= . <= #xFFFD */ \
&& !((string).pointer[offset+1] == 0xBB /* && . != #xFEFF */ \
&& (string).pointer[offset+2] == 0xBF) \
&& !((string).pointer[offset+1] == 0xBF \
&& ((string).pointer[offset+2] == 0xBE \
|| (string).pointer[offset+2] == 0xBF))))

#define IS_PRINTABLE(string) IS_PRINTABLE_AT((string),0)

/*
* Check if the character at the specified position is NUL.
*/
Expand Down
5 changes: 3 additions & 2 deletions tests/run-all-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@ set -e
main() {
# Autoconf based in-source build and tests
clean

export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc"
export CPPFLAGS="-I/usr/local/opt/icu4c/include"
./bootstrap
./configure
make test-all

# CMake based in-source build and tests
clean

export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c
cmake .
make
make test
Expand Down