From 607407d96ee7ade36f5f1cf8312321de7acee1b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Motiejus=20Jak=C5=A1tys?= Date: Fri, 5 Oct 2012 17:45:38 +0100 Subject: [PATCH] FIX z_string:s_utf8/2 Current checker does not validate that codepoints are within allowed range. For instance, this encodes U-00000, but passes the current validator: <<2#110:3, 0:5, 2#10:2, 0:6>>. (<<16#C080>>). New checker takes into account both utf8 syntax and allowed values that certain length codepoints can encode. --- src/support/tests/z_string_tests.erl | 28 +++++++++++ src/support/z_string.erl | 72 +++++++++++++--------------- 2 files changed, 61 insertions(+), 39 deletions(-) diff --git a/src/support/tests/z_string_tests.erl b/src/support/tests/z_string_tests.erl index 977e3b8bf3..50f79537f5 100644 --- a/src/support/tests/z_string_tests.erl +++ b/src/support/tests/z_string_tests.erl @@ -41,3 +41,31 @@ ends_with_test() -> ?assert(z_string:ends_with(["is ", <<"text.">>], "This is text.")), ?assertNot(z_string:ends_with(["is ", <<"jpeg.">>], "This is text.")), ok. + +valid_utf8_test_() -> + [ + ?_assert(v_utf8(<<>>)), + ?_assert(v_utf8(<<127>>)), + ?_assert(v_utf8(<<2#11001111, 2#10000000>>)), + ?_assert(v_utf8(<<2#11011111, 2#10111111>>)), + + ?_assert(v_utf8(<<2#11101000, 2#10000000, 2#10000000>>)), + ?_assert(v_utf8(<<2#11101111, 2#10111111, 2#10111101>>)), + + ?_assert(v_utf8(<<2#11110100, 2#10000000, 2#10000000, 2#10000000>>)), + ?_assert(v_utf8(<<2#11110000, 2#10111111, 2#10111111, 2#10111111>>)), + + ?_assertNot(v_utf8(<<128>>)), + + ?_assertNot(v_utf8(<<2#11100000, 2#10000000>>)), + ?_assertNot(v_utf8(<<2#11000000, 2#11000000>>)), + + ?_assertNot(v_utf8(<<2#11110000, 2#10000000, 2#10000000>>)), + ?_assertNot(v_utf8(<<2#11100000, 2#11000000, 2#10000000>>)), + ?_assertNot(v_utf8(<<2#11100000, 2#10000000, 2#11000000>>)), + + ?_assertNot(v_utf8(<<2#11111000, 2#10000000, 2#10000000, 2#10000000>>)) + ]. + +v_utf8(Bin) -> + z_string:sanitize_utf8(Bin) =:= Bin. diff --git a/src/support/z_string.erl b/src/support/z_string.erl index 8d8f898c43..b7c7c9ef05 100644 --- a/src/support/z_string.erl +++ b/src/support/z_string.erl @@ -608,45 +608,39 @@ replace(String, S1, S2) when is_list(String), is_list(S1), is_list(S2) -> sanitize_utf8(L) when is_list(L) -> sanitize_utf8(iolist_to_binary(L)); sanitize_utf8(B) when is_binary(B) -> s_utf8(B, <<>>). - s_utf8(<<>>, Acc) -> - Acc; - s_utf8(<>, Acc) - when C < 128 -> - s_utf8(Rest, <>); - s_utf8(<>, Acc) - when X >= 2#11000000, X =< 2#11011111, - A >= 2#10000000, A =< 2#10111111 -> - s_utf8(Rest, <>); - s_utf8(<>, Acc) - when X >= 2#11100000, X =< 2#11101111, - A >= 2#10000000, A =< 2#10111111, - B >= 2#10000000, B =< 2#10111111 -> - s_utf8(Rest, <>); - s_utf8(<>, Acc) - when X >= 2#11110000, X =< 2#11110111, - A >= 2#10000000, A =< 2#10111111, - B >= 2#10000000, B =< 2#10111111, - C >= 2#10000000, C =< 2#10111111 -> - s_utf8(Rest, <>); - s_utf8(<>, Acc) - when X >= 2#11111000, X =< 2#11111011, - A >= 2#10000000, A =< 2#10111111, - B >= 2#10000000, B =< 2#10111111, - C >= 2#10000000, C =< 2#10111111, - D >= 2#10000000, D =< 2#10111111 -> - s_utf8(Rest, <>); - s_utf8(<>, Acc) - when X >= 2#11111100, X =< 2#11111101, - A >= 2#10000000, A =< 2#10111111, - B >= 2#10000000, B =< 2#10111111, - C >= 2#10000000, C =< 2#10111111, - D >= 2#10000000, D =< 2#10111111, - E >= 2#10000000, E =< 2#10111111 -> - s_utf8(Rest, <>); - % Drop illegal utf-8 character. - s_utf8(<<_, Rest/binary>>, Acc) -> - s_utf8(Rest, Acc). - +s_utf8(<<>>, Acc) -> + Acc; + +%% 1 byte +s_utf8(<>, Acc) when X < 128 -> + s_utf8(Rest, <>); + +%% 2 bytes +s_utf8(<<2#110:3, A:5, 2#10:2, B:6, Rest/binary>>, Acc) when + <<0:5, A:5, B:6>> >= <<16#80:16>>, + <<0:5, A:5, B:6>> =< <<16#7FF:16>> -> + s_utf8(Rest, <>); + +%% 3 bytes +s_utf8(<<2#1110:4, A:4, 2#10:2, B:6, 2#10:2, C:6, Rest/binary>>, Acc) when + <<0:7, A:5, B:6, C:6>> >= <<16#800:24>> andalso + <<0:7, A:5, B:6, C:6>> =< <<16#D7FF:24>> + orelse + <<0:7, A:5, B:6, C:6>> >= <<16#E000:24>> andalso + <<0:7, A:5, B:6, C:6>> =< <<16#FFFD:24>> -> + s_utf8(Rest, <>); + +%% 4 bytes +s_utf8(<<2#11110:5, A:3, 2#10:2, B:6, 2#10:2, C:6, 2#10:2, D:6, Rest/binary>>, + Acc) when + <<0:3, A:3, B:6, C:6, D:6>> >= <<16#10000:24>> andalso + <<0:3, A:3, B:6, C:6, D:6>> =< <<16#10FFFF:24>> -> + s_utf8(Rest, + <>); + +%% Drop illegal utf-8 character. +s_utf8(<<_, Rest/binary>>, Acc) -> + s_utf8(Rest, Acc). %% @doc Truncate a string. Append the '...' character at the place of break off. %% @spec truncate(String, int()) -> String