Skip to content
Browse files

Merge pull request #439 from Motiejus/z_string

FIX z_string:s_utf8/2
  • Loading branch information...
2 parents 452e731 + 607407d commit 932e95f02e5b59d32a7ce1a79ab9242ef42305bb @mworrell mworrell committed Oct 9, 2012
Showing with 61 additions and 39 deletions.
  1. +28 −0 src/support/tests/z_string_tests.erl
  2. +33 −39 src/support/z_string.erl
View
28 src/support/tests/z_string_tests.erl
@@ -41,3 +41,31 @@ ends_with_test() ->
?assert(z_string:ends_with(["is ", <<"text.">>], "This is text.")),
?assertNot(z_string:ends_with(["is ", <<"jpeg.">>], "This is text.")),
ok.
+
+valid_utf8_test_() ->
+ [
+ ?_assert(v_utf8(<<>>)),
+ ?_assert(v_utf8(<<127>>)),
+ ?_assert(v_utf8(<<2#11001111, 2#10000000>>)),
+ ?_assert(v_utf8(<<2#11011111, 2#10111111>>)),
+
+ ?_assert(v_utf8(<<2#11101000, 2#10000000, 2#10000000>>)),
+ ?_assert(v_utf8(<<2#11101111, 2#10111111, 2#10111101>>)),
+
+ ?_assert(v_utf8(<<2#11110100, 2#10000000, 2#10000000, 2#10000000>>)),
+ ?_assert(v_utf8(<<2#11110000, 2#10111111, 2#10111111, 2#10111111>>)),
+
+ ?_assertNot(v_utf8(<<128>>)),
+
+ ?_assertNot(v_utf8(<<2#11100000, 2#10000000>>)),
+ ?_assertNot(v_utf8(<<2#11000000, 2#11000000>>)),
+
+ ?_assertNot(v_utf8(<<2#11110000, 2#10000000, 2#10000000>>)),
+ ?_assertNot(v_utf8(<<2#11100000, 2#11000000, 2#10000000>>)),
+ ?_assertNot(v_utf8(<<2#11100000, 2#10000000, 2#11000000>>)),
+
+ ?_assertNot(v_utf8(<<2#11111000, 2#10000000, 2#10000000, 2#10000000>>))
+ ].
+
+v_utf8(Bin) ->
+ z_string:sanitize_utf8(Bin) =:= Bin.
View
72 src/support/z_string.erl
@@ -608,45 +608,39 @@ replace(String, S1, S2) when is_list(String), is_list(S1), is_list(S2) ->
sanitize_utf8(L) when is_list(L) -> sanitize_utf8(iolist_to_binary(L));
sanitize_utf8(B) when is_binary(B) -> s_utf8(B, <<>>).
- s_utf8(<<>>, Acc) ->
- Acc;
- s_utf8(<<C, Rest/binary>>, Acc)
- when C < 128 ->
- s_utf8(Rest, <<Acc/binary, C>>);
- s_utf8(<<X, A, Rest/binary>>, Acc)
- when X >= 2#11000000, X =< 2#11011111,
- A >= 2#10000000, A =< 2#10111111 ->
- s_utf8(Rest, <<Acc/binary, X, A>>);
- s_utf8(<<X, A, B, Rest/binary>>, Acc)
- when X >= 2#11100000, X =< 2#11101111,
- A >= 2#10000000, A =< 2#10111111,
- B >= 2#10000000, B =< 2#10111111 ->
- s_utf8(Rest, <<Acc/binary, X, A, B>>);
- s_utf8(<<X, A, B, C, Rest/binary>>, Acc)
- when X >= 2#11110000, X =< 2#11110111,
- A >= 2#10000000, A =< 2#10111111,
- B >= 2#10000000, B =< 2#10111111,
- C >= 2#10000000, C =< 2#10111111 ->
- s_utf8(Rest, <<Acc/binary, X, A, B, C>>);
- s_utf8(<<X, A, B, C, D, Rest/binary>>, Acc)
- when X >= 2#11111000, X =< 2#11111011,
- A >= 2#10000000, A =< 2#10111111,
- B >= 2#10000000, B =< 2#10111111,
- C >= 2#10000000, C =< 2#10111111,
- D >= 2#10000000, D =< 2#10111111 ->
- s_utf8(Rest, <<Acc/binary, X, A, B, C, D>>);
- s_utf8(<<X, A, B, C, D, E, Rest/binary>>, Acc)
- when X >= 2#11111100, X =< 2#11111101,
- A >= 2#10000000, A =< 2#10111111,
- B >= 2#10000000, B =< 2#10111111,
- C >= 2#10000000, C =< 2#10111111,
- D >= 2#10000000, D =< 2#10111111,
- E >= 2#10000000, E =< 2#10111111 ->
- s_utf8(Rest, <<Acc/binary, X, A, B, C, D, E>>);
- % Drop illegal utf-8 character.
- s_utf8(<<_, Rest/binary>>, Acc) ->
- s_utf8(Rest, Acc).
-
+s_utf8(<<>>, Acc) ->
+ Acc;
+
+%% 1 byte
+s_utf8(<<X, Rest/binary>>, Acc) when X < 128 ->
+ s_utf8(Rest, <<Acc/binary, X>>);
+
+%% 2 bytes
+s_utf8(<<2#110:3, A:5, 2#10:2, B:6, Rest/binary>>, Acc) when
+ <<0:5, A:5, B:6>> >= <<16#80:16>>,
+ <<0:5, A:5, B:6>> =< <<16#7FF:16>> ->
+ s_utf8(Rest, <<Acc/binary, 2#110:3, A:5, 2#10:2, B:6>>);
+
+%% 3 bytes
+s_utf8(<<2#1110:4, A:4, 2#10:2, B:6, 2#10:2, C:6, Rest/binary>>, Acc) when
+ <<0:7, A:5, B:6, C:6>> >= <<16#800:24>> andalso
+ <<0:7, A:5, B:6, C:6>> =< <<16#D7FF:24>>
+ orelse
+ <<0:7, A:5, B:6, C:6>> >= <<16#E000:24>> andalso
+ <<0:7, A:5, B:6, C:6>> =< <<16#FFFD:24>> ->
+ s_utf8(Rest, <<Acc/binary, 2#1110:4, A:4, 2#10:2, B:6, 2#10:2, C:6>>);
+
+%% 4 bytes
+s_utf8(<<2#11110:5, A:3, 2#10:2, B:6, 2#10:2, C:6, 2#10:2, D:6, Rest/binary>>,
+ Acc) when
+ <<0:3, A:3, B:6, C:6, D:6>> >= <<16#10000:24>> andalso
+ <<0:3, A:3, B:6, C:6, D:6>> =< <<16#10FFFF:24>> ->
+ s_utf8(Rest,
+ <<Acc/binary, 2#11110:5, A:3, 2#10:2, B:6, 2#10:2, C:6, 2#10:2, D:6>>);
+
+%% Drop illegal utf-8 character.
+s_utf8(<<_, Rest/binary>>, Acc) ->
+ s_utf8(Rest, Acc).
%% @doc Truncate a string. Append the '...' character at the place of break off.
%% @spec truncate(String, int()) -> String

0 comments on commit 932e95f

Please sign in to comment.
Something went wrong with that request. Please try again.