Permalink
Browse files

Klarna fixes - in particular support for ISO 8859-15

  • Loading branch information...
1 parent 609ed2d commit 326e53e3bef353b093e89780a3388aa1840669fa @willemdj committed Sep 2, 2011
Showing with 3,133 additions and 970 deletions.
  1. +2 −0 .gitignore
  2. +1,415 −961 doc/erlsom.htm
  3. +6 −4 src/erlsom_lib.erl
  4. +2 −0 src/erlsom_sax.erl
  5. +62 −1 src/erlsom_sax_latin1.erl
  6. +1,398 −0 src/erlsom_sax_latin9.erl
  7. +62 −1 src/erlsom_sax_list.erl
  8. +62 −1 src/erlsom_sax_utf16be.erl
  9. +62 −1 src/erlsom_sax_utf16le.erl
  10. +62 −1 src/erlsom_sax_utf8.erl
View
@@ -1,3 +1,5 @@
+*.swp
+*.beam
config.log
config.status
include.mk
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -589,6 +589,8 @@ detect_encoding3(Variables) ->
utf8;
'iso-8859-1' ->
iso_8859_1;
+ 'iso-8859-15' ->
+ iso_8859_15;
_ -> throw({error, "Encoding " ++ Encoding ++ " not supported"})
end;
_ ->
@@ -601,10 +603,10 @@ encoding_type(Cs) when is_list(Cs) ->
"iso_8859_1" -> 'iso-8859-1';
"iso_8859-1" -> 'iso-8859-1';
"iso8859-1" -> 'iso-8859-1';
- "iso-8859-15" -> 'iso-8859-1';
- "iso_8859_15" -> 'iso-8859-1';
- "iso_8859-15" -> 'iso-8859-1';
- "iso8859-15" -> 'iso-8859-1';
+ "iso-8859-15" -> 'iso-8859-15';
+ "iso_8859_15" -> 'iso-8859-15';
+ "iso_8859-15" -> 'iso-8859-15';
+ "iso8859-15" -> 'iso-8859-15';
"utf-8" -> 'utf-8';
"utf_8" -> 'utf-8';
_ -> false
View
@@ -176,6 +176,8 @@ parseDocumentBinary(Encoding, Xml, State) ->
erlsom_sax_latin1:parse(Xml, State);
'iso_8859_1' ->
erlsom_sax_latin1:parse(Xml, State);
+ 'iso_8859_15' ->
+ erlsom_sax_latin9:parse(Xml, State);
'list' ->
erlsom_sax_list:parse(Xml, State);
_ ->
View
@@ -25,7 +25,7 @@
%% this file exists several times, but with different names:
%% erlsom_sax_utf8, erlsom_sax_latin1 etc.
%% The only difference to the content of these 2 files is the definition below:
-%% it can be UTF8, LAT1, U16B or U16L. (The names have been chosen so that the
+%% it can be UTF8, LAT1, LAT9, U16B or U16L. (The names have been chosen so that the
%% number of bytes in the file will be the same in either case, so that it is
%% easy to see whether the files are the same, although this check is obviously
%% rather primitive.)
@@ -129,6 +129,30 @@
-define(BOM3, no_match2).
-endif.
+-ifdef(LAT9).
+-module(erlsom_sax_latin9).
+-define(BINARY, true).
+-define(STR1(X), <<X>>).
+-define(STR2(X1, X2), <<X1, X2>>).
+-define(STR3(X1, X2, X3), <<X1, X2, X3>>).
+-define(STR4(X1, X2, X3, X4), <<X1, X2, X3, X4>>).
+-define(STR5(X1, X2, X3, X4, X5), <<X1, X2, X3, X4, X5>>).
+-define(STR6(X1, X2, X3, X4, X5, X6), <<X1, X2, X3, X4, X5, X6>>).
+-define(STR7(X1, X2, X3, X4, X5, X6, X7), <<X1, X2, X3, X4, X5, X6, X7>>).
+-define(STR8(X1, X2, X3, X4, X5, X6, X7, X8), <<X1, X2, X3, X4, X5, X6, X7, X8>>).
+-define(DONTCARE_T(Y), <<_, Y/binary>>).
+-define(STR1_T(X, Y), <<X, Y/binary>>).
+-define(STR2_T(X1, X2, Y), <<X1, X2, Y/binary>>).
+-define(STR3_T(X1, X2, X3, Y), <<X1, X2, X3, Y/binary>>).
+-define(STR4_T(X1, X2, X3, X4, Y), <<X1, X2, X3, X4, Y/binary>>).
+-define(STR7_T(X1, X2, X3, X4, X5, X6, X7, Y), <<X1, X2, X3, X4, X5, X6, X7, Y/binary>>).
+-define(STR8_T(X1, X2, X3, X4, X5, X6, X7, X8, Y), <<X1, X2, X3, X4, X5, X6, X7, X8, Y/binary>>).
+-define(STR9_T(X1, X2, X3, X4, X5, X6, X7, X8, X9, Y), <<X1, X2, X3, X4, X5, X6, X7, X8, X9, Y/binary>>).
+-define(BOM1(X), [no_match | X]).
+-define(BOM2, no_match).
+-define(BOM3, no_match2).
+-endif.
+
-ifdef(LIST).
-module(erlsom_sax_list).
-define(EMPTY, []).
@@ -352,6 +376,33 @@ decodeChar(Tail, State) ->
end.
-endif.
+-ifdef(LAT9).
+decodeChar(Tail, State) ->
+ case Tail of
+ ?EMPTY -> ?CF3(Tail, State, fun decodeChar/2);
+ ?STR1_T(C, T) -> {latin9toUnicode(C), T, State}
+ end.
+
+latin9toUnicode(16#A4) -> % EURO SIGN
+ 16#20AC;
+latin9toUnicode(16#A6) -> % LATIN CAPITAL LETTER S WITH CARON
+ 16#0160;
+latin9toUnicode(16#A8) -> % LATIN SMALL LETTER S WITH CARON
+ 16#0161;
+latin9toUnicode(16#B4) -> % LATIN CAPITAL LETTER Z WITH CARON
+ 16#017D;
+latin9toUnicode(16#B8) -> % LATIN SMALL LETTER Z WITH CARON
+ 16#017E;
+latin9toUnicode(16#BC) -> % LATIN CAPITAL LIGATURE OE
+ 16#0152;
+latin9toUnicode(16#BD) -> % LATIN SMALL LIGATURE OE
+ 16#0153;
+latin9toUnicode(16#BE) -> % LATIN CAPITAL LETTER Y WITH DIAERESIS
+ 16#0178;
+latin9toUnicode(Char) ->
+ Char.
+-endif.
+
-ifdef(LIST).
decodeChar(Tail, State) ->
case Tail of
@@ -1078,6 +1129,11 @@ encode(List) ->
list_to_binary(List).
-endif.
+-ifdef(LAT9).
+encode(List) ->
+ list_to_binary(List).
+-endif.
+
-ifdef(LIST).
encode(List) ->
List.
@@ -1319,6 +1375,11 @@ decode(Bin) ->
Value.
-endif.
+-ifdef(LAT9).
+decode(Bin) ->
+ [latin9toUnicode(Char) || Char <- binary_to_list(Bin)].
+-endif.
+
-ifdef(U16B).
decode(Bin) ->
{Value, _} = erlsom_ucs:from_utf16be(Bin),
Oops, something went wrong.

0 comments on commit 326e53e

Please sign in to comment.