Skip to content

Commit

Permalink
Added unicode encoding/decoding from Mojo::JSON
Browse files Browse the repository at this point in the history
  • Loading branch information
vti committed Apr 21, 2010
1 parent 50ac472 commit 738d829
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 47 deletions.
42 changes: 41 additions & 1 deletion lib/Mojo/JSON/XS.pm
Expand Up @@ -6,11 +6,35 @@ use warnings;
use base 'Mojo::Base';

use JSON::XS;
use Mojo::ByteStream 'b';

# Literal names
our $FALSE = Mojo::JSON::XS::_Bool->new(0);
our $TRUE = Mojo::JSON::XS::_Bool->new(1);

# Byte order marks
my $BOM_RE = qr/
(?:
\357\273\277 # UTF-8
|
\377\376\0\0 # UTF-32LE
|
\0\0\376\377 # UTF-32BE
|
\376\377 # UTF-16BE
|
\377\376 # UTF-16LE
)
/x;

# Unicode encoding detection
my $UTF_PATTERNS = {
"\0\0\0[^\0]" => 'UTF-32BE',
"\0[^\0]\0[^\0]" => 'UTF-16BE',
"[^\0]\0\0\0" => 'UTF-32LE',
"[^\0]\0[^\0]\0" => 'UTF-16LE'
};

__PACKAGE__->attr('_jsonxs' => sub { JSON::XS->new->convert_blessed(1) });
__PACKAGE__->attr('error');

Expand All @@ -23,6 +47,19 @@ sub decode {
# Cleanup
$self->error(undef);

# Remove BOM
$string =~ s/^$BOM_RE//go;

# Detect and decode unicode
my $encoding = 'UTF-8';
for my $pattern (keys %$UTF_PATTERNS) {
if ($string =~ /^$pattern/) {
$encoding = $UTF_PATTERNS->{$pattern};
last;
}
}
$string = b($string)->decode($encoding)->to_string;

my $result;

eval {$result = $self->_jsonxs->decode($string);};
Expand All @@ -38,7 +75,10 @@ sub decode {
sub encode {
my ($self, $ref) = @_;

return $self->_jsonxs->encode($ref);
my $string = $self->_jsonxs->encode($ref);

# Unicode
return b($string)->encode('UTF-8')->to_string;
}

sub false {$FALSE}
Expand Down
95 changes: 49 additions & 46 deletions t/json.t
Expand Up @@ -9,7 +9,7 @@ use Test::More;

plan skip_all => 'JSON::XS is required for this test'
unless eval { require JSON::XS; 1 };
plan tests => 70;
plan tests => 82;

use Mojo::ByteStream 'b';

Expand Down Expand Up @@ -44,9 +44,9 @@ $array = $json->decode('[10e12 , [2 ]]');
#is_deeply($array, ['10e12', [2]], 'decode [10e12 , [2 ]]');
#$array = $json->decode('[37.7668 , [ 20 ]] ');
#is_deeply($array, [37.7668, [20]], 'decode [37.7668 , [ 20 ]] ');
#$array = $json->decode('[1e3]');
#isa_ok($array, 'ARRAY', 'decode [1e3]');
#cmp_ok($array->[0], '==', 1e3, 'value is 1e3');
$array = $json->decode('[1e3]');
isa_ok($array, 'ARRAY', 'decode [1e3]');
cmp_ok($array->[0], '==', 1e3, 'value is 1e3');

# Decode name
$array = $json->decode('[true]');
Expand Down Expand Up @@ -136,11 +136,11 @@ $string = $json->encode(["hello\nworld!"]);
is($string, '["hello\nworld!"]', 'encode ["hello\nworld!"]');
$string = $json->encode(["hello\t\"world!"]);
is($string, '["hello\t\"world!"]', 'encode ["hello\t\"world!"]');
#$string = $json->encode(["hello\x{0003}\x{0152}world\x{0152}!"]);
#is( b($string)->decode('UTF-8'),
# "[\"hello\\u0003\x{0152}world\x{0152}!\"]",
# 'encode ["hello\x{0003}\x{0152}world\x{0152}!"]'
#);
$string = $json->encode(["hello\x{0003}\x{0152}world\x{0152}!"]);
is( b($string)->decode('UTF-8'),
"[\"hello\\u0003\x{0152}world\x{0152}!\"]",
'encode ["hello\x{0003}\x{0152}world\x{0152}!"]'
);
$string = $json->encode(["123abc"]);
is($string, '["123abc"]', 'encode ["123abc"]');

Expand Down Expand Up @@ -182,56 +182,59 @@ is($string, '[37.7668,[20]]', 'encode [37.7668, [20]]');
#$array = $json->decode($string);
#is_deeply($array, ["\x{10346}"], 'successful roundtrip');

## Decode UTF-16LE
#$array = $json->decode(b("\x{feff}[true]")->encode('UTF-16LE'));
# Decode UTF-16LE
$array = $json->decode(b("\x{feff}[true]")->encode('UTF-16LE'));
#is_deeply($array, [$json->true], 'decode \x{feff}[true]');
#
## Decode UTF-16LE with faihu surrogate pair
#$array = $json->decode(b("\x{feff}[\"\\ud800\\udf46\"]")->encode('UTF-16LE'));
#is_deeply($array, ["\x{10346}"], 'decode \x{feff}[\"\\ud800\\udf46\"]');
#
## Decode UTF-16LE with faihu surrogate pair and BOM value
#$array = $json->decode(
# b("\x{feff}[\"\\ud800\\udf46\x{feff}\"]")->encode('UTF-16LE'));
#is_deeply($array, ["\x{10346}\x{feff}"],
# 'decode \x{feff}[\"\\ud800\\udf46\x{feff}\"]');
#
is($array->[0], $json->true);

# Decode UTF-16LE with faihu surrogate pair
$array = $json->decode(b("\x{feff}[\"\\ud800\\udf46\"]")->encode('UTF-16LE'));
is_deeply($array, ["\x{10346}"], 'decode \x{feff}[\"\\ud800\\udf46\"]');

# Decode UTF-16LE with faihu surrogate pair and BOM value
$array = $json->decode(
b("\x{feff}[\"\\ud800\\udf46\x{feff}\"]")->encode('UTF-16LE'));
is_deeply($array, ["\x{10346}\x{feff}"],
'decode \x{feff}[\"\\ud800\\udf46\x{feff}\"]');

## Decode UTF-16LE with missing high surrogate
#$array = $json->decode(b("\x{feff}[\"\\ud800\"]")->encode('UTF-16LE'));
#is_deeply($array, ['\ud800'], 'decode \x{feff}[\"\\ud800\"]');
#

## Decode UTF-16LE with missing low surrogate
#$array = $json->decode(b("\x{feff}[\"\\udf46\"]")->encode('UTF-16LE'));
#is_deeply($array, ['\udf46'], 'decode \x{feff}[\"\\udf46\"]');
#
## Decode UTF-16BE with faihu surrogate pair
#$array = $json->decode(b("\x{feff}[\"\\ud800\\udf46\"]")->encode('UTF-16BE'));
#is_deeply($array, ["\x{10346}"], 'decode \x{feff}[\"\\ud800\\udf46\"]');
#
## Decode UTF-32LE
#$array = $json->decode(b("\x{feff}[true]")->encode('UTF-32LE'));

# Decode UTF-32LE
$array = $json->decode(b("\x{feff}[true]")->encode('UTF-32LE'));
#is_deeply($array, [$json->true], 'decode \x{feff}[true]');
#
## Decode UTF-32BE
#$array = $json->decode(b("\x{feff}[true]")->encode('UTF-32BE'));
is($array->[0], $json->true);

# Decode UTF-32BE
$array = $json->decode(b("\x{feff}[true]")->encode('UTF-32BE'));
#is_deeply($array, [$json->true], 'decode \x{feff}[true]');
#
## Decode UTF-16LE without BOM
#$array = $json->decode(b("[\"\\ud800\\udf46\"]")->encode('UTF-16LE'));
#is_deeply($array, ["\x{10346}"], 'decode [\"\\ud800\\udf46\"]');
#
## Decode UTF-16BE without BOM
#$array = $json->decode(b("[\"\\ud800\\udf46\"]")->encode('UTF-16BE'));
#is_deeply($array, ["\x{10346}"], 'decode [\"\\ud800\\udf46\"]');
#
## Decode UTF-32LE without BOM
#$array = $json->decode(b("[\"\\ud800\\udf46\"]")->encode('UTF-32LE'));
#is_deeply($array, ["\x{10346}"], 'decode [\"\\ud800\\udf46\"]');
#
## Decode UTF-32BE without BOM
#$array = $json->decode(b("[\"\\ud800\\udf46\"]")->encode('UTF-32BE'));
#is_deeply($array, ["\x{10346}"], 'decode [\"\\ud800\\udf46\"]');
#
is($array->[0], $json->true);

# Decode UTF-16LE without BOM
$array = $json->decode(b("[\"\\ud800\\udf46\"]")->encode('UTF-16LE'));
is_deeply($array, ["\x{10346}"], 'decode [\"\\ud800\\udf46\"]');

# Decode UTF-16BE without BOM
$array = $json->decode(b("[\"\\ud800\\udf46\"]")->encode('UTF-16BE'));
is_deeply($array, ["\x{10346}"], 'decode [\"\\ud800\\udf46\"]');

# Decode UTF-32LE without BOM
$array = $json->decode(b("[\"\\ud800\\udf46\"]")->encode('UTF-32LE'));
is_deeply($array, ["\x{10346}"], 'decode [\"\\ud800\\udf46\"]');

# Decode UTF-32BE without BOM
$array = $json->decode(b("[\"\\ud800\\udf46\"]")->encode('UTF-32BE'));
is_deeply($array, ["\x{10346}"], 'decode [\"\\ud800\\udf46\"]');

# Complicated roudtrips
$string = '[null,false,true,"",0,1]';
$array = $json->decode($string);
Expand Down

0 comments on commit 738d829

Please sign in to comment.