Skip to content

Commit

Permalink
Merge pull request perl-pod#40 from grantm/encoding-heuristic
Browse files Browse the repository at this point in the history
Encoding heuristic should be applied to first non-ASCII byte sequence.
  • Loading branch information
theory committed Aug 28, 2012
2 parents 6b6793c + 097be32 commit 48d9e11
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 1 deletion.
2 changes: 1 addition & 1 deletion lib/Pod/Simple/BlackBox.pm
Expand Up @@ -410,7 +410,7 @@ sub _try_encoding_guess {

return unless $line =~ /[^\x00-\x7f]/; # Look for non-ASCII byte

my $encoding = $line =~ /[\xC0-\xFD][\x80-\xBF]/ ? 'UTF-8' : 'ISO8859-1';
my $encoding = $line =~ /^[\x00-\x7f]*[\xC0-\xFD][\x80-\xBF]/ ? 'UTF-8' : 'ISO8859-1';
$self->_handle_encoding_line( "=encoding $encoding" );
$self->{'_transcoder'} && $self->{'_transcoder'}->($line);

Expand Down
143 changes: 143 additions & 0 deletions t/encod04.t
@@ -0,0 +1,143 @@
# The encoding detection heuristic will choose UTF8 or Latin-1. The current
# implementation will usually treat CP1252 (aka "Win-Latin-1") as Latin-1 but
# can be fooled into seeing it as UTF8.
#
# Note 1: Neither guess is 'correct' since even if we choose Latin-1, all the
# smart quote symbols will be rendered as control characters
#
# Note 2: the guess is only applied if the source POD omits =encoding, so
# CP1252 source will render correctly if properly declared
#

BEGIN {
if($ENV{PERL_CORE}) {
chdir 't';
@INC = '../lib';
}
}

use strict;
use Test;
BEGIN { plan tests => 5 };

ok 1;

use Pod::Simple::DumpAsXML;
use Pod::Simple::XMLOutStream;


# Initial, isolated, non-ASCII byte triggers Latin-1 guess and later
# multi-byte sequence is not considered by heuristic.

my @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
=head1 NAME
Em::Dash \x97 \x91CAF\xC9\x92
=cut
} );

my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
if( $guess ) {
if( $guess eq 'ISO8859-1' ) {
if( grep m{Dash (\x97|—|—)}, @output_lines ) {
ok 1;
} else {
ok 0;
print "# failed to find expected control character in output\n"
}
} else {
ok 0;
print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
}
} else {
ok 0;
print "# parser failed to detect non-ASCII bytes in input\n";
}


# Initial smart-quote character triggers Latin-1 guess as expected

@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
=head1 NAME
Smart::Quote - \x91FUT\xC9\x92
=cut
} );

my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
if( $guess ) {
if( $guess eq 'ISO8859-1' ) {
ok 1;
} else {
ok 0;
print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
}
} else {
ok 0;
print "# parser failed to detect non-ASCII bytes in input\n";
}


# Initial accented character followed by 'smart' apostrophe causes heuristic
# to choose UTF8 (a rather contrived example)

@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
=head1 NAME
Smart::Apostrophe::Fail - L\xC9\x92STRANGE
=cut
} );

my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
if( $guess ) {
if( $guess eq 'UTF-8' ) {
ok 1;
} else {
ok 0;
print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
}
} else {
ok 0;
print "# parser failed to detect non-ASCII bytes in input\n";
}


# The previous example used a CP1252 byte sequence that also happened to be a
# valid UTF8 byte sequence. In this example the heuristic also guesses 'wrong'
# despite the byte sequence not being valid UTF8 (it's too short). This could
# arguably be 'fixed' by using a less naive regex.

@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
=head1 NAME
Smart::Apostrophe::Fail - L\xE9\x92Strange
=cut
} );

my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
if( $guess ) {
if( $guess eq 'UTF-8' ) {
ok 1;
} else {
ok 0;
print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
}
} else {
ok 0;
print "# parser failed to detect non-ASCII bytes in input\n";
}


exit;

0 comments on commit 48d9e11

Please sign in to comment.