Merge pull request perl-pod#40 from grantm/encoding-heuristic

Encoding heuristic should be applied to first non-ASCII byte sequence.
theory · Aug 28, 2012 · 48d9e11 · 48d9e11
2 parents 6b6793c + 097be32
commit 48d9e11
Show file tree

Hide file tree

Showing 2 changed files with 144 additions and 1 deletion.
diff --git a/lib/Pod/Simple/BlackBox.pm b/lib/Pod/Simple/BlackBox.pm
@@ -410,7 +410,7 @@ sub _try_encoding_guess {
 
   return unless $line =~ /[^\x00-\x7f]/;  # Look for non-ASCII byte
 
-  my $encoding = $line =~ /[\xC0-\xFD][\x80-\xBF]/ ? 'UTF-8' : 'ISO8859-1';
+  my $encoding = $line =~ /^[\x00-\x7f]*[\xC0-\xFD][\x80-\xBF]/ ? 'UTF-8' : 'ISO8859-1';
   $self->_handle_encoding_line( "=encoding $encoding" );
   $self->{'_transcoder'} && $self->{'_transcoder'}->($line);
 

diff --git a/t/encod04.t b/t/encod04.t
@@ -0,0 +1,143 @@
+# The encoding detection heuristic will choose UTF8 or Latin-1.  The current
+# implementation will usually treat CP1252 (aka "Win-Latin-1") as Latin-1 but
+# can be fooled into seeing it as UTF8.
+#
+# Note 1: Neither guess is 'correct' since even if we choose Latin-1, all the
+#         smart quote symbols will be rendered as control characters
+#
+# Note 2: the guess is only applied if the source POD omits =encoding, so
+#         CP1252 source will render correctly if properly declared
+#
+
+BEGIN {
+    if($ENV{PERL_CORE}) {
+        chdir 't';
+        @INC = '../lib';
+    }
+}
+
+use strict;
+use Test;
+BEGIN { plan tests => 5 };
+
+ok 1;
+
+use Pod::Simple::DumpAsXML;
+use Pod::Simple::XMLOutStream;
+
+
+# Initial, isolated, non-ASCII byte triggers Latin-1 guess and later
+# multi-byte sequence is not considered by heuristic.
+
+my @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
+
+=head1 NAME
+
+Em::Dash \x97 \x91CAF\xC9\x92
+
+=cut
+
+} );
+
+my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
+if( $guess ) {
+  if( $guess eq 'ISO8859-1' ) {
+    if( grep m{Dash (\x97|&#x97;|&#151;)}, @output_lines ) {
+      ok 1;
+    } else {
+      ok 0;
+      print "# failed to find expected control character in output\n"
+    }
+  } else {
+    ok 0;
+    print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
+  }
+} else {
+  ok 0;
+  print "# parser failed to detect non-ASCII bytes in input\n";
+}
+
+
+# Initial smart-quote character triggers Latin-1 guess as expected
+
+@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
+
+=head1 NAME
+
+Smart::Quote - \x91FUT\xC9\x92
+
+=cut
+
+} );
+
+my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
+if( $guess ) {
+  if( $guess eq 'ISO8859-1' ) {
+    ok 1;
+  } else {
+    ok 0;
+    print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
+  }
+} else {
+  ok 0;
+  print "# parser failed to detect non-ASCII bytes in input\n";
+}
+
+
+# Initial accented character followed by 'smart' apostrophe causes heuristic
+# to choose UTF8 (a rather contrived example)
+
+@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
+
+=head1 NAME
+
+Smart::Apostrophe::Fail - L\xC9\x92STRANGE
+
+=cut
+
+} );
+
+my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
+if( $guess ) {
+  if( $guess eq 'UTF-8' ) {
+    ok 1;
+  } else {
+    ok 0;
+    print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
+  }
+} else {
+  ok 0;
+  print "# parser failed to detect non-ASCII bytes in input\n";
+}
+
+
+# The previous example used a CP1252 byte sequence that also happened to be a
+# valid UTF8 byte sequence.  In this example the heuristic also guesses 'wrong'
+# despite the byte sequence not being valid UTF8 (it's too short).  This could
+# arguably be 'fixed' by using a less naive regex.
+
+@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
+
+=head1 NAME
+
+Smart::Apostrophe::Fail - L\xE9\x92Strange
+
+=cut
+
+} );
+
+my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
+if( $guess ) {
+  if( $guess eq 'UTF-8' ) {
+    ok 1;
+  } else {
+    ok 0;
+    print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
+  }
+} else {
+  ok 0;
+  print "# parser failed to detect non-ASCII bytes in input\n";
+}
+
+
+exit;