Permalink
Browse files

add tests for encoding heuristic corner cases

These tests use undeclared CP1252 since a plausible Latin-1 POD source
that can be detected as UTF-8 is much harder to contrive.  This also
documents our expectation that 'smart quote' symbols in CP1252 will be
rendered as control character if no =encoding was declared.
  • Loading branch information...
1 parent 1cca188 commit 097be327efce0d025196382b6d6950b342647db1 @grantm grantm committed Aug 27, 2012
Showing with 143 additions and 0 deletions.
  1. +143 −0 t/encod04.t
View
143 t/encod04.t
@@ -0,0 +1,143 @@
+# The encoding detection heuristic will choose UTF8 or Latin-1. The current
+# implementation will usually treat CP1252 (aka "Win-Latin-1") as Latin-1 but
+# can be fooled into seeing it as UTF8.
+#
+# Note 1: Neither guess is 'correct' since even if we choose Latin-1, all the
+# smart quote symbols will be rendered as control characters
+#
+# Note 2: the guess is only applied if the source POD omits =encoding, so
+# CP1252 source will render correctly if properly declared
+#
+
+BEGIN {
+ if($ENV{PERL_CORE}) {
+ chdir 't';
+ @INC = '../lib';
+ }
+}
+
+use strict;
+use Test;
+BEGIN { plan tests => 5 };
+
+ok 1;
+
+use Pod::Simple::DumpAsXML;
+use Pod::Simple::XMLOutStream;
+
+
+# Initial, isolated, non-ASCII byte triggers Latin-1 guess and later
+# multi-byte sequence is not considered by heuristic.
+
+my @output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
+
+=head1 NAME
+
+Em::Dash \x97 \x91CAF\xC9\x92
+
+=cut
+
+} );
+
+my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
+if( $guess ) {
+ if( $guess eq 'ISO8859-1' ) {
+ if( grep m{Dash (\x97|—|—)}, @output_lines ) {
+ ok 1;
+ } else {
+ ok 0;
+ print "# failed to find expected control character in output\n"
+ }
+ } else {
+ ok 0;
+ print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
+ }
+} else {
+ ok 0;
+ print "# parser failed to detect non-ASCII bytes in input\n";
+}
+
+
+# Initial smart-quote character triggers Latin-1 guess as expected
+
+@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
+
+=head1 NAME
+
+Smart::Quote - \x91FUT\xC9\x92
+
+=cut
+
+} );
+
+my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
+if( $guess ) {
+ if( $guess eq 'ISO8859-1' ) {
+ ok 1;
+ } else {
+ ok 0;
+ print "# parser guessed wrong encoding expected 'ISO8859-1' got '$guess'\n";
+ }
+} else {
+ ok 0;
+ print "# parser failed to detect non-ASCII bytes in input\n";
+}
+
+
+# Initial accented character followed by 'smart' apostrophe causes heuristic
+# to choose UTF8 (a rather contrived example)
+
+@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
+
+=head1 NAME
+
+Smart::Apostrophe::Fail - L\xC9\x92STRANGE
+
+=cut
+
+} );
+
+my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
+if( $guess ) {
+ if( $guess eq 'UTF-8' ) {
+ ok 1;
+ } else {
+ ok 0;
+ print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
+ }
+} else {
+ ok 0;
+ print "# parser failed to detect non-ASCII bytes in input\n";
+}
+
+
+# The previous example used a CP1252 byte sequence that also happened to be a
+# valid UTF8 byte sequence. In this example the heuristic also guesses 'wrong'
+# despite the byte sequence not being valid UTF8 (it's too short). This could
+# arguably be 'fixed' by using a less naive regex.
+
+@output_lines = split m/[\cm\cj]+/, Pod::Simple::XMLOutStream->_out( qq{
+
+=head1 NAME
+
+Smart::Apostrophe::Fail - L\xE9\x92Strange
+
+=cut
+
+} );
+
+my($guess) = "@output_lines" =~ m{Non-ASCII.*?Assuming ([\w-]+)};
+if( $guess ) {
+ if( $guess eq 'UTF-8' ) {
+ ok 1;
+ } else {
+ ok 0;
+ print "# parser guessed wrong encoding expected 'UTF-8' got '$guess'\n";
+ }
+} else {
+ ok 0;
+ print "# parser failed to detect non-ASCII bytes in input\n";
+}
+
+
+exit;

0 comments on commit 097be32

Please sign in to comment.