implement 'parse_characters' option

This option allows the user to supply POD source that has already been decoded to Perl's internal character format
theory · Aug 11, 2012 · 0425bad · 0425bad
1 parent 18c3e79
commit 0425bad
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 1 deletion.
diff --git a/lib/Pod/Simple.pm b/lib/Pod/Simple.pm
@@ -87,6 +87,8 @@ __PACKAGE__->_accessorize(
   'preserve_whitespace', # whether to try to keep whitespace as-is
   'strip_verbatim_indent', # What indent to strip from verbatim
 
+  'parse_characters',  # Whether parser should expect chars rather than octets
+
  'content_seen',      # whether we've seen any real Pod content
  'errors_seen',       # TODO: document.  whether we've seen any errors (fatal or not)
 

diff --git a/lib/Pod/Simple.pod b/lib/Pod/Simple.pod
@@ -123,6 +123,14 @@ most likely to use.
 
 =over
 
+=item C<< $parser->parse_characters( I<SOMEVALUE> ) >>
+
+The Pod parser normally expects to read octets and to convert those octets
+to characters based on the C<=encoding> declaration in the Pod source.  Set
+this option to a true value to indicate that the Pod source is already a Perl
+character stream.  This tells the parser to ignore any C<=encoding> command
+and to skip all the code paths involving decoding octets.
+
 =item C<< $parser->no_whining( I<SOMEVALUE> ) >>
 
 If you set this attribute to a true value, you will suppress the
@@ -335,6 +343,10 @@ attempt to guess the encoding (selecting one of UTF-8 or Latin-1) by examining
 the first non-ASCII bytes and applying the heuristic described in
 L<perlpodspec>.
 
+If you set the C<parse_characters> option to a true value the parser will
+expect characters rather than octets; will ignore any C<=encoding>; and will
+make no attempt to decode the input.
+
 =head1 CAVEATS
 
 This is just a beta release -- there are a good number of things still

diff --git a/lib/Pod/Simple/BlackBox.pm b/lib/Pod/Simple/BlackBox.pm
@@ -123,7 +123,7 @@ sub parse_lines {             # Usage: $parser->parse_lines(@lines)
       }
     }
 
-    if(!$self->{'encoding'}) {
+    if(!$self->parse_characters && !$self->{'encoding'}) {
       $self->_try_encoding_guess($line)
     }
 
@@ -272,6 +272,8 @@ sub parse_lines {             # Usage: $parser->parse_lines(@lines)
 sub _handle_encoding_line {
   my($self, $line) = @_;
 
+  return if $self->parse_characters;
+
   # The point of this routine is to set $self->{'_transcoder'} as indicated.
 
   return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;

diff --git a/t/enc-chars.t b/t/enc-chars.t
@@ -0,0 +1,58 @@
+# tell parser the source POD has already been decoded from bytes to chars
+# =encoding line should be ignored
+# utf8 characters should come through unscathed
+
+BEGIN {
+    if($ENV{PERL_CORE}) {
+        chdir 't';
+        @INC = '../lib';
+    }
+}
+
+use strict;
+use Test;
+BEGIN { plan tests => 3 };
+
+use Pod::Simple::DumpAsXML;
+use Pod::Simple::XMLOutStream;
+
+
+my $parser = Pod::Simple::XMLOutStream->new;
+$parser->parse_characters(1);
+my $output = '';
+$parser->output_string( \$output );
+$parser->parse_string_document(qq{
+
+=encoding bogocode
+
+=head1 DESCRIPTION
+
+Confirm that if we tell the parser to expect character data, it avoids all
+the code paths that might attempt to decode the source from bytes to chars.
+
+The r\x{101}in in \x{15E}pain \x{FB02}oods the plain
+
+});
+
+ok(1); # parsed without exception
+
+if($output =~ /POD ERRORS/) {
+  ok(0);
+}
+else {
+  ok(1); # no errors
+}
+
+$output =~ s{&#(\d+);}{chr($1)}eg;
+
+if($output =~ /The r\x{101}in in \x{15E}pain \x{FB02}oods the plain/) {
+  ok(1); # data was not messed up
+}
+else {
+  ok(0);
+}
+
+
+
+warn $output;
+exit;