Permalink
Browse files

Merge pull request #37 from grantm/expect-chars

Skip decoding on strings with the utf8 flag set and add the 'parse_characters' option.
  • Loading branch information...
theory committed Aug 14, 2012
2 parents 29306b7 + 0425bad commit 80eec155fc64989d97928982d66f0f556f4fec77
Showing with 89 additions and 2 deletions.
  1. +2 −0 lib/Pod/Simple.pm
  2. +25 −0 lib/Pod/Simple.pod
  3. +3 −1 lib/Pod/Simple/BlackBox.pm
  4. +1 −1 lib/Pod/Simple/TranscodeSmart.pm
  5. +58 −0 t/enc-chars.t
View
@@ -87,6 +87,8 @@ __PACKAGE__->_accessorize(
'preserve_whitespace', # whether to try to keep whitespace as-is
'strip_verbatim_indent', # What indent to strip from verbatim
+ 'parse_characters', # Whether parser should expect chars rather than octets
+
'content_seen', # whether we've seen any real Pod content
'errors_seen', # TODO: document. whether we've seen any errors (fatal or not)
View
@@ -14,6 +14,8 @@ documentation") markup language that is typically used for writing
documentation for Perl and for Perl modules. The Pod format is explained
L<perlpod>; the most common formatter is called C<perldoc>.
+Be sure to read L</ENCODING> if your Pod contains non-ASCII characters.
+
Pod formatters can use Pod::Simple to parse Pod documents and render them into
plain text, HTML, or any number of other formats. Typically, such formatters
will be subclasses of Pod::Simple, and so they will inherit its methods, like
@@ -121,6 +123,14 @@ most likely to use.
=over
+=item C<< $parser->parse_characters( I<SOMEVALUE> ) >>
+
+The Pod parser normally expects to read octets and to convert those octets
+to characters based on the C<=encoding> declaration in the Pod source. Set
+this option to a true value to indicate that the Pod source is already a Perl
+character stream. This tells the parser to ignore any C<=encoding> command
+and to skip all the code paths involving decoding octets.
+
=item C<< $parser->no_whining( I<SOMEVALUE> ) >>
If you set this attribute to a true value, you will suppress the
@@ -322,6 +332,21 @@ Log an error unless C<< $parser->no_whining( TRUE ); >>.
=back
+=head1 ENCODING
+
+The Pod::Simple parser expects to read B<octets>. The parser will decode the
+octets into Perl's internal character string representation using the value of
+the C<=encoding> declaration in the POD source.
+
+If the POD source does not include an C<=encoding> declaration, the parser will
+attempt to guess the encoding (selecting one of UTF-8 or Latin-1) by examining
+the first non-ASCII bytes and applying the heuristic described in
+L<perlpodspec>.
+
+If you set the C<parse_characters> option to a true value the parser will
+expect characters rather than octets; will ignore any C<=encoding>; and will
+make no attempt to decode the input.
+
=head1 CAVEATS
This is just a beta release -- there are a good number of things still
@@ -123,7 +123,7 @@ sub parse_lines { # Usage: $parser->parse_lines(@lines)
}
}
- if(!$self->{'encoding'}) {
+ if(!$self->parse_characters && !$self->{'encoding'}) {
$self->_try_encoding_guess($line)
}
@@ -272,6 +272,8 @@ sub parse_lines { # Usage: $parser->parse_lines(@lines)
sub _handle_encoding_line {
my($self, $line) = @_;
+ return if $self->parse_characters;
+
# The point of this routine is to set $self->{'_transcoder'} as indicated.
return $line unless $line =~ m/^=encoding\s+(\S+)\s*$/s;
@@ -32,7 +32,7 @@ sub make_transcoder {
my $x;
return sub {
foreach $x (@_) {
- $x = Encode::decode($e, $x);
+ $x = Encode::decode($e, $x) unless Encode::is_utf8($x);
}
return;
};
View
@@ -0,0 +1,58 @@
+# tell parser the source POD has already been decoded from bytes to chars
+# =encoding line should be ignored
+# utf8 characters should come through unscathed
+
+BEGIN {
+ if($ENV{PERL_CORE}) {
+ chdir 't';
+ @INC = '../lib';
+ }
+}
+
+use strict;
+use Test;
+BEGIN { plan tests => 3 };
+
+use Pod::Simple::DumpAsXML;
+use Pod::Simple::XMLOutStream;
+
+
+my $parser = Pod::Simple::XMLOutStream->new;
+$parser->parse_characters(1);
+my $output = '';
+$parser->output_string( \$output );
+$parser->parse_string_document(qq{
+
+=encoding bogocode
+
+=head1 DESCRIPTION
+
+Confirm that if we tell the parser to expect character data, it avoids all
+the code paths that might attempt to decode the source from bytes to chars.
+
+The r\x{101}in in \x{15E}pain \x{FB02}oods the plain
+
+});
+
+ok(1); # parsed without exception
+
+if($output =~ /POD ERRORS/) {
+ ok(0);
+}
+else {
+ ok(1); # no errors
+}
+
+$output =~ s{&#(\d+);}{chr($1)}eg;
+
+if($output =~ /The r\x{101}in in \x{15E}pain \x{FB02}oods the plain/) {
+ ok(1); # data was not messed up
+}
+else {
+ ok(0);
+}
+
+
+
+warn $output;
+exit;

0 comments on commit 80eec15

Please sign in to comment.