Permalink
Switch branches/tags
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 3854 lines (3308 sloc) 125 KB
#!/usr/bin/perl -T
#
# W3C Markup Validation Service
# A CGI script to retrieve and validate a markup file
#
# Copyright 1995-2013 World Wide Web Consortium, (Massachusetts
# Institute of Technology, European Research Consortium for Informatics
# and Mathematics, Keio University). All Rights Reserved.
#
# Originally written by Gerald Oskoboiny <gerald@w3.org>
# for additional contributors, see
# http://dvcs.w3.org/hg/markup-validator/shortlog/tip and
# https://validator.w3.org/about.html#credits
#
# This source code is available under the license at:
# http://www.w3.org/Consortium/Legal/copyright-software
#
# We need Perl 5.8.0+.
use 5.008;
###############################################################################
#### Load modules. ############################################################
###############################################################################
#
# Pragmas.
use strict;
# enable warnings when not using in production
#use warnings;
use utf8;
package W3C::Validator::MarkupValidator;
#
# Modules. See also the BEGIN block further down below.
#
# Version numbers given where we absolutely need a minimum version of a given
# module (gives nicer error messages). By default, add an empty import list
# when loading modules to prevent non-OO or poorly written modules from
# polluting our namespace.
#
# Need 3.40 for query string and path info fixes, #4365
use CGI 3.40 qw(-newstyle_urls -private_tempfiles redirect);
use CGI::Carp qw(carp croak fatalsToBrowser);
use Config qw(%Config);
use Config::General 2.32 qw(); # Need 2.32 for <msg 0>, rt.cpan.org#17852
use Encode qw();
use Encode::Alias qw();
use Encode::HanExtra qw(); # for some chinese character encodings,
# e.g gb18030
use File::Spec::Functions qw(catfile rel2abs tmpdir);
use HTTP::Cookies qw();
use HTML::Encoding 0.52 qw();
use HTML::HeadParser 3.60 qw(); # Needed for HTML5 meta charset workaround
use HTML::Parser 3.24 qw(); # Need 3.24 for $p->parse($code_ref)
use HTML::Template qw(); # Need 2.6 for path param, other things.
# Specifying 2.6 would break with 2.10,
# rt.cpan.org#70190
use HTTP::Headers::Util qw();
use HTTP::Message 1.52 qw(); # Need 1.52 for decoded_content()
use HTTP::Request qw();
use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*.
use JSON 2.00 qw();
use SGML::Parser::OpenSP 0.991 qw();
use URI 1.53 qw(); # Need 1.53 for secure()
use URI::Escape qw(uri_escape);
use URI::file;
use URI::Heuristic qw();
###############################################################################
#### Constant definitions. ####################################################
###############################################################################
#
# Define global constants
use constant TRUE => 1;
use constant FALSE => 0;
#
# Tentative Validation Severities.
use constant T_WARN => 4; # 0000 0100
use constant T_ERROR => 8; # 0000 1000
#
# Define global variables.
use vars qw($DEBUG $CFG %RSRC $VERSION);
$VERSION = '1.3';
use constant IS_MODPERL2 =>
(exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
#
# Things inside BEGIN don't happen on every request in persistent environments
# (such as mod_perl); so let's do the globals, eg. read config, here.
BEGIN {
my $base = $ENV{W3C_VALIDATOR_HOME} || '/usr/local/validator';
# Launder data for -T; -AutoLaunder doesn't catch this one.
if ($base =~ /^(.*)$/) {
$base = $1;
}
#
# Read Config Files.
eval {
my %config_opts = (
-ConfigFile =>
($ENV{W3C_VALIDATOR_CFG} || '/etc/w3c/validator.conf'),
-MergeDuplicateOptions => TRUE,
-MergeDuplicateBlocks => TRUE,
-SplitPolicy => 'equalsign',
-UseApacheInclude => TRUE,
-IncludeRelative => TRUE,
-InterPolateVars => TRUE,
-AutoLaunder => TRUE,
-AutoTrue => TRUE,
-CComments => FALSE,
-DefaultConfig => {
Protocols => {Allow => 'http,https'},
Paths => {
Base => $base,
Cache => '',
},
External => {HTML5 => FALSE,},
},
);
my %cfg = Config::General->new(%config_opts)->getall();
$CFG = \%cfg;
};
if ($@) {
die <<"EOF";
Could not read configuration. Set the W3C_VALIDATOR_CFG environment variable
or copy conf/* to /etc/w3c/. Make sure that the configuration file and all
included files are readable by the web server user. The error was:\n'$@'
EOF
}
#
# Check paths in config
# @@FIXME: This does not do a very good job error-message-wise if
# a path is missing...
{
my %paths = map { $_ => [-d $_, -r _] } $CFG->{Paths}->{Base},
$CFG->{Paths}->{Templates}, $CFG->{Paths}->{SGML}->{Library};
my @_d = grep { not $paths{$_}->[0] } keys %paths;
my @_r = grep { not $paths{$_}->[1] } keys %paths;
die "Does not exist or is not a directory: @_d\n" if scalar(@_d);
die "Directory not readable (permission denied): @_r\n" if scalar(@_r);
}
#
# Split allowed protocols into a list.
if (my $allowed = delete($CFG->{Protocols}->{Allow})) {
$CFG->{Protocols}->{Allow} = [split(/\s*,\s*/, $allowed)];
}
# Split available languages into a list
if (my $langs = delete($CFG->{Languages})) {
$CFG->{Languages} = [split(/\s+/, $langs)];
}
else {
# Default to english
$CFG->{Languages} = ["en"];
}
{ # Make types config indexed by FPI.
my $types = {};
while (my ($key, $value) = each %{$CFG->{Types}}) {
$types->{$CFG->{Types}->{$key}->{PubID}} = $value;
}
$CFG->{Types} = $types;
}
#
# Change strings to internal constants in MIME type mapping.
while (my ($key, $value) = each %{$CFG->{MIME}}) {
$CFG->{MIME}->{$key} = 'TBD'
unless ($value eq 'SGML' || $value eq 'XML');
}
#
# Register Encode aliases.
while (my ($key, $value) = each %{$CFG->{Charsets}}) {
Encode::Alias::define_alias($key, $1) if ($value =~ /^[AX] (\S+)/);
}
#
# Set debug flag.
if ($CFG->{'Allow Debug'}) {
$DEBUG = TRUE if $ENV{W3C_VALIDATOR_DEBUG} || $CFG->{'Enable Debug'};
}
else {
$DEBUG = FALSE;
}
# Read friendly error message file
# 'en_US' should be replaced by $lang for lang-neg
%RSRC = Config::General->new(
-MergeDuplicateBlocks => 1,
-ConfigFile =>
catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'),
)->getall();
eval {
local $SIG{__DIE__} = undef;
require Encode::JIS2K; # for optional extra Japanese encodings
};
# Tell libxml to load _only_ our XML catalog. This is because our entity
# load jailing may trap the libxml internal default catalog (which is
# automatically loaded). Preventing loading that from the input callback
# will cause libxml to not see the document content at all but to throw
# weird "Document is empty" errors, at least as of XML::LibXML 1.70 and
# libxml 2.7.7. XML_CATALOG_FILES needs to be in effect at XML::LibXML
# load time which is why we're using "require" here instead of pulling it
# in with "use" as usual. And finally, libxml should have support for
# SGML open catalogs but they don't seem to work (again as of 1.70 and
# 2.7.7); if we use xml.soc here, no entities seem to end up being resolved
# from it - so we use a (redundant) XML catalog which works.
# Note that setting XML_CATALOG_FILES here does not seem to work with
# mod_perl (it doesn't end up being used by XML::LibXML), therefore we do
# it in the mod_perl/startup.pl startup file for it too.
local $ENV{XML_CATALOG_FILES} =
catfile($CFG->{Paths}->{SGML}->{Library}, 'catalog.xml');
require XML::LibXML;
XML::LibXML->VERSION(1.73); # Need 1.73 for rt.cpan.org #66642
} # end of BEGIN block.
#
# Get rid of (possibly insecure) $PATH.
delete $ENV{PATH};
#@@DEBUG: Dump $CFG datastructure. Used only as a developer aid.
#use Data::Dumper qw(Dumper);
#print Dumper($CFG);
#exit;
#@@DEBUG;
###############################################################################
#### Process CGI variables and initialize. ####################################
###############################################################################
#
# Create a new CGI object.
my $q = CGI->new();
#
# The data structure that will hold all session data.
# @@FIXME This can't be my() as $File will sooner or
# later be undef and add_warning will cause the script
# to die. our() seems to work but has other problems.
# @@FIXME Apparently, this must be set to {} also,
# otherwise the script might pick up an old object
# after abort_if_error_flagged under mod_perl.
our $File = {};
#################################
# Initialize the datastructure. #
#################################
#
# Charset data (casing policy: lowercase early).
$File->{Charset}->{Use} = ''; # The charset used for validation.
$File->{Charset}->{Auto} = ''; # Autodetection using XML rules (Appendix F)
$File->{Charset}->{HTTP} = ''; # From HTTP's "charset" parameter.
$File->{Charset}->{META} = ''; # From HTML's <meta http-equiv>.
$File->{Charset}->{XML} = ''; # From the XML Declaration.
$File->{Charset}->{Override} = ''; # From CGI/user override.
#
# Misc simple types.
$File->{Mode} =
'DTD+SGML'; # Default parse mode is DTD validation in SGML mode.
# By default, perform validation (we may perform only xml-wf in some cases)
$File->{XMLWF_ONLY} = FALSE;
#
# Listrefs.
$File->{Warnings} = []; # Warnings...
$File->{Namespaces} = []; # Other (non-root) Namespaces.
$File->{Parsers} = []; # Parsers used {name, link, type, options}
# By default, doctype-less documents cannot be valid
$File->{"DOCTYPEless OK"} = FALSE;
$File->{"Default DOCTYPE"}->{"HTML"} = 'HTML 4.01 Transitional';
$File->{"Default DOCTYPE"}->{"XHTML"} = 'XHTML 1.0 Transitional';
###############################################################################
#### Generate Template for Result. ############################################
###############################################################################
# first we determine the chosen language based on
# 1) lang argument given as parameter (if this language is available)
# 2) HTTP language negotiation between variants available and user-agent choices
# 3) English by default
my $lang = $q->param('lang') || '';
my @localizations;
foreach my $lang_available (@{$CFG->{Languages}}) {
if ($lang eq $lang_available) {
# Requested language (from parameters) is available, just use it
undef @localizations;
last;
}
push @localizations,
[
$lang_available, 1, 'text/html', undef,
'utf-8', $lang_available, undef
];
}
# If language is not chosen yet, use HTTP-based negotiation
if (@localizations) {
require HTTP::Negotiate;
$lang = HTTP::Negotiate::choose(\@localizations);
}
# HTTP::Negotiate::choose may return undef e.g if sent Accept-Language: en;q=0
$lang ||= 'en_US';
if ($lang eq "en") {
$lang = 'en_US'; # legacy
}
$File->{Template_Defaults} = {
die_on_bad_params => FALSE,
loop_context_vars => TRUE,
global_vars => TRUE,
case_sensitive => TRUE,
path => [catfile($CFG->{Paths}->{Templates}, $lang)],
filter => sub { my $ref = shift; ${$ref} = Encode::decode_utf8(${$ref}); },
};
if (IS_MODPERL2()) {
$File->{Template_Defaults}->{cache} = TRUE;
}
elsif ($CFG->{Paths}->{Cache}) {
$File->{Template_Defaults}->{file_cache} = TRUE;
$File->{Template_Defaults}->{file_cache_dir} =
rel2abs($CFG->{Paths}->{Cache}, tmpdir());
}
undef $lang;
#########################################
# Populate $File->{Opt} -- CGI Options. #
#########################################
#
# Preprocess the CGI parameters.
$q = &prepCGI($File, $q);
#
# Set session switches.
$File->{Opt}->{Outline} = $q->param('outline') ? TRUE : FALSE;
$File->{Opt}->{'Show Source'} = $q->param('ss') ? TRUE : FALSE;
$File->{Opt}->{'Show Tidy'} = $q->param('st') ? TRUE : FALSE;
$File->{Opt}->{Verbose} = $q->param('verbose') ? TRUE : FALSE;
$File->{Opt}->{'Group Errors'} = $q->param('group') ? TRUE : FALSE;
$File->{Opt}->{Debug} = $q->param('debug') ? TRUE : FALSE;
$File->{Opt}->{No200} = $q->param('No200') ? TRUE : FALSE;
$File->{Opt}->{Prefill} = $q->param('prefill') ? TRUE : FALSE;
$File->{Opt}->{'Prefill Doctype'} = $q->param('prefill_doctype') || 'html401';
$File->{Opt}->{Charset} = lc($q->param('charset') || '');
$File->{Opt}->{DOCTYPE} = $q->param('doctype') || '';
$File->{Opt}->{'User Agent'} =
$q->param('user-agent') &&
$q->param('user-agent') ne "1" ? $q->param('user-agent') :
"W3C_Validator/$VERSION " . $CFG->{'User Agent Info'};
$File->{Opt}->{'User Agent'} =~ tr/\x00-\x09\x0b\x0c-\x1f//d;
if ($File->{Opt}->{'User Agent'} eq 'mobileok') {
$File->{Opt}->{'User Agent'} =
'W3C-mobileOK/DDC-1.0 (see http://www.w3.org/2006/07/mobileok-ddc)';
}
$File->{Opt}->{'Accept Header'} = $q->param('accept') || '';
$File->{Opt}->{'Accept-Language Header'} = $q->param('accept-language') || '';
$File->{Opt}->{'Accept-Charset Header'} = $q->param('accept-charset') || '';
$File->{Opt}->{$_} =~ tr/\x00-\x09\x0b\x0c-\x1f//d
for ('Accept Header', 'Accept-Language Header', 'Accept-Charset Header');
#
# "Fallback" info for Character Encoding (fbc), Content-Type (fbt),
# and DOCTYPE (fbd). If TRUE, the Override values are treated as
# Fallbacks instead of Overrides.
$File->{Opt}->{FB}->{Charset} = $q->param('fbc') ? TRUE : FALSE;
$File->{Opt}->{FB}->{Type} = $q->param('fbt') ? TRUE : FALSE;
$File->{Opt}->{FB}->{DOCTYPE} = $q->param('fbd') ? TRUE : FALSE;
#
# If ";debug" was given, let it overrule the value from the config file,
# regardless of whether it's "0" or "1" (on or off), but only if config
# allows the debugging options.
if ($CFG->{'Allow Debug'}) {
$DEBUG = $q->param('debug') if defined $q->param('debug');
$File->{Opt}->{Verbose} = TRUE if $DEBUG;
}
else {
$DEBUG = FALSE; # The default.
}
$File->{Opt}->{Debug} = $DEBUG;
&abort_if_error_flagged($File);
#
# Get the file and metadata.
if ($q->param('uploaded_file')) {
$File = &handle_file($q, $File);
}
elsif ($q->param('fragment')) {
$File = &handle_frag($q, $File);
}
elsif ($q->param('uri')) {
$File = &handle_uri($q, $File);
}
#
# Abort if an error was flagged during initialization.
&abort_if_error_flagged($File);
#
# Get rid of the CGI object.
undef $q;
#
# We don't need STDIN any more, so get rid of it to avoid getting clobbered
# by Apache::Registry's idiotic interference under mod_perl.
untie *STDIN;
###############################################################################
#### Output validation results. ###############################################
###############################################################################
if (!$File->{ContentType} && !$File->{'Direct Input'} && !$File->{'Is Upload'})
{
&add_warning('W08', {});
}
$File = find_encodings($File);
#
# Decide on a charset to use (first part)
#
if ($File->{Charset}->{HTTP}) { # HTTP, if given, is authoritative.
$File->{Charset}->{Use} = $File->{Charset}->{HTTP};
}
elsif ($File->{ContentType} =~ m(^text/(?:[-.a-zA-Z0-9]\+)?xml$)) {
# Act as if $http_charset was 'us-ascii'. (MIME rules)
$File->{Charset}->{Use} = 'us-ascii';
&add_warning(
'W01',
{ W01_upload => $File->{'Is Upload'},
W01_agent => $File->{Server},
W01_ct => $File->{ContentType},
}
);
}
elsif ($File->{Charset}->{XML}) {
$File->{Charset}->{Use} = $File->{Charset}->{XML};
}
elsif ($File->{BOM} &&
$File->{BOM} == 2 &&
$File->{Charset}->{Auto} =~ /^utf-16[bl]e$/)
{
$File->{Charset}->{Use} = 'utf-16';
}
elsif ($File->{ContentType} =~ m(^application/(?:[-.a-zA-Z0-9]+\+)?xml$)) {
$File->{Charset}->{Use} = "utf-8";
}
elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) {
$File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.)
}
$File->{Charset}->{Use} ||= $File->{Charset}->{META};
#
# Handle any Fallback or Override for the charset.
if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) {
# charset=foo was given to the CGI and it wasn't "autodetect" or empty.
#
# Extract the user-requested charset from CGI param.
my ($override, undef) = split(/\s/, $File->{Opt}->{Charset}, 2);
$File->{Charset}->{Override} = lc($override);
if ($File->{Opt}->{FB}->{Charset}) { # charset fallback mode
unless ($File->{Charset}->{Use})
{ # no charset detected, actual fallback
&add_warning('W02', {W02_charset => $File->{Charset}->{Override}});
$File->{Tentative} |= T_ERROR; # Tag it as Invalid.
$File->{Charset}->{Use} = $File->{Charset}->{Override};
}
}
else { # charset "hard override" mode
if (!$File->{Charset}->{Use}) { # overriding "nothing"
&add_warning(
'W04',
{ W04_charset => $File->{Charset}->{Override},
W04_override => TRUE
}
);
$File->{Tentative} |= T_ERROR;
$File->{Charset}->{Use} = $File->{Charset}->{Override};
}
elsif ($File->{Charset}->{Override} ne $File->{Charset}->{Use}) {
# Actually overriding something; warn about override.
&add_warning(
'W03',
{ W03_use => $File->{Charset}->{Use},
W03_opt => $File->{Charset}->{Override}
}
);
$File->{Tentative} |= T_ERROR;
$File->{Charset}->{Use} = $File->{Charset}->{Override};
}
}
}
if ($File->{'Direct Input'}) { #explain why UTF-8 is forced
&add_warning('W28', {});
}
unless ($File->{Charset}->{XML} || $File->{Charset}->{META})
{ #suggest character encoding info within doc
&add_warning('W27', {});
}
#
# Abort if an error was flagged while finding the encoding.
&abort_if_error_flagged($File);
$File->{Charset}->{Default} = FALSE;
unless ($File->{Charset}->{Use}) { # No charset given...
$File->{Charset}->{Use} = 'utf-8';
$File->{Charset}->{Default} = TRUE;
$File->{Tentative} |= T_ERROR; # Can never be valid.
&add_warning('W04', {W04_charset => "UTF-8"});
}
# Always transcode, even if the content claims to be UTF-8
$File = transcode($File);
# Try guessing if it didn't work out
if ($File->{ContentType} eq 'text/html' && $File->{Charset}->{Default}) {
my $also_tried = 'UTF-8';
for my $cs (qw(windows-1252 iso-8859-1)) {
last unless $File->{'Error Flagged'};
$File->{'Error Flagged'} = FALSE; # reset
$File->{Charset}->{Use} = $cs;
&add_warning('W04',
{W04_charset => $cs, W04_also_tried => $also_tried});
$File = transcode($File);
$also_tried .= ", $cs";
}
}
# if it still does not work, we abandon hope here
&abort_if_error_flagged($File);
#
# Add a warning if doc is UTF-8 and contains a BOM.
if ($File->{Charset}->{Use} eq 'utf-8' &&
@{$File->{Content}} &&
$File->{Content}->[0] =~ m(^\x{FEFF}))
{
&add_warning('W21', {});
}
#
# Overall parsing algorithm for documents returned as text/html:
#
# For documents that come to us as text/html,
#
# 1. check if there's a doctype
# 2. if there is a doctype, parse/validate against that DTD
# 3. if no doctype, check for an xmlns= attribute on the first element, or
# XML declaration
# 4. if no doctype and XML mode, check for XML well-formedness
# 5. otherwise, punt.
#
#
# Override DOCTYPE if user asked for it.
if ($File->{Opt}->{DOCTYPE}) {
if ($File->{Opt}->{DOCTYPE} !~ /(?:Inline|detect)/i) {
$File = &override_doctype($File);
}
else {
# Get rid of inline|detect for easy truth value checking later
$File->{Opt}->{DOCTYPE} = '';
}
}
# Try to extract a DOCTYPE or xmlns.
$File = &preparse_doctype($File);
if ($File->{Opt}->{DOCTYPE} eq "HTML5") {
$File->{DOCTYPE} = "HTML5";
$File->{Version} = $File->{DOCTYPE};
}
set_parse_mode($File, $CFG);
#
# Sanity check Charset information and add any warnings necessary.
$File = &charset_conflicts($File);
# before we start the parsing, clean slate
$File->{'Is Valid'} = TRUE;
$File->{Errors} = [];
$File->{WF_Errors} = [];
if (($File->{DOCTYPE} eq "HTML5") or ($File->{DOCTYPE} eq "XHTML5")) {
if ($CFG->{External}->{HTML5}) {
$File = &html5_validate($File);
}
else {
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_no_checker => TRUE);
$tmpl->param(fatal_missing_checker => 'HTML5 Validator');
}
}
elsif (($File->{DOCTYPE} eq '') and
(($File->{Root} eq "svg") or @{$File->{Namespaces}} > 1))
{
# we send doctypeless SVG, or any doctypeless XML document with multiple
# namespaces found, to a different engine. WARNING this is experimental.
if ($CFG->{External}->{CompoundXML}) {
$File = &compoundxml_validate($File);
}
}
else {
$File = &dtd_validate($File);
}
&abort_if_error_flagged($File);
if (&is_xml($File)) {
if ($File->{DOCTYPE} eq "HTML5") {
# $File->{DOCTYPE} = "XHTML5";
# $File->{Version} = "XHTML5";
}
else {
# XMLWF check can be slow, skip if we already know the doc can't pass.
# http://www.w3.org/Bugs/Public/show_bug.cgi?id=9899
$File = &xmlwf($File) if $File->{'Is Valid'};
}
&abort_if_error_flagged($File);
}
#
# Force "XML" if type is an XML type and an FPI was not found.
# Otherwise set the type to be the FPI.
if (&is_xml($File) and not $File->{DOCTYPE} and lc($File->{Root}) ne 'html') {
$File->{Version} = 'XML';
}
else {
$File->{Version} ||= $File->{DOCTYPE};
}
#
# Get the pretty text version of the FPI if a mapping exists.
if (my $prettyver = $CFG->{Types}->{$File->{Version}}->{Display}) {
$File->{Version} = $prettyver;
}
#
# check the received mime type against Allowed mime types
if ($File->{ContentType}) {
my @allowedMediaType =
split(/\s+/,
$CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Allowed} || '');
my $usedCTisAllowed;
if (scalar @allowedMediaType) {
$usedCTisAllowed = FALSE;
foreach (@allowedMediaType) {
$usedCTisAllowed = TRUE if ($_ eq $File->{ContentType});
}
}
else {
# wedon't know what media type is recommended, so better shut up
$usedCTisAllowed = TRUE;
}
if (!$usedCTisAllowed) {
&add_warning(
'W23',
{ W23_type => $File->{ContentType},
W23_type_pref =>
$CFG->{Types}->{$File->{DOCTYPE}}->{Types}->{Preferred},
w23_doctype => $File->{Version}
}
);
}
}
#
# Warn about unknown, incorrect, or missing Namespaces.
if ($File->{Namespace}) {
my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE;
if (&is_xml($File)) {
if ($ns eq $File->{Namespace}) {
&add_warning(
'W10',
{ W10_ns => $File->{Namespace},
W10_type => $File->{Type},
}
);
}
}
elsif ($File->{DOCTYPE} ne 'HTML5') {
&add_warning(
'W11',
{ W11_ns => $File->{Namespace},
w11_doctype => $File->{DOCTYPE}
}
);
}
}
else {
if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) {
&add_warning('W12', {});
}
}
## if invalid content, AND if requested, pass through tidy
if (!$File->{'Is Valid'} && $File->{Opt}->{'Show Tidy'}) {
eval {
local $SIG{__DIE__} = undef;
require HTML::Tidy;
my $tidy = HTML::Tidy->new({config_file => $CFG->{Paths}->{TidyConf}});
my $cleaned = $tidy->clean(join("\n", @{$File->{Content}}));
$cleaned = Encode::decode_utf8($cleaned);
$File->{Tidy} = $cleaned;
};
if ($@) {
(my $errmsg = $@) =~ s/ at .*//s;
&add_warning('W29', {W29_msg => $errmsg});
}
}
my %templates = (
earl => ['earl_xml.tmpl', default_escape => 'HTML'],
n3 => ['earl_n3.tmpl'],
json => ['json_output.tmpl'],
ucn => ['ucn_output.tmpl'],
);
my $template = $templates{$File->{Opt}->{Output}};
if ($template) {
my $tname = shift(@$template);
my $tmpl = &get_template($File, $tname, @$template);
$template = $tmpl;
}
elsif ($File->{Opt}->{Output} eq 'soap12') {
if ($CFG->{'Enable SOAP'} != 1) {
# API disabled - ideally this should have been sent before performing
# validation...
print CGI::header(
-status => 503,
-content_language => "en",
-type => "text/html",
-charset => "utf-8"
);
$template = &get_template($File, 'soap_disabled.tmpl');
}
else {
$template = &get_template($File, 'soap_output.tmpl');
}
}
else {
$template = &get_template($File, 'result.tmpl');
}
&prep_template($File, $template);
&fin_template($File, $template);
$template->param(tidy_output => $File->{Tidy});
$template->param(file_source => &source($File))
if ($template->param('opt_show_source') or
($File->{'Is Upload'}) or
($File->{'Direct Input'}));
if ($File->{Opt}->{Output} eq 'json') {
# No JSON escaping in HTML::Template (and "JS" is not the right thing here)
my $json = JSON->new();
$json->allow_nonref(TRUE);
if (my $msgs = $template->param("file_errors")) {
for my $msg (@$msgs) {
for my $key (qw(msg expl src)) {
$msg->{$key} = $json->encode($msg->{$key}) if $msg->{$key};
}
# Drop non-numeric char indicators from output, e.g.
# "> 80" for some XML parse error ones (see the non-structured
# XML::LibXML code branch in XML preparsing below).
if ($msg->{char} && $msg->{char} !~ /^\d+$/) {
delete($msg->{char});
}
}
}
# Escape params used in json_fault.tmpl
for my $key (qw(
fatal_missing_checker
fatal_checker_error
fatal_transcode_charset
fatal_transcode_errmsg
fatal_decode_uri
fatal_decode_cenc
fatal_decode_errmsg
fatal_byte_lines
fatal_byte_charset
fatal_uri_scheme
fatal_ip_host
fatal_http_code
fatal_http_msg
fatal_http_uri
fatal_http_warn
fatal_http_no200
fatal_mime_ct
fatal_parse_extid_msg
)) {
$template->param($key) = $json->encode($template->param($key)) if $template->param($key);
}
}
# transcode output from perl's internal to utf-8 and output
print Encode::encode('UTF-8', $template->output);
#
# Get rid of $File object and exit.
undef $File;
exit;
#############################################################################
# Subroutine definitions
#############################################################################
sub get_template ($$;@)
{
my ($File, $fname, @opts) = @_;
if (!$File->{_Templates}->{$fname}) {
my $tmpl = HTML::Template->new(
%{$File->{Template_Defaults}},
filename => $fname,
@opts
);
$tmpl->param(env_home_page => $File->{Env}->{'Home Page'});
$tmpl->param(validator_version => $VERSION);
$File->{_Templates}->{$fname} = $tmpl;
}
return $File->{_Templates}->{$fname};
}
sub get_error_template ($;@)
{
my ($File, @opts) = @_;
my $fname = 'fatal-error.tmpl';
if ($File->{Opt}->{Output} eq 'soap12') {
$fname = 'soap_fault.tmpl';
}
elsif ($File->{Opt}->{Output} eq 'ucn') {
$fname = 'ucn_fault.tmpl';
}
return &get_template($File, $fname, @opts);
}
# TODO: need to bring in fixes from html5_validate() here
sub compoundxml_validate (\$)
{
my $File = shift;
my $ua = W3C::Validator::UserAgent->new($CFG, $File);
$ua->cookie_jar({});
push(
@{$File->{Parsers}},
{ name => "Compound XML",
link => "http://qa-dev.w3.org/", # TODO?
type => "",
options => ""
}
);
my $url = URI->new($CFG->{External}->{CompoundXML});
$url->query("out=xml");
my $req = HTTP::Request->new(POST => $url);
if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) {
# Doctype or charset overridden, need to use $File->{Content} in UTF-8
# because $File->{Bytes} is not affected by the overrides. This will
# most likely be a source of errors about internal/actual charset
# differences as long as our transcoding process does not "fix" the
# charset info in XML declaration and meta http-equiv (any others?).
if ($File->{'Direct Input'})
{ # sane default when using html5 validator by direct input
$req->content_type("application/xml; charset=UTF-8");
}
else {
$req->content_type("$File->{ContentType}; charset=UTF-8");
}
$req->content(Encode::encode_utf8(join("\n", @{$File->{Content}})));
}
else {
# Pass original bytes, Content-Type and charset as-is.
# We trust that our and validator.nu's interpretation of line numbers
# is the same (regardless of EOL chars used in the document).
my @content_type = ($File->{ContentType} => undef);
push(@content_type, charset => $File->{Charset}->{HTTP})
if $File->{Charset}->{HTTP};
$req->content_type(
HTTP::Headers::Util::join_header_words(@content_type));
$req->content_ref(\$File->{Bytes});
}
$req->content_language($File->{ContentLang}) if $File->{ContentLang};
# Intentionally using direct header access instead of $req->last_modified
$req->header('Last-Modified', $File->{Modified}) if $File->{Modified};
# If not in debug mode, gzip the request (LWP >= 5.817)
eval { $req->encode("gzip"); } unless $File->{Opt}->{Debug};
my $res = $ua->request($req);
if (!$res->is_success()) {
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_no_checker => TRUE);
$tmpl->param(fatal_missing_checker => 'HTML5 Validator');
$tmpl->param(fatal_checker_error => $res->status_line());
}
else {
my $content = &get_content($File, $res);
return $File if $File->{'Error Flagged'};
# and now we parse according to
# http://wiki.whatwg.org/wiki/Validator.nu_XML_Output
# I wish we could use XML::LibXML::Reader here. but SHAME on those
# major unix distributions still shipping with libxml2 2.6.16… 4 years
# after its release
# …and we could use now as we require libxml2 >= 2.6.21 anyway…
my $xml_reader = XML::LibXML->new();
$xml_reader->base_uri($res->base());
my $xmlDOM;
eval { $xmlDOM = $xml_reader->parse_string($content); };
if ($@) {
my $errmsg = $@;
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_no_checker => TRUE);
$tmpl->param(fatal_missing_checker => 'HTML5 Validator');
$tmpl->param(fatal_checker_error => $errmsg);
return $File;
}
my @nodelist = $xmlDOM->getElementsByTagName("messages");
my $messages_node = $nodelist[0];
my @message_nodes = $messages_node->childNodes;
foreach my $message_node (@message_nodes) {
my $message_type = $message_node->localname;
my ($err, $xml_error_msg, $xml_error_expl);
if ($message_type eq "error") {
$err->{type} = "E";
$File->{'Is Valid'} = FALSE;
}
elsif ($message_type eq "info") {
# by default - we find warnings in the type attribute (below)
$err->{type} = "I";
}
if ($message_node->hasAttributes()) {
my @attributelist = $message_node->attributes();
foreach my $attribute (@attributelist) {
if ($attribute->name eq "type") {
if (($attribute->getValue() eq "warning") and
($message_type eq "info"))
{
$err->{type} = "W";
}
}
if ($attribute->name eq "last-column") {
$err->{char} = $attribute->getValue();
}
if ($attribute->name eq "last-line") {
$err->{line} = $attribute->getValue();
}
}
}
my @child_nodes = $message_node->childNodes;
foreach my $child_node (@child_nodes) {
if ($child_node->localname eq "message") {
$xml_error_msg = $child_node->toString();
$xml_error_msg =~ s,</?[^>]*>,,gsi;
}
if ($child_node->localname eq "elaboration") {
$xml_error_expl = $child_node->toString();
$xml_error_expl =~ s,</?elaboration>,,gi;
$xml_error_expl =
"\n<div class=\"ve xml\">$xml_error_expl</div>\n";
}
}
# formatting the error message for output
$err->{src} = "" if $err->{uri}; # TODO...
$err->{num} = 'validator.nu';
$err->{msg} = $xml_error_msg;
$err->{expl} = $xml_error_expl;
if ($err->{msg} =~
/Using the preset for (.*) based on the root namespace/)
{
$File->{DOCTYPE} = $1;
}
else {
push @{$File->{Errors}}, $err;
}
# @@ TODO message explanation / elaboration
}
}
return $File;
}
sub html5_validate (\$)
{
my $File = shift;
my $ua = W3C::Validator::UserAgent->new($CFG, $File);
$ua->cookie_jar({});
push(
@{$File->{Parsers}},
{ name => "validator.nu",
link => "http://validator.nu/",
type => "HTML5",
options => ""
}
);
my $url = URI->new($CFG->{External}->{HTML5});
$url->query("out=xml");
my $req = HTTP::Request->new(POST => $url);
my $ct = &is_xml($File) ? "application/xhtml+xml" : "text/html";
if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override} ||
$File->{'Direct Input'})
{
# Doctype or charset overridden, need to use $File->{Content} in UTF-8
# because $File->{Bytes} is not affected by the overrides. Note that
# direct input is always considered an override here.
&override_charset($File, "UTF-8");
$ct = $File->{ContentType} unless $File->{'Direct Input'};
my @ct = ($ct => undef, charset => "UTF-8");
$ct = HTTP::Headers::Util::join_header_words(@ct);
$req->content(Encode::encode_utf8(join("\n", @{$File->{Content}})));
}
else {
# Pass original bytes, Content-Type and charset as-is.
# We trust that our and validator.nu's interpretation of line numbers
# is the same later when displaying error contexts (regardless of EOL
# chars used in the document).
my @ct = ($File->{ContentType} => undef);
push(@ct, charset => $File->{Charset}->{HTTP})
if $File->{Charset}->{HTTP};
$ct = HTTP::Headers::Util::join_header_words(@ct);
$req->content_ref(\$File->{Bytes});
}
$req->content_type($ct);
$req->content_language($File->{ContentLang}) if $File->{ContentLang};
# Intentionally using direct header access instead of $req->last_modified
# (the latter takes seconds since epoch, but $File->{Modified} is an already
# formatted string).
$req->header('Last-Modified', $File->{Modified}) if $File->{Modified};
# Use gzip in non-debug, remote HTML5 validator mode (LWP >= 5.817).
if (!$File->{Opt}->{Debug} &&
$url->host() !~ /^(?:localhost|127(?:\.\d+){3}|.*\.localdomain)$/i)
{
eval { $req->encode("gzip"); };
}
else {
$req->header('Accept-Encoding', 'identity');
}
redirect_html5_requests();
my $res = $ua->request($req);
if (!$res->is_success()) {
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_no_checker => TRUE);
$tmpl->param(fatal_missing_checker => 'HTML5 Validator');
$tmpl->param(fatal_checker_error => $res->status_line());
}
else {
my $content = &get_content($File, $res);
return $File if $File->{'Error Flagged'};
# and now we parse according to
# http://wiki.whatwg.org/wiki/Validator.nu_XML_Output
# I wish we could use XML::LibXML::Reader here. but SHAME on those
# major unix distributions still shipping with libxml2 2.6.16… 4 years
# after its release
my $xml_reader = XML::LibXML->new();
$xml_reader->base_uri($res->base());
my $xmlDOM;
eval { $xmlDOM = $xml_reader->parse_string($content); };
if ($@) {
my $errmsg = $@;
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_no_checker => TRUE);
$tmpl->param(fatal_missing_checker => 'HTML5 Validator');
$tmpl->param(fatal_checker_error => $errmsg);
return $File;
}
my @nodelist = $xmlDOM->getElementsByTagName("messages");
my $messages_node = $nodelist[0];
my @message_nodes = $messages_node->childNodes;
foreach my $message_node (@message_nodes) {
my $message_type = $message_node->localname;
my ($html5_error_msg, $html5_error_expl);
my $err = {};
# TODO: non-document errors should receive different/better
# treatment, but this is better than hiding all problems for now
# (#6747)
if ($message_type eq "error" ||
$message_type eq "non-document-error")
{
$err->{type} = "E";
$File->{'Is Valid'} = FALSE;
}
elsif ($message_type eq "info") {
# by default - we find warnings in the type attribute (below)
$err->{type} = "I";
}
if ($message_node->hasAttributes()) {
my @attributelist = $message_node->attributes();
foreach my $attribute (@attributelist) {
if ($attribute->name eq "type") {
if (($attribute->getValue() eq "warning") and
($message_type eq "info"))
{
$err->{type} = "W";
}
}
elsif ($attribute->name eq "last-column") {
$err->{char} = $attribute->getValue();
}
elsif ($attribute->name eq "last-line") {
$err->{line} = $attribute->getValue();
}
elsif ($attribute->name eq "url") {
&set_error_uri($err, $attribute->getValue());
}
}
}
my @child_nodes = $message_node->childNodes;
foreach my $child_node (@child_nodes) {
if ($child_node->localname eq "message") {
$html5_error_msg = $child_node->textContent();
}
elsif ($child_node->localname eq "elaboration") {
$html5_error_expl = $child_node->toString();
$html5_error_expl =~ s,</?elaboration>,,gi;
$html5_error_expl =
"\n<div class=\"ve html5\">$html5_error_expl</div>\n";
}
}
# formatting the error message for output
# TODO: set $err->{src} from extract if we got an URI for the error:
# http://wiki.whatwg.org/wiki/Validator.nu_XML_Output#The_extract_Element
# For now, set it directly to empty to prevent report_errors() from
# trying to populate it from our doc.
$err->{src} = "" if $err->{uri};
$err->{num} = 'html5';
$err->{msg} = $html5_error_msg;
$err->{expl} = $html5_error_expl;
push @{$File->{Errors}}, $err;
# @@ TODO message explanation / elaboration
}
}
return $File;
}
sub dtd_validate (\$)
{
my $File = shift;
my $opensp = SGML::Parser::OpenSP->new();
#
# By default, use SGML catalog file and SGML Declaration.
my $catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc');
# default parsing options
my @spopt = qw(valid non-sgml-char-ref no-duplicate);
#
# Switch to XML semantics if file is XML.
if (&is_xml($File)) {
$catalog = catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc');
push(@spopt, 'xml');
}
else {
# add warnings for shorttags
push(@spopt, 'min-tag');
}
push(
@{$File->{Parsers}},
{ name => "OpenSP",
link => "http://openjade.sourceforge.net/",
type => "SGML/XML",
options => join(" ", @spopt)
}
);
#
# Parser configuration
$opensp->search_dirs($CFG->{Paths}->{SGML}->{Library});
$opensp->catalogs($catalog);
$opensp->show_error_numbers(1);
$opensp->warnings(@spopt);
#
# Restricted file reading is disabled on Win32 for the time
# being since neither SGML::Parser::OpenSP nor check auto-
# magically set search_dirs to include the temp directory
# so restricted file reading would defunct the Validator.
$opensp->restrict_file_reading(1) unless $^O eq 'MSWin32';
my $h; # event handler
if ($File->{Opt}->{Outline}) {
$h = W3C::Validator::EventHandler::Outliner->new($opensp, $File, $CFG);
}
else {
$h = W3C::Validator::EventHandler->new($opensp, $File, $CFG);
}
$opensp->handler($h);
$opensp->parse_string(join "\n", @{$File->{Content}});
# Make sure there are no circular references, otherwise the script
# would leak memory until mod_perl unloads it which could take some
# time. @@FIXME It's probably overly careful though.
$opensp->handler(undef);
undef $h->{_parser};
undef $h->{_file};
undef $h;
undef $opensp;
#
# Set Version to be the FPI initially.
$File->{Version} = $File->{DOCTYPE};
return $File;
}
sub xmlwf (\$)
{
# we should really be using a SAX ErrorHandler, but I can't find a way to
# make it work with XML::LibXML::SAX::Parser... ** FIXME **
# ditto, we should try using W3C::Validator::EventHandler, but it's badly
# linked to opensp at the moment
my $File = shift;
my $xmlparser = XML::LibXML->new();
$xmlparser->line_numbers(1);
$xmlparser->validation(0);
$xmlparser->base_uri($File->{URI})
unless ($File->{'Direct Input'} || $File->{'Is Upload'});
push(
@{$File->{Parsers}},
{ name => "libxml2",
link => "http://xmlsoft.org/",
type => "XML",
options => ""
}
);
# Restrict file reading similar to what SGML::Parser::OpenSP does. Note
# that all inputs go through the callback so if we were passing a
# URI/filename to the parser, it would be affected as well and would break
# fetching the initial document. As long as we pass the doc as string,
# this should work.
my $cb = XML::LibXML::InputCallback->new();
$cb->register_callbacks([\&xml_jail_match, sub { }, sub { }, sub { }]);
$xmlparser->input_callbacks($cb);
&override_charset($File, "UTF-8");
eval { $xmlparser->parse_string(join("\n", @{$File->{Content}})); };
if (ref($@)) {
# handle a structured error (XML::LibXML::Error object)
my $err_obj = $@;
while ($err_obj) {
my $err = {};
&set_error_uri($err, $err_obj->file());
$err->{src} = &ent($err_obj->context()) if $err->{uri};
$err->{line} = $err_obj->line();
$err->{char} = $err_obj->column();
$err->{num} = "libxml2-" . $err_obj->code();
$err->{type} = "E";
$err->{msg} = $err_obj->message();
$err_obj = $err_obj->_prev();
unshift(@{$File->{WF_Errors}}, $err);
}
}
elsif ($@) {
my $xmlwf_errors = $@;
my $xmlwf_error_line = undef;
my $xmlwf_error_col = undef;
my $xmlwf_error_msg = undef;
my $got_error_message = undef;
my $got_quoted_line = undef;
foreach my $msg_line (split "\n", $xmlwf_errors) {
$msg_line =~ s{[^\x0d\x0a](:\d+:)}{\n$1}g;
$msg_line =~ s{[^\x0d\x0a]+[\x0d\x0a]$}{};
# first we get the actual error message
if (!$got_error_message &&
$msg_line =~ /^(:\d+:)( parser error : .*)/)
{
$xmlwf_error_line = $1;
$xmlwf_error_msg = $2;
$xmlwf_error_line =~ s/:(\d+):/$1/;
$xmlwf_error_msg =~ s/ parser error :/XML Parsing Error: /;
$got_error_message = 1;
}
# then we skip the second line, which shows the context
# (we don't use that)
elsif ($got_error_message && !$got_quoted_line) {
$got_quoted_line = 1;
}
# we now take the third line, with the pointer to the error's
# column
elsif (($msg_line =~ /(\s+)\^/) and
$got_error_message and
$got_quoted_line)
{
$xmlwf_error_col = length($1);
}
# cleanup for a number of bugs for the column number
if (defined($xmlwf_error_col)) {
if (( my $l =
length($File->{Content}->[$xmlwf_error_line - 1])
) < $xmlwf_error_col
)
{
# http://bugzilla.gnome.org/show_bug.cgi?id=434196
#warn("Warning: reported error column larger than line length " .
# "($xmlwf_error_col > $l) in $File->{URI} line " .
# "$xmlwf_error_line, libxml2 bug? Resetting to line length.");
$xmlwf_error_col = $l;
}
elsif ($xmlwf_error_col == 79) {
# working around an apparent odd limitation of libxml which
# only gives context for lines up to 80 chars
# http://www.w3.org/Bugs/Public/show_bug.cgi?id=4420
# http://bugzilla.gnome.org/show_bug.cgi?id=424017
$xmlwf_error_col = "> 80";
# non-int line number will trigger the proper behavior in
# report_error
}
}
# when we have all the info (one full error message), proceed
# and move on to the next error
if ((defined $xmlwf_error_line) and
(defined $xmlwf_error_col) and
(defined $xmlwf_error_msg))
{
# Reinitializing for the next batch of 3 lines
$got_error_message = undef;
$got_quoted_line = undef;
# formatting the error message for output
my $err = {};
# TODO: set_error_uri() (need test case)
$err->{src} = "" if $err->{uri}; # TODO...
$err->{line} = $xmlwf_error_line;
$err->{char} = $xmlwf_error_col;
$err->{num} = 'xmlwf';
$err->{type} = "E";
$err->{msg} = $xmlwf_error_msg;
push(@{$File->{WF_Errors}}, $err);
$xmlwf_error_line = undef;
$xmlwf_error_col = undef;
$xmlwf_error_msg = undef;
}
}
}
$File->{'Is Valid'} = FALSE if @{$File->{WF_Errors}};
return $File;
}
#
# Generate HTML report.
sub prep_template ($$)
{
my $File = shift;
my $T = shift;
#
# URL for Nu Html Checker
$T->param(htmlchecker_url => $CFG->{External}->{HTML5});
#
# XML mode...
$T->param(is_xml => &is_xml($File));
#
# Upload?
$T->param(is_upload => $File->{'Is Upload'});
#
# Direct Input?
$T->param(is_direct_input => $File->{'Direct Input'});
#
# The URI...
$T->param(file_uri => $File->{URI});
#
# HTTPS note?
$T->param(file_https_note => $File->{'Is Upload'} ||
$File->{'Direct Input'} ||
URI->new($File->{URI})->secure());
#
# Set URL for page title.
$T->param(page_title_url => $File->{URI});
#
# Metadata...
$T->param(file_modified => $File->{Modified});
$T->param(file_server => $File->{Server});
$T->param(file_size => $File->{Size});
$T->param(file_contenttype => $File->{ContentType});
$T->param(file_charset => $File->{Charset}->{Use});
$T->param(file_doctype => $File->{DOCTYPE});
#
# Output options...
$T->param(opt_show_source => $File->{Opt}->{'Show Source'});
$T->param(opt_show_tidy => $File->{Opt}->{'Show Tidy'});
$T->param(opt_show_outline => $File->{Opt}->{Outline});
$T->param(opt_verbose => $File->{Opt}->{Verbose});
$T->param(opt_group_errors => $File->{Opt}->{'Group Errors'});
$T->param(opt_no200 => $File->{Opt}->{No200});
# Root Element
$T->param(root_element => $File->{Root});
# Namespaces...
$T->param(file_namespace => $File->{Namespace});
# Non-root ones; unique, preserving occurrence order
my %seen_ns = ();
$seen_ns{$File->{Namespace}}++ if defined($File->{Namespace});
my @nss =
map { $seen_ns{$_}++ == 0 ? {uri => $_} : () } @{$File->{Namespaces}};
$T->param(file_namespaces => \@nss) if @nss;
if ($File->{Opt}->{DOCTYPE}) {
my $over_doctype_param = "override doctype $File->{Opt}->{DOCTYPE}";
$T->param($over_doctype_param => TRUE);
}
if ($File->{Opt}->{Charset}) {
my $over_charset_param = "override charset $File->{Opt}->{Charset}";
$T->param($over_charset_param => TRUE);
}
# Allow content-negotiation
if ($File->{Opt}->{'Accept Header'}) {
$T->param('accept' => $File->{Opt}->{'Accept Header'});
}
if ($File->{Opt}->{'Accept-Language Header'}) {
$T->param(
'accept-language' => $File->{Opt}->{'Accept-Language Header'});
}
if ($File->{Opt}->{'Accept-Charset Header'}) {
$T->param('accept-charset' => $File->{Opt}->{'Accept-Charset Header'});
}
if ($File->{Opt}->{'User Agent'}) {
$T->param('user-agent' => $File->{Opt}->{'User Agent'});
}
if ($File->{'Error Flagged'}) {
$T->param(fatal_error => TRUE);
}
}
sub fin_template ($$)
{
my $File = shift;
my $T = shift;
#
# Set debug info for HTML and SOAP reports.
if ($DEBUG) {
my @parsers;
for my $parser (@{$File->{Parsers}}) {
my $p = $parser->{name};
$p .= " (" . $parser->{options} . ")" if $parser->{options};
push(@parsers, $p);
}
$T->param(
debug => [
map({name => $_, value => $ENV{$_}},
qw(no_proxy http_proxy https_proxy ftp_proxy FTP_PASSIVE)),
{name => 'Content-Encoding', value => $File->{ContentEnc}},
{name => 'Content-Language', value => $File->{ContentLang}},
{name => 'Content-Location', value => $File->{ContentLoc}},
{name => 'Transfer-Encoding', value => $File->{TransferEnc}},
{name => 'Parse Mode', value => $File->{Mode}},
{name => 'Parse Mode Factor', value => $File->{ModeChoice}},
{name => 'Parsers Used', value => join(", ", @parsers)},
],
);
}
$T->param(parsers => $File->{Parsers});
if (!$File->{Doctype} &&
(!$File->{Version} ||
$File->{Version} eq 'unknown' ||
$File->{Version} eq 'SGML')
)
{
my $default_doctype =
$File->{"Default DOCTYPE"}->{&is_xml($File) ? "XHTML" : "HTML"};
$T->param(file_version => "$default_doctype");
}
else {
$T->param(file_version => $File->{Version});
if ($File->{Version} eq "HTML5") {
$T->param(is_html5 => TRUE);
}
}
my ($num_errors, $num_warnings, $num_info, $reported_errors) =
&report_errors($File);
if ($File->{Version} eq "HTML5") {
$num_warnings++;
}
if ($num_errors + $num_warnings > 0) {
$T->param(has_errors => 1);
}
$T->param(valid_errors_num => $num_errors);
$num_warnings += scalar @{$File->{Warnings}};
$T->param(valid_warnings_num => $num_warnings);
my $number_of_errors = ""; # textual form of $num_errors
my $number_of_warnings = ""; # textual form of $num_errors
# The following is a bit hack-ish, but will enable us to have some logic
# for a human-readable display of the number, with cases for 0, 1, 2 and
# above (the case of 2 appears to be useful for localization in some
# languages where the plural is different for 2, and above)
if ($num_errors > 1) {
$T->param(number_of_errors_is_0 => FALSE);
$T->param(number_of_errors_is_1 => FALSE);
if ($num_errors == 2) {
$T->param(number_of_errors_is_2 => TRUE);
}
else {
$T->param(number_of_errors_is_2 => FALSE);
}
$T->param(number_of_errors_is_plural => TRUE);
}
elsif ($num_errors == 1) {
$T->param(number_of_errors_is_0 => FALSE);
$T->param(number_of_errors_is_1 => TRUE);
$T->param(number_of_errors_is_2 => FALSE);
$T->param(number_of_errors_is_plural => FALSE);
}
else { # 0
$T->param(number_of_errors_is_0 => TRUE);
$T->param(number_of_errors_is_1 => FALSE);
$T->param(number_of_errors_is_2 => FALSE);
$T->param(number_of_errors_is_plural => FALSE);
}
if ($num_warnings > 1) {
$T->param(number_of_warnings_is_0 => FALSE);
$T->param(number_of_warnings_is_1 => FALSE);
if ($num_warnings == 2) {
$T->param(number_of_warnings_is_2 => TRUE);
}
else {
$T->param(number_of_warnings_is_2 => FALSE);
}
$T->param(number_of_warnings_is_plural => TRUE);
}
elsif ($num_warnings == 1) {
$T->param(number_of_warnings_is_0 => FALSE);
$T->param(number_of_warnings_is_1 => TRUE);
$T->param(number_of_warnings_is_2 => FALSE);
$T->param(number_of_warnings_is_plural => FALSE);
}
else { # 0
$T->param(number_of_warnings_is_0 => TRUE);
$T->param(number_of_warnings_is_1 => FALSE);
$T->param(number_of_warnings_is_2 => FALSE);
$T->param(number_of_warnings_is_plural => FALSE);
}
$T->param(file_outline => $File->{heading_outline})
if $File->{Opt}->{Outline};
$T->param(file_errors => $reported_errors);
if ($File->{'Is Valid'}) {
$T->param(VALID => TRUE);
$T->param(valid_status => 'Valid');
&report_valid($File, $T);
}
else {
$T->param(VALID => FALSE);
$T->param(valid_status => 'Invalid');
}
}
#
# Output "This page is Valid" report.
sub report_valid
{
my $File = shift;
my $T = shift;
unless ($File->{Version} eq 'unknown' or defined $File->{Tentative}) {
if (exists $CFG->{Types}->{$File->{DOCTYPE}}->{Badge}) {
my $cfg = $CFG->{Types}->{$File->{DOCTYPE}};
$T->param(badge_uri => $cfg->{Badge}->{URI});
$T->param(local_badge_uri => $cfg->{Badge}->{'Local URI'});
$T->param(badge_alt_uri => $cfg->{Badge}->{'Alt URI'});
$T->param(local_alt_badge_uri => $cfg->{Badge}->{'Local ALT URI'});
$T->param(badge_alt => $cfg->{Badge}->{Alt});
$T->param(badge_rdfa => $cfg->{Badge}->{RDFa});
$T->param(badge_h => $cfg->{Badge}->{Height});
$T->param(badge_w => $cfg->{Badge}->{Width});
$T->param(badge_onclick => $cfg->{Badge}->{OnClick});
$T->param(badge_tagc => $cfg->{'Parse Mode'} eq 'XML' ? ' /' : '');
}
}
elsif (defined $File->{Tentative}) {
$T->param(is_tentative => TRUE);
}
if ($File->{XMLWF_ONLY}) {
$T->param(xmlwf_only => TRUE);
}
my $thispage = self_url_file($File);
$T->param(file_thispage => $thispage);
}
#
# Add a warning message to the output.
sub add_warning ($$)
{
my $WID = shift;
my $params = shift;
push @{$File->{Warnings}}, $WID;
my %tmplparams = (
$WID => TRUE,
have_warnings => TRUE,
%$params,
);
for my $tmpl (qw(result fatal-error soap_output ucn_output)) {
&get_template($File, "$tmpl.tmpl")->param(%tmplparams);
}
}
#
# Proxy authentication requests.
# Note: expects the third argument to be a hash ref (see HTTP::Headers::Auth).
sub authenticate
{
my $File = shift;
my $resource = shift;
my $authHeader = shift || {};
my $realm = $resource;
$realm =~ s([^\w\d.-]*){}g;
while (my ($scheme, $header) = each %$authHeader) {
my $origrealm = $header->{realm};
if (not defined $origrealm or $scheme !~ /^(?:basic|digest)$/i) {
delete($authHeader->{$scheme});
next;
}
$header->{realm} = "$realm-$origrealm";
}
my $headers = HTTP::Headers->new(Connection => 'close');
$headers->www_authenticate(%$authHeader);
$headers = $headers->as_string();
chomp($headers);
my $tmpl = &get_template($File, 'http_401_authrequired.tmpl');
$tmpl->param(http_401_headers => $headers);
$tmpl->param(http_401_url => $resource);
print Encode::encode('UTF-8', $tmpl->output);
exit; # Further interaction will be a new HTTP request.
}
#
# Fetch an URL and return the content and selected meta-info.
sub handle_uri
{
my $q = shift; # The CGI object.
my $File = shift; # The master datastructure.
my $ua = W3C::Validator::UserAgent->new($CFG, $File);
$ua->cookie_jar({});
my $uri = URI->new(ref $q ? scalar $q->param('uri') : $q)->canonical();
$uri->fragment(undef);
if (!$uri->scheme()) {
local $ENV{URL_GUESS_PATTERN} = '';
my $guess = URI::Heuristic::uf_uri($uri);
if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
$uri = $guess;
}
else {
$uri = URI->new("http://$uri");
}
}
unless ($ua->is_protocol_supported($uri)) {
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
# If uri param is empty (also for empty direct or upload), it's been
# set to TRUE in sub prepCGI()
if ($uri->canonical() eq "1") {
$tmpl->param(fatal_no_content => TRUE);
}
else {
$tmpl->param(fatal_uri_error => TRUE);
$tmpl->param(fatal_uri_scheme => $uri->scheme());
}
return $File;
}
return $File unless $ua->uri_ok($uri);
my $req = HTTP::Request->new(GET => $uri);
# if one wants to use the accept, accept-charset and accept-language params
# in order to trigger specific negotiation
if ($File->{Opt}->{'Accept Header'}) {
$req->header(Accept => $File->{Opt}->{'Accept Header'});
}
if ($File->{Opt}->{'Accept-Language Header'}) {
$req->header(
Accept_Language => $File->{Opt}->{'Accept-Language Header'});
}
if ($File->{Opt}->{'Accept-Charset Header'}) {
$req->header(
Accept_Charset => $File->{Opt}->{'Accept-Charset Header'});
}
# All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
# If we're under mod_perl, there is a way around it...
my $http_auth = $ENV{HTTP_AUTHORIZATION};
eval {
local $SIG{__DIE__} = undef;
my $auth =
Apache2::RequestUtil->request()->headers_in()->{Authorization};
$http_auth = $auth if $auth;
} if (IS_MODPERL2() && !$http_auth);
# If we got a Authorization header, the client is back at it after being
# prompted for a password so we insert the header as is in the request.
$req->headers->header(Authorization => $http_auth) if $http_auth;
my $res = $ua->request($req);
return $File if $File->{'Error Flagged'}; # Redirect IP rejected?
unless ($res->code == 200 or $File->{Opt}->{'No200'}) {
if ($res->code == 401) {
my %auth = $res->www_authenticate(); # HTTP::Headers::Auth
&authenticate($File, $res->request->uri, \%auth);
}
else {
$File->{'Error Flagged'} = TRUE;
my $no200url = undef;
if (!$File->{Opt}->{No200}) {
# $File->{URI} not set yet; setting it non-local has side
# effects
local $File->{URI} = $uri->as_string;
local $File->{Opt}->{No200} = TRUE;
$no200url = &self_url_file($File);
}
my $warning = $res->header("Client-Warning");
if ($warning && $warning =~ /Internal response/i) {
# Response doc generated internally by LWP, no need to show
# that info nor to provide error doc validation link to it.
$warning = undef;
$no200url = undef;
}
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_http_error => TRUE);
$tmpl->param(fatal_http_uri => $uri->as_string);
$tmpl->param(fatal_http_code => $res->code);
$tmpl->param(fatal_http_msg => $res->message);
$tmpl->param(fatal_http_warn => $warning);
$tmpl->param(fatal_http_no200 => $no200url);
$tmpl->param(fatal_http_dns => TRUE) if ($res->code == 500);
}
return $File;
}
#
# Enforce Max Recursion level.
&check_recursion($File, $res);
my ($mode, $ct, $charset) = &parse_content_type(
$File,
scalar($res->header('Content-Type')),
scalar($res->request->uri),
);
my $content = &get_content($File, $res);
return $File if $File->{'Error Flagged'};
$File->{Bytes} = $content;
$File->{Mode} = $mode;
$File->{ContentType} = $ct;
$File->{ContentEnc} = $res->content_encoding;
$File->{ContentLang} = $res->content_language;
$File->{ContentLoc} = $res->header('Content-Location');
$File->{TransferEnc} = $res->header('Client-Transfer-Encoding');
$File->{Charset}->{HTTP} = lc $charset if defined $charset;
$File->{Modified} = $res->header('Last-Modified');
$File->{Server} = scalar $res->server;
# TODO: Content-Length is not always set, so either this should
# be renamed to 'Content-Length' or it should consider more than
# the Content-Length header.
$File->{Size} = scalar $res->content_length;
$File->{URI} = scalar $res->request->uri->canonical;
$File->{'Is Upload'} = FALSE;
$File->{'Direct Input'} = FALSE;
return $File;
}
#
# Handle uploaded file and return the content and selected meta-info.
sub handle_file
{
my $q = shift; # The CGI object.
my $File = shift; # The master datastructure.
my $p = $q->param('uploaded_file');
my $f = $q->upload('uploaded_file');
if (!defined($f)) {
# Probably not an uploaded file as far as CGI is concerned,
# treat as a fragment.
$q->param('fragment', $p);
return &handle_frag($q, $File);
}
my $h = $q->uploadInfo($p);
local $/ = undef; # set line delimiter so that <> reads rest of file
my $file = <$f>;
my ($mode, $ct, $charset) =
&parse_content_type($File, $h->{'Content-Type'});
$File->{Bytes} = $file;
$File->{Mode} = $mode;
$File->{ContentType} = $ct;
$File->{Charset}->{HTTP} = lc $charset if defined $charset;
$File->{Modified} = $q->http('Last-Modified');
$File->{Server} = $q->http('User-Agent'); # Fake a "server". :-)
$File->{Size} = $q->http('Content-Length');
$File->{URI} = "$p";
$File->{'Is Upload'} = TRUE;
$File->{'Direct Input'} = FALSE;
return $File;
}
#
# Handle uploaded file and return the content and selected meta-info.
sub handle_frag
{
my $q = shift; # The CGI object.
my $File = shift; # The master datastructure.
$File->{Bytes} = $q->param('fragment');
$File->{Mode} = 'TBD';
$File->{Modified} = '';
$File->{Server} = '';
$File->{Size} = '';
$File->{ContentType} = ''; # @@TODO?
$File->{URI} = 'upload://Form Submission';
$File->{'Is Upload'} = FALSE;
$File->{'Direct Input'} = TRUE;
$File->{Charset}->{HTTP} =
"utf-8"; # by default, the form accepts utf-8 chars
if ($File->{Opt}->{Prefill}) {
# we surround the HTML fragment with some basic document structure
my $prefill_Template;
if ($File->{Opt}->{'Prefill Doctype'} eq 'html401') {
$prefill_Template = &get_template($File, 'prefill_html401.tmpl');
}
else {
$prefill_Template = &get_template($File, 'prefill_xhtml10.tmpl');
}
$prefill_Template->param(fragment => $File->{Bytes});
$File->{Bytes} = $prefill_Template->output();
# Let's force the view source so that the user knows what we've put
# around their code.
$File->{Opt}->{'Show Source'} = TRUE;
# Ignore doctype overrides (#5132).
$File->{Opt}->{DOCTYPE} = 'Inline';
}
return $File;
}
#
# Parse a Content-Type and parameters. Return document type and charset.
sub parse_content_type
{
my $File = shift;
my $Content_Type = shift;
my $url = shift;
my $charset = '';
my ($ct) = lc($Content_Type) =~ /^\s*([^\s;]*)/g;
my $mode = $CFG->{MIME}->{$ct} || $ct;
$charset = HTML::Encoding::encoding_from_content_type($Content_Type);
if (index($mode, '/') != -1) { # a "/" means it's unknown or we'd have a mode here.
if ($ct eq 'text/css' and defined $url) {
print redirect
'http://jigsaw.w3.org/css-validator/validator?uri=' .
uri_escape $url;
exit;
}
elsif ($ct eq 'application/atom+xml' and defined $url) {
print redirect 'https://validator.w3.org/feed/check.cgi?url=' .
uri_escape $url;
exit;
}
elsif ($ct =~ m(^application/.+\+xml$)) {
# unknown media types which should be XML - we give these a try
$mode = "XML";
}
else {
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_mime_error => TRUE);
$tmpl->param(fatal_mime_ct => $ct);
}
}
return $mode, $ct, $charset;
}
#
# Get content with Content-Encodings decoded from a response.
sub get_content ($$)
{
my $File = shift;
my $res = shift;
my $content;
eval {
$content = $res->decoded_content(charset => 'none', raise_error => 1);
};
if ($@) {
(my $errmsg = $@) =~ s/ at .*//s;
my $cenc = $res->header("Content-Encoding");
my $uri = $res->request->uri;
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_decode_error => TRUE);
$tmpl->param(fatal_decode_errmsg => $errmsg);
$tmpl->param(fatal_decode_cenc => $cenc);
# Include URI because it might be a subsystem (eg. HTML5 validator) one
$tmpl->param(fatal_decode_uri => $uri);
}
return $content;
}
#
# Check recursion level and enforce Max Recursion limit.
sub check_recursion ($$)
{
my $File = shift;
my $res = shift;
# Not looking at our own output.
return unless defined $res->header('X-W3C-Validator-Recursion');
my $lvl = $res->header('X-W3C-Validator-Recursion');
return unless $lvl =~ m/^\d+$/; # Non-digit, i.e. garbage, ignore.
if ($lvl >= $CFG->{'Max Recursion'}) {
print redirect $File->{Env}->{'Home Page'};
}
else {
# Increase recursion level in output.
&get_template($File, 'result.tmpl')->param(depth => $lvl++);
}
}
#
# XML::LibXML::InputCallback matcher using our SGML search path jail.
sub xml_jail_match
{
my $arg = shift;
# Ensure we have a file:// URI if we get a file.
my $uri = URI->new($arg);
if (!$uri->scheme()) {
$uri = URI::file->new_abs($arg);
}
$uri = $uri->canonical();
# Do not trap non-file URIs.
return 0 unless ($uri->scheme() eq "file");
# Do not trap file URIs within our jail.
for my $dir ($CFG->{Paths}->{SGML}->{Library},
split(/\Q$Config{path_sep}\E/o, $ENV{SGML_SEARCH_PATH} || ''))
{
next unless $dir;
my $dir_uri = URI::file->new_abs($dir)->canonical()->as_string();
$dir_uri =~ s|/*$|/|; # ensure it ends with a slash
return 0 if ($uri =~ /^\Q$dir_uri\E/);
}
# We have a match (a file outside the jail).
return 1;
}
#
# Escape text to be included in markup comment.
sub escape_comment
{
local $_ = shift;
return '' unless defined;
s/--/- /g;
return $_;
}
#
# Return $_[0] encoded for HTML entities (cribbed from merlyn).
#
# Note that this is used both for HTML and XML escaping (so e.g. no &apos;).
#
sub ent
{
my $str = shift;
return '' unless defined($str); # Eliminate warnings
# should switch to hex sooner or later
$str =~ s/&/&#38;/g;
$str =~ s/</&#60;/g;
$str =~ s/>/&#62;/g;
$str =~ s/"/&#34;/g;
$str =~ s/'/&#39;/g;
return $str;
}
#
# Truncate source lines for report.
# Expects 1-based column indexes.
sub truncate_line
{
my $line = shift;
my $col = shift;
my $maxlen = 80; # max line length to truncate to
my $diff = length($line) - $maxlen;
# Don't truncate at all if it fits.
return ($line, $col) if ($diff <= 0);
my $start = $col - int($maxlen / 2);
if ($start < 0) {
# Truncate only from end of line.
$start = 0;
$line = substr($line, $start, $maxlen - 1) . '';
}
elsif ($start > $diff) {
# Truncate only from beginning of line.
$start = $diff;
$line = '' . substr($line, $start + 1);
}
else {
# Truncate from both beginning and end of line.
$line = '' . substr($line, $start + 1, $maxlen - 2) . '';
}
# Shift column if we truncated from beginning of line.
$col -= $start;
return ($line, $col);
}
#
# Suppress any existing DOCTYPE by commenting it out.
sub override_doctype
{
my $File = shift;
my ($dt) =
grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} }
values %{$CFG->{Types}};
# @@TODO: abort/whine about unrecognized doctype if $dt is undef.;
my $pubid = $dt->{PubID};
my $sysid = $dt->{SysID};
my $name = $dt->{Name};
# The HTML5 PubID is a fake, reset it out of the way.
$pubid = undef if ($pubid eq 'HTML5');
# We don't have public/system ids for all types.
my $dtd = "<!DOCTYPE $name";
if ($pubid) {
$dtd .= qq( PUBLIC "$pubid");
$dtd .= qq( "$sysid") if $sysid;
}
elsif ($sysid) {
$dtd .= qq( SYSTEM "$sysid");
}
$dtd .= '>';
my $org_dtd = '';
my $HTML = '';
my $seen_doctype = FALSE;
my $declaration = sub {
my ($tag, $text) = @_;
if ($seen_doctype || uc($tag) ne '!DOCTYPE') {
$HTML .= $text;
return;
}
$seen_doctype = TRUE;
$org_dtd = &ent($text);
($File->{Root}, undef, $File->{DOCTYPE}) = $text =~
/<!DOCTYPE\s+(\w[\w\.-]+)(?:\s+(?:PUBLIC|SYSTEM)\s+(['"])(.*?)\2)?\s*>/si;
$File->{DOCTYPE} = 'HTML5'
if (
lc($File->{Root} || '') eq 'html' &&
(!defined($File->{DOCTYPE}) ||
$File->{DOCTYPE} eq 'about:legacy-compat')
);
# No Override if Fallback was requested, or if override is the same as
# detected
my $known = $CFG->{Types}->{$File->{DOCTYPE}};
if ($File->{Opt}->{FB}->{DOCTYPE} or
($known && $File->{Opt}->{DOCTYPE} eq $known->{Display}))
{
$HTML .= $text; # Stash it as is...
}
else {
$HTML .= "$dtd<!-- " . &escape_comment($text) . " -->";
}
};
my $start_element = sub {
my $p = shift;
# Sneak chosen doctype before the root elt if none replaced thus far.
$HTML .= $dtd unless $seen_doctype;
$HTML .= shift;
# We're done with this handler.
$p->handler(start => undef);
};
HTML::Parser->new(
default_h => [sub { $HTML .= shift }, 'text'],
declaration_h => [$declaration, 'tag,text'],
start_h => [$start_element, 'self,text']
)->parse(join "\n", @{$File->{Content}})->eof();
$File->{Content} = [split /\n/, $HTML];
if ($seen_doctype) {
my $known = $CFG->{Types}->{$File->{DOCTYPE}};
unless ($File->{Opt}->{FB}->{DOCTYPE} or
($known && $File->{Opt}->{DOCTYPE} eq $known->{Display}))
{
&add_warning(
'W13',
{ W13_org => $org_dtd,
W13_new => $File->{Opt}->{DOCTYPE},
}
);
$File->{Tentative} |= T_ERROR; # Tag it as Invalid.
}
}
else {
if ($File->{"DOCTYPEless OK"}) {
&add_warning('W25', {W25_dtd => $File->{Opt}->{DOCTYPE}});
}
elsif ($File->{Opt}->{FB}->{DOCTYPE}) {
&add_warning('W16', {W16_dtd => $File->{Opt}->{DOCTYPE}});
$File->{Tentative} |= T_ERROR; # Tag it as Invalid.
}
else {
&add_warning('W15', {W15_dtd => $File->{Opt}->{DOCTYPE}});
$File->{Tentative} |= T_ERROR; # Tag it as Invalid.
}
}
return $File;
}
#
# Override inline charset declarations, for use e.g. when passing
# transcoded results to external parsers that use them.
sub override_charset ($$)
{
my ($File, $charset) = @_;
my $ws = qr/[\x20\x09\x0D\x0A]/o;
my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o;
my $content = join("\n", @{$File->{Content}});
# Flatten newlines (so that we don't end up changing line numbers while
# overriding) and comment-escape a string.
sub escape_original ($)
{
my $str = shift;
$str =~ tr/\r\n/ /;
return &escape_comment($str);
}
# <?xml encoding="charset"?>
$content =~ s/(
(^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["']))
(${cs})
(\3.*?\?>)
)/lc($4) eq lc($charset) ?
"$1" : "$2$charset$5<!-- " . &escape_original($1) . " -->"/esx;
# <meta charset="charset">
$content =~ s/(
(<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*)
(${cs})
(.*?>)
)/lc($3) eq lc($charset) ?
"$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
# <meta http-equiv="content-type" content="some/type; charset=charset">
$content =~ s/(
(<meta\b[^>]*${ws}
http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws}
content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
(${cs})
(.*?>)
)/lc($3) eq lc($charset) ?
"$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
# <meta content="some/type; charset=charset" http-equiv="content-type">
$content =~ s/(
(<meta\b[^>]*${ws}
content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
(${cs})
([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>)
)/lc($3) eq lc($charset) ?
"$1" : "$2$charset$4<!-- " . &escape_original($1) . " -->"/esix;
$File->{Content} = [split /\n/, $content];
}
sub set_error_uri ($$)
{
my ($err, $uri) = @_;
# We want errors in the doc that was validated to appear without
# $err->{uri}, and non-doc errors with it pointing to the external entity
# or the like where the error is. This usually works as long as we're
# passing docs to parsers as strings, but S::P::O (at least as of 0.994)
# seems to give us "3" as the FileName in those cases so we try to filter
# out everything that doesn't look like a useful URI.
if ($uri && index($uri, '/') != -1) {
# Mask local file paths
my $euri = URI->new($uri);
if (!$euri->scheme() || $euri->scheme() eq 'file') {
$err->{uri_is_file} = TRUE;
$err->{uri} = ($euri->path_segments())[-1];
}
else {
$err->{uri} = $euri->canonical();
}
}
}
#
# Generate a HTML report of detected errors.
sub report_errors ($)
{
my $File = shift;
my $Errors = [];
my %Errors_bytype;
my $number_of_errors = 0;
my $number_of_warnings = 0;
my $number_of_info = 0;
# for the sake of readability, at least until the xmlwf errors have
# explanations, we push the errors from the XML parser at the END of the
# error list.
push @{$File->{Errors}}, @{$File->{WF_Errors}};
if (scalar @{$File->{Errors}}) {
foreach my $err (@{$File->{Errors}}) {
my $col = 0;
# Populate source/context for errors in our doc that don't have it
# already. Checkers should always have populated $err->{src} with
# _something_ for non-doc errors.
if (!defined($err->{src})) {
my $line = undef;
# Avoid truncating lines that do not exist.
if (defined($err->{line}) &&
$File->{Content}->[$err->{line} - 1])
{
if (defined($err->{char}) && $err->{char} =~ /^[0-9]+$/) {
($line, $col) =
&truncate_line(
$File->{Content}->[$err->{line} - 1],
$err->{char});
# Skip &mark_error for JSON to make JSON output Nu Html Checker compatible.
if ($File->{Opt}->{Output} ne 'json') {
$line = &mark_error($line, $col);
}
}
elsif (defined($err->{line})) {
$col = length($File->{Content}->[$err->{line} - 1]);
$col = 80 if ($col > 80);
($line, $col) =
&truncate_line(
$File->{Content}->[$err->{line} - 1], $col);
$line = &ent($line);
$col = 0;
}
}
else {
$col = 0;
}
$err->{src} = $line;
}
my $explanation = "";
if ($err->{expl}) {
}
else {
if ($err->{num}) {
my $num = $err->{num};
$explanation .= Encode::decode_utf8(
"\n $RSRC{msg}->{$num}->{verbose}\n")
if exists $RSRC{msg}->{$num} &&
exists $RSRC{msg}->{$num}->{verbose};
my $_msg = $RSRC{msg}->{nomsg}->{verbose};
$_msg =~ s/<!--MID-->/$num/g;
if (($File->{'Is Upload'}) or ($File->{'Direct Input'})) {
$_msg =~ s/<!--URI-->//g;
}
else {
my $escaped_uri = uri_escape($File->{URI});
$_msg =~ s/<!--URI-->/$escaped_uri/g;
}
# The send feedback plea.
$explanation = " $_msg\n$explanation";
$explanation =~ s/<!--#echo\s+var="relroot"\s*-->//g;
}
$err->{expl} = $explanation;
}
$err->{col} = ' ' x $col;
if ($err->{type} eq 'I') {
$err->{class} = 'msg_info';
$err->{err_type_err} = 0;
$err->{err_type_warn} = 0;
$err->{err_type_info} = 1;
$number_of_info += 1;
}
elsif ($err->{type} eq 'E') {
$err->{class} = 'msg_err';
$err->{err_type_err} = 1;
$err->{err_type_warn} = 0;
$err->{err_type_info} = 0;
$number_of_errors += 1;
}
elsif (($err->{type} eq 'W') or ($err->{type} eq 'X')) {
$err->{class} = 'msg_warn';
$err->{err_type_err} = 0;
$err->{err_type_warn} = 1;
$err->{err_type_info} = 0;
$number_of_warnings += 1;
}
# TODO other classes for "X" etc? FIXME find all types of message.
push @{$Errors}, $err;
if (($File->{Opt}->{'Group Errors'}) and
(($err->{type} eq 'E') or
($err->{type} eq 'W') or
($err->{type} eq 'X'))
)
{
# index by num for errors and warnings only - info usually
# gives context of error or warning
if (!exists $Errors_bytype{$err->{num}}) {
$Errors_bytype{$err->{num}}->{instances} = [];
my $msg_text;
if ($err->{num} eq 'xmlwf') {
# FIXME need a catalog of errors from XML::LibXML
$msg_text = "XML Parsing Error";
}
elsif ($err->{num} eq 'html5') {
$msg_text = "HTML5 Validator Error";
}
else {
$msg_text = $RSRC{msg}->{$err->{num}}->{original};
$msg_text =~ s/%1/X/;
$msg_text =~ s/%2/Y/;
}
$Errors_bytype{$err->{num}}->{expl} = $err->{expl};
$Errors_bytype{$err->{num}}->{generic_msg} = $msg_text;
$Errors_bytype{$err->{num}}->{msg} = $err->{msg};
$Errors_bytype{$err->{num}}->{type} = $err->{type};
$Errors_bytype{$err->{num}}->{class} = $err->{class};
$Errors_bytype{$err->{num}}->{err_type_err} =
$err->{err_type_err};
$Errors_bytype{$err->{num}}->{err_type_warn} =
$err->{err_type_warn};
$Errors_bytype{$err->{num}}->{err_type_info} =
$err->{err_type_info};
}
push @{$Errors_bytype{$err->{num}}->{instances}}, $err;
}
}
}
@$Errors = values(%Errors_bytype) if $File->{Opt}->{'Group Errors'};
# we are not sorting errors by line, as it would break the position
# of auxiliary messages such as "start tag was here". We'll have to live
# with the fact that XML well-formedness errors are listed first, then
# validation errors
#else {
# sort error by lines
# @{$Errors} = sort {$a->{line} <=> $b->{line} } @{$Errors};
#}
return $number_of_errors, $number_of_warnings, $number_of_info, $Errors;
}
#
# Chop the source line into 3 pieces; the character at which the error
# was detected, and everything to the left and right of that position.
# That way we can add markup to the relevant char without breaking &ent().
# Expects 1-based column indexes.
sub mark_error ($$)
{
my $line = shift;
my $col = shift;
my $linelen = length($line);
# Coerce column into an index valid within the line.
if ($col < 1) {
$col = 1;
}
elsif ($col > $linelen) {
$col = $linelen;
}
$col--;
my $left = substr($line, 0, $col);
my $char = substr($line, $col, 1);
my $right = substr($line, $col + 1);
$char = &ent($char);
$char =
qq(<strong title="Position where error was detected.">$char</strong>);
$line = &ent($left) . $char . &ent($right);
return $line;
}
#
# Create a HTML representation of the document.
sub source
{
my $File = shift;
# Remove any BOM since we're not at BOT anymore...
$File->{Content}->[0] = substr($File->{Content}->[0], 1)
if ($File->{BOM} && scalar(@{$File->{Content}}));
my @source = map({file_source_line => $_}, @{$File->{Content}});
return \@source;
}
sub match_DTD_FPI_SI
{
my ($File, $FPI, $SI) = @_;
if ($CFG->{Types}->{$FPI}) {
if ($CFG->{Types}->{$FPI}->{SysID}) {
if ($SI ne $CFG->{Types}->{$FPI}->{SysID}) {
&add_warning(
'W26',
{ W26_dtd_pub => $FPI,
W26_dtd_pub_display =>
$CFG->{Types}->{$FPI}->{Display},
W26_dtd_sys => $SI,
W26_dtd_sys_recommend => $CFG->{Types}->{$FPI}->{SysID}
}
);
}
}
}
else { # FPI not known, checking if the SI is
while (my ($proper_FPI, $value) = each %{$CFG->{Types}}) {
if ($value->{SysID} && $value->{SysID} eq $SI) {
&add_warning(
'W26',
{ W26_dtd_pub => $FPI,
W26_dtd_pub_display => $value->{Display},
W26_dtd_sys => $SI,
W26_dtd_pub_recommend => $proper_FPI
}
);
}
}
}
}
#
# Do an initial parse of the Document Entity to extract FPI.
sub preparse_doctype
{
my $File = shift;
#
# Reset DOCTYPE, Root (for second invocation, probably not needed anymore).
$File->{DOCTYPE} = '';
$File->{Root} = '';
my $dtd = sub {
return if $File->{Root};
# TODO: The \s and \w are probably wrong now that the strings are
# utf8_on
my $declaration = shift;
my $doctype_type;
my $doctype_secondpart;
if ($declaration =~
/<!DOCTYPE\s+html(?:\s+SYSTEM\s+(['"])about:legacy-compat\1)?\s*>/si
)
{
$File->{Root} = "html";
$File->{DOCTYPE} = "HTML5";
}
elsif ($declaration =~
m(<!DOCTYPE\s+(\w[\w\.-]+)\s+(PUBLIC|SYSTEM)\s+(?:[\'\"])([^\"\']+)(?:[\"\'])(.*)>)si
)
{
( $File->{Root}, $doctype_type,
$File->{DOCTYPE}, $doctype_secondpart
) = ($1, $2, $3, $4);
if (($doctype_type eq "PUBLIC") and
(($doctype_secondpart) =
$doctype_secondpart =~
m(\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*)si)
)
{
&match_DTD_FPI_SI($File, $File->{DOCTYPE},
$doctype_secondpart);
}
}
};
my $start = sub {
my ($p, $tag, $attr) = @_;
if ($File->{Root}) {
return unless $tag eq $File->{Root};
}
else {
$File->{Root} = $tag;
}
if ($attr->{xmlns}) {
$File->{Namespace} = $attr->{xmlns};
}
if ($attr->{version}) {
$File->{'Root Version'} = $attr->{version};
}
if ($attr->{baseProfile}) {
$File->{'Root BaseProfile'} = $attr->{baseProfile};
}
# We're done parsing.
$p->eof();
};
# we use HTML::Parser as pre-parser. May use html5lib or other in the future
my $p = HTML::Parser->new(api_version => 3);
# if content-type has shown we should pre-parse with XML mode, use that
# otherwise (mostly text/html cases) use default mode
$p->xml_mode(&is_xml($File));
$p->handler(declaration => $dtd, 'text');
$p->handler(start => $start, 'self,tag,attr');
my $line = 0;
my $max = scalar(@{$File->{Content}});
$p->parse(
sub {
return ($line < $max) ? $File->{Content}->[$line++] . "\n" : undef;
}
);
$p->eof();
# TODO: These \s here are probably wrong now that the strings are utf8_on
$File->{DOCTYPE} = '' unless defined $File->{DOCTYPE};
$File->{DOCTYPE} =~ s(^\s+){ }g;
$File->{DOCTYPE} =~ s(\s+$){ }g;
$File->{DOCTYPE} =~ s(\s+) { }g;
# Some document types actually need no doctype to be identified,
# root element and some version attribute is enough
# TODO applicable doctypes should be migrated to a config file?
# if (($File->{DOCTYPE} eq '') and ($File->{Root} eq "svg") ) {
# if (($File->{'Root Version'}) or ($File->{'Root BaseProfile'}))
# {
# if (! $File->{'Root Version'}) { $File->{'Root Version'} = "0"; }
# if (! $File->{'Root BaseProfile'}) { $File->{'Root BaseProfile'} = "0"; }
# if ($File->{'Root Version'} eq "1.0"){
# $File->{DOCTYPE} = "-//W3C//DTD SVG 1.0//EN";
# $File->{"DOCTYPEless OK"} = TRUE;
# $File->{Opt}->{DOCTYPE} = "SVG 1.0";
# }
# if ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "tiny")) {
# $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Tiny//EN";
# $File->{"DOCTYPEless OK"} = TRUE;
# $File->{Opt}->{DOCTYPE} = "SVG 1.1 Tiny";
# }
# elsif ((($File->{'Root Version'} eq "1.1") or ($File->{'Root Version'} eq "0")) and ($File->{'Root BaseProfile'} eq "basic")) {
# $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1 Basic//EN";
# $File->{Opt}->{DOCTYPE} = "SVG 1.1 Basic";
# $File->{"DOCTYPEless OK"} = TRUE;
# }
# elsif (($File->{'Root Version'} eq "1.1") and (!$File->{'Root BaseProfile'})) {
# $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN";
# $File->{Opt}->{DOCTYPE} = "SVG 1.1";
# $File->{"DOCTYPEless OK"} = TRUE;
# }
# if ($File->{'Root Version'} eq "0") { $File->{'Root Version'} = undef; }
# if ($File->{'Root BaseProfile'} eq "0") { $File->{'Root BaseProfile'} = undef; }
# }
# else {
# # by default for an svg root elt, we use SVG 1.1
# $File->{DOCTYPE} = "-//W3C//DTD SVG 1.1//EN";
# $File->{Opt}->{DOCTYPE} = "SVG 1.1";
# $File->{"DOCTYPEless OK"} = TRUE;
# }
# }
if (($File->{"DOCTYPEless OK"}) and ($File->{Opt}->{DOCTYPE})) {
# doctypeless document type found, we fake the override
# so that the parser will have something to validate against
$File = &override_doctype($File);
}
return $File;
}
#
# Preprocess CGI parameters.
sub prepCGI
{
my $File = shift;
my $q = shift;
# The URL to this CGI script.
$File->{Env}->{'Self URI'} = $q->url();
# Decode parameter values, set booleans the way we expect them.
foreach my $param ($q->param()) {
# 'uploaded_file' and 'fragment' contain data we treat as is.
next if ($param eq 'uploaded_file' || $param eq 'fragment');
# Decode all other defined values as UTF-8.
my @values = map { Encode::decode_utf8($_) } $q->multi_param($param);
$q->param($param, @values);
# Skip parameters that should not be treated as booleans.
next if $param =~ /^(?:accept(?:-(?:language|charset))?|ur[il])$/;
# Keep false-but-set params.
next if $q->param($param) eq '0';
# Parameters that are given to us without specifying a value get set
# to a true value.
$q->param($param, TRUE) unless $q->param($param);
}
$File->{Env}->{'Home Page'} =
URI->new_abs(".", $File->{Env}->{'Self URI'});
# Use "url" unless a "uri" was also given.
if ($q->param('url') and not $q->param('uri')) {
$q->param('uri', scalar $q->param('url'));
}
# Set output mode; needed in get_error_template if we end up there.
$File->{Opt}->{Output} = scalar $q->param('output') || 'html';
# Issue a redirect for uri=referer.
if ($q->param('uri') and $q->param('uri') eq 'referer') {
if ($q->referer) {
$q->param('uri', $q->referer);
$q->param('accept', $q->http('Accept')) if ($q->http('Accept'));
$q->param('accept-language', $q->http('Accept-Language'))
if ($q->http('Accept-Language'));
$q->param('accept-charset', $q->http('Accept-Charset'))
if ($q->http('Accept-Charset'));
print redirect(-uri => &self_url_q($q, $File), -vary => 'Referer');
exit;
}
else {
# No Referer header was found.
$File->{'Error Flagged'} = TRUE;
&get_error_template($File)->param(fatal_referer_error => TRUE);
}
}
# Supersede URL with an uploaded file.
if ($q->param('uploaded_file')) {
$q->param('uri', 'upload://' . scalar $q->param('uploaded_file'));
$File->{'Is Upload'} = TRUE; # Tag it for later use.
}
# Supersede URL with an uploaded fragment.
if ($q->param('fragment')) {
$q->param('uri', 'upload://Form Submission');
$File->{'Direct Input'} = TRUE; # Tag it for later use.
}
# Redirect to a GETable URL if method is POST without a file upload.
if (defined $q->request_method and
$q->request_method eq 'POST' and
not($File->{'Is Upload'} or $File->{'Direct Input'}))
{
my $thispage = &self_url_q($q, $File);
print redirect $thispage;
exit;
}
#
# Flag an error if we didn't get a file to validate.
unless ($q->param('uri')) {
$File->{'Error Flagged'} = TRUE;
my $tmpl = &get_error_template($File);
$tmpl->param(fatal_uri_error => TRUE);
$tmpl->param(fatal_uri_scheme => 'undefined');
}
return $q;
}
#
# Set parse mode (SGML or XML) based on a number of preparsed factors:
# * HTTP Content-Type
# * Doctype Declaration
# * XML Declaration
# * XML namespaces
sub set_parse_mode
{
my $File = shift;
my $CFG = shift;
my $fpi = $File->{DOCTYPE};
$File->{ModeChoice} = '';
my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD';
my $xmlws = qr/[\x20\x09\x0D\x0A]/o;
# $File->{Mode} may have been set in parse_content_type
# and it would come from the Media Type
my $parseModeFromMimeType = $File->{Mode};
my $begincontent = join "\x20",
@{$File->{Content}}; # for the sake of xml decl detection,
# the 10 first lines should be safe
my $parseModeFromXMLDecl = (
$begincontent =~
/^ ${xmlws}* # whitespace before the decl should not be happening
# but we are greedy for the sake of detection, not validation
<\?xml ${xmlws}+ # start matching an XML Declaration
version ${xmlws}* = # for documents, version info is mandatory
${xmlws}* (["'])1.[01]\1 # hardcoding the existing XML versions.
# Maybe we should use \d\.\d
(?:${xmlws}+ encoding
${xmlws}* = ${xmlws}*
(["'])[A-Za-z][a-zA-Z0-9_-]+\2
)? # encoding info is optional
(?:${xmlws}+ standalone
${xmlws}* = ${xmlws}*
(["'])(?:yes|no)\3
)? # ditto standalone info, optional
${xmlws}* \?> # end of XML Declaration
/ox
?
'XML' :
'TBD'
);
my $parseModeFromNamespace = 'TBD';
# http://www.w3.org/Bugs/Public/show_bug.cgi?id=9967
$parseModeFromNamespace = 'XML'
if ($File->{Namespace} && $parseModeFromDoctype ne 'HTML5');
if (($parseModeFromMimeType eq 'TBD') and
($parseModeFromXMLDecl eq 'TBD') and
($parseModeFromNamespace eq 'TBD') and
(!exists $CFG->{Types}->{$fpi}))
{
# if the mime type is text/html (ambiguous, hence TBD mode)
# and the doctype isn't in the catalogue
# and XML prolog detection was unsuccessful
# and we found no namespace at the root
# ... throw in a warning
&add_warning(
'W06',
{ W06_mime => $File->{ContentType},
w06_doctype => $File->{DOCTYPE}
}
);
return;
}
$parseModeFromDoctype = 'TBD'
unless $parseModeFromDoctype eq 'SGML' or
$parseModeFromDoctype eq 'HTML5' or
$parseModeFromDoctype eq 'XML' or
$parseModeFromNamespace eq 'XML';
if (($parseModeFromDoctype eq 'TBD') and
($parseModeFromXMLDecl eq 'TBD') and
($parseModeFromMimeType eq 'TBD') and
($parseModeFromNamespace eq 'TBD'))
{
# if all factors are useless to give us a parse mode
# => we use SGML-based DTD validation as a default
$File->{Mode} = 'DTD+SGML';
$File->{ModeChoice} = 'Fallback';
# and send warning about the fallback
&add_warning(
'W06',
{ W06_mime => $File->{ContentType},
w06_doctype => $File->{DOCTYPE}
}
);
return;
}
if ($parseModeFromMimeType ne 'TBD') {
# if The mime type gives clear indication of whether the document is
# XML or not
if (($parseModeFromDoctype ne 'TBD') and
($parseModeFromDoctype ne 'HTML5') and
($parseModeFromMimeType ne $parseModeFromDoctype))
{
# if document-type recommended mode and content-type recommended
# mode clash, shoot a warning
# unknown doctypes will not trigger this
# neither will html5 documents, which can be XML or not
&add_warning(
'W07',
{ W07_mime => $File->{ContentType},
W07_ct => $parseModeFromMimeType,
W07_dtd => $parseModeFromDoctype,
}
);
}
# mime type has precedence, we stick to it
$File->{ModeChoice} = 'Mime';
if ($parseModeFromDoctype eq "HTML5") {
$File->{Mode} = 'HTML5+' . $File->{Mode};
}
else {
$File->{Mode} = 'DTD+' . $File->{Mode};
}
return;
}
if ($parseModeFromDoctype ne 'TBD') {
# the mime type is ambiguous (hence we didn't stop at the previous test)
# but by now we're sure that the document type is a good indication
# so we use that.
if ($parseModeFromDoctype eq "HTML5") {
if ($parseModeFromXMLDecl eq "XML" or
$parseModeFromNamespace eq "XML")
{
$File->{Mode} = "HTML5+XML";
}
else {
$File->{Mode} = "HTML5";
}
}
else { # not HTML5
$File->{Mode} = "DTD+" . $parseModeFromDoctype;
}
$File->{ModeChoice} = 'Doctype';
return;
}
if ($parseModeFromXMLDecl ne 'TBD') {
# the mime type is ambiguous (hence we didn't stop at the previous test)
# and so was the doctype
# but we found an XML declaration so we use that.
if ($File->{Mode} eq "") {
$File->{Mode} = "DTD+" . $parseModeFromXMLDecl;
}
elsif ((my $ix = index($File->{Mode}, '+')) != -1) {
substr($File->{Mode}, $ix + 1) = $parseModeFromXMLDecl;
}
else {
$File->{Mode} = $File->{Mode} . "+" . $parseModeFromXMLDecl;
}
$File->{ModeChoice} = 'XMLDecl';
return;
}
# this is the last case. We know that all modes are not TBD,
# yet mime type, doctype AND XML DECL tests have failed => we are saved
# by the presence of namespaces
if ($File->{Mode} eq "") {
$File->{Mode} = "DTD+" . $parseModeFromNamespace;
}
elsif ((my $ix = index($File->{Mode}, '+')) != -1) {
substr($File->{Mode}, $ix + 1) = $parseModeFromNamespace;
}
else {
$File->{Mode} = $File->{Mode} . "+" . $parseModeFromNamespace;
}
$File->{ModeChoice} = 'Namespace';
}
#
# Utility sub to tell if mode "is" XML.
sub is_xml
{
index(shift->{Mode}, 'XML') != -1;
}
#
# Check charset conflicts and add any warnings necessary.
sub charset_conflicts
{
my $File = shift;
#
# Handle the case where there was no charset to be found.
unless ($File->{Charset}->{Use}) {
&add_warning('W17', {});
$File->{Tentative} |= T_WARN;
}
#
# Add a warning if there was charset info conflict (HTTP header,
# XML declaration, or <meta> element).
# filtering out some of the warnings in direct input mode where HTTP
# encoding is a "fake"
if (( charset_not_equal(
$File->{Charset}->{HTTP},
$File->{Charset}->{XML}
)
) and
not($File->{'Direct Input'})
)
{
&add_warning(
'W18',
{ W18_http => $File->{Charset}->{HTTP},
W18_xml => $File->{Charset}->{XML},
W18_use => $File->{Charset}->{Use},
}
);
}
elsif (
charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META})
and
not($File->{'Direct Input'}))
{
&add_warning(
'W19',
{ W19_http => $File->{Charset}->{HTTP},
W19_meta => $File->{Charset}->{META},
W19_use => $File->{Charset}->{Use},
}
);
}