diff --git a/src/main/java/com/adobe/epubcheck/ops/OPSHandler.java b/src/main/java/com/adobe/epubcheck/ops/OPSHandler.java index 68d8a4d5..f5c71e5d 100755 --- a/src/main/java/com/adobe/epubcheck/ops/OPSHandler.java +++ b/src/main/java/com/adobe/epubcheck/ops/OPSHandler.java @@ -22,8 +22,10 @@ package com.adobe.epubcheck.ops; +import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; +import java.net.URL; import java.util.Locale; import java.util.Stack; @@ -226,15 +228,22 @@ else if (".".equals(href)) { report.info(path, FeatureEnum.REFERENCE, href); - /* - * #708 report invalid HTTP/HTTPS URLs - * uri.scheme may be correct, but missing a : or a / from the // - * leads to uri.getHost() == null - */ + // Report if the host part couldn't be parsed correctly + // (either due to missing slashes (issue #708) or invalid characters (issue #1034) if (uri.getHost() == null) { - int missingSlashes = uri.getSchemeSpecificPart().startsWith("/") ? 1 : 2; - report.message(MessageId.RSC_023, parser.getLocation(), uri, missingSlashes, uri.getScheme()); + try + { + // if the URL contains underscore characters, try reparsing it without them, + // as underscores are accepted by browsers in the host part (even if it's disallowed) + // see issue #1079 + if (!href.contains("_") || new URI(href.replace('_', 'x')).getHost() == null) { + report.message(MessageId.RSC_023, parser.getLocation(), uri); + } + } catch (URISyntaxException ignored) + { + // ignored (well-formedness errors are caught earlier) + } } } diff --git a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties index 6c6bfa53..316c39cb 100644 --- a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties +++ b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties @@ -326,7 +326,7 @@ RSC_019=EPUBs with Multiple Renditions should contain a META-INF/metadata.xml fi RSC_020='%1$s' is not a valid URI. RSC_021=A Search Key Map Document must point to Content Documents ('%1$s' was not found in the spine). RSC_022=Cannot check image details (requires Java version 7 or higher). -RSC_023=The URL '%1$s' is missing %2$d slash(es) '/' after the protocol '%3$s:' +RSC_023=Couldn't parse host of URL '%1$s' (probably due to disallowed characters or missing slashes after the protocol) #Scripting SCP_001=Use of Javascript eval() function in EPUB scripts is a security risk. diff --git a/src/test/java/com/adobe/epubcheck/ops/OPSCheckerTest.java b/src/test/java/com/adobe/epubcheck/ops/OPSCheckerTest.java index 5a2798e8..75ba890a 100644 --- a/src/test/java/com/adobe/epubcheck/ops/OPSCheckerTest.java +++ b/src/test/java/com/adobe/epubcheck/ops/OPSCheckerTest.java @@ -230,7 +230,7 @@ public void testValidateXHTMLLINKInvalid() public void testValidateXHTMLUrlChecksInvalid() { Collections.addAll(expectedErrors, MessageId.RSC_020); - Collections.addAll(expectedWarnings, MessageId.HTM_025, MessageId.RSC_023, MessageId.RSC_023); + Collections.addAll(expectedWarnings, MessageId.HTM_025, MessageId.RSC_023, MessageId.RSC_023, MessageId.RSC_023); testValidateDocument("xhtml/invalid/url-checks_issue-708.xhtml", "application/xhtml+xml", EPUBVersion.VERSION_3); } diff --git a/src/test/resources/30/single/xhtml/invalid/url-checks_issue-708.xhtml b/src/test/resources/30/single/xhtml/invalid/url-checks_issue-708.xhtml index 8e672dde..16e62c06 100644 --- a/src/test/resources/30/single/xhtml/invalid/url-checks_issue-708.xhtml +++ b/src/test/resources/30/single/xhtml/invalid/url-checks_issue-708.xhtml @@ -11,7 +11,9 @@ Unsupported URI scheme (HTM-025) URL is missing slashes after protocol (RSC-023) URL is missing slashes after protocol (RSC-023) - + Host contains an invalid character (RSC-023) + + Underscore in hosts are accepted in most browsers Valid URI Valid URI Valid URI