Skip to content

Commit

Permalink
feat: update the reporting of file encoding issues
Browse files Browse the repository at this point in the history
This commit changes the errors and warnings reported when EPUBCheck
detects an invalid or non-recommended file encoding.

The following errors codes are now reported:

- `RSC-027` (new): warning reported for XML documents encoded in UTF-16
- `RSC-028` (new): error reported for XML documents in an invalid encoding
- `HTM-058` (new): error reported for XHTML encoded in UTF-16
  (the HTML standard has an authoring requirement for UTF-8)
- `CSS-003` (updated): warning reported for CSS encoded in UTF-16
- `CSS-004` (updated): error reported for CSS in an invalid encoding

Note: previously both `CSS-003` and `CSS-004` were errors, reported for
a disallowed encoding. The only difference is that one was reported when
the encoded was detected from a BOM, the other from a `@charset`
declaration.
This commit repurposes `CSS-003` as the warning raised for UTF-16, and
`CSS-004` as the disallowed-encoding error.

Fixes #1245
  • Loading branch information
rdeltour committed Nov 17, 2022
1 parent 111e772 commit 0d6f927
Show file tree
Hide file tree
Showing 43 changed files with 318 additions and 27 deletions.
21 changes: 15 additions & 6 deletions src/main/java/com/adobe/epubcheck/css/CSSChecker.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,17 +138,26 @@ CssSource getCssSource()
if (source.getInputStream().getBomCharset().isPresent())
{
charset = source.getInputStream().getBomCharset().get().toLowerCase(Locale.ROOT);
if (!charset.equals("utf-8") && !charset.startsWith("utf-16"))
if (!charset.equals("utf-8"))
{
report.message(MessageId.CSS_004, EPUBLocation.of(context), charset);
if (charset.startsWith("utf-16"))
{
report.message(MessageId.CSS_003, EPUBLocation.of(context), charset);
} else {
report.message(MessageId.CSS_004, EPUBLocation.of(context), charset);
}
}
}
if (source.getInputStream().getCssCharset().isPresent())
} else if (source.getInputStream().getCssCharset().isPresent())
{
charset = source.getInputStream().getCssCharset().get().toLowerCase(Locale.ROOT);
if (!charset.equals("utf-8") && !charset.startsWith("utf-16"))
if (!charset.equals("utf-8"))
{
report.message(MessageId.CSS_003, EPUBLocation.of(context), charset);
if (charset.startsWith("utf-16"))
{
report.message(MessageId.CSS_003, EPUBLocation.of(context), charset);
} else {
report.message(MessageId.CSS_004, EPUBLocation.of(context), charset);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ private void initialize()
// CSS
severities.put(MessageId.CSS_001, Severity.ERROR);
severities.put(MessageId.CSS_002, Severity.ERROR);
severities.put(MessageId.CSS_003, Severity.ERROR);
severities.put(MessageId.CSS_003, Severity.WARNING);
severities.put(MessageId.CSS_004, Severity.ERROR);
severities.put(MessageId.CSS_005, Severity.ERROR);
severities.put(MessageId.CSS_006, Severity.USAGE);
Expand Down Expand Up @@ -141,6 +141,7 @@ private void initialize()
severities.put(MessageId.HTM_055, Severity.WARNING);
severities.put(MessageId.HTM_056, Severity.ERROR);
severities.put(MessageId.HTM_057, Severity.ERROR);
severities.put(MessageId.HTM_058, Severity.ERROR);

// Media
severities.put(MessageId.MED_001, Severity.ERROR);
Expand Down Expand Up @@ -337,6 +338,8 @@ private void initialize()
severities.put(MessageId.RSC_024, Severity.USAGE);
severities.put(MessageId.RSC_025, Severity.USAGE);
severities.put(MessageId.RSC_026, Severity.ERROR);
severities.put(MessageId.RSC_027, Severity.WARNING);
severities.put(MessageId.RSC_028, Severity.ERROR);

// Scripting
severities.put(MessageId.SCP_001, Severity.SUPPRESSED); // checking scripts is out of scope
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/com/adobe/epubcheck/messages/MessageId.java
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ public enum MessageId implements Comparable<MessageId>
HTM_055("HTM_055"),
HTM_056("HTM_056"),
HTM_057("HTM_057"),
HTM_058("HTM_058"),

// Messages associated with media (images, audio and video)
MED_001("MED-001"),
Expand Down Expand Up @@ -331,6 +332,8 @@ public enum MessageId implements Comparable<MessageId>
RSC_024("RSC-024"),
RSC_025("RSC-025"),
RSC_026("RSC-026"),
RSC_027("RSC-027"),
RSC_028("RSC-028"),

// Messages relating to scripting
SCP_001("SCP-001"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import java.io.InputStream;
import java.util.Locale;

public final class EncodingSniffer
public final class XMLEncodingSniffer
{

private static final byte[][] UTF16_MAGIC = { { (byte) 0xFE, (byte) 0xFF },
Expand Down Expand Up @@ -108,7 +108,7 @@ public static String sniffEncoding(InputStream in)
return encoding.toUpperCase(Locale.ROOT);
}

private EncodingSniffer()
private XMLEncodingSniffer()
{
// Not instanciable.
}
Expand Down
30 changes: 26 additions & 4 deletions src/main/java/com/adobe/epubcheck/xml/XMLParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.w3c.epubcheck.constants.MIMEType;
import org.xml.sax.ContentHandler;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
Expand Down Expand Up @@ -135,13 +136,33 @@ public void process()
}

// Check encoding
String encoding = EncodingSniffer.sniffEncoding(buffered);
if (encoding != null && !encoding.equals("UTF-8") && !encoding.equals("UTF-16"))
// If the result is null, the XML parser will must parse it as UTF-8
String encoding = XMLEncodingSniffer.sniffEncoding(buffered);
if (encoding != null && !encoding.equals("UTF-8"))
{
report.message(MessageId.CSS_003, EPUBLocation.of(context), encoding);
if (encoding.equals("UTF-16"))
{
// XHTML requires UTF-8, UTF-16 is reported as an error
if (MIMEType.XHTML.is(context.mimeType))
{
report.message(MessageId.HTM_058, EPUBLocation.of(context));
}
// For other XML types, UTF-16 is reported as a warning
else
{
report.message(MessageId.RSC_027, EPUBLocation.of(context));
}
}
else
{
report.message(MessageId.RSC_028, EPUBLocation.of(context), encoding);
}
}

// Build the input source
// We do not set the source encoding name, but instead let the SAXParser
// apply its own encoding-sniffing logic, as it can report useful errors
// (for instance a mismatch between a BOM and the XML declaration)
InputSource source = new InputSource(buffered);
source.setSystemId(url.toString());

Expand All @@ -163,7 +184,8 @@ public void process()
} catch (SAXException e)
{
// All errors should have already been reported by the error handler
if (report.getFatalErrorCount() == 0) {
if (report.getFatalErrorCount() == 0)
{
report.message(MessageId.RSC_016, EPUBLocation.of(context), e.getMessage());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ CHK_008=Error encountered while processing an item "%1$s"; skip other checks for
#CSS
CSS_001=The "%1$s" property must not be included in an EPUB Style Sheet.
CSS_002=Empty or NULL reference found.
CSS_003=Only UTF-8 and UTF-16 encodings are allowed, detected %1$s.
CSS_004=Only UTF-8 and UTF-16 encodings are allowed, detected %1$s BOM.
CSS_003=CSS document is encoded in UTF-16. It should be encoded in UTF-8 instead.
CSS_004=CSS documents must be encoded in UTF-8, detected %1%s;
CSS_005=Conflicting alternate style attributes found: %1$s.
CSS_006=CSS selector specifies fixed position.
CSS_007=Font-face reference "%1$s" refers to non-standard font type "%2$s".
Expand All @@ -68,7 +68,7 @@ CSS_025=CSS class Selector could not be found.
CSS_025_SUG=Check for typos or define a class selector to document the use of the class.
CSS_028=Use of Font-face declaration.
CSS_029=Found CSS class name "%1$s" but no "%2$s" property was declared in the package document.
CSS_030=The package document declares media overlays styling class names but no CSS was found in the content document.
CSS_030=The package document declares media overlays styling class names but no CSS was found in the content document.

#HTM - XHTML related messages
HTM_001=Any publication resource that is an XML-based media type must be a valid XML 1.0 document. XML version found: %1$s.
Expand Down Expand Up @@ -125,7 +125,8 @@ HTM_053=Found an external file link (file://) in file: "%1$s".
HTM_054=Custom attribute namespace ("%1$s") must not include the string "%2$s" in its domain.
HTM_055=The "%1$s" element should not be used (discouraged construct)
HTM_056=Viewport metadata has no "%1$s" dimension (both "width" and "height" properties are required)
HTM_057=Viewport "%1$s" value must be a positive number or the keyword "device-%1$s"
HTM_057=Viewport "%1$s" value must be a positive number or the keyword "device-%1$s"
HTM_058=HTML documents must be encoded in UTF-8, but UTF-16 was detected.

#media
MED_001=Video poster must have core media image type.
Expand Down Expand Up @@ -346,7 +347,9 @@ RSC_022=Cannot check image details (requires Java version 7 or higher).
RSC_023=Couldn’t parse host of URL "%1$s" (probably due to disallowed characters or missing slashes after the protocol)
RSC_024=Informative parsing warning: %1$s
RSC_025=Informative parsing error: %1$s
RSC_026=URL "%1$s" leaks outside the container (it is not a valid-relative-ocf-URL-with-fragment string)
RSC_026=URL "%1$s" leaks outside the container (it is not a valid-relative-ocf-URL-with-fragment string)
RSC_027=XML document is encoded in UTF-16. It should be encoded in UTF-8 instead.
RSC_028=XML documents must be encoded in UTF-8, but %1%s was detected.

#Scripting
SCP_001=Use of Javascript eval() function in EPUB scripts is a security risk.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title id="title">Minimal EPUB 3.0</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="q">NOID</dc:identifier>
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
</metadata>
<manifest>
<item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
</manifest>
<spine>
<itemref idref="content_001" />
</spine>
</package>
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="unknown"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title id="title">Minimal EPUB 3.0</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="q">NOID</dc:identifier>
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
</metadata>
<manifest>
<item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
</manifest>
<spine>
<itemref idref="content_001" />
</spine>
</package>
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title id="title">Minimal EPUB 3.0</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="q">NOID</dc:identifier>
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
</metadata>
<manifest>
<item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
</manifest>
<spine>
<itemref idref="content_001" />
</spine>
</package>
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title id="title">Minimal EPUB 3.0</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="q">NOID</dc:identifier>
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
</metadata>
<manifest>
<item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
</manifest>
<spine>
<itemref idref="content_001" />
</spine>
</package>
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title id="title">Minimal EPUB 3.0</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="q">NOID</dc:identifier>
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
</metadata>
<manifest>
<item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
</manifest>
<spine>
<itemref idref="content_001" />
</spine>
</package>
58 changes: 56 additions & 2 deletions src/test/resources/epub3/03-resources/resources.feature
Original file line number Diff line number Diff line change
Expand Up @@ -388,18 +388,72 @@

## 3.9 XML conformance

@spec @xref:sec-xml-constraint
@spec @xref:sec-xml-constraints
Scenario: an XML document encoded as UTF-8 with an encoding declaration is valid
When checking file 'xml-encoding-utf8-declared-valid.opf'
Then no errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: an XML document encoded as UTF-8 with a BOM is valid
When checking file 'xml-encoding-utf8-BOM-valid.opf'
Then no errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: an XML document encoded as UTF-8 with no encoding declaration is valid
When checking file 'xml-encoding-utf8-no-declaration-valid.opf'
Then no errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: Warn about an XML document encoded as UTF-16 (with an encoding declaration)
When checking file 'xml-encoding-utf16-declared-warning.opf'
Then warning RSC-027 is reported
And no other errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: Warn about an XML document encoded as UTF-16 (not declared but with a BOM)
When checking file 'xml-encoding-utf16-BOM-no-declaration-warning.opf'
Then warning RSC-027 is reported
And no other errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: Warn about an XML document encoded as UTF-16 (even with an UTF-8 declaration)
When checking file 'xml-encoding-utf16-BOM-and-utf8-declaration-warning.opf'
Then warning RSC-027 is reported
And fatal error RSC-016 is reported (by the XML parser)
And no other errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: Report an XML document encoded as ISO-8859-1 (detected in the encoding declaration)
When checking file 'xml-encoding-latin1-declaration-error.opf'
Then error RSC-028 is reported
And no other errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: Report an XML document encoded as UCS-4 (detected with a BOM)
When checking file 'xml-encoding-utf32-BOM-error.opf'
Then error RSC-028 is reported
And no other errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: Report an XML document declared with an unknown encoding name
When checking file 'xml-encoding-unknown-declared-error.opf'
Then error RSC-028 is reported
And fatal error RSC-016 is reported (by the XML parser)
And no other errors or warnings are reported

@spec @xref:sec-xml-constraints
Scenario: a not well-formed Package Document is reported
When checking file 'conformance-xml-malformed-error.opf'
Then fatal error RSC-016 is reported (parsing error)
And no other errors or warnings are reported

@spec @xref:sec-xml-constraint
@spec @xref:sec-xml-constraints
Scenario: using a not-declared namespace is not allowed
When checking file 'conformance-xml-undeclared-namespace-error.opf'
Then fatal error RSC-016 is reported (parsing error)
And no other errors or warnings are reported


Scenario: Verify an attribute value with leading/trailing whitespace is allowed (issue 332)
When checking EPUB 'conformance-xml-id-leading-trailing-spaces-valid'
Then no errors or warnings are reported
Original file line number Diff line number Diff line change
Expand Up @@ -38,15 +38,26 @@ Feature: EPUB 3 — Content Documents — CSS
##### Encoding

@spec @xref:sec-css-req
Scenario: Verify a CSS file with a `@charset` declaration and UTF8 encoding
See also issue #262
When checking EPUB 'content-css-charset-utf8-valid'
Scenario: Verify a CSS document encoded in UTF-8 (declared with `@charset`)
When checking EPUB 'content-css-encoding-utf8-declared-valid'
Then no errors or warnings are reported

@spec @xref:sec-css-req
Scenario: Report a CSS file with a `@charset` declaration that is not utf-8
When checking EPUB 'content-css-charset-enc-error'
Then error CSS-003 is reported
Scenario: Warn about a CSS document encoded in UTF-16 (declared with `@charset`)
When checking EPUB 'content-css-encoding-utf16-declared-warning'
Then warning CSS-003 is reported
And no other errors or warnings are reported

@spec @xref:sec-css-req
Scenario: Warn about a CSS document encoded in UTF-16 (and no `@charset` declaration)
When checking EPUB 'content-css-encoding-utf16-not-declared-warning'
Then warning CSS-003 is reported
And no other errors or warnings are reported

@spec @xref:sec-css-req
Scenario: Report a CSS document with a `@charset` declaration that is not utf-8 or utf-16
When checking EPUB 'content-css-encoding-latin1-error'
Then error CSS-004 is reported
And no other errors or warnings are reported

##### Resources and imports
Expand Down
Loading

0 comments on commit 0d6f927

Please sign in to comment.