From 52ebd8057779460e40570acccc7e4d945dda05d9 Mon Sep 17 00:00:00 2001 From: Romain Deltour Date: Sun, 14 Nov 2021 14:34:24 +0100 Subject: [PATCH] feat: new check (OPF-092) for language tags well-formedness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Package Document, the language tags appearing in the elements or attributes below MUST be well-formed according to BCP47: - `xml:lang` attribute - `hreflang` attribute - `dc:language` element For these values: - the schema now only do basic datatype check (string, non-empty value when relevant) - the well-formedness is checked with Java’s Locale.Builder#setLanguageTag() API - a new check (OPF-092) is reported when an ill-formed value is found See https://docs.oracle.com/javase/8/docs/api/java/util/Locale.Builder.html#setLanguageTag-java.lang.String- Fix #1221 Close #702 --- .../epubcheck/messages/DefaultSeverities.java | 1 + .../adobe/epubcheck/messages/MessageId.java | 1 + .../com/adobe/epubcheck/opf/OPFHandler30.java | 37 +++++++++++++++++++ .../messages/MessageBundle.properties | 1 + .../adobe/epubcheck/schema/30/package-30.rnc | 4 +- .../attr-lang-not-well-formed-error.opf | 17 +++++++++ .../attr-lang-whitespace-error.opf | 17 +++++++++ .../link-hreflang-not-well-formed-error.opf | 17 +++++++++ .../link-hreflang-whitespace-error.opf | 17 +++++++++ .../metadata-language-empty-error.opf | 16 ++++++++ ...etadata-language-not-well-formed-error.opf | 16 ++++++++ .../resources/epub3/package-document.feature | 33 ++++++++++++++++- 12 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 src/test/resources/epub3/files/package-document/attr-lang-not-well-formed-error.opf create mode 100644 src/test/resources/epub3/files/package-document/attr-lang-whitespace-error.opf create mode 100644 src/test/resources/epub3/files/package-document/link-hreflang-not-well-formed-error.opf create mode 100644 src/test/resources/epub3/files/package-document/link-hreflang-whitespace-error.opf create mode 100644 src/test/resources/epub3/files/package-document/metadata-language-empty-error.opf create mode 100644 src/test/resources/epub3/files/package-document/metadata-language-not-well-formed-error.opf diff --git a/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java b/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java index 68b419eb4..312acb6f9 100644 --- a/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java +++ b/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java @@ -273,6 +273,7 @@ private void initialize() severities.put(MessageId.OPF_089, Severity.ERROR); severities.put(MessageId.OPF_090, Severity.USAGE); severities.put(MessageId.OPF_091, Severity.ERROR); + severities.put(MessageId.OPF_092, Severity.ERROR); // PKG severities.put(MessageId.PKG_001, Severity.WARNING); diff --git a/src/main/java/com/adobe/epubcheck/messages/MessageId.java b/src/main/java/com/adobe/epubcheck/messages/MessageId.java index 0d5003eda..15723233c 100644 --- a/src/main/java/com/adobe/epubcheck/messages/MessageId.java +++ b/src/main/java/com/adobe/epubcheck/messages/MessageId.java @@ -267,6 +267,7 @@ public enum MessageId implements Comparable OPF_089("OPF-089"), OPF_090("OPF-090"), OPF_091("OPF-091"), + OPF_092("OPF-092"), // Messages relating to the entire package PKG_001("PKG-001"), diff --git a/src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java b/src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java index 7c59bf13a..8dc2f597f 100644 --- a/src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java +++ b/src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java @@ -50,7 +50,9 @@ import java.net.URI; import java.net.URISyntaxException; import java.util.Deque; +import java.util.IllformedLocaleException; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; @@ -166,6 +168,12 @@ public void startElement() XMLElement e = parser.getCurrentElement(); String name = e.getName(); + + // Check global attributes + String xmllang = e.getAttributeNS(EpubConstants.XmlNamespaceUri, "lang"); + if (xmllang != null && !xmllang.isEmpty()) { + checkLanguageTag(xmllang); + } if (EpubConstants.OpfNamespaceUri.equals(e.getNamespace())) { @@ -469,6 +477,11 @@ private void processLink(XMLElement e) .refines(e.getAttribute("refines")).build(); linkedResourcesBuilders.peekFirst().add(resource); } + + String hreflang = e.getAttribute("hreflang"); + if (hreflang != null && !hreflang.isEmpty()) { + checkLanguageTag(hreflang); + } } private void processItemrefProperties(OPFItem.Builder builder, String property) @@ -566,11 +579,22 @@ private void processDCElem(XMLElement e) { // get the property Optional prop = DCMESVocab.VOCAB.lookup(e.getName()); + // Add to the metadata model builder if (prop.isPresent() && !metadataBuilders.isEmpty()) { metadataBuilders.peekFirst().meta(e.getAttribute("id"), prop.get(), (String) e.getPrivateData(), null); } + // Check that dc:language is well-formed + if ("language".equals(e.getName())) + { + String language = (String) e.getPrivateData(); + // Empty dc:language is checked by the schema + if (language != null && !language.trim().isEmpty()) + { + checkLanguageTag(language.trim()); + } + } } private void processItemsInIndexCollection(ResourceCollection collection) @@ -592,6 +616,19 @@ private void processItemsInIndexCollection(ResourceCollection collection) } } } + + private void checkLanguageTag(String language) + { + try + { + new Locale.Builder().setLanguageTag(language); + } catch (IllformedLocaleException exception) + { + report.message(MessageId.OPF_092, + EPUBLocation.create(path, parser.getLineNumber(), parser.getColumnNumber()), language, + exception.getMessage()); + } + } protected void reportMetadata() { diff --git a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties index c43aa9f42..2bcfb0b94 100644 --- a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties +++ b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties @@ -282,6 +282,7 @@ OPF_088=Unrecognized epub:type value "%1$s". OPF_089=The "alternate" link rel keyword cannot be paired with other keywords. OPF_090=It is encouraged to use MIME media type "%1$s" instead of "%2$s". OPF_091=The item href URL must not have a fragment identifier. +OPF_092=Language tag "%1$s" is not well-formed: %2$s #Package PKG_001=Validating the EPUB against version %1$s but detected version %2$s. diff --git a/src/main/resources/com/adobe/epubcheck/schema/30/package-30.rnc b/src/main/resources/com/adobe/epubcheck/schema/30/package-30.rnc index e3fbdb27a..ec9d24969 100644 --- a/src/main/resources/com/adobe/epubcheck/schema/30/package-30.rnc +++ b/src/main/resources/com/adobe/epubcheck/schema/30/package-30.rnc @@ -36,7 +36,7 @@ opf.dc.identifier = element dc:identifier { opf.id.attr? & datatype.string.nonempty } opf.dc.title = element dc:title { opf.dc.attlist & datatype.string.nonempty } - opf.dc.language = element dc:language { opf.id.attr? & datatype.languagecode } + opf.dc.language = element dc:language { opf.id.attr? & datatype.string.nonempty } opf.dc.date = element dc:date { opf.id.attr? & datatype.string.nonempty } opf.dc.source = element dc:source { opf.dc.attlist & datatype.string.nonempty } opf.dc.type = element dc:type { opf.id.attr? & datatype.string.nonempty } @@ -129,5 +129,5 @@ opf.href.attr = attribute href { datatype.URI } opf.id.attr = attribute id { datatype.ID } opf.i18n.attrs = opf.xml.lang.attr? & opf.dir.attr? - opf.xml.lang.attr = attribute xml:lang { "" | datatype.languagecode } + opf.xml.lang.attr = attribute xml:lang { "" | datatype.string.nonempty } opf.dir.attr = attribute dir { 'ltr' | 'rtl' } diff --git a/src/test/resources/epub3/files/package-document/attr-lang-not-well-formed-error.opf b/src/test/resources/epub3/files/package-document/attr-lang-not-well-formed-error.opf new file mode 100644 index 000000000..df837859d --- /dev/null +++ b/src/test/resources/epub3/files/package-document/attr-lang-not-well-formed-error.opf @@ -0,0 +1,17 @@ + + + + Title + en + NOID + 2019-01-01T12:00:00Z + Jane Doe + + + + + + + + diff --git a/src/test/resources/epub3/files/package-document/attr-lang-whitespace-error.opf b/src/test/resources/epub3/files/package-document/attr-lang-whitespace-error.opf new file mode 100644 index 000000000..f740561ae --- /dev/null +++ b/src/test/resources/epub3/files/package-document/attr-lang-whitespace-error.opf @@ -0,0 +1,17 @@ + + + + Title + en + NOID + 2019-01-01T12:00:00Z + Jane Doe + + + + + + + + diff --git a/src/test/resources/epub3/files/package-document/link-hreflang-not-well-formed-error.opf b/src/test/resources/epub3/files/package-document/link-hreflang-not-well-formed-error.opf new file mode 100644 index 000000000..0946a70d6 --- /dev/null +++ b/src/test/resources/epub3/files/package-document/link-hreflang-not-well-formed-error.opf @@ -0,0 +1,17 @@ + + + + Title + en + NOID + 2019-01-01T12:00:00Z + + + + + + + + + diff --git a/src/test/resources/epub3/files/package-document/link-hreflang-whitespace-error.opf b/src/test/resources/epub3/files/package-document/link-hreflang-whitespace-error.opf new file mode 100644 index 000000000..90dcc818a --- /dev/null +++ b/src/test/resources/epub3/files/package-document/link-hreflang-whitespace-error.opf @@ -0,0 +1,17 @@ + + + + Title + en + NOID + 2019-01-01T12:00:00Z + + + + + + + + + diff --git a/src/test/resources/epub3/files/package-document/metadata-language-empty-error.opf b/src/test/resources/epub3/files/package-document/metadata-language-empty-error.opf new file mode 100644 index 000000000..27d70a1ba --- /dev/null +++ b/src/test/resources/epub3/files/package-document/metadata-language-empty-error.opf @@ -0,0 +1,16 @@ + + + + Title + + NOID + 2019-01-01T12:00:00Z + + + + + + + + \ No newline at end of file diff --git a/src/test/resources/epub3/files/package-document/metadata-language-not-well-formed-error.opf b/src/test/resources/epub3/files/package-document/metadata-language-not-well-formed-error.opf new file mode 100644 index 000000000..d379d38ee --- /dev/null +++ b/src/test/resources/epub3/files/package-document/metadata-language-not-well-formed-error.opf @@ -0,0 +1,16 @@ + + + + Title + a-value + NOID + 2019-01-01T12:00:00Z + + + + + + + + \ No newline at end of file diff --git a/src/test/resources/epub3/package-document.feature b/src/test/resources/epub3/package-document.feature index cefa142c5..ffdccd656 100644 --- a/src/test/resources/epub3/package-document.feature +++ b/src/test/resources/epub3/package-document.feature @@ -90,6 +90,16 @@ Feature: EPUB 3 ▸ Packages ▸ Package Document Checks Scenario: the 'xml:lang' attribute can be empty When checking file 'attr-lang-empty-valid.opf' Then no other errors or warnings are reported + + Scenario: the 'xml:lang' language tag must not have leading/trailing whitespace + When checking file 'attr-lang-whitespace-error.opf' + Then error OPF-092 is reported + And no other errors or warnings are reported + + Scenario: the 'xml:lang' language tag must be well-formed + When checking file 'attr-lang-not-well-formed-error.opf' + Then error OPF-092 is reported + And no other errors or warnings are reported ## 3.4.3 Metadata ### 3.4.3 The metadata element @@ -106,7 +116,18 @@ Feature: EPUB 3 ▸ Packages ▸ Package Document Checks When checking file 'metadata-identifier-uuid-invalid-warning.opf' Then warning OPF-085 is reported And no other errors or warnings are reported - + + Scenario: 'dc:language' must not be empty + When checking file 'metadata-language-empty-error.opf' + Then error RSC-005 is reported + And the message contains "must be a string with length at least 1" + And no other errors or warnings are reported + + Scenario: 'dc:language' must be well-formed + When checking file 'metadata-language-not-well-formed-error.opf' + Then error OPF-092 is reported + And no other errors or warnings are reported + Scenario: 'dc:modified' must be defined When checking file 'metadata-modified-missing-error.opf' Then error RSC-005 is reported @@ -245,6 +266,16 @@ Feature: EPUB 3 ▸ Packages ▸ Package Document Checks Scenario: the 'link' 'hreflang' attribute can be empty When checking file 'link-hreflang-empty-valid.opf' Then no other errors or warnings are reported + + Scenario: the 'link' 'hreflang' language tag must not have leading/trailing whitespace + When checking file 'link-hreflang-whitespace-error.opf' + Then error OPF-092 is reported + And no other errors or warnings are reported + + Scenario: the 'link' 'hreflang' language tag must be well-formed + When checking file 'link-hreflang-not-well-formed-error.opf' + Then error OPF-092 is reported + And no other errors or warnings are reported ### 3.4.4 Manifest