Skip to content

Commit

Permalink
feat: new check (OPF-092) for language tags well-formedness
Browse files Browse the repository at this point in the history
In Package Document, the language tags appearing in the elements or attributes below
MUST be well-formed according to BCP47:
 - `xml:lang` attribute
 - `hreflang` attribute
 - `dc:language` element

For these values:
- the schema now only do basic datatype check (string, non-empty value when relevant)
- the well-formedness is checked with Java’s Locale.Builder#setLanguageTag() API
- a new check (OPF-092) is reported when an ill-formed value is found

See https://docs.oracle.com/javase/8/docs/api/java/util/Locale.Builder.html#setLanguageTag-java.lang.String-

Fix #1221
Close #702
  • Loading branch information
rdeltour committed Nov 14, 2021
1 parent e39a801 commit 52ebd80
Show file tree
Hide file tree
Showing 12 changed files with 174 additions and 3 deletions.
Expand Up @@ -273,6 +273,7 @@ private void initialize()
severities.put(MessageId.OPF_089, Severity.ERROR);
severities.put(MessageId.OPF_090, Severity.USAGE);
severities.put(MessageId.OPF_091, Severity.ERROR);
severities.put(MessageId.OPF_092, Severity.ERROR);

// PKG
severities.put(MessageId.PKG_001, Severity.WARNING);
Expand Down
1 change: 1 addition & 0 deletions src/main/java/com/adobe/epubcheck/messages/MessageId.java
Expand Up @@ -267,6 +267,7 @@ public enum MessageId implements Comparable<MessageId>
OPF_089("OPF-089"),
OPF_090("OPF-090"),
OPF_091("OPF-091"),
OPF_092("OPF-092"),

// Messages relating to the entire package
PKG_001("PKG-001"),
Expand Down
37 changes: 37 additions & 0 deletions src/main/java/com/adobe/epubcheck/opf/OPFHandler30.java
Expand Up @@ -50,7 +50,9 @@
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Deque;
import java.util.IllformedLocaleException;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

Expand Down Expand Up @@ -166,6 +168,12 @@ public void startElement()

XMLElement e = parser.getCurrentElement();
String name = e.getName();

// Check global attributes
String xmllang = e.getAttributeNS(EpubConstants.XmlNamespaceUri, "lang");
if (xmllang != null && !xmllang.isEmpty()) {
checkLanguageTag(xmllang);
}

if (EpubConstants.OpfNamespaceUri.equals(e.getNamespace()))
{
Expand Down Expand Up @@ -469,6 +477,11 @@ private void processLink(XMLElement e)
.refines(e.getAttribute("refines")).build();
linkedResourcesBuilders.peekFirst().add(resource);
}

String hreflang = e.getAttribute("hreflang");
if (hreflang != null && !hreflang.isEmpty()) {
checkLanguageTag(hreflang);
}
}

private void processItemrefProperties(OPFItem.Builder builder, String property)
Expand Down Expand Up @@ -566,11 +579,22 @@ private void processDCElem(XMLElement e)
{
// get the property
Optional<Property> prop = DCMESVocab.VOCAB.lookup(e.getName());
// Add to the metadata model builder
if (prop.isPresent() && !metadataBuilders.isEmpty())
{
metadataBuilders.peekFirst().meta(e.getAttribute("id"), prop.get(),
(String) e.getPrivateData(), null);
}
// Check that dc:language is well-formed
if ("language".equals(e.getName()))
{
String language = (String) e.getPrivateData();
// Empty dc:language is checked by the schema
if (language != null && !language.trim().isEmpty())
{
checkLanguageTag(language.trim());
}
}
}

private void processItemsInIndexCollection(ResourceCollection collection)
Expand All @@ -592,6 +616,19 @@ private void processItemsInIndexCollection(ResourceCollection collection)
}
}
}

private void checkLanguageTag(String language)
{
try
{
new Locale.Builder().setLanguageTag(language);
} catch (IllformedLocaleException exception)
{
report.message(MessageId.OPF_092,
EPUBLocation.create(path, parser.getLineNumber(), parser.getColumnNumber()), language,
exception.getMessage());
}
}

protected void reportMetadata()
{
Expand Down
Expand Up @@ -282,6 +282,7 @@ OPF_088=Unrecognized epub:type value "%1$s".
OPF_089=The "alternate" link rel keyword cannot be paired with other keywords.
OPF_090=It is encouraged to use MIME media type "%1$s" instead of "%2$s".
OPF_091=The item href URL must not have a fragment identifier.
OPF_092=Language tag "%1$s" is not well-formed: %2$s

#Package
PKG_001=Validating the EPUB against version %1$s but detected version %2$s.
Expand Down
Expand Up @@ -36,7 +36,7 @@

opf.dc.identifier = element dc:identifier { opf.id.attr? & datatype.string.nonempty }
opf.dc.title = element dc:title { opf.dc.attlist & datatype.string.nonempty }
opf.dc.language = element dc:language { opf.id.attr? & datatype.languagecode }
opf.dc.language = element dc:language { opf.id.attr? & datatype.string.nonempty }
opf.dc.date = element dc:date { opf.id.attr? & datatype.string.nonempty }
opf.dc.source = element dc:source { opf.dc.attlist & datatype.string.nonempty }
opf.dc.type = element dc:type { opf.id.attr? & datatype.string.nonempty }
Expand Down Expand Up @@ -129,5 +129,5 @@
opf.href.attr = attribute href { datatype.URI }
opf.id.attr = attribute id { datatype.ID }
opf.i18n.attrs = opf.xml.lang.attr? & opf.dir.attr?
opf.xml.lang.attr = attribute xml:lang { "" | datatype.languagecode }
opf.xml.lang.attr = attribute xml:lang { "" | datatype.string.nonempty }
opf.dir.attr = attribute dir { 'ltr' | 'rtl' }
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uid"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<metadata>
<dc:title>Title</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="uid">NOID</dc:identifier>
<meta property="dcterms:modified">2019-01-01T12:00:00Z</meta>
<dc:creator xml:lang="a-value">Jane Doe</dc:creator>
</metadata>
<manifest>
<item id="t001" href="contents.xhtml" properties="nav" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="t001"/>
</spine>
</package>
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uid"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<metadata>
<dc:title>Title</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="uid">NOID</dc:identifier>
<meta property="dcterms:modified">2019-01-01T12:00:00Z</meta>
<dc:creator xml:lang=" en ">Jane Doe</dc:creator>
</metadata>
<manifest>
<item id="t001" href="contents.xhtml" properties="nav" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="t001"/>
</spine>
</package>
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uid"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<metadata>
<dc:title>Title</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="uid">NOID</dc:identifier>
<meta property="dcterms:modified">2019-01-01T12:00:00Z</meta>
<link hreflang="a-value" rel="alternate" href="https://example.org/package.json" media-type="application/json-ld"/>
</metadata>
<manifest>
<item id="t001" href="contents.xhtml" properties="nav" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="t001"/>
</spine>
</package>
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uid"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<metadata>
<dc:title>Title</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="uid">NOID</dc:identifier>
<meta property="dcterms:modified">2019-01-01T12:00:00Z</meta>
<link hreflang=" en " rel="alternate" href="https://example.org/package.json" media-type="application/json-ld"/>
</metadata>
<manifest>
<item id="t001" href="contents.xhtml" properties="nav" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="t001"/>
</spine>
</package>
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uid"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<metadata>
<dc:title>Title</dc:title>
<dc:language> </dc:language>
<dc:identifier id="uid">NOID</dc:identifier>
<meta property="dcterms:modified">2019-01-01T12:00:00Z</meta>
</metadata>
<manifest>
<item id="t001" href="contents.xhtml" properties="nav" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="t001"/>
</spine>
</package>
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="uid"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<metadata>
<dc:title>Title</dc:title>
<dc:language>a-value</dc:language>
<dc:identifier id="uid">NOID</dc:identifier>
<meta property="dcterms:modified">2019-01-01T12:00:00Z</meta>
</metadata>
<manifest>
<item id="t001" href="contents.xhtml" properties="nav" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="t001"/>
</spine>
</package>
33 changes: 32 additions & 1 deletion src/test/resources/epub3/package-document.feature
Expand Up @@ -90,6 +90,16 @@ Feature: EPUB 3 ▸ Packages ▸ Package Document Checks
Scenario: the 'xml:lang' attribute can be empty
When checking file 'attr-lang-empty-valid.opf'
Then no other errors or warnings are reported

Scenario: the 'xml:lang' language tag must not have leading/trailing whitespace
When checking file 'attr-lang-whitespace-error.opf'
Then error OPF-092 is reported
And no other errors or warnings are reported

Scenario: the 'xml:lang' language tag must be well-formed
When checking file 'attr-lang-not-well-formed-error.opf'
Then error OPF-092 is reported
And no other errors or warnings are reported

## 3.4.3 Metadata
### 3.4.3 The metadata element
Expand All @@ -106,7 +116,18 @@ Feature: EPUB 3 ▸ Packages ▸ Package Document Checks
When checking file 'metadata-identifier-uuid-invalid-warning.opf'
Then warning OPF-085 is reported
And no other errors or warnings are reported


Scenario: 'dc:language' must not be empty
When checking file 'metadata-language-empty-error.opf'
Then error RSC-005 is reported
And the message contains "must be a string with length at least 1"
And no other errors or warnings are reported

Scenario: 'dc:language' must be well-formed
When checking file 'metadata-language-not-well-formed-error.opf'
Then error OPF-092 is reported
And no other errors or warnings are reported

Scenario: 'dc:modified' must be defined
When checking file 'metadata-modified-missing-error.opf'
Then error RSC-005 is reported
Expand Down Expand Up @@ -245,6 +266,16 @@ Feature: EPUB 3 ▸ Packages ▸ Package Document Checks
Scenario: the 'link' 'hreflang' attribute can be empty
When checking file 'link-hreflang-empty-valid.opf'
Then no other errors or warnings are reported

Scenario: the 'link' 'hreflang' language tag must not have leading/trailing whitespace
When checking file 'link-hreflang-whitespace-error.opf'
Then error OPF-092 is reported
And no other errors or warnings are reported

Scenario: the 'link' 'hreflang' language tag must be well-formed
When checking file 'link-hreflang-not-well-formed-error.opf'
Then error OPF-092 is reported
And no other errors or warnings are reported

### 3.4.4 Manifest

Expand Down

0 comments on commit 52ebd80

Please sign in to comment.