Permalink
Browse files

Added HTML microdata typed items parsing capability.

This adds the possibility for the HTML parser to gather typed items URLs
annotated in HTML tags with itemscope and itemtype attributes (see
microdata specification https://www.w3.org/TR/microdata/ ), notably
Types from the schema.org vocabulary, but also Types/Classes from any
other vocabulary, such as the common ones listed in the RDFa core
context ( https://www.w3.org/2011/rdfa-context/rdfa-1.1.html ).
  • Loading branch information...
luccioman committed Feb 2, 2018
1 parent 80fb102 commit 58b98347296d6417d59c09efdc74b0695cb35ec8
@@ -191,6 +191,13 @@ public String toString() {
private final SizeLimitedMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation
private final List<ImageEntry> images;
private final SizeLimitedSet<AnchorURL> script, frames, iframes;
/**
* URLs of linked data item types referenced from HTML content with standard
* annotations such as RDFa, microdata, microformats or JSON-LD
*/
private final SizeLimitedSet<DigestURL> linkedDataTypes;
private final SizeLimitedMap<String, String> metas;
private final SizeLimitedMap<String, DigestURL> hreflang, navigation;
private LinkedHashSet<String> titles;
@@ -260,6 +267,7 @@ public ContentScraper(final DigestURL root, final int maxAnchors, final int maxL
this.embeds = new SizeLimitedMap<AnchorURL, EmbedEntry>(maxLinks);
this.frames = new SizeLimitedSet<AnchorURL>(maxLinks);
this.iframes = new SizeLimitedSet<AnchorURL>(maxLinks);
this.linkedDataTypes = new SizeLimitedSet<>(maxLinks);
this.metas = new SizeLimitedMap<String, String>(maxLinks);
this.hreflang = new SizeLimitedMap<String, DigestURL>(maxLinks);
this.navigation = new SizeLimitedMap<String, DigestURL>(maxLinks);
@@ -543,12 +551,49 @@ private AnchorURL absolutePath(final String relativePath) {
}
}
private void checkOpts(Tag tag) {
/**
* Parse the eventual microdata itemtype attribute of a tag and extract its
* valid URL tokens when the itemscope attribute is present.
*
* @param tagAttributes parsed HTML tag attributes.
* @return a set of URLs eventually empty when no itemtype attribute is present
* or when its value is not valid
* @see <a href="https://www.w3.org/TR/microdata/#dfn-itemtype">itemtype
* definition at W3C</a>
* @see <a href=
* "https://html.spec.whatwg.org/multipage/microdata.html#attr-itemtype">itemtype
* definition at WHATWG</a>
*/
private Set<DigestURL> parseMicrodataItemType(final Properties tagAttributes) {
final Set<DigestURL> types = new HashSet<>();
if (tagAttributes != null) {
/*
* The itemtype attribute must not be specified on elements that do not have an
* itemscope attribute specified. So we lazily check here for itemscope boolean
* attribute presence (strictly conforming parsing would also check it has no
* value or the value is the empty string or "itemscope")
*/
if (tagAttributes.getProperty("itemscope") != null) {
final Set<String> itemTypes = parseSpaceSeparatedTokens(tagAttributes.getProperty("itemtype"));
for (final String itemType : itemTypes) {
try {
types.add(new DigestURL(itemType));
} catch (final MalformedURLException ignored) {
/* Each itemtype space-separated token must be a valid absolute URL */
}
}
}
}
return types;
}
private void checkOpts(final Tag tag) {
// vocabulary classes
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
this.vocabularyScraper.check(this.root, classprop, tag.content);
// itemprop (schema.org)
// itemprop microdata property (standard definition at https://www.w3.org/TR/microdata/#dfn-attr-itemprop)
String itemprop = tag.opts.getProperty("itemprop");
if (itemprop != null) {
String propval = tag.opts.getProperty("content"); // value for <meta itemprop="" content=""> see https://html.spec.whatwg.org/multipage/microdata.html#values
@@ -620,7 +665,7 @@ private void checkOpts(Tag tag) {
* attribute string, may be null
* @return a set of tokens eventually empty
*/
public static Set<String> parseSpaceSeparatedTokens(String attr) {
public static Set<String> parseSpaceSeparatedTokens(final String attr) {
Set<String> tokens = new HashSet<>();
/* Check attr string is not empty to avoid adding a single empty string
* in result */
@@ -923,6 +968,22 @@ public void scrapeTag1(final Tag tag) {
this.fireScrapeTag1(tag.name, tag.opts, tag.content.getChars());
}
/**
* Scraping operation applied to any kind of tag opening, being either singleton
* or paired tag, not restricted to tags listed in
* {@link ContentScraper#linkTags0} and {@link ContentScraper#linkTags1}.
*/
@Override
public void scrapeAnyTagOpening(final String tagName, final Properties tagAttributes) {
if (tagAttributes != null) {
/*
* HTML microdata can be annotated on any kind of tag, so we don't restrict this
* scraping to the limited sets in linkTags0 and linkTags1
*/
this.linkedDataTypes.addAll(parseMicrodataItemType(tagAttributes));
}
}
/**
* Add an anchor to the anchors list, and trigger any eventual listener
* @param anchor anchor to add. Must not be null.
@@ -1092,6 +1153,14 @@ public String getText() {
// returns a url (String) / name (String) relation
return this.iframes;
}
/**
* @return URLs of linked data item types referenced from HTML content with standard
* annotations such as RDFa, microdata, microformats or JSON-LD
*/
public SizeLimitedSet<DigestURL> getLinkedDataTypes() {
return this.linkedDataTypes;
}
public Set<AnchorURL> getScript() {
return this.script;
@@ -1164,7 +1233,7 @@ public boolean isLimitsExceeded() {
return this.contentSizeLimitExceeded || this.maxAnchorsExceeded || this.css.isLimitExceeded()
|| this.rss.isLimitExceeded() || this.embeds.isLimitExceeded() || this.metas.isLimitExceeded()
|| this.hreflang.isLimitExceeded() || this.navigation.isLimitExceeded() || this.script.isLimitExceeded()
|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded();
|| this.frames.isLimitExceeded() || this.iframes.isLimitExceeded() || this.linkedDataTypes.isLimitExceeded();
}
/*
@@ -1384,6 +1453,7 @@ public void close() {
this.script.clear();
this.frames.clear();
this.iframes.clear();
this.linkedDataTypes.clear();
this.embeds.clear();
this.images.clear();
this.icons.clear();
@@ -24,17 +24,52 @@
package net.yacy.document.parser.html;
import java.util.Properties;
public interface Scraper {
/**
* @param tag
* a tag name
* @return true when the tag name belongs to the first category of tags
* according to the Scraper implementation, and is therefore candidate
* for processing by
* {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
* implementation
*/
public boolean isTag0(String tag);
/**
* @param tag
* a tag name
* @return true when the tag name belongs to the second category of tags
* according to the Scraper implementation, and is therefore candidate
* for processing by
* {@link #scrapeTag0(net.yacy.document.parser.html.ContentScraper.Tag)}
* implementation
*/
public boolean isTag1(String tag);
public void scrapeText(char[] text, String insideTag);
/**
* Process a tag belonging to the first category of tags according to the Scraper implementation
* @param tag a parsed tag
*/
public void scrapeTag0(ContentScraper.Tag tag);
/**
* Process a tag belonging to the second category of tags according to the Scraper implementation
* @param tag a parsed tag
*/
public void scrapeTag1(ContentScraper.Tag tag);
/**
* Processing applied to any kind of tag opening.
* @param tagName the tag name
* @param tagAttributes the atttributes of the tag
*/
public void scrapeAnyTagOpening(String tagName, Properties tagAttributes);
public void scrapeComment(final char[] comment);
@@ -292,6 +292,10 @@ public TransformerWriter(
final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser());
charBuffer.close();
/* Apply processing relevant for any kind of tag opening */
this.scraper.scrapeAnyTagOpening(tag.name, tag.opts);
if (this.scraper != null && this.scraper.isTag0(tagname)) {
// this single tag is collected at once here
this.scraper.scrapeTag0(tag);
@@ -29,8 +29,11 @@
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.junit.Assert;
@@ -338,5 +341,80 @@ public void testRemoveUnpairedBrackets() {
Assert.assertEquals("{abc}{def}", ContentScraper.removeUnpairedBrackets("{abc}{def}", '{', '}'));
Assert.assertEquals("{{abc}{def}}", ContentScraper.removeUnpairedBrackets("{{abc}{def}}", '{', '}'));
}
/**
* Test microdata itemtype attribute parsing
* @throws IOException
*/
@Test
public void testParseMicroDataItemType() throws IOException {
final String htmlHeader = "<!DOCTYPE html><head><title>Test document</title></head>";
final DigestURL docUrl = new DigestURL("http://example.org/microdata.html");
final Map<String, String[]> html2Results = new HashMap<>();
/* Basic microdata syntax example with no item type */
String html = htmlHeader + "<div itemscope><p>My name is <span itemprop=\"name\">Elizabeth</span>.</p></div>";
String[] expectedUrls = {};
html2Results.put(html, expectedUrls);
/* Nested items with no item type */
html = "<div itemscope>\n" +
" <p>Name: <span itemprop=\"name\">Amanda</span></p>\n" +
" <p>Band: <span itemprop=\"band\" itemscope> <span itemprop=\"name\">Jazz Band</span> (<span itemprop=\"size\">12</span> players)</span></p>\n" +
"</div>";
expectedUrls = new String[0];
html2Results.put(html, expectedUrls);
/* One typed item */
html = htmlHeader + "<div itemscope itemtype=\"https://schema.org/LocalBusiness\"><img itemprop=\"logo\" src=\"our-logo.png\" alt=\"Our Company\"></div>";
expectedUrls = new String[]{"https://schema.org/LocalBusiness"};
html2Results.put(html, expectedUrls);
/* more than one type per item */
html = htmlHeader + "<dl itemscope itemtype=\"https://md.example.com/loco https://md.example.com/lighting\">" +
" <dt>Name:\n" +
" <dd itemprop=\"name\">Tank Locomotive (DB 80)\n" +
" <dt>Product code:\n" +
" <dd itemprop=\"product-code\">33041\n" +
" <dt>Scale:\n" +
" <dd itemprop=\"scale\">HO\n" +
" <dt>Digital:\n" +
" <dd itemprop=\"digital\">Delta\n" +
"</dl>";
expectedUrls = new String[]{"https://md.example.com/loco", "https://md.example.com/lighting"};
html2Results.put(html, expectedUrls);
/* Nested typed items */
html = htmlHeader + "<div itemscope itemtype=\"http://schema.org/Product\">\n" +
" <span itemprop=\"name\">Panasonic White 60L Refrigerator</span>\n" +
" <img src=\"panasonic-fridge-60l-white.jpg\" alt=\"\">\n" +
" <div itemprop=\"aggregateRating\"\n" +
" itemscope itemtype=\"http://schema.org/AggregateRating\">\n" +
" <meter itemprop=\"ratingValue\" min=0 value=3.5 max=5>Rated 3.5/5</meter>\n" +
" (based on <span itemprop=\"reviewCount\">11</span> customer reviews)\n" +
" </div>\n" +
"</div>";
expectedUrls = new String[]{"http://schema.org/Product", "http://schema.org/AggregateRating"};
html2Results.put(html, expectedUrls);
for (final Entry<String, String[]> html2Result : html2Results.entrySet()) {
ContentScraper scraper = new ContentScraper(docUrl, 10, new HashSet<String>(), new VocabularyScraper(), 0);
try (final Writer writer = new TransformerWriter(null, null, scraper, null, false)) {
FileUtils.copy(new StringReader(html2Result.getKey()), writer);
final Set<DigestURL> expected = new HashSet<>();
for (final String url : html2Result.getValue()) {
expected.add(new DigestURL(url));
}
Assert.assertEquals(expected.size(), scraper.getLinkedDataTypes().size());
Assert.assertTrue(expected.containsAll(scraper.getLinkedDataTypes()));
} finally {
scraper.close();
}
}
}
}

0 comments on commit 58b9834

Please sign in to comment.