Permalink
Browse files

Added basic support for autotagging microdata annotated item types.

With the appropriate vocabulary settings in Vocabulary_p.html page, this
can produce Vocabulary search facets displaying item types referenced in
html documents by microdata annotation.
Tested notably, but not limited to, vocabulary classes/types defined by
Schema.org and Dublin Core.
  • Loading branch information...
luccioman committed Feb 6, 2018
1 parent 5a14d34 commit 9412881230e6129474597e86f8a40801e30f2a21
@@ -279,6 +279,11 @@ surrogates.out = DATA/SURROGATES/out
# this directory also contains subdirectories for input sources, the did-you-mean function and other
dictionaries = DATA/DICTIONARIES
# Set of comma separated vocabulary names whose terms should only be matched
# from linked data types annotations in documents (with microdata, RDFa, microformats...)
# instead of cleartext words
vocabularies.matchLinkedData.names =
# a path to the classification directory
# each subdirectory is the name of a context (which becomes a navigator) with '.txt' files
# containing texts to teach a bayesian filter. One of the files must be named 'negative.txt'.
@@ -887,6 +892,7 @@ search.result.show.cache = true
search.result.show.proxy = false
search.result.show.hostbrowser = true
search.result.show.vocabulary = false
# Set of comma separated vocabulary names not to be used as search results facets
search.result.show.vocabulary.omit =
search.result.show.snapshots = false
# when true, display the raw ranking score value
@@ -162,6 +162,15 @@ <h2>Vocabulary Administration</h2>
<dt>Prefix</dt><dd>#[prefix]#</dd>
<dt>Objectspace</dt><dd>#(editable)##[objectspace]#::<input type="text" name="objectspace" value="#[objectspace]#" size="78" maxlength="128" /><br/>if set, uses the predicate <a href="#[objectspacepredicate]#" target="_blank">#[objectspacepredicate]#</a> for generated objects. Hint: use 'http://dbpedia.org/resource/' as default.#(/editable)#</dd>
<dt>Is Facet?</dt><dd><input type="checkbox" name="isFacet"#(isFacet)#:: checked="checked"#(/isFacet)#/> (If checked, this vocabulary is used for search facets. Not feasible for large vocabularies!)</dd>
<dt>Match terms from</dt>
<dd>
<label>
<input type="radio" name="vocabularies.matchLinkedData" value="false" #(vocabularies.matchLinkedData)#checked="checked"::#(/vocabularies.matchLinkedData)# />Cleartext
</label>
<label>
<input type="radio" name="vocabularies.matchLinkedData" value="true" #(vocabularies.matchLinkedData)#::checked="checked"#(/vocabularies.matchLinkedData)# />Linked data/Semantic web annotations
</label>
</dd>
</dl>
<table class="sortable" border="0">
<tr class="TableHeader" valign="bottom">
@@ -49,6 +49,7 @@
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@@ -238,13 +239,27 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
}
}
// check the isFacet property
// check the isFacet and isMatchFromLinkedData properties
if (vocabulary != null && post.containsKey("set")) {
boolean isFacet = post.getBoolean("isFacet");
vocabulary.setFacet(isFacet);
Set<String> omit = env.getConfigSet("search.result.show.vocabulary.omit");
if (isFacet) omit.remove(vocabularyName); else omit.add(vocabularyName);
if (isFacet) {
omit.remove(vocabularyName);
} else {
omit.add(vocabularyName);
}
env.setConfig("search.result.show.vocabulary.omit", omit);
boolean isMatchFromLinkedData = post.getBoolean("vocabularies.matchLinkedData");
vocabulary.setMatchFromLinkedData(isMatchFromLinkedData);
final Set<String> matchLinkedDataVocs = env.getConfigSet(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES);
if (isMatchFromLinkedData) {
matchLinkedDataVocs.add(vocabularyName);
} else {
matchLinkedDataVocs.remove(vocabularyName);
}
env.setConfig(SwitchboardConstants.VOCABULARIES_MATCH_LINKED_DATA_NAMES, matchLinkedDataVocs);
}
}
} catch (final IOException e) {
@@ -273,6 +288,7 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
prop.putXML("edit_namexml", vocabulary.getName());
prop.putHTML("edit_namespace", vocabulary.getNamespace());
prop.put("edit_isFacet", vocabulary.isFacet() ? 1 : 0);
prop.put("edit_vocabularies.matchLinkedData", vocabulary.isMatchFromLinkedData());
prop.put("edit_size", vocabulary.size());
prop.putHTML("edit_predicate", vocabulary.getPredicate());
prop.putHTML("edit_prefix", Tagging.DEFAULT_PREFIX);
@@ -951,6 +951,11 @@ public String getProtocol() {
return this.protocol;
}
/**
* @return this URL fragment or null if has no fragment
* @see <a href="https://url.spec.whatwg.org/#concept-url-fragment">URL fragment concept at WHATWG</a>
* @see <a href="https://tools.ietf.org/html/rfc3986#section-3.5">URL fragment section in RFC 3986</a>
*/
public String getRef() {
return this.anchor;
}
@@ -28,6 +28,9 @@
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.lang.StringUtils;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.geo.Locations;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.util.ConcurrentLog;
@@ -44,7 +47,8 @@
private final static Object PRESENT = new Object();
private final File autotaggingPath;
private final Map<String, Tagging> vocabularies; // mapping from vocabulary name to the tagging vocabulary
/** mapping from vocabulary name to the tagging vocabulary */
private final Map<String, Tagging> vocabularies;
private final Map<String, Object> allTags;
/**
@@ -81,6 +85,25 @@ public AutotaggingLibrary(final File autotaggingPath) {
}
}
}
/**
* Create a new Autotagging instance from the provided vocabularies. Can be used
* for example for testing purpose.
*/
protected AutotaggingLibrary(final Map<String, Tagging> vocabularies) {
if(vocabularies != null) {
this.vocabularies = vocabularies;
} else {
this.vocabularies = new ConcurrentHashMap<String, Tagging>();
}
this.allTags = new ConcurrentHashMap<String, Object>();
this.autotaggingPath = null;
for(final Tagging voc : this.vocabularies.values()) {
for (final String t: voc.tags()) {
this.allTags.put(t, PRESENT);
}
}
}
public File getVocabularyFile(String name) {
return new File(this.autotaggingPath, name + ".vocabulary");
@@ -152,19 +175,90 @@ public int getMaxWordsInTerm() {
return 4;
}
public Tagging.Metatag getTagFromTerm(Set<String> vocabularies, String term) {
/**
* Search a term in the given active vocabularies matching clear text words.
* @param vocabularies the vocabularies names to search for term
* @param term the word to search
* @return a instance of Metatag from the first matching vocabulary, or null when no one was found
*/
public Tagging.Metatag getTagFromTerm(final Set<String> vocabularies, String term) {
if (this.vocabularies.isEmpty()) return null;
Tagging.Metatag tag;
term = Tagging.normalizeTerm(term);
for (String vocabularyName: vocabularies) {
Tagging t = this.vocabularies.get(vocabularyName);
if (t != null) {
if (t != null && !t.isMatchFromLinkedData()) {
tag = t.getMetatagFromSynonym(term);
if (tag != null) return tag;
}
}
return null;
}
/**
* Search in the active vocabularies matching linked data for Metatag entries with objectspace + term
* matching the given term URL. Returns at most one Metatag instance per
* vocabulary.
*
* @param termURL
* the vocabulary term identifier (an absolute URL) to search
* @return a set of matching Metatag instances eventually empty
*/
public Set<Tagging.Metatag> getTagsFromTermURL(final DigestURL termURL) {
final Set<Tagging.Metatag> tags = new HashSet<>();
if (termURL == null || this.vocabularies.isEmpty()) {
return tags;
}
final String termURLStr = termURL.toNormalform(false);
String termNamespace = null;
/* If the objectLink URL has a fragment, this should be the vocabulary term */
String term = termURL.getRef();
if (term == null) {
/*
* No fragment in the URL : the term should then be the last segment of the URL
*/
term = termURL.getFileName();
if (StringUtils.isNotEmpty(term)) {
final int lastPathSeparatorPos = termURLStr.lastIndexOf("/");
if (lastPathSeparatorPos > 0) {
termNamespace = termURLStr.substring(0, lastPathSeparatorPos + 1);
}
}
} else {
final int fragmentPos = termURLStr.indexOf("#");
if (fragmentPos > 0) {
termNamespace = termURLStr.substring(0, fragmentPos + 1);
}
}
if (StringUtils.isNotEmpty(term) && termNamespace != null) {
final String alternativeTermNamespace;
/*
* http://example.org/ and https://example.org/ are considered equivalent forms
* for the namespace URL
*/
if (termURL.isHTTP()) {
alternativeTermNamespace = "https" + termNamespace.substring("http".length());
} else if (termURL.isHTTPS()) {
alternativeTermNamespace = "http" + termNamespace.substring("https".length());
} else {
alternativeTermNamespace = null;
}
for (final Tagging vocabulary : this.vocabularies.values()) {
if (vocabulary != null && vocabulary.isMatchFromLinkedData()) {
if ((termNamespace.equals(vocabulary.getObjectspace())) || (alternativeTermNamespace != null
&& alternativeTermNamespace.equals(vocabulary.getObjectspace()))) {
final Tagging.Metatag tag = vocabulary.getMetatagFromTerm(term);
if (tag != null) {
tags.add(tag);
}
}
}
}
}
return tags;
}
public Tagging.Metatag metatag(String vocName, String term) {
Tagging tagging = this.vocabularies.get(vocName);
@@ -47,6 +47,9 @@
public final static String DEFAULT_NAMESPACE= "http://yacy.net/autotagging#";
public final static String DEFAULT_PREFIX = "tags";
/** Default value for the property matchFromLinkedData */
public final static boolean DEFAULT_MATCH_FROM_LINKED_DATA = false;
private final String navigatorName;
private final Map<String, String> synonym2term;
@@ -55,7 +58,16 @@
private final Map<String, TaggingEntry> term2entries;
private File propFile;
private boolean isFacet; // true if the vocabulary shall generate a navigation facet
/** true if the vocabulary shall generate a navigation facet */
private boolean isFacet;
/**
* True when this vocabulary terms should only be matched from linked data types
* annotations (with microdata, RDFa, microformats...) instead of clear text
* words
*/
private boolean matchFromLinkedData;
private String predicate, namespace, objectspace;
@@ -101,6 +113,7 @@ public Tagging(String name) {
this.objectspace = null;
this.propFile = null;
this.isFacet = true;
this.matchFromLinkedData = DEFAULT_MATCH_FROM_LINKED_DATA;
}
public Tagging(String name, File propFile) throws IOException {
@@ -285,6 +298,25 @@ public void setFacet(boolean isFacet) {
this.isFacet = isFacet;
}
/**
* @return true when this vocabulary terms should be matched from linked data
* types annotations (with microdata, RDFa, microformats...) instead of
* clear text words
*/
public boolean isMatchFromLinkedData() {
return this.matchFromLinkedData;
}
/**
* @param facetFromLinkedData
* true when this vocabulary terms should be matched from linked
* data types annotations (with microdata, RDFa, microformats...)
* instead of clear text words
*/
public void setMatchFromLinkedData(final boolean facetFromLinkedData) {
this.matchFromLinkedData = facetFromLinkedData;
}
public int size() {
return this.term2entries.size();
}
@@ -525,16 +557,41 @@ public File getFile() {
return this.propFile;
}
/**
* @param word
* a synonym to look for
* @return a Metatag instance with the matching term, or null when the synonym
* is not in this vocabulary.
*/
public Metatag getMetatagFromSynonym(final String word) {
String printname = this.synonym2term.get(word);
if (printname == null) return null;
return new Metatag(printname);
}
/**
* @param term
* a term to look for
* @return a Metatag instance with the matching term, or null when it is not in
* this vocabulary.
*/
public Metatag getMetatagFromTerm(final String term) {
TaggingEntry entry = this.term2entries.get(term);
if(entry == null) {
return null;
}
return new Metatag(term);
}
public Metatag getMetatagFromTerm(final String word) {
/**
* @param word
* the object of the Metatag
* @return a new Metatag instance related to this vocabulary
*/
public Metatag buildMetatagFromTerm(final String word) {
return new Metatag(word);
}
public Set<String> tags() {
return this.synonym2term.keySet();
}
Oops, something went wrong.

0 comments on commit 9412881

Please sign in to comment.