Added support for HTML OpenSearch results.

Many OpenSearch systems do not provide results as standard RSS/Atom feeds but only as HTML. This modification add some support for custom OpenSearch HTML results through the use of mapping files (as already done for federated Solr search) relying on CSS-like selectors to retrieve information from HTML content. An example mapping file is provided to map results from the www.npmjs.com OpenSearch URL.
yacy · Feb 13, 2017 · bf16de2 · bf16de2 · reger24 · Feb 13, 2017
1 parent a79194a
commit bf16de2
Show file tree

Hide file tree

Showing 7 changed files with 475 additions and 74 deletions.
diff --git a/defaults/federatecfg/npmjs.html.map.properties b/defaults/federatecfg/npmjs.html.map.properties
@@ -0,0 +1,24 @@
+# www.npmjs.com HTML search results mapping
+# OpenSearch description : https://www.npmjs.com/opensearch.xml
+# OpenSearch template URL : https://www.npmjs.com/search?q={searchTerms}
+
+# This is an example mapping file for OpenSearch systems or search APIs providing results only as HTML
+# When possible, it is preferable to use an OpenSearch URL providing results as standard RSS or Atom feed as mapping is generic
+# Selectors are using CSS or JQuery-like syntax, as described at https://jsoup.org/apidocs/org/jsoup/select/Selector.html
+# Standard Java properties file syntax is used here instead of usual YaCy Configuration syntax to easily allow '#' characters in values (example : _result=div#result li)
+# Character encoding is assumed to be ISO-8859-1 
+
+# Result node selector (required)
+# In this example, a list item such as : <li class="package-details css-ywvx7i" data-reactid="n">
+_result=.package-details
+
+# Result link selector relative to the selected result block (required)
+# In this example, a link such as <a href="https://www.npmjs.com/package/packageName" class="name css-1nx9rl1">packageName</a>
+_sku=.name
+
+# field mappings
+# YaCyFieldname = HTML text node selector, relative to the result block
+# In this example title is the text of the link so it has the same selector
+title=.name
+# In this example the description is in a paragraph tag such as <p class="description css-zqstoe">Package description</p>
+description_txt=.description
diff --git a/defaults/heuristicopensearch.conf b/defaults/heuristicopensearch.conf
@@ -12,11 +12,14 @@
 ## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines
 ##
 
+## Additional mapping files for OpenSearch HTML results can be set in DATA/SETTINGS/federatecfg/[name].html.map.properties 
+
 #Faroo-News = http://www.faroo.com/api?q={searchTerms}&start={startIndex}&length=20&l=en&src=news&f=rss  # get results from Faroo news-search
 #WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?}  #Search WordPress.com Blogs
 #Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv
 #Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2
-#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web 
+#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web
+#npmjs = https://www.npmjs.com/search?q={searchTerms} # Search JavaScript packages from the npm repository 
 
 ## In addition to OpenSearch systems other connectors are available to query foreign systems
 ## the syntax is

diff --git a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java
@@ -109,16 +109,20 @@ public void search(final SearchEvent theSearch) {
             @Override
             public void run() {
                 Thread.currentThread().setName("heuristic:" + instancename);
+                ConcurrentLog.info("YACY SEARCH (federated)", "Send search query to " +  instancename);
                 theSearch.oneFeederStarted();
                 List<URIMetadataNode> doclist = query(theSearch.getQuery());
                 if (doclist != null) {
+                    ConcurrentLog.info("YACY SEARCH (federated)", "Got " + doclist.size() + " documents from " +  instancename);
                     Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // add nodes doesn't allow null
                     Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(); // add nodes doesn't allow null
                     theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size());
 
                     for (URIMetadataNode doc : doclist) {
                         theSearch.addHeuristic(doc.hash(), instancename, false);
                     }
+                } else {
+                	ConcurrentLog.info("YACY SEARCH (federated)", "Got no results from " +  instancename);
                 }
                 // that's all we need to display serach result
                 theSearch.oneFeederTerminated();

diff --git a/source/net/yacy/cora/federate/FederateSearchManager.java b/source/net/yacy/cora/federate/FederateSearchManager.java
@@ -19,19 +19,22 @@
  */
 package net.yacy.cora.federate;
 
-import net.yacy.cora.federate.opensearch.OpenSearchConnector;
 import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
-
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
+
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrDocumentList;
+
 import net.yacy.cora.document.analysis.Classification;
 import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.federate.opensearch.OpenSearchConnector;
 import net.yacy.cora.federate.solr.connector.SolrConnector;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.storage.Configuration;
@@ -49,8 +52,6 @@
 import net.yacy.search.query.QueryParams;
 import net.yacy.search.query.SearchEvent;
 import net.yacy.search.schema.WebgraphSchema;
-import org.apache.solr.common.SolrDocument;
-import org.apache.solr.common.SolrDocumentList;
 
 /**
  * Handling of queries to configured remote OpenSearch systems.
@@ -107,8 +108,8 @@ public FederateSearchManager(Switchboard sb) {
                                 ConcurrentLog.config("FederateSearchManager", "Error in configuration of: " + url);
                             }
                         } else { // handle opensearch url template
-                            OpenSearchConnector osc = new OpenSearchConnector();
-                            if (osc.init(name, url)) {
+                            OpenSearchConnector osc = new OpenSearchConnector(url);
+                            if (osc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name))) {
                                 conlist.add(osc);
                             }
                         }
@@ -234,8 +235,13 @@ public boolean addOpenSearchTarget(String name, String urlTemplate, boolean acti
                 try {
                     conf.commit();
                     if (active) {
-                        OpenSearchConnector osd = new OpenSearchConnector();
-                        if (osd.init(name, urlTemplate)) {
+                        OpenSearchConnector osd = new OpenSearchConnector(urlTemplate);
+                        String htmlMappingFile = null;
+                        Switchboard sb = Switchboard.getSwitchboard();
+                        if(sb != null) {
+                        	htmlMappingFile = sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name);
+                        }
+                        if (osd.init(name, htmlMappingFile)) {
                             conlist.add(osd);
                         }
                     }
@@ -407,9 +413,8 @@ public boolean init(String cfgFileName) {
                                     ConcurrentLog.config("FederateSearchManager", "Init error in configuration of: " + url);
                                 }
                             } else { // handle opensearch url template
-                                OpenSearchConnector osd;
-                                osd = new OpenSearchConnector();
-                                if (osd.init(name, url)) {
+                                OpenSearchConnector osd = new OpenSearchConnector(url);
+                                if (osd.init(name, confFile.getParent()+"/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name))) {
                                     conlist.add(osd);
                                 }
                             }