Permalink
Browse files

Added a new crawler document filter type using Solr syntax

This makes possbile to set up much more advanced document crawl filters,
by filtering on one or more document indexed fields before inserting in
the index.
  • Loading branch information...
luccioman committed Jun 19, 2018
1 parent 2c155ec commit cced94298ab946125bc29e58583431ac4dd6a426
@@ -32,6 +32,8 @@
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
<status>#(status)#terminated::active::system#(/status)#</status>
<crawlingDomFilterContent>
#{crawlingDomFilterContent}#
@@ -171,6 +171,8 @@
disableIf('indexmustnotmatch', defaultMatchNone);
disableIf('indexcontentmustnotmatch', defaultMatchNone);
disableIf('indexMediaTypeMustNotMatch', defaultMatchNone);
disableIf('indexSolrQueryMustMatch', "#[solrQueryMatchAllStr]#");
disableIf('indexSolrQueryMustNotMatch', "#[solrEmptyQueryStr]#");
// remove if MATCH_ALL_STRING
disableIf('mustmatch', defaultMatchAll);
@@ -369,7 +371,7 @@ <h2>Expert Crawl Start</h2>
<dt>Filter on Document Media Type (aka MIME type)</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Clean up search events cache info"/>
<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type filter info"/>
<span style="right:0px;" id="mediaTypeMustMatchInfo">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>
that <b>must match</b> with the document Media Type (also known as MIME Type) to allow the URL to be indexed.
@@ -388,6 +390,39 @@ <h2>Expert Crawl Start</h2>
</tr>
</table>
</dd>
<dt>Solr query filter on any active <a href="IndexSchema_p.html" target="_blank">indexed</a> field(s)</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Solr query filter info"/>
<span style="right:0px;" id="indexSolrQueryInfo">
Each parsed document is checked against the given Solr query before being added to the index.
The query must be written in respect to the <a href="https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html#the-standard-query-parser" target="_blank">standard</a> Solr query syntax.
</span>
</div>
<table style="border-width: 0px" role="presentation">
#(embeddedSolrConnected)#
<tr>
<td>
<div class="info"><p>The embedded local Solr index must be connected to use this kind of filter.</p>
<p>You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</p></div>
</td>
</tr>
::
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td>
<input name="indexSolrQueryMustMatch" id="indexSolrQueryMustMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustMatch]#" aria-describedby="indexSolrQueryInfo" />
</td>
</tr>
<tr>
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td>
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
</td>
</tr>
#(/embeddedSolrConnected)#
</table>
</dd>
</dl>
</fieldset>
<fieldset>
@@ -28,6 +28,9 @@
import java.util.Collection;
import java.util.List;
import org.apache.solr.core.SolrCore;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@@ -49,9 +52,11 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
final serverObjects prop = new serverObjects();
final String defaultCollection = "user";
// javascript values
// javascript constants
prop.put("matchAllStr", CrawlProfile.MATCH_ALL_STRING);
prop.put("matchNoneStr", CrawlProfile.MATCH_NEVER_STRING);
prop.put("solrQueryMatchAllStr", CrawlProfile.SOLR_MATCH_ALL_QUERY);
prop.put("solrEmptyQueryStr", CrawlProfile.SOLR_EMPTY_QUERY);
prop.put("defaultCollection", defaultCollection);
// ---------- Start point
@@ -317,6 +322,29 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
} else {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
}
// Filter with a Solr syntax query
/* Check that the embedded local Solr index is connected, as its schema is required to apply the eventual Solr filter query */
final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
prop.put("embeddedSolrConnected", embeddedSolrConnected);
if(embeddedSolrConnected) {
if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)) {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim());
} else {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
}
if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key)) {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key,
post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim());
} else {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
}
}
// ---------- Clean-Up before Crawl Start
@@ -216,9 +216,23 @@ <h2>Crawler</h2>
<!-- 8 -->
Crawling of "#[crawlingURL]#" started. <strong>Please wait some seconds,
it may take some seconds until the first result appears there.</strong>
If you crawl any un-wanted pages, you can delete them <a href="IndexCreateQueues_p.html?stack=LOCAL">here</a>.<br />
If you crawl any un-wanted pages, you can delete them <a href="IndexCreateQueues_p.html?stack=LOCAL">here</a>.<br />::
<!-- 9 -->
No embedded local Solr index is connected. This is required to use a Solr query filter.
You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.::
<!-- 10 -->
The Solr filter query syntax is not valid : <code>#[solrQuery]#</code>::
<!-- 11 -->
Could not parse the Solr filter query : <code>#[solrQuery]#</code>
#(/info)#
</p>
<!-- #(noEmbeddedSolr)#::<div class="alert alert-error">No embedded local Solr index is connected. This is required to use the Solr filter query.
You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</div>
#(/noEmbeddedSolr)#
#(solrQuerySyntaxtError)#::<div class="alert alert-error">The Solr filter query syntax is not valid : #[solrQuery]#</div>
#(/solrQuerySyntaxtError)#-->
<!-- crawl queues -->
#(info-queue)#::<div class="alert alert-warning">#[message]#</div>#(/info-queue)#
Oops, something went wrong.

0 comments on commit cced942

Please sign in to comment.