Permalink
Browse files

Issue #156 : new option to clean up (or not) search cache on crawl start

Prevent also unnecessary search event cache clean-up on each access to
the crawl monitor page (Crawler_p.html).
  • Loading branch information...
luccioman committed Feb 16, 2018
1 parent eeb5fbb commit 519fc9a6009bd81182f9efb16d734f7a552669a7
@@ -381,6 +381,16 @@ <h2>Expert Crawl Start</h2>
<fieldset>
<legend>Clean-Up before Crawl Start</legend>
<dl>
<dt><label for="cleanSearchCache">Clean up search events cache</label></dt>
<dd>
<input type="checkbox" name="cleanSearchCache" id="cleanSearchCache" #(cleanSearchCacheChecked)#::checked="checked"#(/cleanSearchCacheChecked)# aria-describedby="cleanSearchCacheInfo"/>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Clean up search events cache info"/>
<span style="right:0px;" id="cleanSearchCacheInfo">
Check this option to be sure to get fresh search results including newly crawled documents. Beware that it will also interrupt any refreshing/resorting of search results currently requested from browser-side.
</span>
</div>
</dd>
<dt>No Deletion</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host.
@@ -35,6 +35,7 @@
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.LibraryProvider;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@@ -364,6 +365,19 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
} else {
prop.put("deleteIfOlderUnitSelect_list_2_default", 1);
}
// clean up search events cache ?
if (post != null && post.containsKey("cleanSearchCache")) {
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
} else {
/*
* no parameter passed : the checkbox is proposed unchecked
* when JavaScript search resort is enabled, as it heavily relies on search events cache
*/
prop.put("cleanSearchCacheChecked", !sb.getConfigBool(SwitchboardConstants.SEARCH_JS_RESORT,
SwitchboardConstants.SEARCH_JS_RESORT_DEFAULT));
}
// delete any document before the crawl is started?
if (post != null && post.containsKey("deleteold")) {
@@ -86,6 +86,7 @@ <h2>Site Crawling</h2>
<input type="hidden" name="recrawl" id="recrawl" value="reload" />
<input type="hidden" name="reloadIfOlderNumber" id="reloadIfOlderNumber" value="3" />
<input type="hidden" name="reloadIfOlderUnit" id="reloadIfOlderUnit" value="day" />
<input type="hidden" name="cleanSearchCache" id="cleanSearchCache" value=#(cleanSearchCacheChecked)#"off"::"on"#(/cleanSearchCacheChecked)# />
<input type="hidden" name="deleteold" id="deleteold" value="on" />
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
@@ -74,20 +74,18 @@
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
/**
* This servlet does NOT create the Crawler servlet page content! This controls
* a web crawl start or the crawl monitor page (Crawler_p.html). The interfaces for entering the web crawl parameters are
* in CrawlStartSite.html and CrawlStartExpert.html.
*/
public class Crawler_p {
// this servlet does NOT create the Crawler servlet page content!
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
// inital values for AJAX Elements (without JavaScript)
final serverObjects prop = new serverObjects();
prop.put("rejected", 0);
@@ -220,6 +218,12 @@ public static serverObjects respond(final RequestHeader header, final serverObje
if (sb.peers == null) {
prop.put("info", "3");
} else {
if(post.getBoolean("cleanSearchCache")) {
// clean up all search events
SearchEventCache.cleanupEvents(true);
sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings
}
// remove crawlingFileContent before we record the call
String crawlingFileName = post.get("crawlingFile");
@@ -34,6 +34,7 @@ <h3>Retrieval of Wiki Pages</h3>
<input type="hidden" name="mustnotmatch" value="(.*Recentchangeslinked.*)|(.*Whatlinkshere.*)|(.*MediaWiki.*)" />
<input type="hidden" name="range" value="subpath" />
<input type="hidden" name="crawlingIfOlderCheck" value="on"/>
<input type="hidden" name="cleanSearchCache" id="cleanSearchCache" value=#(cleanSearchCacheChecked)#"off"::"on"#(/cleanSearchCacheChecked)# />
<input type="hidden" name="crawlingIfOlderNumber" value="1" />
<input type="hidden" name="crawlingIfOlderUnit" value="day" />
<input type="hidden" name="crawlingDomFilterCheck" value="off" />
@@ -26,6 +26,7 @@
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@@ -45,6 +46,18 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
}
prop.put("starturl", "http://");
prop.put("address", a);
// hidden form param : clean up search events cache ?
if (post != null && post.containsKey("cleanSearchCache")) {
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
} else {
/*
* no parameter passed : no search event cache clean-up
* when JavaScript search resort is enabled, as it heavily relies on search events cache
*/
prop.put("cleanSearchCacheChecked", !sb.getConfigBool(SwitchboardConstants.SEARCH_JS_RESORT,
SwitchboardConstants.SEARCH_JS_RESORT_DEFAULT));
}
// return rewrite properties
return prop;
@@ -45,6 +45,7 @@ <h3>Retrieval of phpBB3 Forum Pages using a web crawl</h3>
<input type="hidden" name="mustmatch" value=".*" />
<input type="hidden" name="mustnotmatch" value=".*memberlist.*|.*previous.*|.*next.*|.*start=.*|.*p=.*" />
<input type="hidden" name="range" value="subpath" />
<input type="hidden" name="cleanSearchCache" id="cleanSearchCache" value=#(cleanSearchCacheChecked)#"off"::"on"#(/cleanSearchCacheChecked)# />
<input type="hidden" name="crawlingIfOlderCheck" value="on"/>
<input type="hidden" name="crawlingIfOlderNumber" value="1" />
<input type="hidden" name="crawlingIfOlderUnit" value="day" />
@@ -44,6 +44,18 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
final String repository = "http://" + a + "/";
prop.put("starturl", (intranet) ? repository : "http://");
prop.put("address", a);
// hidden form param : clean up search events cache ?
if (post != null && post.containsKey("cleanSearchCache")) {
prop.put("cleanSearchCacheChecked", post.getBoolean("cleanSearchCache"));
} else {
/*
* no parameter passed : no search event cache clean-up
* when JavaScript search resort is enabled, as it heavily relies on search events cache
*/
prop.put("cleanSearchCacheChecked", !sb.getConfigBool(SwitchboardConstants.SEARCH_JS_RESORT,
SwitchboardConstants.SEARCH_JS_RESORT_DEFAULT));
}
// return rewrite properties
return prop;

0 comments on commit 519fc9a

Please sign in to comment.