Skip to content
Permalink
Browse files

Recrawl:

- set the chunksize to 100 to meet the max of the embedded solr
- re-enable sorting (the case where we switched it of should be away)
- enable recrawling on remote-solr
  • Loading branch information...
sgaebel
sgaebel committed Jan 4, 2019
1 parent 8f58c1d commit 8d2e7262d9568658269ebe16b9cc28ac81dac6af
Showing with 50 additions and 55 deletions.
  1. +48 −52 htroot/IndexReIndexMonitor_p.java
  2. +2 −3 source/net/yacy/crawler/RecrawlBusyThread.java
@@ -140,58 +140,54 @@ public static serverObjects respond(final RequestHeader header, final serverObje
if (recrawlbt == null || recrawlbt.shutdownInProgress()) {
prop.put("recrawljobrunning_simulationResult", 0);
prop.put("recrawljobrunning_error", 0);
if(!sb.index.fulltext().connectedLocalSolr()) {
prop.put("recrawljobrunning_error", 1); // Re-crawl works only with an embedded local Solr index
} else {
if (post.containsKey("recrawlnow")) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);

/* store this call as an api call for easy scheduling possibility */
if(sb.tables != null) {
/* We avoid creating a duplicate of any already recorded API call with the same parameters */
final Row lastExecutedCall = WorkTables
.selectLastExecutedApiCall(IndexReIndexMonitor_p.SERVLET_NAME, post, sb);
if (lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
if (lastExecutedCallPk != null) {
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
}
}
sb.tables.recordAPICall(post, IndexReIndexMonitor_p.SERVLET_NAME, WorkTables.TABLE_API_TYPE_CRAWLER,
"Recrawl documents matching selection query : " + recrawlQuery);
}
} else if(post.containsKey("simulateRecrawl")) {
final SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (solrConnector != null && !solrConnector.isClosed()) {
try {
/* Ensure indexed data is up-to-date */
solrConnector.commit(true);
// query all or only httpstatus=200 depending on includefailed flag
final String finalQuery = RecrawlBusyThread.buildSelectionQuery(recrawlQuery, inclerrdoc);
final long count = solrConnector.getCountByQuery(finalQuery);
prop.put("recrawljobrunning_simulationResult", 1);
prop.put("recrawljobrunning_simulationResult_docCount", count);
if(count > 0) {
/* Got some results : add a link to the related solr select URL for easily browsing results */
final int maxRows = 10;
final String solrSelectUrl = genLocalSolrSelectUrl(finalQuery, maxRows);
prop.put("recrawljobrunning_simulationResult_showSelectLink", 1);
prop.put("recrawljobrunning_simulationResult_showSelectLink_rows", maxRows);
prop.put("recrawljobrunning_simulationResult_showSelectLink_browseSelectedUrl", solrSelectUrl);
} else {
prop.put("recrawljobrunning_simulationResult_showSelectLink", 0);
}
} catch (final IOException e) {
prop.put("recrawljobrunning_simulationResult", 2);
ConcurrentLog.logException(e);
}
} else {
prop.put("recrawljobrunning_simulationResult", 3);
}
}
}
if (post.containsKey("recrawlnow")) {
sb.deployThread(RecrawlBusyThread.THREAD_NAME, "ReCrawl", "recrawl existing documents", null,
new RecrawlBusyThread(Switchboard.getSwitchboard(), recrawlQuery, inclerrdoc), 1000);
recrawlbt = sb.getThread(RecrawlBusyThread.THREAD_NAME);

/* store this call as an api call for easy scheduling possibility */
if(sb.tables != null) {
/* We avoid creating a duplicate of any already recorded API call with the same parameters */
final Row lastExecutedCall = WorkTables
.selectLastExecutedApiCall(IndexReIndexMonitor_p.SERVLET_NAME, post, sb);
if (lastExecutedCall != null && !post.containsKey(WorkTables.TABLE_API_COL_APICALL_PK)) {
byte[] lastExecutedCallPk = lastExecutedCall.getPK();
if (lastExecutedCallPk != null) {
post.add(WorkTables.TABLE_API_COL_APICALL_PK, UTF8.String(lastExecutedCallPk));
}
}
sb.tables.recordAPICall(post, IndexReIndexMonitor_p.SERVLET_NAME, WorkTables.TABLE_API_TYPE_CRAWLER,
"Recrawl documents matching selection query : " + recrawlQuery);
}
} else if(post.containsKey("simulateRecrawl")) {
final SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (solrConnector != null && !solrConnector.isClosed()) {
try {
/* Ensure indexed data is up-to-date */
solrConnector.commit(true);
// query all or only httpstatus=200 depending on includefailed flag
final String finalQuery = RecrawlBusyThread.buildSelectionQuery(recrawlQuery, inclerrdoc);
final long count = solrConnector.getCountByQuery(finalQuery);
prop.put("recrawljobrunning_simulationResult", 1);
prop.put("recrawljobrunning_simulationResult_docCount", count);
if(count > 0) {
/* Got some results : add a link to the related solr select URL for easily browsing results */
final int maxRows = 10;
final String solrSelectUrl = genLocalSolrSelectUrl(finalQuery, maxRows);
prop.put("recrawljobrunning_simulationResult_showSelectLink", 1);
prop.put("recrawljobrunning_simulationResult_showSelectLink_rows", maxRows);
prop.put("recrawljobrunning_simulationResult_showSelectLink_browseSelectedUrl", solrSelectUrl);
} else {
prop.put("recrawljobrunning_simulationResult_showSelectLink", 0);
}
} catch (final IOException e) {
prop.put("recrawljobrunning_simulationResult", 2);
ConcurrentLog.logException(e);
}
} else {
prop.put("recrawljobrunning_simulationResult", 3);
}
}

if(post.containsKey("recrawlDefaults")) {
recrawlQuery = RecrawlBusyThread.DEFAULT_QUERY;
@@ -72,7 +72,7 @@
private boolean includefailed;

private int chunkstart = 0;
private final int chunksize;
private final int chunksize = 100;
private final Switchboard sb;

/** buffer of urls to recrawl */
@@ -129,8 +129,7 @@ public RecrawlBusyThread(final Switchboard xsb, final String query, final boolea
this.urlstack = new HashSet<DigestURL>();
// workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
// org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
this.solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200);
solrSortBy = CollectionSchema.load_date_dt.getSolrFieldName() + " asc";

final SolrConnector solrConnector = this.sb.index.fulltext().getDefaultConnector();
if (solrConnector != null && !solrConnector.isClosed()) {

0 comments on commit 8d2e726

Please sign in to comment.
You can’t perform that action at this time.