Permalink
Browse files

Added a report info box about eventual last terminated recrawl job

For easier monitoring of recrawls.
  • Loading branch information...
luccioman committed Jan 9, 2018
1 parent b2af25b commit 433e241e4f83c4d912f403fc1624c6ce43b719c9
@@ -62,7 +62,38 @@ <h2>Re-Crawl Index Documents</h2>
<p>Searches the local index and selects documents to add to the crawler (recrawl the document).
This runs transparent as background job. Documents are added to the crawler only if no other crawls are active
and are added in small chunks.</p>
<form action="IndexReIndexMonitor_p.html?setup=recrawljob" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
<div class="container-fluid">
<div class="row">
#(recrawljobrunning)#
#(recrawlReport)#::
<div class="col-md-10 col-lg-6">
<div class="panel panel-info">
<div class="panel-heading">
<h3 class="panel-title">Last Re-Crawl job report</h3>
</div>
<div class="panel-body">
<table class="table">
<tbody>
<tr>
<th scope="row">Start time</th>
<td>#[startTime]#</td>
</tr>
<tr>
<th scope="row">End time</th>
<td>#[endTime]#</td>
</tr>
<tr>
<th scope="row">Count</th>
<td>#[recrawledUrlsCount]# URLs added to the crawler queue for recrawl</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
#(/recrawlReport)#
#(/recrawljobrunning)#
<form action="IndexReIndexMonitor_p.html?setup=recrawljob" method="post" enctype="multipart/form-data" accept-charset="UTF-8" class="col-md-10 col-lg-6">
<input type="hidden" name="transactionToken" value="#[transactionToken]#" />
<table><tr valign="top"><td>
<fieldset>
@@ -119,6 +150,8 @@ <h2>Re-Crawl Index Documents</h2>
</td>
</tr></table>
</form>
</div>
</div>
#%env/templates/footer.template%#
</body>
</html>
@@ -17,17 +17,21 @@
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
import java.io.IOException;
import java.time.DateTimeException;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.FormatStyle;
import java.util.Locale;
import net.yacy.migration;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.workflow.BusyThread;
import java.io.IOException;
import net.yacy.migration;
import net.yacy.crawler.RecrawlBusyThread;
import net.yacy.data.TransactionManager;
import net.yacy.kelondro.workflow.BusyThread;
import net.yacy.search.Switchboard;
import net.yacy.search.index.ReindexSolrBusyThread;
import net.yacy.server.serverObjects;
@@ -149,7 +153,9 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
}
} else {
if (post.containsKey("stoprecrawl")) {
sb.terminateThread(RecrawlBusyThread.THREAD_NAME, false);
/* We do not remove the thread from the Switchboard worker threads using serverSwitch.terminateThread(String,boolean),
* because we want to be able to provide a report after its termination */
recrawlbt.terminate(false);
prop.put("recrawljobrunning", 0);
}
}
@@ -169,12 +175,62 @@ public static serverObjects respond(@SuppressWarnings("unused") final RequestHea
prop.put("recrawljobrunning_recrawlquerytext", ((RecrawlBusyThread) recrawlbt).getQuery());
prop.put("recrawljobrunning_includefailedurls", ((RecrawlBusyThread) recrawlbt).getIncludeFailed());
} else {
prop.put("recrawljobrunning", 0);
prop.put("recrawljobrunning", 0);
processRecrawlReport(header, sb, prop, (RecrawlBusyThread)recrawlbt);
prop.put("recrawljobrunning_recrawlquerytext", recrawlQuery);
prop.put("recrawljobrunning_includefailedurls", inclerrdoc);
}
// return rewrite properties
return prop;
}
/**
* Write information on the eventual last recrawl job terminated
* @param header current request header. Must not be null.
* @param sb Switchboard instance holding server environment
* @param prop this template result
* @param recrawlbt the eventual terminated recrawl thread
*/
private static void processRecrawlReport(final RequestHeader header, final Switchboard sb,
final serverObjects prop, final RecrawlBusyThread recrawlbt) {
if (recrawlbt != null) {
prop.put("recrawljobrunning_recrawlReport", 1);
String lng = sb.getConfig("locale.language", Locale.ENGLISH.getLanguage());
Locale formatLocale;
if ("browser".equals(lng)) {
/* Only use the client locale when locale.language is set to browser */
formatLocale = header.getLocale();
} else {
formatLocale = Locale.forLanguageTag(lng);
}
final DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.MEDIUM)
.withLocale(formatLocale);
prop.put("recrawljobrunning_recrawlReport_startTime", formatDateTime(formatter, recrawlbt.getStartTime()));
prop.put("recrawljobrunning_recrawlReport_endTime", formatDateTime(formatter, recrawlbt.getEndTime()));
prop.put("recrawljobrunning_recrawlReport_recrawledUrlsCount", recrawlbt.getRecrawledUrlsCount());
} else {
prop.put("recrawljobrunning_recrawlReport", 0);
}
}
/**
* @param formatter the formatter to use. Must not be null.
* @param time the date/time value to format. Can be null.
* @return a string representing the formatted date/time, eventually empty.
*/
protected static String formatDateTime(final DateTimeFormatter formatter, final LocalDateTime time) {
String formattedTime;
if(time != null) {
try {
formattedTime = time.format(formatter);
} catch(final DateTimeException e) {
/* Fallback to ISO-8601 on any eventual formatting failure */
formattedTime = time.toString();
}
} else {
formattedTime = "";
}
return formattedTime;
}
}
@@ -25,8 +25,13 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.time.LocalDateTime;
import java.util.HashSet;
import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.util.ConcurrentLog;
@@ -37,8 +42,6 @@
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
/**
* Selects documents by a query from the local index
@@ -74,10 +77,19 @@
/** The total number of candidate URLs found for recrawl */
private long urlsToRecrawl = 0;
/** Total number of URLs added to the crawler queue for recrawl */
private long recrawledUrlsCount = 0;
private String solrSortBy;
/** Set to true when more URLs are still to be processed */
private boolean moreToRecrawl = true;
/** The recrawl job start time */
private LocalDateTime startTime;
/** The recrawl job end time */
private LocalDateTime endTime;
/**
* @param xsb
@@ -117,7 +129,7 @@ public void setQuery(String q, boolean includefailedurls) {
this.chunkstart = 0;
}
public String getQuery () {
public String getQuery() {
return this.currentQuery;
}
@@ -178,6 +190,7 @@ private boolean feedToCrawler() {
ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
} else {
added++;
this.recrawledUrlsCount++;
}
}
this.urlstack.clear();
@@ -212,7 +225,18 @@ public boolean job() {
didSomething = feedToCrawler();
}
return didSomething;
}
@Override
public synchronized void start() {
this.startTime = LocalDateTime.now();
super.start();
}
@Override
public void terminate(boolean waitFor) {
super.terminate(waitFor);
this.endTime = LocalDateTime.now();
}
/**
@@ -273,6 +297,23 @@ public long getUrlsToRecrawl() {
return this.urlsToRecrawl;
}
/**
* @return The total number of URLs added to the crawler queue for recrawl
*/
public long getRecrawledUrlsCount() {
return this.recrawledUrlsCount;
}
/** @return The recrawl job start time */
public LocalDateTime getStartTime() {
return this.startTime;
}
/** @return The recrawl job end time */
public LocalDateTime getEndTime() {
return this.endTime;
}
@Override
public void freemem() {
this.urlstack.clear();

0 comments on commit 433e241

Please sign in to comment.