Permalink
Browse files

Added more details to the recrawl job report

  • Loading branch information...
luccioman committed Jan 12, 2018
1 parent d95d393 commit 4e033356257b0ef969dabcc2a22460712add4a52
@@ -132,12 +132,17 @@ <h2>Re-Crawl Index Documents</h2>
<h3 class="panel-title">#(jobStatus)#::::Last #(/jobStatus)#Re-Crawl job report</h3>
</div>
<div class="panel-body">
#(error)#::<div class="alert alert-danger" role="alert">The job terminated early due to an error when requesting the Solr index.</div>#(/error)#
<table class="table">
<tbody>
<tr>
<th scope="row">Status</th>
<td>#(jobStatus)#Running::Shutdown in progress::Terminated#(/jobStatus)#</td>
</tr>
<tr>
<th scope="row">Query</th>
<td>#[recrawlquerytext]#</td>
</tr>
<tr>
<th scope="row">Start time</th>
<td>#[startTime]#</td>
@@ -147,8 +152,16 @@ <h3 class="panel-title">#(jobStatus)#::::Last #(/jobStatus)#Re-Crawl job report<
<td>#[endTime]#</td>
</tr>
<tr>
<th scope="row">Count</th>
<td>#[recrawledUrlsCount]# URLs added to the crawler queue for recrawl</td>
<th scope="row" title="URLs added to the crawler queue for recrawl">Recrawled URLs</th>
<td>#[recrawledUrlsCount]#</td>
</tr>
<tr>
<th scope="row" title="URLs rejected for some reason by the crawl stacker or the crawler queue. Please check the logs for more details.">Rejected URLs</th>
<td>#[rejectedUrlsCount]#</td>
</tr>
<tr>
<th scope="row">Malformed URLs</th>
<td title="#[malformedUrlsDeletedCount]# deleted from the index">#[malformedUrlsCount]#</td>
</tr>
</tbody>
</table>
@@ -257,6 +257,24 @@ private static void processRecrawlReport(final RequestHeader header, final Switc
final serverObjects prop, final RecrawlBusyThread recrawlbt) {
if (recrawlbt != null) {
prop.put("recrawlReport", 1);
prop.put("recrawlReport_error", recrawlbt.isTerminatedBySolrFailure());
int jobStatus;
if(recrawlbt.isAlive()) {
if(recrawlbt.shutdownInProgress()) {
jobStatus = 1; // Shutdown in progress
} else {
jobStatus = 0; // Running
}
} else {
jobStatus = 2; // Terminated
}
prop.put("recrawlReport_jobStatus", jobStatus);
prop.put("recrawlReport_recrawlquerytext", recrawlbt.getQuery());
Locale formatLocale;
if (sb != null) {
String lng = sb.getConfig("locale.language", Locale.ENGLISH.getLanguage());
@@ -272,20 +290,12 @@ private static void processRecrawlReport(final RequestHeader header, final Switc
}
final DateTimeFormatter formatter = DateTimeFormatter.ofLocalizedDateTime(FormatStyle.MEDIUM)
.withLocale(formatLocale);
int jobStatus;
if(recrawlbt.isAlive()) {
if(recrawlbt.shutdownInProgress()) {
jobStatus = 1; // Shutdown in progress
} else {
jobStatus = 0; // Running
}
} else {
jobStatus = 2; // Terminated
}
prop.put("recrawlReport_jobStatus", jobStatus);
prop.put("recrawlReport_startTime", formatDateTime(formatter, recrawlbt.getStartTime()));
prop.put("recrawlReport_endTime", formatDateTime(formatter, recrawlbt.getEndTime()));
prop.put("recrawlReport_recrawledUrlsCount", recrawlbt.getRecrawledUrlsCount());
prop.put("recrawlReport_rejectedUrlsCount", recrawlbt.getRejectedUrlsCount());
prop.put("recrawlReport_malformedUrlsCount", recrawlbt.getMalformedUrlsCount());
prop.put("recrawlReport_malformedUrlsDeletedCount", recrawlbt.getMalformedUrlsDeletedCount());
} else {
prop.put("recrawlReport", 0);
}
@@ -80,11 +80,23 @@
/** Total number of URLs added to the crawler queue for recrawl */
private long recrawledUrlsCount = 0;
/** Total number of URLs rejected for some reason by the crawl stacker or the crawler queue */
private long rejectedUrlsCount = 0;
/** Total number of malformed URLs found */
private long malformedUrlsCount = 0;
/** Total number of malformed URLs deleted from index */
private long malformedUrlsDeletedCount = 0;
private String solrSortBy;
/** Set to true when more URLs are still to be processed */
private boolean moreToRecrawl = true;
/** True when the job terminated early because an error occurred when requesting the Solr index, or the Solr index was closed */
private boolean terminatedBySolrFailure = false;
/** The recrawl job start time */
private LocalDateTime startTime;
@@ -173,20 +185,22 @@ private boolean feedToCrawler() {
if (!this.urlstack.isEmpty()) {
final CrawlProfile profile = sb.crawler.defaultTextSnippetGlobalProfile;
for (DigestURL url : this.urlstack) {
for (final DigestURL url : this.urlstack) {
final Request request = sb.loader.request(url, true, true);
String acceptedError = sb.crawlStacker.checkAcceptanceChangeable(url, profile, 0);
if (!includefailed && acceptedError == null) { // skip check if failed docs to be included
acceptedError = sb.crawlStacker.checkAcceptanceInitially(url, profile);
}
if (acceptedError != null) {
this.rejectedUrlsCount++;
ConcurrentLog.info(THREAD_NAME, "addToCrawler: cannot load " + url.toNormalform(true) + ": " + acceptedError);
continue;
}
final String s;
s = sb.crawlQueues.noticeURL.push(NoticedURL.StackType.LOCAL, request, profile, sb.robots);
if (s != null) {
this.rejectedUrlsCount++;
ConcurrentLog.info(THREAD_NAME, "addToCrawler: failed to add " + url.toNormalform(true) + ": " + s);
} else {
added++;
@@ -248,9 +262,10 @@ private boolean processSingleQuery() {
return true;
}
SolrDocumentList docList = null;
SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
final SolrConnector solrConnector = sb.index.fulltext().getDefaultConnector();
if (solrConnector.isClosed()) {
this.urlsToRecrawl = 0;
this.terminatedBySolrFailure = true;
return false;
}
@@ -261,17 +276,20 @@ private boolean processSingleQuery() {
this.urlsToRecrawl = docList.getNumFound();
} catch (final Throwable e) {
this.urlsToRecrawl = 0;
this.terminatedBySolrFailure = true;
}
if (docList != null) {
for (final SolrDocument doc : docList) {
try {
this.urlstack.add(new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName())));
} catch (MalformedURLException ex) {
} catch (final MalformedURLException ex) {
this.malformedUrlsCount++;
try { // if index entry hasn't a valid url (useless), delete it
solrConnector.deleteById((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()));
this.malformedUrlsDeletedCount++;
ConcurrentLog.severe(THREAD_NAME, "deleted index document with invalid url " + (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()));
} catch (IOException ex1) {
} catch (final IOException ex1) {
ConcurrentLog.severe(THREAD_NAME, ex1.getMessage());
}
}
@@ -297,12 +315,42 @@ public long getUrlsToRecrawl() {
return this.urlsToRecrawl;
}
/**
* @return The total number of URLs added to the crawler queue for recrawl
*/
/**
* @return The total number of URLs added to the crawler queue for recrawl
*/
public long getRecrawledUrlsCount() {
return this.recrawledUrlsCount;
}
/**
* @return The total number of URLs rejected for some reason by the crawl
* stacker or the crawler queue
*/
public long getRejectedUrlsCount() {
return this.rejectedUrlsCount;
}
/**
* @return The total number of malformed URLs found
*/
public long getMalformedUrlsCount() {
return this.malformedUrlsCount;
}
/**
* @return The total number of malformed URLs deleted from index
*/
public long getMalformedUrlsDeletedCount() {
return this.malformedUrlsDeletedCount;
}
/**
* @return true when the job terminated early because an error occurred when
* requesting the Solr index, or the Solr index was closed
*/
public boolean isTerminatedBySolrFailure() {
return this.terminatedBySolrFailure;
}
/** @return The recrawl job start time */
public LocalDateTime getStartTime() {

0 comments on commit 4e03335

Please sign in to comment.