Skip to content
Permalink
Browse files

Fixed exceeding max size of failreason_s Solr field on large link list

When using the 'From Link-List of URL' as a crawl start, with lists in
the order of one or more thousands of links, the failreason_s Solr field
maximum size (32kb) was exceeded by the string representation of the URL
must-match filter when a crawl URL was rejected because not matching.
  • Loading branch information...
luccioman committed Jul 11, 2018
1 parent f467601 commit dcad393fe592170b681958d9abe946ec3d0c1556
@@ -496,8 +496,11 @@ public String checkAcceptanceChangeable(final DigestURL url, final CrawlProfile

// filter with must-match for URLs
if ((depth > 0) && !profile.urlMustMatchPattern().matcher(urlstring).matches()) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + profile.urlMustMatchPattern().toString() + "'.");
return ERROR_NO_MATCH_MUST_MATCH_FILTER + profile.urlMustMatchPattern().toString();
final String patternStr = profile.formattedUrlMustMatchPattern();
if (CrawlStacker.log.isFine()) {
CrawlStacker.log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + patternStr + "'.");
}
return ERROR_NO_MATCH_MUST_MATCH_FILTER + patternStr;
}

// filter with must-not-match for URLs
@@ -467,6 +467,24 @@ public Pattern urlMustMatchPattern() {
}
return this.crawlerurlmustmatch;
}

/**
* Render the urlMustMatchPattern as a String of limited size, suffixing it with
* "..." when it is truncated. Used to prevent unnecessary growth of the logs,
* and to prevent exceeding the field size limit for
* CollectionSchema.failreason_s (32k) when the pattern is present in a fail doc
* added to the Solr index.
*
* @return the urlMustMatchPattern formatted as a String of limited size
*/
public String formattedUrlMustMatchPattern() {
String patternStr = urlMustMatchPattern().toString();
if(patternStr.length() > 1000) {
/* The pattern may be quite large when using the 'From Link-List of URL' crawl start point. */
patternStr = patternStr.substring(0, Math.min(patternStr.length(), 1000)) + "...";
}
return patternStr;
}

/**
* Gets the regex which must not be matched by URLs in order to be crawled.
@@ -371,7 +371,7 @@ private void load(final Request urlEntry, final String stats) {
+ ", crawlOrder=" + ((profile.remoteIndexing()) ? "true" : "false")
+ ", depth=" + urlEntry.depth()
+ ", crawlDepth=" + profile.depth()
+ ", must-match=" + profile.urlMustMatchPattern().toString()
+ ", must-match=" + profile.formattedUrlMustMatchPattern()
+ ", must-not-match=" + profile.urlMustNotMatchPattern().toString()
+ ", permission=" + ((this.sb.peers == null) ? "undefined" : (((this.sb.peers.mySeed().isSenior()) || (this.sb.peers.mySeed().isPrincipal())) ? "true" : "false")));
}
@@ -2992,7 +2992,7 @@ public IndexingQueueEntry parseDocument(final IndexingQueueEntry in) {
"processResourceStack processCase=" + processCase
+ ", depth=" + response.depth()
+ ", maxDepth=" + ((response.profile() == null) ? "null" : Integer.toString(response.profile().depth()))
+ ", must-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustMatchPattern().toString())
+ ", must-match=" + ((response.profile() == null) ? "null" : response.profile().formattedUrlMustMatchPattern())
+ ", must-not-match=" + ((response.profile() == null) ? "null" : response.profile().urlMustNotMatchPattern().toString())
+ ", initiatorHash=" + ((response.initiator() == null) ? "null" : ASCII.String(response.initiator()))
+ ", url=" + response.url()); // DEBUG

0 comments on commit dcad393

Please sign in to comment.
You can’t perform that action at this time.