Skip to content

Commit

Permalink
Use a constant for crawler reject reason prefix with specific processing
Browse files Browse the repository at this point in the history
  • Loading branch information
luccioman committed Jan 13, 2018
1 parent 4e03335 commit d47afe6
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
9 changes: 6 additions & 3 deletions source/net/yacy/crawler/CrawlStacker.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ public final class CrawlStacker {
public static String ERROR_NO_MATCH_MUST_MATCH_FILTER = "url does not match must-match filter ";
public static String ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER = "url matches must-not-match filter ";

/** Crawl reject reason prefix having specific processing */
public static final String CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX = "double in";

private final static ConcurrentLog log = new ConcurrentLog("STACKCRAWL");

private final RobotsTxt robots;
Expand Down Expand Up @@ -135,7 +138,7 @@ public Request job(final Request entry) {
final String rejectReason = stackCrawl(entry);

// if the url was rejected we store it into the error URL db
if (rejectReason != null && !rejectReason.startsWith("double in")) {
if (rejectReason != null && !rejectReason.startsWith(CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX)) {
final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry.url(), entry.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
Expand Down Expand Up @@ -411,7 +414,7 @@ public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile p
// check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
if (dbocc != null) {
return "double in: " + dbocc.name();
return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": " + dbocc.name();
}
String urlhash = ASCII.String(url.hash());
LoadTimeURL oldEntry = null;
Expand Down Expand Up @@ -452,7 +455,7 @@ public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile p
CrawlStacker.log.fine("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago.");
} else {
return "double in: local index, oldDate = " + ISO8601Formatter.FORMATTER.format(new Date(oldDate));
return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": local index, oldDate = " + ISO8601Formatter.FORMATTER.format(new Date(oldDate));
}

return null;
Expand Down
2 changes: 1 addition & 1 deletion source/net/yacy/search/Switchboard.java
Original file line number Diff line number Diff line change
Expand Up @@ -2971,7 +2971,7 @@ private Document[] parseDocument(final Response response) throws InterruptedExce
newDocs.add(doc);
} else {
// we consider this as fail urls to have a tracking of the problem
if (rejectReason != null && !rejectReason.startsWith("double in")) {
if (rejectReason != null && !rejectReason.startsWith(CrawlStacker.CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX)) {
this.crawlStacker.nextQueue.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
}
Expand Down

0 comments on commit d47afe6

Please sign in to comment.