Skip to content

Commit

Permalink
fix for slow crawling and better logging in balancer
Browse files Browse the repository at this point in the history
  • Loading branch information
Orbiter committed Apr 29, 2014
1 parent 3acf416 commit c1c1be8
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 11 deletions.
19 changes: 13 additions & 6 deletions source/net/yacy/crawler/HostBalancer.java
Expand Up @@ -60,6 +60,7 @@
*/
public class HostBalancer implements Balancer {

private final static ConcurrentLog log = new ConcurrentLog("HostBalancer");
public final static HandleMap depthCache = new RowHandleMap(Word.commonHashLength, Word.commonHashOrder, 2, 8 * 1024 * 1024, "HostBalancer.DepthCache");

private final File hostsPath;
Expand Down Expand Up @@ -258,17 +259,23 @@ public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws
String s = i.next();
HostQueue hq = this.queues.get(s);
if (hq == null) {i.remove(); continue smallstacks;}
int delta = Latency.waitingRemainingGuessed(hq.getHost(), s, robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things
// to protect all small stacks which have a fast throughput, remove all with long waiting time
if (delta >= 1000) {i.remove(); continue smallstacks;}
int size = hq.size();
if (singletonStacksExist) {
if (size != 1) {i.remove(); continue smallstacks;}
} else {
if (size > 10) {i.remove(); continue smallstacks;}
if (size != 1) {i.remove(); continue smallstacks;} // remove all non-singletons
} else /*smallStacksExist*/ {
if (size > 10) {i.remove(); continue smallstacks;} // remove all large stacks
}
// to protect all small stacks which have a fast throughput, remove all with long wainting time
int delta = Latency.waitingRemainingGuessed(hq.getHost(), s, robots, ClientIdentification.yacyInternetCrawlerAgent);
if (delta >= 1000) {i.remove();}
}
}
if (this.roundRobinHostHashes.size() == 1) {
if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host");
} else {
log.info("(re-)initialized the round-robin queue; " + this.roundRobinHostHashes.size() + " hosts.");
}
}
if (this.roundRobinHostHashes.size() == 0) return null;

Expand Down
12 changes: 7 additions & 5 deletions source/net/yacy/crawler/HostQueue.java
Expand Up @@ -55,6 +55,8 @@

public class HostQueue implements Balancer {

private final static ConcurrentLog log = new ConcurrentLog("HostQueue");

public static final String indexSuffix = ".stack";
private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000;
Expand Down Expand Up @@ -105,7 +107,7 @@ private final void init() {
if (!(this.hostPath.exists())) this.hostPath.mkdirs();
this.depthStacks = new TreeMap<Integer, Index>();
int size = openAllStacks();
ConcurrentLog.info("Balancer", "opened HostQueue " + this.hostPath.getAbsolutePath() + " with " + size + " urls.");
if (log.isInfo()) log.info("opened HostQueue " + this.hostPath.getAbsolutePath() + " with " + size + " urls.");
}

public String getHost() {
Expand Down Expand Up @@ -406,15 +408,15 @@ public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws

// check blacklist (again) because the user may have created blacklist entries after the queue has been filled
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
ConcurrentLog.fine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist.");
if (log.isFine()) log.fine("URL '" + crawlEntry.url() + "' is in blacklist.");
continue mainloop;
}

// at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again
profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) {
ConcurrentLog.fine("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
if (log.isFine()) log.fine("no profile entry for handle " + crawlEntry.profileHandle());
continue mainloop;
}

Expand All @@ -432,7 +434,7 @@ public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws
// in best case, this should never happen if the balancer works properly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
ConcurrentLog.info("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent));
if (log.isInfo()) log.info("forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent));
long loops = sleeptime / 1000;
long rest = sleeptime % 1000;
if (loops < 3) {
Expand All @@ -444,7 +446,7 @@ public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws
// must be synchronized here to avoid 'takeover' moves from other threads which then idle the same time which would not be enough
if (rest > 0) {try {this.wait(rest);} catch (final InterruptedException e) {}}
for (int i = 0; i < loops; i++) {
ConcurrentLog.info("BALANCER", "waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
if (log.isInfo()) log.info("waiting for " + crawlEntry.url().getHost() + ": " + (loops - i) + " seconds remaining...");
try {this.wait(1000); } catch (final InterruptedException e) {}
}
}
Expand Down

0 comments on commit c1c1be8

Please sign in to comment.