Skip to content

Commit

Permalink
protection against crawl balancer failure:
Browse files Browse the repository at this point in the history
a minimum of 500 milliseconds distance between two acesses
to the same domain is now ensured

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3354 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Feb 9, 2007
1 parent 1f1f398 commit 8c1d2e0
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 6 deletions.
31 changes: 26 additions & 5 deletions source/de/anomic/plasma/plasmaCrawlBalancer.java
Expand Up @@ -58,10 +58,12 @@ public class plasmaCrawlBalancer {

private kelondroStack stack;
private HashMap domainStacks;
private HashMap domainAccess;

public plasmaCrawlBalancer(File stackFile) {
stack = kelondroStack.open(stackFile, new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0));
domainStacks = new HashMap();
domainAccess = new HashMap();
}

public void close() {
Expand Down Expand Up @@ -135,17 +137,36 @@ public void add(String domain, byte[] hash) throws IOException {
}
}

public byte[] get() throws IOException {
public String get(long minimumDelta) throws IOException {
// returns an url-hash from the stack
synchronized (domainStacks) {
String entry = null;
if (stack.size() > 0) {
return stack.pop().getColBytes(0);
entry = new String(stack.pop().getColBytes(0));
} else if (domainStacks.size() > 0) {
flushOnce();
return stack.pop().getColBytes(0);
} else {
return null;
entry = new String(stack.pop().getColBytes(0));
}
if ((minimumDelta > 0) && (entry != null)) {
// check if the time after retrieval of last hash from same
// domain is not shorter than the minimumDelta
String domhash = entry.substring(6);
Long lastAccess = (Long) domainAccess.get(domhash);
if (lastAccess != null) {
// this is not the first access of the same domain
long la = lastAccess.longValue();
if (System.currentTimeMillis() - la > minimumDelta) {
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
// this is only to protect against the worst case, where the crawler could
// behave in a DoS-manner
long sleeptime = System.currentTimeMillis() - la - minimumDelta;
if (sleeptime > 0) try {this.wait(sleeptime);} catch (InterruptedException e) {}
}
}
domainAccess.put(domhash, new Long(System.currentTimeMillis()));
}
return entry;
}
}

Expand Down
3 changes: 2 additions & 1 deletion source/de/anomic/plasma/plasmaCrawlNURL.java
Expand Up @@ -76,6 +76,7 @@ public class plasmaCrawlNURL {
public static final int STACK_TYPE_MOVIE = 12; // put on movie stack
public static final int STACK_TYPE_MUSIC = 13; // put on music stack

private static final long minimumDelta = 500; // the minimum time difference between access of the same domain
/**
* column length definition for the {@link plasmaURL#urlIndexFile} DB
*/
Expand Down Expand Up @@ -432,7 +433,7 @@ private Entry pop(kelondroStack stack) throws IOException {
private Entry pop(plasmaCrawlBalancer balancer) throws IOException {
// this is a filo - pop
if (balancer.size() > 0) {
String hash = new String(balancer.get());
String hash = balancer.get(minimumDelta);
if (hash == null) throw new IOException("hash is null");
Entry e = new Entry(hash);
stackIndex.remove(e.hash);
Expand Down

0 comments on commit 8c1d2e0

Please sign in to comment.