Skip to content

Commit

Permalink
- patch for bad web structure dumps
Browse files Browse the repository at this point in the history
- added automatic slow down of accessed to specific domains when access to a web page fails

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5765 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Apr 1, 2009
1 parent 0139988 commit b6c2167
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 1 deletion.
16 changes: 16 additions & 0 deletions source/de/anomic/crawler/Latency.java
Expand Up @@ -45,6 +45,17 @@ public static void update(String hosthash, String host, long time) {
}
}

public static void slowdown(String hosthash, String host) {
assert hosthash.length() == 6;
Host h = map.get(hosthash);
if (h == null) {
h = new Host(host, 3000);
map.put(hosthash, h);
} else {
h.slowdown();
}
}

public static Host host(String hosthash) {
assert hosthash.length() == 6;
return map.get(hosthash);
Expand Down Expand Up @@ -172,6 +183,11 @@ public void update(long time) {
this.timeacc += time;
this.count++;
}
public void slowdown() {
this.lastacc = System.currentTimeMillis();
this.timeacc = Math.min(60000, average() * 5);
this.count = 1;
}
public int count() {
return this.count;
}
Expand Down
8 changes: 8 additions & 0 deletions source/de/anomic/http/httpClient.java
Expand Up @@ -59,8 +59,10 @@
import org.apache.commons.httpclient.protocol.Protocol;
import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;

import de.anomic.crawler.Latency;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.util.Log;
import de.anomic.yacy.yacyURL;

/**
* HttpClient implementation which uses Jakarta Commons HttpClient 3.x {@link http://hc.apache.org/httpclient-3.x/}
Expand Down Expand Up @@ -448,14 +450,20 @@ private httpResponse execute(final HttpMethod method) throws IOException {
}
} catch (final IllegalThreadStateException e) {
// cleanUp statistics
yacyURL url = new yacyURL(method.getURI().toString(), null);
Latency.slowdown(url.hash().substring(6), url.getHost());
HttpConnectionInfo.removeConnection(generateConInfo(method));
throw e;
} catch (final IOException e) {
// cleanUp statistics
yacyURL url = new yacyURL(method.getURI().toString(), null);
Latency.slowdown(url.hash().substring(6), url.getHost());
HttpConnectionInfo.removeConnection(generateConInfo(method));
throw e;
} catch (final IllegalStateException e) {
// cleanUp statistics
yacyURL url = new yacyURL(method.getURI().toString(), null);
Latency.slowdown(url.hash().substring(6), url.getHost());
HttpConnectionInfo.removeConnection(generateConInfo(method));
throw new IOException(e.getMessage());
}
Expand Down
8 changes: 7 additions & 1 deletion source/de/anomic/plasma/plasmaWebStructure.java
Expand Up @@ -184,9 +184,15 @@ static Map<String, Integer> refstr2map(final String refs) {
final Map<String, Integer> map = new HashMap<String, Integer>();
String c;
final int refsc = refstr2count(refs);
int d;
for (int i = 0; i < refsc; i++) {
c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
map.put(c.substring(0, 6), Integer.valueOf(c.substring(6), 16));
try {
d = Integer.valueOf(c.substring(6), 16);
} catch (NumberFormatException e) {
d = 1;
}
map.put(c.substring(0, 6), d);
}
return map;
}
Expand Down

0 comments on commit b6c2167

Please sign in to comment.