Skip to content

Commit

Permalink
apply blacklist on rwis during dht receive
Browse files Browse the repository at this point in the history
very experimental!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1865 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Mar 9, 2006
1 parent 915812f commit f188611
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 18 deletions.
11 changes: 5 additions & 6 deletions htroot/yacy/transferRWI.java
Expand Up @@ -126,19 +126,18 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
wordhashes[received] = wordHash;
entry = new plasmaWordIndexEntry(estring.substring(p));
sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true);
//sb.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), System.currentTimeMillis(), true);
serverCore.checkInterruption();

urlHash = entry.getUrlHash();
try {
if (
(!(unknownURL.contains(urlHash))) &&
(!(sb.urlPool.loadedURL.exists(urlHash)))
) {
if ((!(unknownURL.contains(urlHash))) &&
(!(sb.urlPool.loadedURL.exists(urlHash)))) {
unknownURL.add(urlHash);
}
} catch (Exception ex) {
sb.getLog().logWarning("transferRWI: DB-Error while trying to determine if URL with hash '" + urlHash + "' is known.",ex);
sb.getLog().logWarning(
"transferRWI: DB-Error while trying to determine if URL with hash '" +
urlHash + "' is known.", ex);
unknownURL.add(urlHash);
}
received++;
Expand Down
15 changes: 8 additions & 7 deletions htroot/yacy/transferURL.java
Expand Up @@ -95,16 +95,17 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
} else {
lEntry = sb.urlPool.loadedURL.newEntry(urls, true);
if ((lEntry != null) && (lEntry.url() != null)) {
if (
(blockBlacklist) &&
(plasmaSwitchboard.urlBlacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))
){
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName);
if ((blockBlacklist) &&
(plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
} else {
sb.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName);
received++;
yacyCore.log.logFine("transferURL: received URL '"
+ lEntry.url() + "' from peer "
+ otherPeerName);
received++;
}
} else {
yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName +
Expand Down
8 changes: 8 additions & 0 deletions source/de/anomic/plasma/plasmaWordIndex.java
Expand Up @@ -358,6 +358,14 @@ public synchronized int removeEntries(String wordHash, String[] urlHashes, boole
return removed;
}

public synchronized int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
// urlHash assigned. This can only work if the entry is really fresh
// and can be found in the RAM cache
// this returns the number of deletion that had been possible
return ramCache.tryRemoveURLs(urlHash);
}

public static final int RL_RAMCACHE = 0;
public static final int RL_FILECACHE = 1;
public static final int RL_ASSORTMENTS = 2;
Expand Down
40 changes: 35 additions & 5 deletions source/de/anomic/plasma/plasmaWordIndexCache.java
Expand Up @@ -61,8 +61,9 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
// environment constants
private static final String indexArrayFileName = "indexDump1.array";
public static final int ramCacheReferenceLimit = 50;
public static final long ramCacheAgeLimit = 60 * 60 * 2 * 1000; // milliseconds; 2 Hours

public static final long ramCacheMaxAge = 1000 * 60 * 60 * 2; // milliseconds; 2 Hours
public static final long ramCacheMinAge = 1000 * 60 * 2; // milliseconds; 2 Minutes (Karenz for DHT Receive)

// class variables
private final File databaseRoot;
private final TreeMap cache;
Expand Down Expand Up @@ -257,12 +258,13 @@ public String bestFlushWordHash() {
String hash = null;
int count = hashScore.getMaxScore();
if ((count > ramCacheReferenceLimit) &&
((hash = (String) hashScore.getMaxObject()) != null)) {
// flush high-score entries
((hash = (String) hashScore.getMaxObject()) != null) &&
(System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) > ramCacheMinAge)) {
// flush high-score entries, but not if they are too 'young'
return hash;
}
long oldestTime = longEmit(hashDate.getMinScore());
if (((System.currentTimeMillis() - oldestTime) > ramCacheAgeLimit) &&
if (((System.currentTimeMillis() - oldestTime) > ramCacheMaxAge) &&
((hash = (String) hashDate.getMinObject()) != null)) {
// flush out-dated entries
return hash;
Expand All @@ -271,6 +273,10 @@ public String bestFlushWordHash() {
if (Runtime.getRuntime().freeMemory() < 10000000) {
// low-memory case
hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM)
if (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) < ramCacheMinAge) {
// to young, take it from the oldest entries
hash = (String) hashDate.getMinObject();
}
} else {
// not-efficient-so-far case
hash = (String) hashDate.getMinObject(); // flush oldest entries (makes indexing faster)
Expand Down Expand Up @@ -335,6 +341,30 @@ public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComp
return count;
}

public synchronized int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
// urlHash assigned. This can only work if the entry is really fresh
// Such entries must be searched in the latest entries
Iterator i = hashDate.scores(false);
String wordHash;
long t;
plasmaWordIndexEntryContainer c;
int delCount = 0;
while (i.hasNext()) {
wordHash = (String) i.next();
// check time
t = longEmit(hashDate.getScore(wordHash));
if (System.currentTimeMillis() - t > ramCacheMinAge) return delCount;
// get container
c = (plasmaWordIndexEntryContainer) cache.get(wordHash);
if (c.remove(urlHash) != null) {
cache.put(wordHash, c);
delCount++;
}
}
return delCount;
}

public int addEntries(plasmaWordIndexEntryContainer container, long updateTime, boolean highPriority) {
// this puts the entries into the cache, not into the assortment directly

Expand Down

0 comments on commit f188611

Please sign in to comment.