From ea478f397595c9eef24e52eae2776fd96f49bd03 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 11 May 2005 23:42:40 +0000 Subject: [PATCH] enhanced indexing-caching git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@107 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Performance_p.html | 10 +++ htroot/Performance_p.java | 1 + .../kelondro/kelondroMScoreCluster.java | 36 ++++++++-- source/de/anomic/plasma/plasmaWordIndex.java | 6 +- .../anomic/plasma/plasmaWordIndexCache.java | 69 +++++++++++++------ source/de/anomic/yacy/yacySearch.java | 9 ++- yacy.parser | 2 +- 7 files changed, 100 insertions(+), 33 deletions(-) diff --git a/htroot/Performance_p.html b/htroot/Performance_p.html index 5159fe7217..9b965d9675 100644 --- a/htroot/Performance_p.html +++ b/htroot/Performance_p.html @@ -81,6 +81,13 @@

Performance

If this is a big number, it shows that the caching works efficiently. + + Singletons Cache Size: + #[singletonsSize]# + + The Singletons Cache is a database that holds words that occurred only once. + + Maximum number of Word Caches: @@ -90,6 +97,7 @@

Performance

flushed to disc; this may last some minutes. + Changes take effect immediately +

diff --git a/htroot/Performance_p.java b/htroot/Performance_p.java index e4b5775c86..85182f4c61 100644 --- a/htroot/Performance_p.java +++ b/htroot/Performance_p.java @@ -146,6 +146,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache()); prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180")); prop.put("wordCacheMax", switchboard.getConfig("wordCacheMax", "10000")); + prop.put("singletonsSize", switchboard.wordIndex.singletonsSize()); // return rewrite values for templates return prop; diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java index 21a787372e..9571942a11 100644 --- a/source/de/anomic/kelondro/kelondroMScoreCluster.java +++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java @@ -44,6 +44,7 @@ import java.text.SimpleDateFormat; import java.util.Iterator; import java.util.TreeMap; +import java.util.Map; public class kelondroMScoreCluster { @@ -243,21 +244,22 @@ public Object[] getScores(int maxCount, boolean up, int minScore, int maxScore) } public Iterator scores(boolean up) { - return scores(up, Integer.MIN_VALUE, Integer.MAX_VALUE); + if (up) return new simpleScoreIterator(); + else return scores(false, Integer.MIN_VALUE, Integer.MAX_VALUE); } public Iterator scores(boolean up, int minScore, int maxScore) { - return new scoreIterator(up, minScore, maxScore); + return new komplexScoreIterator(up, minScore, maxScore); } - private class scoreIterator implements Iterator { + private class komplexScoreIterator implements Iterator { boolean up; TreeMap keyrefDBcopy; Object n; int min, max; - public scoreIterator(boolean up, int minScore, int maxScore) { + public komplexScoreIterator(boolean up, int minScore, int maxScore) { this.up = up; this.min = minScore; this.max = maxScore; @@ -299,7 +301,31 @@ public void remove() { } - public static void main(String[] args) { + private class simpleScoreIterator implements Iterator { + + Iterator ii; + Map.Entry entry; + + public simpleScoreIterator() { + ii = keyrefDB.entrySet().iterator(); + } + + public boolean hasNext() { + return ii.hasNext(); + } + + public Object next() { + entry = (Map.Entry) ii.next(); + return entry.getValue(); + } + + public void remove() { + ii.remove(); + } + + } + + public static void main(String[] args) { System.out.println("Test for Score: start"); kelondroMScoreCluster s = new kelondroMScoreCluster(); int c = 0; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index fd7a70c7ac..373e14748a 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -65,7 +65,7 @@ public class plasmaWordIndex { public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException { this.databaseRoot = databaseRoot; plasmaWordIndexClassicDB fileDB = new plasmaWordIndexClassicDB(databaseRoot, log); - this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, 1000000, log); + this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, bufferkb, log); } public int maxURLinWordCache() { @@ -76,6 +76,10 @@ public int wordCacheRAMSize() { return ramCache.wordCacheRAMSize(); } + public int singletonsSize() { + return ramCache.singletonsSize(); + } + public void setMaxWords(int maxWords) { ramCache.setMaxWords(maxWords); } diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 0d6da66fb4..32f43f2d17 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -82,11 +82,11 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; } - public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, long singletonBufferSize, serverLog log) { + public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, int singletonbufferkb, serverLog log) { // creates a new index cache // the cache has a back-end where indexes that do not fit in the cache are flushed this.databaseRoot = databaseRoot; - this.singletonBufferSize = singletonBufferSize; + this.singletonBufferSize = singletonbufferkb * 1024; this.cache = new TreeMap(); this.hashScore = new kelondroMScoreCluster(); this.hashDate = new HashMap(); @@ -132,7 +132,7 @@ private void dump(int waitingSeconds) throws IOException { long wordsPerSecond = 0, wordcount = 0, urlcount = 0; synchronized (cache) { //Iterator i = cache.entrySet().iterator(); - Iterator i = hashScore.scores(false); + Iterator i = hashScore.scores(true); //Map.Entry entry; String wordHash; plasmaWordIndexEntryContainer container; @@ -318,6 +318,10 @@ public int wordCacheRAMSize() { return cache.size(); } + public int singletonsSize() { + return singletons.size(); + } + public void setMaxWords(int maxWords) { this.maxWords = maxWords; } @@ -341,7 +345,14 @@ public Iterator wordHashes(String startWordHash, boolean up) { true); } - private int flushFromMem(String key) { + private int flushFromMem(String key, boolean reintegrate) { + // this method flushes indexes out from the ram to the disc. + // at first we check the singleton database and act accordingly + // if we we are to flush an index, but see also an entry in the singletons, we + // decide upn the 'reintegrate'-Flag: + // true: do not flush to disc, but re-Integrate the singleton to the RAM + // false: flush the singleton together with container to disc + plasmaWordIndexEntryContainer container = null; long time; synchronized (cache) { @@ -358,12 +369,13 @@ private int flushFromMem(String key) { // now decide where to flush that container Object[] singleton = readSingleton(key); if (singleton == null) { + // not found in singletons if (container.size() == 1) { - // store to singleton + // it is a singleton: store to singleton storeSingleton(key, container.getOne(), time); return 1; } else { - // store to back-end + // store to back-end; this should be a rare case return backend.addEntries(container, time); } } else { @@ -376,17 +388,28 @@ private int flushFromMem(String key) { // it is superfluous to flush this, simple do nothing return 0; } else { - // we flush to the backend, but remove the entry from the singletons + // we flush to the backend, and the entry from the singletons removeSingleton(key); return backend.addEntries(container, java.lang.Math.max(time, oldTime)); } } else { - // now we have more than one entry, + // now we have more than one entry // we must remove the key from the singleton database removeSingleton(key); - // add this to the backend + // .. and put it to the container container.add(oldEntry); - return backend.addEntries(container, java.lang.Math.max(time, oldTime)); + if (reintegrate) { + // put singleton together with container back to ram + synchronized (cache) { + cache.put(key, container); + hashScore.setScore(key, container.size()); + hashDate.put(key, new Long(time)); + } + return -1; + } else { + // add this to the backend + return backend.addEntries(container, java.lang.Math.max(time, oldTime)); + } } } } @@ -441,31 +464,35 @@ private int flushFromMemToLimit() { break; } //log.logDebug("flushing high-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size()); - total += flushFromMem(key); + total += flushFromMem(key, false); } // flush singletons - while ((total < 200) && (hashScore.size() >= maxWords)) { - key = (String) hashScore.getMinObject(); + Iterator i = hashScore.scores(true); + ArrayList al = new ArrayList(); + while ((i.hasNext()) && (total < 200)) { + key = (String) i.next(); createTime = (Long) hashDate.get(key); count = hashScore.getScore(key); if (count > 1) { //log.logDebug("flush of singleton-key " + key + ": count too high (count=" + count + ")"); break; } - if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) { - //log.logDebug("singleton-key " + key + " is too fresh, interruptiong flush (count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size() + ")"); - break; + if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 90000)) { + //log.logDebug("singleton-key " + key + " is too fresh, interrupting flush (count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size() + ")"); + continue; } //log.logDebug("flushing singleton-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size()); - total += flushFromMem(key); + al.add(key); + total++; } + for (int k = 0; k < al.size(); k++) flushFromMem((String) al.get(k), true); } return total; } public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) { - flushFromMem(wordHash); + flushFromMem(wordHash, false); flushFromSingleton(wordHash); return backend.getIndex(wordHash, deleteIfEmpty); } @@ -486,13 +513,13 @@ public void deleteIndex(String wordHash) { backend.deleteIndex(wordHash); } - public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { - flushFromMem(wordHash); + public synchronized int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { + flushFromMem(wordHash, false); flushFromSingleton(wordHash); return backend.removeEntries(wordHash, urlHashes, deleteComplete); } - public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) { + public synchronized int addEntries(plasmaWordIndexEntryContainer container, long creationTime) { //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); flushFromMemToLimit(); //if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries"); diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index d4f005aa56..fe41674617 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -101,16 +101,15 @@ private static yacySeed[] selectPeers(Set wordhashes, int seedcount) { int c; while (i.hasNext()) { dhtEnum = yacyCore.dhtAgent.getDHTSeeds(true, (String) i.next()); - c = seedcount; - while ((dhtEnum.hasMoreElements()) && (c > 0)) { + c = 0; + while ((dhtEnum.hasMoreElements()) && (c < seedcount)) { seed = (yacySeed) dhtEnum.nextElement(); - ranking.addScore(seed.hash, c); - c--; + ranking.addScore(seed.hash, c++); } } if (ranking.size() < seedcount) seedcount = ranking.size(); yacySeed[] result = new yacySeed[seedcount]; - Iterator e = ranking.scores(false); + Iterator e = ranking.scores(true); c = 0; while ((e.hasNext()) && (c < result.length)) result[c++] = yacyCore.seedDB.getConnected((String) e.next()); diff --git a/yacy.parser b/yacy.parser index 8b244b04b2..1b369b90f3 100644 --- a/yacy.parser +++ b/yacy.parser @@ -1,2 +1,2 @@ #plasmaParser configuration file -#Wed May 11 17:48:25 CEST 2005 +#Thu May 12 01:40:28 CEST 2005