From ea478f397595c9eef24e52eae2776fd96f49bd03 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Wed, 11 May 2005 23:42:40 +0000
Subject: [PATCH] enhanced indexing-caching

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@107 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/Performance_p.html                     | 10 +++
 htroot/Performance_p.java                     |  1 +
 .../kelondro/kelondroMScoreCluster.java       | 36 ++++++++--
 source/de/anomic/plasma/plasmaWordIndex.java  |  6 +-
 .../anomic/plasma/plasmaWordIndexCache.java   | 69 +++++++++++++------
 source/de/anomic/yacy/yacySearch.java         |  9 ++-
 yacy.parser                                   |  2 +-
 7 files changed, 100 insertions(+), 33 deletions(-)
diff --git a/htroot/Performance_p.html b/htroot/Performance_p.html
index 5159fe7217..9b965d9675 100644
--- a/htroot/Performance_p.html
+++ b/htroot/Performance_p.html
@@ -81,6 +81,13 @@ <h2>Performance</h2>
     If this is a big number, it shows that the caching works efficiently.
     </td>
   </tr>
+  <tr valign="top" class="TableCellDark">
+    <td class=small>Singletons Cache Size:</td>
+    <td class=small>#[singletonsSize]#</td>
+    <td class=small>
+    The Singletons Cache is a database that holds words that occurred only once.
+    </td>
+  </tr>
   <tr valign="top" class="TableCellDark">
     <td class=small>Maximum number of Word Caches:</td>
     <td class=small><input name="wordCacheMax" type="text" size="20" maxlength="100" value="#[wordCacheMax]#"></td>
@@ -90,6 +97,7 @@ <h2>Performance</h2>
     flushed to disc; this may last some minutes.
     </td>
   </tr>
+  <!--
   <tr valign="top" class="TableCellDark">
     <td class=small>Maximum waitingtime to flush word cache<br>during shut-down (seconds):</td>
     <td class=small><input name="maxWaitingWordFlush" type="text" size="6" maxlength="6" value="#[maxWaitingWordFlush]#"></td>
@@ -99,11 +107,13 @@ <h2>Performance</h2>
     time are lost.
     </td>
   </tr>
+  -->
   <tr valign="top" class="TableCellLight">
     <td class=small colspan="3"><input type="submit" name="cacheSizeSubmit" value="Enter New Cache Size">
     Changes take effect immediately</td>
     </td>
   </tr>
+
 </form>
 </table>
 </p>
diff --git a/htroot/Performance_p.java b/htroot/Performance_p.java
index e4b5775c86..85182f4c61 100644
--- a/htroot/Performance_p.java
+++ b/htroot/Performance_p.java
@@ -146,6 +146,7 @@ public static serverObjects respond(httpHeader header, serverObjects post, serve
         prop.put("maxURLinWordCache", "" + switchboard.wordIndex.maxURLinWordCache());
         prop.put("maxWaitingWordFlush", switchboard.getConfig("maxWaitingWordFlush", "180"));
         prop.put("wordCacheMax", switchboard.getConfig("wordCacheMax", "10000"));
+        prop.put("singletonsSize", switchboard.wordIndex.singletonsSize());
         
         // return rewrite values for templates
         return prop;
diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java
index 21a787372e..9571942a11 100644
--- a/source/de/anomic/kelondro/kelondroMScoreCluster.java
+++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java
@@ -44,6 +44,7 @@
 import java.text.SimpleDateFormat;
 import java.util.Iterator;
 import java.util.TreeMap;
+import java.util.Map;
 
 public class kelondroMScoreCluster {
     
@@ -243,21 +244,22 @@ public Object[] getScores(int maxCount, boolean up, int minScore, int maxScore)
     }
     
     public Iterator scores(boolean up) {
-        return scores(up, Integer.MIN_VALUE, Integer.MAX_VALUE);
+        if (up) return new simpleScoreIterator();
+        else return scores(false, Integer.MIN_VALUE, Integer.MAX_VALUE);
     }
     
     public Iterator scores(boolean up, int minScore, int maxScore) {
-        return new scoreIterator(up, minScore, maxScore);
+        return new komplexScoreIterator(up, minScore, maxScore);
     }
     
-    private class scoreIterator implements Iterator {
+    private class komplexScoreIterator implements Iterator {
 
         boolean up;
         TreeMap keyrefDBcopy;
         Object n;
         int min, max;
         
-        public scoreIterator(boolean up, int minScore, int maxScore) {
+        public komplexScoreIterator(boolean up, int minScore, int maxScore) {
             this.up = up;
             this.min = minScore;
             this.max = maxScore;
@@ -299,7 +301,31 @@ public void remove() {
         
     }
     
-     public static void main(String[] args) {
+    private class simpleScoreIterator implements Iterator {
+
+        Iterator ii;
+        Map.Entry entry;
+        
+        public simpleScoreIterator() {
+            ii = keyrefDB.entrySet().iterator();
+        }
+       
+        public boolean hasNext() {
+            return ii.hasNext();
+        }
+        
+        public Object next() {
+            entry = (Map.Entry) ii.next();
+            return entry.getValue();
+        }
+        
+        public void remove() {
+            ii.remove();
+        }
+        
+    }
+        
+    public static void main(String[] args) {
         System.out.println("Test for Score: start");
         kelondroMScoreCluster s = new kelondroMScoreCluster();
 	int c = 0;
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index fd7a70c7ac..373e14748a 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -65,7 +65,7 @@ public class plasmaWordIndex {
     public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException {
         this.databaseRoot = databaseRoot;
         plasmaWordIndexClassicDB fileDB = new plasmaWordIndexClassicDB(databaseRoot, log);
-        this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, 1000000, log);
+        this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, bufferkb, log);
     }
     
     public int maxURLinWordCache() {
@@ -76,6 +76,10 @@ public int wordCacheRAMSize() {
         return ramCache.wordCacheRAMSize();
     }
     
+    public int singletonsSize() {
+        return ramCache.singletonsSize();
+    }
+        
     public void setMaxWords(int maxWords) {
         ramCache.setMaxWords(maxWords);
     }
diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java
index 0d6da66fb4..32f43f2d17 100644
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@@ -82,11 +82,11 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface {
 	for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-';
     }
 
-    public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, long singletonBufferSize, serverLog log) {
+    public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, int singletonbufferkb, serverLog log) {
         // creates a new index cache
         // the cache has a back-end where indexes that do not fit in the cache are flushed
         this.databaseRoot = databaseRoot;
-        this.singletonBufferSize = singletonBufferSize;
+        this.singletonBufferSize = singletonbufferkb * 1024;
         this.cache = new TreeMap();
 	this.hashScore = new kelondroMScoreCluster();
         this.hashDate  = new HashMap();
@@ -132,7 +132,7 @@ private void dump(int waitingSeconds) throws IOException {
         long wordsPerSecond = 0, wordcount = 0, urlcount = 0;
         synchronized (cache) {
             //Iterator i = cache.entrySet().iterator();
-            Iterator i = hashScore.scores(false);
+            Iterator i = hashScore.scores(true);
             //Map.Entry entry;
             String wordHash;
             plasmaWordIndexEntryContainer container;
@@ -318,6 +318,10 @@ public int wordCacheRAMSize() {
         return cache.size();
     }
     
+    public int singletonsSize() {
+        return singletons.size();
+    }
+    
     public void setMaxWords(int maxWords) {
         this.maxWords = maxWords;
     }
@@ -341,7 +345,14 @@ public Iterator wordHashes(String startWordHash, boolean up) {
                         true);
     }
     
-    private int flushFromMem(String key) {
+    private int flushFromMem(String key, boolean reintegrate) {
+        // this method flushes indexes out from the ram to the disc.
+        // at first we check the singleton database and act accordingly
+        // if we we are to flush an index, but see also an entry in the singletons, we
+        // decide upn the 'reintegrate'-Flag:
+        // true: do not flush to disc, but re-Integrate the singleton to the RAM
+        // false: flush the singleton together with container to disc
+        
         plasmaWordIndexEntryContainer container = null;
         long time;
 	synchronized (cache) {
@@ -358,12 +369,13 @@ private int flushFromMem(String key) {
         // now decide where to flush that container
         Object[] singleton = readSingleton(key);
         if (singleton == null) {
+            // not found in singletons
             if (container.size() == 1) {
-                // store to singleton
+                // it is a singleton: store to singleton
                 storeSingleton(key, container.getOne(), time);
                 return 1;
             } else {
-                // store to back-end
+                // store to back-end; this should be a rare case
                 return backend.addEntries(container, time);
             }
         } else {
@@ -376,17 +388,28 @@ private int flushFromMem(String key) {
                     // it is superfluous to flush this, simple do nothing
                     return 0;
                 } else {
-                    // we flush to the backend, but remove the entry from the singletons
+                    // we flush to the backend, and the entry from the singletons
                     removeSingleton(key);
                     return backend.addEntries(container, java.lang.Math.max(time, oldTime));
                 }
             } else {
-                // now we have more than one entry,
+                // now we have more than one entry
                 // we must remove the key from the singleton database
                 removeSingleton(key);
-                // add this to the backend
+                // .. and put it to the container
                 container.add(oldEntry);
-                return backend.addEntries(container, java.lang.Math.max(time, oldTime));
+                if (reintegrate) {
+                    // put singleton together with container back to ram
+                    synchronized (cache) {
+                        cache.put(key, container);
+                        hashScore.setScore(key, container.size());
+                        hashDate.put(key, new Long(time));
+                    }
+                    return -1;
+                } else {
+                    // add this to the backend
+                    return backend.addEntries(container, java.lang.Math.max(time, oldTime));
+                }
             }
         }	
     }
@@ -441,31 +464,35 @@ private int flushFromMemToLimit() {
                     break;
                 }
                 //log.logDebug("flushing high-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size());
-                total += flushFromMem(key);
+                total += flushFromMem(key, false);
             }
             
             // flush singletons
-            while ((total < 200) && (hashScore.size() >= maxWords)) {
-                key = (String) hashScore.getMinObject();
+            Iterator i = hashScore.scores(true);
+            ArrayList al = new ArrayList();
+            while ((i.hasNext()) && (total < 200)) {
+                key = (String) i.next();
                 createTime = (Long) hashDate.get(key);
                 count = hashScore.getScore(key);
                 if (count > 1) {
                     //log.logDebug("flush of singleton-key " + key + ": count too high (count=" + count + ")");
                     break;
                 }
-                if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) {
-                    //log.logDebug("singleton-key " + key + " is too fresh, interruptiong flush (count=" + count + ", cachesize=" + cache.size()  + ", singleton-size=" + singletons.size() + ")");
-                    break;
+                if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 90000)) {
+                    //log.logDebug("singleton-key " + key + " is too fresh, interrupting flush (count=" + count + ", cachesize=" + cache.size()  + ", singleton-size=" + singletons.size() + ")");
+                    continue;
                 }
                 //log.logDebug("flushing singleton-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size());
-                total += flushFromMem(key);
+                al.add(key);
+                total++;
             }
+            for (int k = 0; k < al.size(); k++) flushFromMem((String) al.get(k), true);
         }
         return total;
     }
     
     public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) {
-        flushFromMem(wordHash);
+        flushFromMem(wordHash, false);
         flushFromSingleton(wordHash);
 	return backend.getIndex(wordHash, deleteIfEmpty);
     }
@@ -486,13 +513,13 @@ public void deleteIndex(String wordHash) {
 	backend.deleteIndex(wordHash);
     }
 
-    public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
-        flushFromMem(wordHash);
+    public synchronized int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) {
+        flushFromMem(wordHash, false);
         flushFromSingleton(wordHash);
         return backend.removeEntries(wordHash, urlHashes, deleteComplete);
     }
     
-    public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
+    public synchronized int addEntries(plasmaWordIndexEntryContainer container, long creationTime) {
 	//serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size());
         flushFromMemToLimit();
 	//if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries");
diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java
index d4f005aa56..fe41674617 100644
--- a/source/de/anomic/yacy/yacySearch.java
+++ b/source/de/anomic/yacy/yacySearch.java
@@ -101,16 +101,15 @@ private static yacySeed[] selectPeers(Set wordhashes, int seedcount) {
         int c;
         while (i.hasNext()) {
             dhtEnum = yacyCore.dhtAgent.getDHTSeeds(true, (String) i.next());
-            c = seedcount;
-            while ((dhtEnum.hasMoreElements()) && (c > 0)) {
+            c = 0;
+            while ((dhtEnum.hasMoreElements()) && (c < seedcount)) {
                 seed = (yacySeed) dhtEnum.nextElement();
-                ranking.addScore(seed.hash, c);
-                c--;
+                ranking.addScore(seed.hash, c++);
             }
         }
         if (ranking.size() < seedcount) seedcount = ranking.size();
         yacySeed[] result = new yacySeed[seedcount];
-        Iterator e = ranking.scores(false);
+        Iterator e = ranking.scores(true);
         c = 0;
         while ((e.hasNext()) && (c < result.length))
             result[c++] = yacyCore.seedDB.getConnected((String) e.next());
diff --git a/yacy.parser b/yacy.parser
index 8b244b04b2..1b369b90f3 100644
--- a/yacy.parser
+++ b/yacy.parser
@@ -1,2 +1,2 @@
 #plasmaParser configuration file
-#Wed May 11 17:48:25 CEST 2005
+#Thu May 12 01:40:28 CEST 2005