enhanced snippet-generation (case where snippet is too long)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@350 6c8d7289-2bf4-0310-a012-ef5d649a1542
yacy · Jun 30, 2005 · 75ebdbc · 75ebdbc
1 parent 35c7a58
commit 75ebdbc
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 16 deletions.
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -155,7 +155,7 @@ public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
         }
 
         // we have found a parseable non-empty file: use the lines
-        line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120);
+        line = computeSnippet(sentences, queryhashes, 8 + 6 * queryhashes.size(), 120);
         //System.out.println("loaded snippet for url " + url + ": " + line);
         if (line == null) return new result(null, ERROR_NO_MATCH, "no matching snippet found");
         if (line.length() > 120) line = line.substring(0, 120);
@@ -202,13 +202,19 @@ private String computeSnippet(String[] sentences, Set queryhashes, int minLength
         if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
         kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
         Iterator j;
-        HashSet hs;
+        HashMap hs;
+        String hash;
         for (int i = 0; i < sentences.length; i++) {
-            if ((sentences[i].length() > minLength) && (sentences[i].length() < maxLength)) {
+            System.out.println("Sentence " + i + ": " + sentences[i]);
+            if (sentences[i].length() > minLength) {
                 hs = hashSentence(sentences[i]);
                 j = queryhashes.iterator();
                 while (j.hasNext()) {
-                    if (hs.contains((String) j.next())) hitTable.incScore(new Integer(i));
+                    hash = (String) j.next();
+                    if (hs.containsKey(hash)) {
+                        System.out.println("hash " + hash + " appears in line " + i);
+			hitTable.incScore(new Integer(i));
+                    }
                 }
             }
         }
@@ -227,26 +233,59 @@ private String computeSnippet(String[] sentences, Set queryhashes, int minLength
         }
         // find a first result
         String result = sentences[shortLineIndex];
-        if (score == queryhashes.size()) return result;
-        // the result has not all words in it.
-        // find another sentence that represents the missing other words
-        // first remove all words that appear in the result from the queryhashes
+        // remove all hashes that appear in the result
         hs = hashSentence(result);
         j = queryhashes.iterator();
+        Integer pos;
+        int p, minpos = maxLength, maxpos = -1;
         while (j.hasNext()) {
-            if (hs.contains((String) j.next())) j.remove();
+            pos = (Integer) hs.get((String) j.next());
+            if (pos != null) {
+                j.remove();
+                p = pos.intValue();
+                if (p > maxpos) maxpos = p;
+                if (p < minpos) minpos = p;
+            }
+        }
+        // check result size
+        maxpos = maxpos + 10;
+        if (maxpos > result.length()) maxpos = result.length();
+        if (minpos < 0) minpos = 0;
+        // we have a result, but is it short enough?
+        if (result.length() > maxLength) {
+            // trim result, 1st step (cut at right side)
+            result = result.substring(0, maxpos).trim() + " [..]";
+        }
+        if (result.length() > maxLength) {
+            // trim result, 2nd step (cut at left side)
+            result = "[..] " + result.substring(minpos).trim();
+        }
+        if (result.length() > maxLength) {
+            // trim result, 3rd step (cut in the middle)
+            result = result.substring(6, 20).trim() + " [..] " + result.substring(result.length() - 26, result.length() - 6).trim();
         }
         if (queryhashes.size() == 0) return result;
-        // now find recursively more sentences
+        // the result has not all words in it.
+        // find another sentence that represents the missing other words
+        // and find recursively more sentences
+        maxLength = maxLength - result.length();
+        if (maxLength < 20) maxLength = 20;
         String nextSnippet = computeSnippet(sentences, queryhashes, minLength, maxLength);
-        return result + ((nextSnippet == null) ? "" : (" ... " + nextSnippet));
+        return result + ((nextSnippet == null) ? "" : (" / " + nextSnippet));
     }
 
-    private HashSet hashSentence(String sentence) {
-        HashSet set = new HashSet();
+    private HashMap hashSentence(String sentence) {
+        // generates a word-wordPos mapping
+        HashMap map = new HashMap();
         Enumeration words = plasmaCondenser.wordTokenizer(sentence);
-        while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
-        return set;
+        int pos = 0;
+        String word;
+        while (words.hasMoreElements()) {
+            word = (String) words.nextElement();
+            map.put(plasmaWordIndexEntry.word2hash(word), new Integer(pos));
+            pos += word.length() + 1;
+        }
+        return map;
     }
 
     public plasmaParserDocument parseDocument(URL url, byte[] resource) {

diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -374,7 +374,9 @@ public plasmaSwitchboard(String rootPath, String initPath, String configPath) th
 
         // test routine for snippet fetch
         // url = /www.heise.de/mobil/newsticker/meldung/mail/54980
-        //Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise'
+        //Set query = new HashSet();
+        //query.add(plasmaWordIndexEntry.word2hash("Weitergabe"));
+        //query.add(plasmaWordIndexEntry.word2hash("Zahl"));
         //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true);
         //plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
     }