Skip to content

Commit

Permalink
enhanced snippet-generation (case where snippet is too long)
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@350 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jun 30, 2005
1 parent 35c7a58 commit 75ebdbc
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 16 deletions.
69 changes: 54 additions & 15 deletions source/de/anomic/plasma/plasmaSnippetCache.java
Expand Up @@ -155,7 +155,7 @@ public result retrieve(URL url, Set queryhashes, boolean fetchOnline) {
}

// we have found a parseable non-empty file: use the lines
line = computeSnippet(sentences, queryhashes, 12 * queryhashes.size(), 120);
line = computeSnippet(sentences, queryhashes, 8 + 6 * queryhashes.size(), 120);
//System.out.println("loaded snippet for url " + url + ": " + line);
if (line == null) return new result(null, ERROR_NO_MATCH, "no matching snippet found");
if (line.length() > 120) line = line.substring(0, 120);
Expand Down Expand Up @@ -202,13 +202,19 @@ private String computeSnippet(String[] sentences, Set queryhashes, int minLength
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
Iterator j;
HashSet hs;
HashMap hs;
String hash;
for (int i = 0; i < sentences.length; i++) {
if ((sentences[i].length() > minLength) && (sentences[i].length() < maxLength)) {
System.out.println("Sentence " + i + ": " + sentences[i]);
if (sentences[i].length() > minLength) {
hs = hashSentence(sentences[i]);
j = queryhashes.iterator();
while (j.hasNext()) {
if (hs.contains((String) j.next())) hitTable.incScore(new Integer(i));
hash = (String) j.next();
if (hs.containsKey(hash)) {
System.out.println("hash " + hash + " appears in line " + i);
hitTable.incScore(new Integer(i));
}
}
}
}
Expand All @@ -227,26 +233,59 @@ private String computeSnippet(String[] sentences, Set queryhashes, int minLength
}
// find a first result
String result = sentences[shortLineIndex];
if (score == queryhashes.size()) return result;
// the result has not all words in it.
// find another sentence that represents the missing other words
// first remove all words that appear in the result from the queryhashes
// remove all hashes that appear in the result
hs = hashSentence(result);
j = queryhashes.iterator();
Integer pos;
int p, minpos = maxLength, maxpos = -1;
while (j.hasNext()) {
if (hs.contains((String) j.next())) j.remove();
pos = (Integer) hs.get((String) j.next());
if (pos != null) {
j.remove();
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
}
}
// check result size
maxpos = maxpos + 10;
if (maxpos > result.length()) maxpos = result.length();
if (minpos < 0) minpos = 0;
// we have a result, but is it short enough?
if (result.length() > maxLength) {
// trim result, 1st step (cut at right side)
result = result.substring(0, maxpos).trim() + " [..]";
}
if (result.length() > maxLength) {
// trim result, 2nd step (cut at left side)
result = "[..] " + result.substring(minpos).trim();
}
if (result.length() > maxLength) {
// trim result, 3rd step (cut in the middle)
result = result.substring(6, 20).trim() + " [..] " + result.substring(result.length() - 26, result.length() - 6).trim();
}
if (queryhashes.size() == 0) return result;
// now find recursively more sentences
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
String nextSnippet = computeSnippet(sentences, queryhashes, minLength, maxLength);
return result + ((nextSnippet == null) ? "" : (" ... " + nextSnippet));
return result + ((nextSnippet == null) ? "" : (" / " + nextSnippet));
}

private HashSet hashSentence(String sentence) {
HashSet set = new HashSet();
private HashMap hashSentence(String sentence) {
// generates a word-wordPos mapping
HashMap map = new HashMap();
Enumeration words = plasmaCondenser.wordTokenizer(sentence);
while (words.hasMoreElements()) set.add(plasmaWordIndexEntry.word2hash((String) words.nextElement()));
return set;
int pos = 0;
String word;
while (words.hasMoreElements()) {
word = (String) words.nextElement();
map.put(plasmaWordIndexEntry.word2hash(word), new Integer(pos));
pos += word.length() + 1;
}
return map;
}

public plasmaParserDocument parseDocument(URL url, byte[] resource) {
Expand Down
4 changes: 3 additions & 1 deletion source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -374,7 +374,9 @@ public plasmaSwitchboard(String rootPath, String initPath, String configPath) th

// test routine for snippet fetch
// url = /www.heise.de/mobil/newsticker/meldung/mail/54980
//Set query = new HashSet(); query.add("0OQUNU3JSs05"); // 'heise'
//Set query = new HashSet();
//query.add(plasmaWordIndexEntry.word2hash("Weitergabe"));
//query.add(plasmaWordIndexEntry.word2hash("Zahl"));
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/mobil/newsticker/meldung/mail/54980"), query, true);
//plasmaSnippetCache.result scr = snippetCache.retrieve(new URL("http://www.heise.de/security/news/foren/go.shtml?read=1&msg_id=7301419&forum_id=72721"), query, true);
}
Expand Down

0 comments on commit 75ebdbc

Please sign in to comment.