Skip to content
Permalink
Browse files

Reduced text snippet extraction processing time.

By not generating MD5 hashes on all words of indexed texts, processing
time is reduced by 30 to 50% on indexed documents with more than 1Mbytes
of plain text.
  • Loading branch information...
luccioman committed May 11, 2018
1 parent 7525594 commit e115e57cc7a3bb34406656a701f33f013380691c
@@ -350,6 +350,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
TextSnippet snippet = new TextSnippet(
sb.loader,
urlEntry,
goal.getIncludeWordsSet(),
goal.getIncludeHashes(),
CacheStrategy.CACHEONLY,
false,
@@ -21,36 +21,34 @@
package net.yacy.document;

import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;

import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.index.RowHandleSet;

public class SnippetExtractor {

String snippetString;
HandleSet remainingHashes;
private String snippetString;
private Set<String> remainingTerms;

public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {

public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
SortedMap<byte[], Integer> hs;
if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
SortedMap<String, Integer> hs;
final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
long uniqCounter = 999L;
Integer pos;
TreeSet<Integer> positions;
int linenumber = 0;
int fullmatchcounter = 0;
lookup: for (final StringBuilder sentence: sentences) {
hs = WordTokenizer.hashSentence(sentence.toString(), 100);
hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
positions = new TreeSet<Integer>();
for (final byte[] word: queryhashes) {
for (final String word: queryTerms) {
pos = hs.get(word);
if (pos != null) {
positions.add(pos);
@@ -65,7 +63,7 @@ public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleS
if (!positions.isEmpty()) {
order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
if (order.size() > 5) order.remove(order.firstEntry().getKey());
if (positions.size() == queryhashes.size()) fullmatchcounter++;
if (positions.size() == queryTerms.size()) fullmatchcounter++;
if (fullmatchcounter >= 3) break lookup;
}
linenumber++;
@@ -76,31 +74,31 @@ public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleS
while (!order.isEmpty()) {
sentence = order.remove(order.lastKey()); // sentence with the biggest score
try {
tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength);
tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength);
} catch (final UnsupportedOperationException e) {
continue;
}
this.snippetString = tsr.snippetString;
if (this.snippetString != null && this.snippetString.length() > 0) {
this.remainingHashes = tsr.remainingHashes;
if (this.remainingHashes.isEmpty()) {
this.remainingTerms = tsr.remainingTerms;
if (this.remainingTerms.isEmpty()) {
// we have found the snippet
return; // finished!
} else if (this.remainingHashes.size() < queryhashes.size()) {
} else if (this.remainingTerms.size() < queryTerms.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - this.snippetString.length();
if (maxLength < 20) maxLength = 20;
try {
tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
tsr = new SnippetExtractor(order.values(), this.remainingTerms, maxLength);
} catch (final UnsupportedOperationException e) {
throw e;
}
final String nextSnippet = tsr.snippetString;
if (nextSnippet == null) return;
this.snippetString = this.snippetString + (" / " + nextSnippet);
this.remainingHashes = tsr.remainingHashes;
this.remainingTerms = tsr.remainingTerms;
return;
} else {
// error
@@ -120,27 +118,24 @@ private static int linelengthKey(int givenlength, int maxlength) {
return 0;
}

private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {

private SnippetExtractor(String sentence, final Set<String> queryTerms, final int maxLength) throws UnsupportedOperationException {
try {
if (sentence == null) throw new UnsupportedOperationException("no sentence given");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
byte[] hash;
if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
String term;

// find all hashes that appear in the sentence
final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, 100);
final Iterator<byte[]> j = queryhashes.iterator();
final Map<String, Integer> hs = WordTokenizer.tokenizeSentence(sentence, 100);
final Iterator<String> j = queryTerms.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
final HandleSet remainingHashes = new RowHandleSet(queryhashes.keylen(), queryhashes.comparator(), 0);
final Set<String> remainingTerms = new HashSet<>();
while (j.hasNext()) {
hash = j.next();
pos = hs.get(hash);
term = j.next();
pos = hs.get(term);
if (pos == null) {
try {
remainingHashes.put(hash);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
remainingTerms.add(term);
} else {
p = pos.intValue();
if (p > maxpos) maxpos = p;
@@ -185,7 +180,7 @@ private SnippetExtractor(String sentence, final HandleSet queryhashes, final int
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
this.snippetString = sentence;
this.remainingHashes = remainingHashes;
this.remainingTerms = remainingTerms;
} catch (final IndexOutOfBoundsException e) {
throw new UnsupportedOperationException(e.getMessage());
}
@@ -195,7 +190,7 @@ public String getSnippet() {
return this.snippetString;
}

public HandleSet getRemainingWords() {
return this.remainingHashes;
}
public Set<String> getRemainingTerms() {
return this.remainingTerms;
}
}
@@ -27,6 +27,7 @@
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Locale;
import java.util.SortedMap;
import java.util.TreeMap;

@@ -210,4 +211,34 @@ public static StringBuilder trim(final StringBuilder sb) {
words = null;
}
}

/**
* Tokenize the given sentence and generate a word-wordPos mapping
* @param sentence the sentence to be tokenized
* @return a ordered map containing word as key and position as value. The map is ordered by words.
*/
public static SortedMap<String, Integer> tokenizeSentence(final String sentence, int maxlength) {
final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null);
try {
int pos = 0;
String word;
Integer oldpos;
while (words.hasMoreElements() && maxlength-- > 0) {
word = words.nextElement().toString().toLowerCase(Locale.ENGLISH);

// don't overwrite old values, that leads to too far word distances
oldpos = map.put(word, LargeNumberCache.valueOf(pos));
if (oldpos != null) {
map.put(word, oldpos);
}

pos += word.length() + 1;
}
return map;
} finally {
words.close();
words = null;
}
}
}
@@ -251,18 +251,32 @@ public int getExcludeSize() {
}

/**
* @return a set of words to be included in the search result
* @return an iterator on the set of words to be included in the search result
*/
public Iterator<String> getIncludeWords() {
return this.include_words.iterator();
}

/**
* @return a copy of the set of words to be included in the search result
*/
public Set<String> getIncludeWordsSet() {
return new NormalizedWords(this.include_words);
}

/**
* @return a set of words to be excluded in the search result
* @return an iterator on the set of words to be excluded from the search result
*/
public Iterator<String> getExcludeWords() {
return this.exclude_words.iterator();
}

/**
* @return a copy of the set of words to be excluded from the search result
*/
public Set<String> getExcludeWordsSet() {
return new NormalizedWords(this.exclude_words);
}

/**
* @return a list of include strings which reproduces the original order of the search words and quotation
@@ -183,6 +183,8 @@

/** a set of word hashes that are used to match with the snippets */
private final HandleSet snippetFetchWordHashes;
/** a set of words that are used to match with the snippets */
private final Set<String> snippetFetchWords;
private final boolean deleteIfSnippetFail;
private long urlRetrievalAllTime;
private long snippetComputationAllTime;
@@ -531,7 +533,7 @@ public void run() {
this.resultList = new WeakPriorityBlockingQueue<URIMetadataNode>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking

// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search
// only with the query minus the stopwords which had not been used for the search
boolean filtered = false;
// check if query contains stopword
if (Switchboard.stopwordHashes != null) {
@@ -547,6 +549,10 @@ public void run() {
if (filtered) { // remove stopwords
this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
}

this.snippetFetchWords = query.getQueryGoal().getIncludeWordsSet();
// remove stopwords
this.snippetFetchWords.removeAll(Switchboard.stopwords);

// clean up events
SearchEventCache.cleanupEvents(false);
@@ -1877,6 +1883,7 @@ private boolean drainSolrStackToResult(boolean concurrentSnippetFetch) {
final TextSnippet solrsnippet = new TextSnippet(node.url(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_SOLR, "");
final TextSnippet yacysnippet = new TextSnippet(this.loader,
node,
this.query.getQueryGoal().getIncludeWordsSet(),
this.query.getQueryGoal().getIncludeHashes(),
CacheStrategy.CACHEONLY,
false,
@@ -2000,6 +2007,7 @@ public URIMetadataNode getSnippet(URIMetadataNode page, final CacheStrategy cach
final TextSnippet snippet = new TextSnippet(
null,
page,
this.snippetFetchWords,
this.snippetFetchWordHashes,
null,
((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@@ -2016,6 +2024,7 @@ public URIMetadataNode getSnippet(URIMetadataNode page, final CacheStrategy cach
final TextSnippet snippet = new TextSnippet(
this.loader,
page,
this.snippetFetchWords,
this.snippetFetchWordHashes,
cacheStrategy,
((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@@ -2032,7 +2041,7 @@ public URIMetadataNode getSnippet(URIMetadataNode page, final CacheStrategy cach
return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet
} else {
// problems with snippet fetch
if (this.snippetFetchWordHashes.has(Segment.catchallHash)) {
if (this.snippetFetchWords.contains(Segment.catchallString)) {
// we accept that because the word cannot be on the page
return page.makeResultEntry(this.query.getSegment(), this.peers, null);
}

0 comments on commit e115e57

Please sign in to comment.
You can’t perform that action at this time.