Skip to content
Permalink
Browse files

Reduced memory footprint of text snippet extraction

By not parsing and storing at first all sentences of a document, but
only on the fly the ones necessary to compute the snippet.
  • Loading branch information...
luccioman committed May 13, 2018
1 parent e115e57 commit e357ade47d116c9b65b413666e467f9826dbeb9b
@@ -24,35 +24,67 @@

package net.yacy.document;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
* Read sentences from a given text.
* This enumerates StringBuilder objects.
*/
public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> {
// read sentences from a given input stream
// this enumerates StringBuilder objects

/** Holds the next element */
private StringBuilder buffer;

/** List of already parsed sentences, eventually in addition to those extracted from the main text. */
private List<StringBuilder> parsedSentences;

/** Current position in the parsedSentences list. */
private int sentencesPos;

/** The main text to parse for sentences */
private String text;

/** The current character position in the main text */
private int pos;

/** When true sentences can not include line break characters */
private boolean pre = false;

public SentenceReader(final String text) {
assert text != null;
this.text = text;
this.pos = 0;
this.pre = false;
this.buffer = nextElement0();
this(new ArrayList<>(), text, false);
}

public SentenceReader(final String text, final boolean pre) {
this(text);
this(new ArrayList<>(), text, pre);
}

public SentenceReader(final List<StringBuilder> parsedSentences, final String text, final boolean pre) {
assert text != null;
this.text = text;
this.pos = 0;
this.pre = pre;
if(parsedSentences == null) {
this.parsedSentences = new ArrayList<>();
} else {
this.parsedSentences = parsedSentences;
}
this.sentencesPos = 0;
this.buffer = nextElement0();
}

public void pre(final boolean x) {
this.pre = x;
}

private StringBuilder nextElement0() {
if(this.sentencesPos < this.parsedSentences.size()) {
final StringBuilder element = this.parsedSentences.get(this.sentencesPos);
this.sentencesPos++;
return element;
}

final StringBuilder s = new StringBuilder(80);
int nextChar;
char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' '
@@ -73,6 +105,9 @@ private StringBuilder nextElement0() {
s.trimToSize();
s.deleteCharAt(s.length() - 1);
}
/* Add to parsed sentences list for eventual reuse after a reset */
this.parsedSentences.add(s);
this.sentencesPos++;
return s;
}

@@ -118,9 +153,19 @@ public void remove() {
public Iterator<StringBuilder> iterator() {
return this;
}

/**
* Reset the iterator position to zero
*/
public void reset() {
/* Reset only the sentences position to reuse already parsed sentences */
this.sentencesPos = 0;
this.buffer = nextElement0();
}

public synchronized void close() {
this.text = null;
this.parsedSentences = null;
}

public static void main(String[] args) {
@@ -20,7 +20,6 @@

package net.yacy.document;

import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
@@ -35,8 +34,8 @@
private Set<String> remainingTerms;


public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
public SnippetExtractor(final Iterable<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentences == null");
if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
SortedMap<String, Integer> hs;
final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
@@ -45,7 +44,7 @@ public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<Str
TreeSet<Integer> positions;
int linenumber = 0;
int fullmatchcounter = 0;
lookup: for (final StringBuilder sentence: sentences) {
lookup: for(final StringBuilder sentence : sentences) {
hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
positions = new TreeSet<Integer>();
for (final String word: queryTerms) {
@@ -47,6 +47,7 @@
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.SentenceReader;
import net.yacy.document.SnippetExtractor;
import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding;
@@ -202,7 +203,8 @@ public TextSnippet(
// this requires that the document is parsed after loading
String textline = null;
Set<String> remainingTerms = new HashSet<>(queryTerms);
List<StringBuilder> sentences = null;
SentenceReader sentences = null;
List<StringBuilder> firstSentencesList = null;

// try to get the snippet from metadata
removeMatchingTerms(row.url().toTokens(), remainingTerms);
@@ -214,15 +216,17 @@ public TextSnippet(
// we did not find everything in the metadata, look further into the document itself.

// first acquire the sentences (from description/abstract or text):
ArrayList<String> solrdesc = row.getDescription();
final ArrayList<String> solrdesc = row.getDescription();
if (!solrdesc.isEmpty()) { // include description_txt (similar to solr highlighting config)
sentences = new ArrayList<StringBuilder>();
for (String s:solrdesc) sentences.add(new StringBuilder(s));
firstSentencesList = new ArrayList<>();
for (final String s : solrdesc) {
firstSentencesList.add(new StringBuilder(s));
}
}
final String solrText = row.getText();
if (solrText != null && solrText.length() > 0) { // TODO: instead of join with desc, we could check if snippet already complete and skip further computation
// compute sentences from solr query
if (sentences == null) sentences = row.getSentences(pre); else sentences.addAll(row.getSentences(pre));
sentences = new SentenceReader(firstSentencesList, solrText, pre);
} else if (net.yacy.crawler.data.Cache.has(url.hash())) {
// get the sentences from the cache
final Request request = loader == null ? null : loader.request(url, true, reindexing);
@@ -236,7 +240,7 @@ public TextSnippet(
if (response != null) {
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
sentences = document.getSentences(pre);
sentences = new SentenceReader(firstSentencesList, document.getTextString(), pre);
response = null;
document = null;
} catch (final Parser.Failure e) {
@@ -249,7 +253,7 @@ public TextSnippet(
return;
}

if (sentences.size() > 0) {
if (sentences.iterator().hasNext()) {
try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
textline = tsr.getSnippet();
@@ -265,30 +269,38 @@ public TextSnippet(
// we found the snippet or the query is fully included in the headline or url
if (textline == null || textline.length() == 0) {
// this is the case where we don't have a snippet because all search words are included in the headline or the url
String solrText = row.getText();
if (solrText != null && solrText.length() > 0) {
// compute sentences from solr query
sentences = row.getSentences(pre);
}
if (sentences == null || sentences.size() == 0) {
if(sentences == null) {
String solrText = row.getText();
if (solrText != null && solrText.length() > 0) {
// compute sentences from solr query
sentences = new SentenceReader(firstSentencesList, solrText, pre);
}
} else {
sentences.reset();
}
if (sentences == null || (!sentences.iterator().hasNext())) {
textline = row.dc_subject();
} else {
// use the first lines from the text after the h1 tag as snippet
// get first the h1 tag
List<String> h1 = row.h1();
if (h1 != null && h1.size() > 0 && sentences.size() > 2) {
if (h1 != null && h1.size() > 0) {
// find first appearance of first h1 in sentences and then take the next sentence
String h1s = h1.get(0);
if (h1s.length() > 0) {
solrsearch: for (int i = 0; i < sentences.size() - 2; i++) {
if (sentences.get(i).toString().startsWith(h1s)) {
textline = sentences.get(i + 1).toString();
String prevSentence = null, currentSentence;
solrsearch: for (final StringBuilder sentence: sentences) {
currentSentence = sentence.toString();
if (prevSentence != null && prevSentence.startsWith(h1s)) {
textline = currentSentence;
break solrsearch;
}
prevSentence = currentSentence;
}
}
}
if (textline == null) {
sentences.reset();
final StringBuilder s = new StringBuilder(snippetMaxLength);
for (final StringBuilder t: sentences) {
s.append(t).append(' ');
@@ -344,10 +356,10 @@ public TextSnippet(
}

// compute sentences from parsed document
sentences = document.getSentences(pre);
sentences = new SentenceReader(document.getTextString(), pre);
document.close();

if (sentences == null) {
if (!sentences.hasNext()) {
init(url, null, false, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences", beginTime);
return;
}
@@ -88,8 +88,30 @@ public void testTextSnippet() throws MalformedURLException {
for (String word : wordlist) {
assertTrue("testTextSnippet word included " + word, rstr.contains(word));
}

}

/**
* Test snippet extraction when only document title matches searched terms.
* @throws MalformedURLException when the test document URL is malformed. Should not happen.
*/
@Test
public void testTextSnippetMatchTitle() throws MalformedURLException {
final URIMetadataNode testDoc = new URIMetadataNode(doc);
testDoc.addField(CollectionSchema.title.name(), "New test case title");
testDoc.addField(CollectionSchema.keywords.name(), "junit");
testDoc.addField(CollectionSchema.author.name(), "test author");
testDoc.addField(CollectionSchema.text_t.name(),
"A new testcase has been introduced. " + "It includes a few test lines but only title should match.");

final String querywords = "title";
final QueryGoal qg = new QueryGoal(querywords);

final TextSnippet ts = new TextSnippet(null, testDoc, qg.getIncludeWordsSet(), qg.getIncludeHashes(),
cacheStrategy, pre, snippetMaxLength, reindexing);
assertEquals("testTextSnippet Error Code: ", "", ts.getError());
assertTrue("Snippet line should be extracted from first text lines.",
ts.getLineRaw().startsWith("A new testcase has been introduced."));
}

/**
* Test of getLineMarked method, of class TextSnippet.

0 comments on commit e357ade

Please sign in to comment.
You can’t perform that action at this time.