Permalink
Browse files

Started support of partial parsing on large streamed resources.

Thus enable getpageinfo_p API to return something in a reasonable amount
of time on resources over MegaBytes size range.
Support added first with the generic XML parser, for other formats
regular crawler limits apply as usual.
  • Loading branch information...
luccioman committed Jul 8, 2017
1 parent 2a87b08 commit bf55f1d6e582eb126e74d18e0bea2be542efda68
@@ -87,7 +87,8 @@
* </ul>
* </li>
* <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li>
* <li>maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return</li>
* <li>maxLinks (optional integer value) : the maximum number of links, sitemap URLs or icons to return on 'title' action</li>
* <li>maxBytes (optional long integer value) : the maximum number of bytes to load and parse from the url on 'title' action</li>
* </ul>
* @param env
* server environment
@@ -139,7 +140,17 @@ public static serverObjects respond(final RequestHeader header, final serverObje
net.yacy.document.Document scraper = null;
if (u != null) try {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
if(post.containsKey("maxBytes")) {
/* A maxBytes limit is specified : let's try to parse only the amount of bytes given */
final long maxBytes = post.getLong("maxBytes", sb.loader.protocolMaxFileSize(u));
scraper = sb.loader.loadDocumentAsLimitedStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent, maxLinks, maxBytes);
} else {
/* No maxBytes limit : apply regular parsing with default crawler limits.
* Eventual maxLinks limit will apply after loading and parsing the document. */
scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@@ -151,7 +162,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
// put the icons that belong to the document
Set<DigestURL> iconURLs = scraper.getIcons().keySet();
int count = 0;
long count = 0;
for (DigestURL iconURL : iconURLs) {
if(count >= maxLinks) {
break;
@@ -199,7 +210,7 @@ public static serverObjects respond(final RequestHeader header, final serverObje
count++;
}
prop.put("links", count);
prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
prop.put("hasMoreLinks", scraper.isPartiallyParsed() || (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
}
@@ -116,5 +116,55 @@ public Response getResponse() {
}
}
/**
* Parse and close the content stream and return the parsed documents when
* possible.<br>
* Try to limit the parser processing with a maximum total number of links
* detection (anchors, images links, media links...) or a maximum amount of
* content bytes to parse.<br>
* Limits apply only when the available parsers for the resource media type
* support parsing within limits (see
* {@link Parser#isParseWithLimitsSupported()}. When available parsers do
* not support parsing within limits, an exception is thrown when
* content size is beyond maxBytes.
*
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return the parsed documents or null when an error occurred
* @throws Parser.Failure
* when no parser support the content, or an error occurred while parsing
*/
public Document[] parseWithLimits(final int maxLinks, final long maxBytes) throws Parser.Failure {
final String supportError = TextParser.supports(this.response.url(),
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
if (supportError != null) {
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
}
try {
final String mimeType = this.response.getResponseHeader() == null ? null
: this.response.getResponseHeader().getContentType();
final String charsetName = this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
: this.response.getResponseHeader().getCharacterEncoding();
return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks,
maxBytes);
} catch (final Exception e) {
return null;
} finally {
if (this.contentStream != null) {
try {
this.contentStream.close();
} catch (IOException ignored) {
log.warn("Could not close content stream on url " + this.response.url());
}
}
}
}
}
@@ -23,12 +23,14 @@
package net.yacy.document;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
public abstract class AbstractParser implements Parser {
@@ -98,5 +100,20 @@ public int hashCode() {
if (t != null) c.add(t);
return c;
}
@Override
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
/* Please override on subclasses when implementation is possible */
throw new UnsupportedOperationException();
}
@Override
public boolean isParseWithLimitsSupported() {
/* Please override on subclasses when parseWithLimits is supported */
return false;
}
}
@@ -99,6 +99,9 @@
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date lastModified; // creation or last modification date of the source document
private int crawldepth;
/** True when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit */
private boolean partiallyParsed;
public Document(final DigestURL location, final String mimeType, final String charset,
final Parser parserObject,
@@ -152,6 +155,7 @@ public Document(final DigestURL location, final String mimeType, final String ch
this.lastModified = lastModified == null ? new Date() : lastModified;
this.crawldepth = 999; // unknown yet
this.scraperObject = null; // will be set by setScraperObject()
this.partiallyParsed = false;
}
/**
@@ -212,6 +216,20 @@ public String getFileName() {
return this.generic_facets;
}
/**
* @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/
public boolean isPartiallyParsed() {
return this.partiallyParsed;
}
/**
* @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/
public void setPartiallyParsed(final boolean partiallyParsed) {
this.partiallyParsed = partiallyParsed;
}
/**
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
@@ -47,12 +47,13 @@
* parse an input stream
* @param url the url of the source
* @param mimeType the mime type of the source, if known
* @param charset the charset of the source, if known
* @param charset the charset name of the source, if known
* @param scraper an entity scraper to detect facets from text annotation context
* @param timezoneOffset the local time zone offset
* @param source a input stream
* @return a list of documents that result from parsing the source
* @throws Parser.Failure
* @throws InterruptedException
* @throws Parser.Failure when the parser processing failed
* @throws InterruptedException when the processing was interrupted before termination
*/
public Document[] parse(
DigestURL url,
@@ -62,7 +63,55 @@
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException;
/**
* Parse an input stream, eventually terminating processing when a total of
* maxLinks URLS (anchors, images links, media links...) have been reached,
* or when maxBytes content bytes have been processed, thus potentially
* resulting in partially parsed documents (with
* {@link Document#isPartiallyParsed()} returning true). Some parser
* implementations will not support parsing within maxLinks or maxBytes
* limits : make sure to check this by calling fist
* {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
* could be thrown.
*
* @param url
* the URL of the source
* @param mimeType
* the mime type of the source, if known
* @param charset
* the charset name of the source, if known
* @param scraper
* an entity scraper to detect facets from text annotation
* context
* @param timezoneOffset
* the local time zone offset
* @param source
* a input stream
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
* @throws InterruptedException
* when the processing was interrupted before termination
* @throws UnsupportedOperationException
* when the parser implementation doesn't support parsing within
* limits
*/
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
/**
* @return true when the parser implementation supports the
* parseWithLimits() operation.
*/
public boolean isParseWithLimitsSupported();
// methods to that shall make it possible to put Parser objects into a hashtable
Oops, something went wrong.

0 comments on commit bf55f1d

Please sign in to comment.