Permalink
Browse files

Prevent unwanted cached bytes duplication on stream parsing.

  • Loading branch information...
luccioman committed Aug 12, 2017
1 parent ed67818 commit 8a94fef9e0594ed31632155ddeb7e6f90334a09f
Showing with 18 additions and 7 deletions.
  1. +18 −7 source/net/yacy/document/TextParser.java
@@ -271,32 +271,43 @@ private static void initParser(final Parser parser) {
canStream = true;
}
}
} else if(sourceStream instanceof ByteArrayInputStream) {
/* Also check if we have a ByteArrayInputStream as source to prevent useless bytes duplication in a new byte array */
canStream = true;
}
// if we do not have more than one non generic parser or the content size is over MaxInt (2GB) or is over the totally available memory
// if we do not have more than one non generic parser, or the content size is over MaxInt (2GB), or is over the totally available memory,
// or stream is already in memory as a ByteArrayInputStream
// then we use only stream-oriented parser.
if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
try {
/* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
* and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
int rewindSize = 10 * 1024;
final BufferedInputStream bufferedStream = new BufferedInputStream(sourceStream, rewindSize);
final InputStream markableStream;
if(sourceStream instanceof ByteArrayInputStream) {
/* No nead to use a wrapping buffered stream when the source is already entirely in memory.
* What's more, ByteArrayInputStream has no read limit when marking.*/
markableStream = sourceStream;
} else {
markableStream = new BufferedInputStream(sourceStream, rewindSize);
}
/* Mark now to allow resetting the buffered stream to the beginning of the stream */
bufferedStream.mark(rewindSize);
markableStream.mark(rewindSize);
/* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
for(Parser parser : idioms) {
/* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
* and so let us eventually reuse the same opened stream with other parsers on parser failure */
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
try {
return parseSource(location, mimeType, parser, charset, scraper, timezoneOffset,
nonCloseInputStream, maxLinks, maxBytes);
} catch (Parser.Failure e) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
* too bad, the marks is invalid and process fails now with an IOException */
bufferedStream.reset();
markableStream.reset();
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
@@ -309,7 +320,7 @@ private static void initParser(final Parser parser) {
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser;
nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
nonCloseInputStream = new CloseShieldInputStream(markableStream);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
@@ -323,7 +334,7 @@ private static void initParser(final Parser parser) {
return new Document[] { maindoc };
} catch(Exception e1) {
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
bufferedStream.reset();
markableStream.reset();
}
}
}

0 comments on commit 8a94fef

Please sign in to comment.