Permalink
Browse files

Support parsing gzip files from servers with redundant headers.

Some web servers provide both 'Content-Encoding : "gzip"' and
'Content-Type : "application/x-gzip"' HTTP headers on their ".gz" files.
This was annoying to fail on such resources which are not so uncommon,
while non conforming (see RFC 7231 section 3.1.2.2 for
"Content-Encoding" header specification
https://tools.ietf.org/html/rfc7231#section-3.1.2.2)
  • Loading branch information...
luccioman committed Jul 16, 2017
1 parent 11a7f92 commit 5a646540ccecd7e4d1386902b89e2348493d05ce
@@ -30,7 +30,6 @@
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
/**
* A crawler load response, holding content as a stream.
@@ -90,31 +89,7 @@ public Response getResponse() {
* when no parser support the content
*/
public Document[] parse() throws Parser.Failure {
final String supportError = TextParser.supports(this.response.url(),
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
if (supportError != null) {
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
}
try {
return TextParser.parseSource(this.response.url(),
this.response.getResponseHeader() == null ? null
: this.response.getResponseHeader().getContentType(),
this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
: this.response.getResponseHeader().getCharacterEncoding(),
new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
this.response.getRequest().depth(), this.response.size(), this.contentStream);
} catch (final Exception e) {
return null;
} finally {
if (this.contentStream != null) {
try {
this.contentStream.close();
} catch (IOException ignored) {
log.warn("Could not close content stream on url " + this.response.url());
}
}
}
return parseWithLimits(Integer.MAX_VALUE, Long.MAX_VALUE);
}
/**
@@ -151,9 +126,11 @@ public Response getResponse() {
: this.response.getResponseHeader().getCharacterEncoding();
return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks,
maxBytes);
} catch (final Exception e) {
this.response.getRequest().timezoneOffset(), this.response.getRequest().depth(),
this.response.size(), this.contentStream, maxLinks, maxBytes);
} catch(Parser.Failure e) {
throw e;
}catch (final Exception e) {
return null;
} finally {
if (this.contentStream != null) {
@@ -49,6 +49,7 @@
import net.yacy.document.parser.docParser;
import net.yacy.document.parser.genericParser;
import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.gzipParser.GZIPOpeningStreamException;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.linkScraperParser;
import net.yacy.document.parser.mmParser;
@@ -296,6 +297,35 @@ private static void initParser(final Parser parser) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
* too bad, the marks is invalid and process fails now with an IOException */
bufferedStream.reset();
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException
&& (idioms.size() == 1 || (idioms.size() == 2 && idioms.contains(genericIdiom)))) {
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
* let's have a chance to parse the stream as uncompressed. */
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
* and "Content-type" with value such as "application/gzip".
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser;
nonCloseInputStream = new CloseShieldInputStream(bufferedStream);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try {
Document[] docs = gzParser.parseCompressedInputStream(location,
charset, timezoneOffset, depth,
nonCloseInputStream, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
}
return new Document[] { maindoc };
} catch(Exception e1) {
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
bufferedStream.reset();
}
}
}
}
} catch (IOException e) {
@@ -345,6 +375,7 @@ private static void initParser(final Parser parser) {
* @param mimeType the mime type of the source, if known
* @param charset the charset name of the source, if known
* @param timezoneOffset the local time zone offset
* @param depth the current depth of the crawl
* @param contentLength the length of the source, if known (else -1 should be used)
* @param source a input stream
* @param maxLinks the maximum total number of links to parse and add to the result documents
@@ -353,9 +384,9 @@ private static void initParser(final Parser parser) {
* @throws Parser.Failure when the parser processing failed
*/
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength,
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes);
}
@@ -400,6 +431,8 @@ private static void initParser(final Parser parser) {
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
}
return docs;
} catch(Parser.Failure e) {
throw e;
} catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location);
}
@@ -460,8 +493,38 @@ private static void initParser(final Parser parser) {
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
}
} catch (final Parser.Failure e) {
failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
if(parser instanceof gzipParser && e.getCause() instanceof GZIPOpeningStreamException &&
(parsers.size() == 1 || (parsers.size() == 2 && parsers.contains(genericIdiom)))) {
/* The gzip parser failed directly when opening the content stream : before falling back to the generic parser,
* let's have a chance to parse the stream as uncompressed. */
/* Indeed, this can be a case of misconfigured web server, providing both headers "Content-Encoding" with value "gzip",
* and "Content-type" with value such as "application/gzip".
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser;
bis = new ByteArrayInputStream(sourceArray);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try {
docs = gzParser.parseCompressedInputStream(location,
charset, timezoneOffset, depth,
bis, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
}
docs = new Document[] { maindoc };
break;
} catch(Parser.Failure e1) {
failedParser.put(parser, e1);
} catch(Exception e2) {
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
}
} else {
failedParser.put(parser, e);
}
} catch (final Exception e) {
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
@@ -638,8 +701,21 @@ public static String mimeOf(final String ext) {
return ext2mime.get(ext.toLowerCase(Locale.ROOT));
}
private static String normalizeMimeType(String mimeType) {
if (mimeType == null) return "application/octet-stream";
/**
* Normalize a media type information string (can be a HTTP "Content-Type"
* response header) : convert to lower case, remove any supplementary
* parameters such as the encoding (charset name), and provide a default
* value when null.
*
* @param mimeType
* raw information about media type, eventually provided by a
* HTTP "Content-Type" response header
* @return a non null media type in lower case
*/
public static String normalizeMimeType(String mimeType) {
if (mimeType == null) {
return "application/octet-stream";
}
mimeType = mimeType.toLowerCase(Locale.ROOT);
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
@@ -31,9 +31,12 @@
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.zip.GZIPInputStream;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
@@ -42,13 +45,14 @@
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
/**
* Parses a gz archive.
* Unzips and parses the content and adds it to the created main document
*/
public class gzipParser extends AbstractParser implements Parser {
private static final int DEFAULT_DEPTH = 999;
public gzipParser() {
super("GNU Zip Compressed Archive Parser");
@@ -75,12 +79,18 @@ public gzipParser() {
Document maindoc = null;
GZIPInputStream zippedContent = null;
FileOutputStream out = null;
try {
zippedContent = new GZIPInputStream(source);
} catch(IOException e) {
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException());
}
try {
int read = 0;
final byte[] data = new byte[1024];
zippedContent = new GZIPInputStream(source);
tempFile = File.createTempFile("gunzip","tmp");
// creating a temp file to store the uncompressed data
@@ -112,11 +122,11 @@ public gzipParser() {
}
}
try {
maindoc = createMainDocument(location, mimeType, charset);
maindoc = createMainDocument(location, mimeType, charset, this);
// creating a new parser class to parse the unzipped content
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, DEFAULT_DEPTH, tempFile);
if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
@@ -134,15 +144,16 @@ public gzipParser() {
* @param location the parsed resource URL
* @param mimeType the media type of the resource
* @param charset the charset name if known
* @param an instance of gzipParser that is registered as the parser origin of the document
* @return a Document instance
*/
private Document createMainDocument(final DigestURL location, final String mimeType, final String charset) {
public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final gzipParser parser) {
final String filename = location.getFileName();
Document maindoc = new Document(
location,
mimeType,
charset,
this,
parser,
null,
null,
AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
@@ -159,6 +170,41 @@ private Document createMainDocument(final DigestURL location, final String mimeT
new Date());
return maindoc;
}
/**
* Parse content in an open stream uncompressing on the fly a gzipped resource.
* @param location the URL of the gzipped resource
* @param charset the charset name if known
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the compressed content
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
*/
public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
// creating a new parser class to parse the unzipped content
final String compressedFileName = location.getFileName();
final String contentfilename = GzipUtils.getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
} catch (MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
}
}
@Override
public boolean isParseWithLimitsSupported() {
@@ -177,21 +223,38 @@ public boolean isParseWithLimitsSupported() {
* before an eventual OutOfMemory occurs */
zippedContent = new GZIPInputStream(source);
} catch(IOException e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
/* Use a GZIPOpeningStreamException to signal the caller the error occurred directly on stream opening
* and eventually apply special error handling */
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location,
new GZIPOpeningStreamException());
}
try {
maindoc = createMainDocument(location, mimeType, charset);
// creating a new parser class to parse the unzipped content
final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
maindoc = createMainDocument(location, mimeType, charset, this);
/* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
Document[] docs = TextParser.parseWithLimits(location, mime, charset, timezoneOffset, -1, zippedContent, maxLinks, maxBytes);
if (docs != null) maindoc.addSubDocuments(docs);
Document[] docs = parseCompressedInputStream(location, charset, timezoneOffset, DEFAULT_DEPTH, zippedContent, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
}
} catch (final Exception e) {
throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(),location);
}
return maindoc == null ? null : new Document[]{maindoc};
}
/**
* Used to signal an error occurred when opening a gzipped input stream.
*/
public class GZIPOpeningStreamException extends Exception {
/** The serialization ID */
private static final long serialVersionUID = 2824038185373304636L;
public GZIPOpeningStreamException() {
super();
}
public GZIPOpeningStreamException(final String message) {
super(message);
}
}
}

0 comments on commit 5a64654

Please sign in to comment.