Skip to content
Permalink
Browse files

Added a parser for XZ compressed archives.

As suggested by LA_FORGE on mantis 781
(http://mantis.tokeek.de/view.php?id=781)
  • Loading branch information...
luccioman committed Aug 15, 2018
1 parent 8ce9c06 commit 685122363dc34a748834eaf567b5e7cb81f01634
@@ -23,6 +23,7 @@
<classpathentry kind="lib" path="lib/json-simple-1.1.1.jar"/>
<classpathentry kind="lib" path="lib/xml-apis.jar"/>
<classpathentry kind="lib" path="lib/commons-compress-1.17.jar"/>
<classpathentry kind="lib" path="lib/xz-1.8.jar"/>
<classpathentry kind="lib" path="lib/commons-lang-2.6.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.11.jar"/>
<classpathentry kind="lib" path="lib/jcl-over-slf4j-1.7.25.jar"/>
@@ -179,6 +179,7 @@
<pathelement location="${lib}/common-lang-3.3.2.jar" />
<pathelement location="${lib}/commons-codec-1.11.jar" />
<pathelement location="${lib}/commons-compress-1.17.jar" />
<pathelement location="${lib}/xz-1.8.jar" />
<pathelement location="${lib}/commons-fileupload-1.3.3.jar" />
<pathelement location="${lib}/commons-io-2.6.jar" />
<pathelement location="${lib}/commons-jxpath-1.3.jar" />
@@ -0,0 +1,10 @@

Licensing of XZ for Java
========================

All the files in this package have been written by Lasse Collin
and/or Igor Pavlov. All these files have been put into the
public domain. You can do whatever you want with these files.

This software is provided "as is", without any warranty.

BIN +106 KB lib/xz-1.8.jar
Binary file not shown.
@@ -380,6 +380,12 @@
<version>1.17</version>
<type>jar</type>
</dependency>
<dependency>
<!-- Handle XZ compressed archives. It is an optional dependency of commons-compress. -->
<groupId>org.tukaani</groupId>
<artifactId>xz</artifactId>
<version>1.8</version>
</dependency>
<dependency>
<groupId>commons-fileupload</groupId>
<artifactId>commons-fileupload</artifactId>
@@ -42,6 +42,7 @@
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.parser.GenericXMLParser;
import net.yacy.document.parser.XZParser;
import net.yacy.document.parser.apkParser;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.bzipParser;
@@ -93,6 +94,7 @@
static {
initParser(new apkParser());
initParser(new bzipParser());
initParser(new XZParser());
initParser(new csvParser());
initParser(new docParser());
initParser(new gzipParser());
@@ -380,6 +382,32 @@ private static void initParser(final Parser parser) {
Integer.MAX_VALUE, Long.MAX_VALUE);
}

/**
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
* not support parsing within limits, an exception is thrown when
* content size is beyond maxBytes.
* @param location the URL of the source
* @param mimeType the mime type of the source, if known
* @param charset the charset name of the source, if known
* @param ignoreClassNames an eventual set of CSS class names whose matching html elements content should be ignored
* @param timezoneOffset the local time zone offset
* @param depth the current depth of the crawl
* @param contentLength the length of the source, if known (else -1 should be used)
* @param source a input stream
* @param maxLinks the maximum total number of links to parse and add to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed
*/
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes);
}

/**
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
@@ -0,0 +1,187 @@
// AbstractCompressorParser.java
// ---------------------------
// Copyright 2018 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package net.yacy.document.parser;

import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Set;

import org.apache.commons.compress.compressors.CompressorInputStream;

import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;

/**
* Base class for parsing compressed files relying on Apache commons-compress
* tools.
*/
public abstract class AbstractCompressorParser extends AbstractParser implements Parser {

/** Crawl depth applied when parsing internal compressed content */
protected static final int DEFAULT_DEPTH = 999;

/**
* @param name the human readable name of the parser
*/
public AbstractCompressorParser(final String name) {
super(name);
}

/**
* @param source an open input stream on a compressed source
* @return a sub class of CompressorInputStream capable of uncompressing the source
* on the fly
* @throws IOException when an error occurred when trying to open the compressed
* stream
*/
protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;

/**
* Maps the given name of a compressed file to the name that the
* file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned.
*
* @param filename name of a compressed file
* @return name of the corresponding uncompressed file
*/
protected abstract String getUncompressedFilename(final String filename);

@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {

return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
Long.MAX_VALUE);
}

@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure {
Document maindoc;
final CompressorInputStream compressedInStream;
try {
compressedInStream = createDecompressStream(source);
} catch (final IOException | RuntimeException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}

try {
// create maindoc for this archive, register with supplied url & mime
maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);

final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset,
AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
if (docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
}
} catch (final Parser.Failure e) {
throw e;
} catch (final IOException | RuntimeException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
return new Document[] { maindoc };
}

/**
* Create the main parsed document for the compressed document at the given URL
* and Media type
*
* @param location the parsed resource URL
* @param mimeType the media type of the resource
* @param charset the charset name if known
* @param parser an instance of CompressorParser that is registered as the
* parser origin of the document
* @return a Document instance
*/
protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final AbstractCompressorParser parser) {
final String filename = location.getFileName();
return new Document(location, mimeType, charset, parser, null, null,
AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
}

/**
* Parse content in an open stream uncompressing on the fly a compressed
* resource.
*
* @param location the URL of the compressed resource
* @param charset the charset name if known
* @param ignoreClassNames an eventual set of CSS class names whose matching
* html elements content should be ignored
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the
* compressed content
* @param maxLinks the maximum total number of links to parse and add
* to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with empty
* or null text.
* @throws Parser.Failure when the parser processing failed
*/
protected Document[] parseCompressedInputStream(final DigestURL location, final String charset,
final Set<String> ignoreClassNames, final int timezoneOffset, final int depth,
final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
final String compressedFileName = location.getFileName();
final String contentfilename = getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/*
* Use the uncompressed file name for sub parsers to not unnecessarily use again
* this same uncompressing parser
*/
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
+ contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
location.getPort(), contentPath);

/*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth,
-1, compressedInStream, maxLinks, maxBytes);
} catch (final MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
}

@Override
public boolean isParseWithLimitsSupported() {
return true;
}

}
@@ -0,0 +1,66 @@
// XZParser.java
// ---------------------------
// Copyright 2018 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package net.yacy.document.parser;

import java.io.IOException;
import java.io.InputStream;

import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
import org.apache.commons.compress.compressors.xz.XZUtils;

import net.yacy.kelondro.util.MemoryControl;

/**
* Parser for xz archives. Uncompresses and parses the content and adds it to
* the created main parsed document.
*
* @see <a href="https://tukaani.org/xz/format.html">xz file format website</a>
*/
public class XZParser extends AbstractCompressorParser {

public XZParser() {
super("XZ Compressed Archive Parser");
this.SUPPORTED_EXTENSIONS.add("xz");
this.SUPPORTED_EXTENSIONS.add("txz");
this.SUPPORTED_MIME_TYPES.add("application/x-xz");
}

@Override
protected CompressorInputStream createDecompressStream(final InputStream source) throws IOException {
/*
* Limit the size dedicated to reading compressed blocks to at most 25% of the
* available memory. Eventual stricter limits should be handled by the caller
* (see for example crawler.[protocol].maxFileSize configuration setting).
*/
final long availableMemory = MemoryControl.available();
final long maxKBytes = (long) (availableMemory * 0.25 / 1024.0);
return new XZCompressorInputStream(source, false, (int) Math.min(Integer.MAX_VALUE, maxKBytes));
}

@Override
protected String getUncompressedFilename(final String filename) {
return XZUtils.getUncompressedFilename(filename);
}

}

0 comments on commit 6851223

Please sign in to comment.
You can’t perform that action at this time.