Skip to content

Commit

Permalink
Support loading local files with a per request specified maximum size.
Browse files Browse the repository at this point in the history
Consistently with the HTTP loader implementation.
  • Loading branch information
luccioman committed Jul 11, 2017
1 parent f369679 commit 1e84956
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 10 deletions.
42 changes: 36 additions & 6 deletions source/net/yacy/crawler/retrieval/FileLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
import java.util.Date;
import java.util.List;

import org.apache.commons.fileupload.util.LimitedInputStream;

import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
Expand All @@ -48,6 +50,9 @@
import net.yacy.search.Switchboard;

public class FileLoader {

/** Default maximum file size allowed for the crawler */
public static final int DEFAULT_MAXFILESIZE = 100000000;

private final Switchboard sb;
private final ConcurrentLog log;
Expand All @@ -56,7 +61,7 @@ public class FileLoader {
public FileLoader(final Switchboard sb, final ConcurrentLog log) {
this.sb = sb;
this.log = log;
this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
this.maxFileSize = sb.getConfigInt("crawler.file.maxFileSize", DEFAULT_MAXFILESIZE);
}

/**
Expand All @@ -77,13 +82,14 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
}

/**
* Open a stream on the requested file
* Open a stream on the requested file. When actual file size is over maxBytes, return a stream on metadata only (URL tokens).
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
* @param maxBytes max file size to load. -1 means no limit.
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable, final int maxBytes) throws IOException {
DigestURL url = request.url();
if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol());

Expand Down Expand Up @@ -134,12 +140,13 @@ public StreamResponse openInputStream(final Request request, final boolean accep
long size;
try {
size = url.length();
responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(size));
} catch (final Exception e) {
size = -1;
}
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > this.maxFileSize && this.maxFileSize >= 0)) {
(size > maxBytes && maxBytes >= 0)) {
// we know that we cannot process that file before loading
// only the metadata is returned

Expand All @@ -163,9 +170,21 @@ public StreamResponse openInputStream(final Request request, final boolean accep
}

// load the resource
final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);

if(size < 0 && maxBytes >= 0) {
/* If content length is unknown for some reason, let's apply now the eventual size restriction */
is = new LimitedInputStream(is, maxBytes) {

@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException(
"Too big file in File crawler for URL " + request.url().toString());
}
};
}

// create response with loaded content
// create response with stream open on content
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
Response response = new Response(
request,
Expand All @@ -176,4 +195,15 @@ public StreamResponse openInputStream(final Request request, final boolean accep
null);
return new StreamResponse(response, is);
}

/**
* Open a stream on the requested file
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
return openInputStream(request, acceptOnlyParseable, this.maxFileSize);
}
}
14 changes: 10 additions & 4 deletions source/net/yacy/repository/LoaderDispatcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ private StreamResponse openInputStreamInternal(final Request request, CacheStrat
} else if (protocol.equals("smb")) {
response = this.smbLoader.openInputStream(request, true);
} else if (protocol.equals("file")) {
response = this.fileLoader.openInputStream(request, true);
response = this.fileLoader.openInputStream(request, true, maxFileSize);
} else {
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
Expand Down Expand Up @@ -444,12 +444,18 @@ private void checkAccessTime(ClientIdentification.Agent agent, final DigestURL u
* @return the crawler configured maximum size allowed to load for the protocol of the URL
*/
public int protocolMaxFileSize(final DigestURL url) {
if (url.isHTTP() || url.isHTTPS())
if (url.isHTTP() || url.isHTTPS()) {
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (url.isFTP())
}
if (url.isFTP()) {
return this.sb.getConfigInt("crawler.ftp.maxFileSize", (int) FTPLoader.DEFAULT_MAXFILESIZE);
if (url.isSMB())
}
if (url.isSMB()) {
return this.sb.getConfigInt("crawler.smb.maxFileSize", (int) SMBLoader.DEFAULT_MAXFILESIZE);
}
if(url.isFile()) {
return this.sb.getConfigInt("crawler.file.maxFileSize", FileLoader.DEFAULT_MAXFILESIZE);
}
return Integer.MAX_VALUE;
}

Expand Down

0 comments on commit 1e84956

Please sign in to comment.