Permalink
Browse files

Respect maxFileSize limit also when streaming HTTP and when relevant.

Constraint applied consistently with HTTP content full load in byte
array.
  • Loading branch information...
luccioman committed Jun 29, 2017
1 parent 4b72b29 commit 433bdb7c0dfe18a5c67952d957e188fa6e302f2d
@@ -28,6 +28,7 @@
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.fileupload.util.LimitedInputStream;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
@@ -45,6 +46,7 @@
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.kelondro.io.ByteCount;
import net.yacy.kelondro.util.Formatter;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
@@ -209,7 +211,7 @@ public StreamResponse openInputStream(final Request request, CrawlProfile profil
* When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
*/
long contentLength = client.getHttpResponse().getEntity().getContentLength();
final InputStream contentStream;
InputStream contentStream;
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
byte[] content = null;
try {
@@ -223,12 +225,29 @@ public StreamResponse openInputStream(final Request request, CrawlProfile profil
contentStream = new ByteArrayInputStream(content);
} else {
/*
* Content length may already be known now : check it before opening a stream
*/
if (maxFileSize >= 0 && contentLength > maxFileSize) {
throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes");
}
/*
* Create a HTTPInputStream delegating to
* client.getContentstream(). Close method will ensure client is
* properly closed.
*/
contentStream = new HTTPInputStream(client);
/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
if(maxFileSize >= 0) {
contentStream = new LimitedInputStream(contentStream, maxFileSize) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException(
"Content to download exceed maximum value of " + Formatter.bytesToString(pSizeMax));
}
};
}
}
return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
@@ -469,12 +469,12 @@ private int protocolMaxFileSize(final DigestURL url) {
* @param cacheStrategy cache strategy to use
* @param blacklistType black list
* @param agent agent identification for HTTP requests
* @param maxFileSize max file size to load. -1 means no limit.
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when url is malformed or blacklisted
*/
public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final int maxFileSize = protocolMaxFileSize(request.url());
BlacklistType blacklistType, final ClientIdentification.Agent agent, final int maxFileSize) throws IOException {
StreamResponse response;
Semaphore check = this.loaderSteering.get(request.url());
@@ -509,6 +509,21 @@ public StreamResponse openInputStream(final Request request, final CacheStrategy
return response;
}
/**
* Open the URL as an InputStream from the web or the cache. Apply the default per protocol configured maximum file size limit.
* @param request must be not null
* @param cacheStrategy cache strategy to use
* @param blacklistType black list
* @param agent agent identification for HTTP requests
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when url is malformed or blacklisted
*/
public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final int maxFileSize = protocolMaxFileSize(request.url());
return this.openInputStream(request, cacheStrategy, blacklistType, agent, maxFileSize);
}
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure {
@@ -564,7 +579,8 @@ public Document loadDocument(final DigestURL location, final CacheStrategy cache
* @return on parsed document or null when an error occurred while parsing
* @throws IOException when the content can not be fetched or no parser support it
*/
public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// load resource
Request request = request(location, true, false);
final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent);
@@ -113,7 +113,7 @@ public DigestURL parseURL(final serverObjects post, final boolean auth)
* image url.
* @return an open input stream instance (don't forget to close it).
* @throws IOException
* when a read/write error occured.
* when a read/write error occurred.
*/
public InputStream openInputStream(final serverObjects post, final LoaderDispatcher loader,
final boolean auth, DigestURL url) throws IOException {
@@ -123,8 +123,10 @@ public InputStream openInputStream(final serverObjects post, final LoaderDispatc
String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName
: ClientIdentification.yacyInternetCrawlerAgentName);
ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName);
/* We do not apply here the crawler max file size limit,
* as the purpose of this stream is not to be parsed and indexed but to be directly rendered */
final StreamResponse response = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST,
BlacklistType.SEARCH, agent);
BlacklistType.SEARCH, agent, -1);
inStream = response.getContentStream();
} catch (final IOException e) {
/** No need to log full stack trace (in most cases resource is not available because of a network error) */

0 comments on commit 433bdb7

Please sign in to comment.