Permalink
Browse files

Improved consistency between loader openInputStream and load functions

  • Loading branch information...
luccioman committed Jun 1, 2017
1 parent cbccf97 commit a9cb083fa135819b56c98561d60f2ab39d2fd8f6
@@ -27,6 +27,7 @@
package net.yacy.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
@@ -166,6 +167,29 @@ public Response load(final Request request, final boolean acceptOnlyParseable) t
Latency.updateAfterLoad(request.url(), System.currentTimeMillis() - start);
return response;
}
/**
* Open a stream on the entry content from a FTP server
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
final Response response = load(request, acceptOnlyParseable);
// TODO implement a true ftp content stream instead of a simple ByteArrayInputStream encapsulation
final StreamResponse streamResponse;
if(response.getContent() != null) {
streamResponse = new StreamResponse(response,
new ByteArrayInputStream(response.getContent()));
} else {
/* content can be null when no parser can handle it : then return the URL tokens as content */
streamResponse = new StreamResponse(response,
new ByteArrayInputStream(UTF8.getBytes(request.url().toTokens())));
}
return streamResponse;
}
/**
* @param ftpClient
@@ -24,6 +24,7 @@
package net.yacy.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
@@ -58,7 +59,31 @@ public FileLoader(final Switchboard sb, final ConcurrentLog log) {
this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
}
/**
* Load fully the requested file in a byte buffer
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens
* @return a response with full meta data and embedding the content as a byte buffer
*/
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable);
/* Read fully the stream and update the response */
byte[] content = FileUtils.read(streamResponse.getContentStream());
Response response = streamResponse.getResponse();
response.setContent(content);
return response;
}
/**
* Open a stream on the requested file
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
DigestURL url = request.url();
if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol());
@@ -93,9 +118,9 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
responseHeader,
profile,
false,
UTF8.getBytes(content.toString()));
null);
return response;
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString())));
}
// create response header
@@ -133,13 +158,12 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
responseHeader,
profile,
false,
UTF8.getBytes(url.toTokens()));
return response;
null);
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(url.toTokens())));
}
// load the resource
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
byte[] b = FileUtils.read(is);
final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
// create response with loaded content
final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle()));
@@ -149,7 +173,7 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
responseHeader,
profile,
false,
b);
return response;
null);
return new StreamResponse(response, is);
}
}
@@ -28,6 +28,7 @@
import java.io.IOException;
import java.io.InputStream;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import net.yacy.cora.document.id.DigestURL;
@@ -82,18 +83,19 @@ public Response load(final Request entry, CrawlProfile profile, final int maxFil
return doc;
}
/**
* Open input stream on a requested HTTP resource. When resource is small, fully load it and returns a ByteArrayInputStream instance.
/**
* Open an input stream on a requested HTTP resource. When the resource content size is small
* (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance.
* @param request
* @param profile crawl profile
* @param retryCount remaining redirect retries count
* @param maxFileSize max file size to load. -1 means no limit.
* @param blacklistType blacklist type to use
* @param agent agent identifier
* @return an open input stream. Don't forget to close it.
* @throws IOException when an error occured
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
* @throws IOException when an error occurred
*/
public InputStream openInputStream(final Request request, CrawlProfile profile, final int retryCount,
public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount,
final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent)
throws IOException {
if (retryCount < 0) {
@@ -200,13 +202,14 @@ public InputStream openInputStream(final Request request, CrawlProfile profile,
FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$");
} else if (statusCode == 200 || statusCode == 203) {
} else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) {
// the transfer is ok
/*
* When content is not large (less than 1MB), we have better cache it if cache is enabled and url is not local
* When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local
*/
long contentLength = client.getHttpResponse().getEntity().getContentLength();
final InputStream contentStream;
if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) {
byte[] content = null;
try {
@@ -218,14 +221,17 @@ public InputStream openInputStream(final Request request, CrawlProfile profile,
client.finish();
}
return new ByteArrayInputStream(content);
contentStream = new ByteArrayInputStream(content);
} else {
/*
* Create a HTTPInputStream delegating to
* client.getContentstream(). Close method will ensure client is
* properly closed.
*/
contentStream = new HTTPInputStream(client);
}
/*
* Returns a HTTPInputStream delegating to
* client.getContentstream(). Close method will ensure client is
* properly closed.
*/
return new HTTPInputStream(client);
return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream);
} else {
client.finish();
// if the response has not the right response type then reject file
@@ -225,10 +225,21 @@ public Response(final Request request, final CrawlProfile profile) {
public void updateStatus(final int newStatus) {
this.status = newStatus;
}
/**
* @return the original request that produced this response
*/
public Request getRequest() {
return request;
}
public ResponseHeader getResponseHeader() {
return this.responseHeader;
}
public RequestHeader getRequestHeader() {
return this.requestHeader;
}
public boolean fromCache() {
return this.fromCache;
@@ -27,6 +27,7 @@
package net.yacy.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
@@ -69,7 +70,31 @@ public SMBLoader(final Switchboard sb, final ConcurrentLog log) {
}
/**
* Load fully the requested file in a byte buffer
*
* @param request the request to process
* @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens
* @return a response with full meta data and embedding the content as a byte buffer
*/
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable);
/* Read fully the stream and update the response */
byte[] content = FileUtils.read(streamResponse.getContentStream());
Response response = streamResponse.getResponse();
response.setContent(content);
return response;
}
/**
* Open a stream on the requested file
*
* @param request the request to process
* @param acceptOnlyParseable when true, do not open a stream on content when no parser can be found to handle the detected MIME type
* @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream.
*/
public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException {
DigestURL url = request.url();
if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());
@@ -111,9 +136,9 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
responseHeader,
profile,
false,
UTF8.getBytes(content.toString()));
null);
return response;
return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString())));
}
// create response header
@@ -151,13 +176,12 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
responseHeader,
profile,
false,
url.toTokens().getBytes());
return response;
null);
return new StreamResponse(response, new ByteArrayInputStream(url.toTokens().getBytes()));
}
// load the resource
InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
byte[] b = FileUtils.read(is);
// create response with loaded content
final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes());
@@ -167,8 +191,8 @@ public Response load(final Request request, boolean acceptOnlyParseable) throws
responseHeader,
profile,
false,
b);
return response;
null);
return new StreamResponse(response, is);
}
public static void main(String[] args) {
Oops, something went wrong.

0 comments on commit a9cb083

Please sign in to comment.