Skip to content

Commit

Permalink
partial fix (images,audio,video) for proxy and content-type problem h…
Browse files Browse the repository at this point in the history
  • Loading branch information
danielr committed Aug 26, 2008
1 parent 0df2e47 commit 9ff4fc1
Show file tree
Hide file tree
Showing 6 changed files with 186 additions and 37 deletions.
2 changes: 1 addition & 1 deletion source/de/anomic/crawler/CrawlEntry.java
Expand Up @@ -104,7 +104,7 @@ public CrawlEntry(
assert appdate != null;
assert url != null;
assert initiator != null;
assert initiator.length() > 0;
assert initiator.length() > 0 : "initiator of '"+ url +"' is empty";
assert referrerhash != null;
this.initiator = initiator;
this.url = url;
Expand Down
3 changes: 3 additions & 0 deletions source/de/anomic/crawler/CrawlStacker.java
Expand Up @@ -220,6 +220,9 @@ public void enqueueEntry(
final CrawlProfile.entry profile) {
if (profile == null) return;

// DEBUG
log.logFinest("ENQUEUE "+ nexturl +", referer="+referrerhash +", initiator="+initiatorHash +", name="+name +", load="+loadDate +", depth="+currentdepth);

// check first before we create a big object
if (this.urlEntryCache.has(nexturl.hash().getBytes())) return;

Expand Down
9 changes: 9 additions & 0 deletions source/de/anomic/crawler/IndexingStack.java
Expand Up @@ -625,5 +625,14 @@ public final String shallIndexCacheForCrawler() {

return null;
}

/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
public String toString() {
return "QueueEntry of "+ url.toString() + ", ref="+referrerHash +", initiator="+initiator +", flags="+ flags +", anchor="+ anchorName;
}
} // class Entry
}
44 changes: 44 additions & 0 deletions source/de/anomic/http/MultiOutputStream.java
@@ -0,0 +1,44 @@
/**
* MultiOutputStream.java
* @since 26.08.2008
*/
package de.anomic.http;

import java.io.IOException;
import java.io.OutputStream;

/**
* writes to multiple {link OutputStream}s (parallel)
*
* @author daniel
*
*/
class MultiOutputStream extends OutputStream {

private final OutputStream[] streams;

/**
* creates a new MultiOutputStream
*
* @param streams
*/
public MultiOutputStream(final OutputStream[] streams) {
super();
// make a copy to avoid external modifications
this.streams = new OutputStream[streams.length];
System.arraycopy(streams, 0, this.streams, 0, streams.length);
}

/**
* writes the byte to each of the streams
*
* @see java.io.OutputStream#write(int)
*/
@Override
public void write(int b) throws IOException {
for(OutputStream stream: streams) {
stream.write(b);
}
}

}
155 changes: 120 additions & 35 deletions source/de/anomic/http/httpdProxyHandler.java
Expand Up @@ -55,7 +55,6 @@
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.BindException;
import java.net.ConnectException;
Expand All @@ -71,6 +70,7 @@
import java.util.HashSet;
import java.util.Iterator;
import java.util.Properties;
import java.util.Set;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.LogManager;
Expand Down Expand Up @@ -177,7 +177,10 @@ public final class httpdProxyHandler {

// create a htRootPath: system pages
htRootPath = new File(switchboard.getRootPath(), switchboard.getConfig("htRootPath","htroot"));
if (!(htRootPath.exists())) htRootPath.mkdir();
if (!(htRootPath.exists())) {
if(!htRootPath.mkdir())
serverLog.logSevere("PROXY", "could not create htRoot "+ htRootPath);
}

// load a transformer
transformer = new htmlFilterContentTransformer();
Expand Down Expand Up @@ -224,6 +227,27 @@ public final class httpdProxyHandler {
*/
private static final StringBuffer userAgentStr = new StringBuffer();

/**
* A Set of media types which are known to only contain binary data (no readable text)
* Each is only the first part of the content-type field (no subtypes)
*/
private static final Set<String> binaryTypes = new HashSet<String>();

/**
* A Set of content-types which are known to only contain binary data (no readable text)
* Each is a complete content-type header field (without parameters)
*/
private static final Set<String> binaryContent = new HashSet<String>();
static {
// all Strings must be lower case!!
// RFC 2045: "Matching of media type and subtype is ALWAYS case-insensitive."
// discrete types
binaryTypes.add("image");
binaryTypes.add("audio");
binaryTypes.add("video");

binaryContent.add("application/octet-stream");
}

public static void handleOutgoingCookies(final httpRequestHeader requestHeader, final String targethost, final String clienthost) {
/*
Expand Down Expand Up @@ -449,7 +473,7 @@ public static void doGet(final Properties conProp, final httpRequestHeader reque
private static void fulfillRequestFromWeb(final Properties conProp, final yacyURL url,final String ext, final httpRequestHeader requestHeader, final httpResponseHeader cachedResponseHeader, final File cacheFile, final OutputStream respond) {

final GZIPOutputStream gzippedOut = null;
Writer hfos = null;
Writer textOutput = null;

JakartaCommonsHttpResponse res = null;
try {
Expand Down Expand Up @@ -532,19 +556,25 @@ private static void fulfillRequestFromWeb(final Properties conProp, final yacyUR

// handle file types and make (possibly transforming) output stream
final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
if (
(!transformer.isIdentityTransformer()) &&
(plasmaParser.supportedHTMLContent(url,responseHeader.mime()))
) {
// make a transformer
theLogger.logFine(reqID +" create transformer for URL " + url);
//hfos = new htmlFilterOutputStream((gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond), null, transformer, (ext.length() == 0));
final Charset charSet = responseHeader.getCharSet();
hfos = new htmlFilterWriter(outStream,charSet, null, transformer, (ext.length() == 0));
final boolean isBinary = isBinary(responseHeader);
if(isBinary) {
theLogger.logFine(reqID +" create direct passthrough for URL " + url + ", extension '" + ext + "', mime-type '" + responseHeader.mime() + "'");
} else {
// simply pass through without parsing
theLogger.logFine(reqID +" create passthrough for URL " + url + ", extension '" + ext + "', mime-type '" + responseHeader.mime() + "'");
hfos = new OutputStreamWriter(outStream, responseHeader.getCharSet());
// handle text stuff (encoding and so on)
if (
(!transformer.isIdentityTransformer()) &&
(plasmaParser.supportedHTMLContent(url,responseHeader.mime()))
) {
// make a transformer
theLogger.logFine(reqID +" create transformer for URL " + url);
//hfos = new htmlFilterOutputStream((gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond), null, transformer, (ext.length() == 0));
final Charset charSet = responseHeader.getCharSet();
textOutput = new htmlFilterWriter(outStream,charSet, null, transformer, (ext.length() == 0));
} else {
// simply pass through without parsing
theLogger.logFine(reqID +" create text passthrough for URL " + url + ", extension '" + ext + "', mime-type '" + responseHeader.mime() + "'");
textOutput = new OutputStreamWriter(outStream, responseHeader.getCharSet());
}
}

// handle incoming cookies
Expand Down Expand Up @@ -587,7 +617,12 @@ private static void fulfillRequestFromWeb(final Properties conProp, final yacyUR
{
// ok, we don't write actually into a file, only to RAM, and schedule writing the file.
final ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
writeContent(res, new BufferedWriter(hfos), byteStream);
if(isBinary) {
final OutputStream toClientAndMemory = new MultiOutputStream(new OutputStream[] {outStream, byteStream});
serverFileUtils.copy(res.getDataAsStream(), toClientAndMemory);
} else {
writeTextContent(res, new BufferedWriter(textOutput), byteStream);
}
// cached bytes
byte[] cacheArray;
if(byteStream.size() > 0) {
Expand All @@ -597,7 +632,7 @@ private static void fulfillRequestFromWeb(final Properties conProp, final yacyUR
}
theLogger.logFine(reqID +" writeContent of " + url + " produced cacheArray = " + ((cacheArray == null) ? "null" : ("size=" + cacheArray.length)));

if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).close();
if (textOutput instanceof htmlFilterWriter) ((htmlFilterWriter) textOutput).close();

if (sizeBeforeDelete == -1) {
// totally fresh file
Expand All @@ -622,8 +657,14 @@ private static void fulfillRequestFromWeb(final Properties conProp, final yacyUR
// the file is too big to cache it in the ram, or the size is unknown
// write to file right here.
cacheFile.getParentFile().mkdirs();
writeContent(res, new BufferedWriter(hfos), new FileOutputStream(cacheFile));
if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).close();
final OutputStream fileStream = new FileOutputStream(cacheFile);
if(isBinary) {
OutputStream toClientAndFile = new MultiOutputStream(new OutputStream[] {outStream, fileStream});
serverFileUtils.copy(res.getDataAsStream(), toClientAndFile);
} else {
writeTextContent(res, new BufferedWriter(textOutput), fileStream);
}
if (textOutput instanceof htmlFilterWriter) ((htmlFilterWriter) textOutput).close();
theLogger.logFine(reqID +" for write-file of " + url + ": contentLength = " + contentLength + ", sizeBeforeDelete = " + sizeBeforeDelete);
plasmaHTCache.writeFileAnnouncement(cacheFile);
if (sizeBeforeDelete == -1) {
Expand Down Expand Up @@ -652,8 +693,14 @@ private static void fulfillRequestFromWeb(final Properties conProp, final yacyUR
" StoreHTCache=" + storeHTCache +
" SupportetContent=" + isSupportedContent);

writeContent(res, new BufferedWriter(hfos));
if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).close();
if(isBinary) {
// directly pass bytes to client
serverFileUtils.copy(res.getDataAsStream(), outStream);
} else {
// read data with specified encoding and send it as character stream
writeTextContent(res, new BufferedWriter(textOutput));
}
if (textOutput instanceof htmlFilterWriter) ((htmlFilterWriter) textOutput).close();
/*if (sizeBeforeDelete == -1) {
// no old file and no load. just data passing
//cacheEntry.status = plasmaHTCache.CACHE_PASSING;
Expand Down Expand Up @@ -701,7 +748,7 @@ private static void fulfillRequestFromCache(

final httpChunkedOutputStream chunkedOut = null;
final GZIPOutputStream gzippedOut = null;
Object hfos = null;
Writer textOutput = null;

// we respond on the request by using the cache, the cache is fresh
try {
Expand Down Expand Up @@ -754,20 +801,19 @@ private static void fulfillRequestFromCache(
if (( !transformer.isIdentityTransformer()) &&
(ext == null || !plasmaParser.supportedHTMLFileExtContains(url)) &&
(plasmaParser.HTMLParsableMimeTypesContains(cachedResponseHeader.mime()))) {
hfos = new htmlFilterWriter(outStream, charSet, null, transformer, (ext == null || ext.length() == 0));
} else {
hfos = outStream;
textOutput = new htmlFilterWriter(outStream, charSet, null, transformer, (ext == null || ext.length() == 0));
}

// send also the complete body now from the cache
// simply read the file and transfer to out socket
if (hfos instanceof OutputStream) {
serverFileUtils.copy(cacheFile,(OutputStream)hfos);
} else if (hfos instanceof Writer) {
serverFileUtils.copy(cacheFile,charSet,(Writer)hfos);
if(textOutput != null && !isBinary(cachedResponseHeader)) {
// send as encoded text
serverFileUtils.copy(cacheFile, charSet, textOutput);
} else {
serverFileUtils.copy(cacheFile, outStream);
}

if (hfos instanceof htmlFilterWriter) ((htmlFilterWriter) hfos).close();
if (textOutput != null) textOutput.close();
if (gzippedOut != null) gzippedOut.finish();
if (chunkedOut != null) chunkedOut.finish();
}
Expand All @@ -787,24 +833,63 @@ private static void fulfillRequestFromCache(
return;
}

public static void writeContent(final JakartaCommonsHttpResponse res, final BufferedWriter hfos) throws IOException, UnsupportedEncodingException {
/**
* determines if the body is text or not
*
* @param responseHeader
* @return
*/
private static boolean isBinary(httpResponseHeader responseHeader) {
String mime = responseHeader.mime().toLowerCase();
if(mime.contains(";")) {
// cut of parameters
mime = mime.substring(0, mime.indexOf(';'));
}
// mime and the contents of the Set must be lower case!
if(binaryContent.contains(mime)) {
return true;
}
final int endType = mime.contains("/") ? mime.indexOf('/') : mime.length();
final String type = mime.substring(0, endType);
if(binaryTypes.contains(type)) {
return true;
}
return false;
}

/**
* ready the body of res with charSet and write it to output
*
* @param res
* @param output
* @throws IOException
*/
public static void writeTextContent(final JakartaCommonsHttpResponse res, final BufferedWriter output) throws IOException {
try {
final InputStream data = res.getDataAsStream();
if (data == null) return;
final Charset charSet = res.getResponseHeader().getCharSet();
serverFileUtils.copyToWriter(new BufferedInputStream(data), hfos, charSet);
serverFileUtils.copyToWriter(new BufferedInputStream(data), output, charSet);
} finally {
res.closeStream();
}
}

public static void writeContent(final JakartaCommonsHttpResponse res, final BufferedWriter hfos, final OutputStream byteStream) throws IOException, UnsupportedEncodingException {
/**
* ready the body of res with charSet and write it to output and parallel encoded with charSet to byteStream
*
* @param res
* @param output
* @param byteStream
* @throws IOException
*/
public static void writeTextContent(final JakartaCommonsHttpResponse res, final BufferedWriter output, final OutputStream byteStream) throws IOException {
assert byteStream != null;
try {
final InputStream data = res.getDataAsStream();
if (data == null) return;
final Charset charSet = res.getResponseHeader().getCharSet();
serverFileUtils.copyToWriters(new BufferedInputStream(data), hfos, new BufferedWriter(new OutputStreamWriter(byteStream, charSet)) , charSet);
serverFileUtils.copyToWriters(new BufferedInputStream(data), output, new BufferedWriter(new OutputStreamWriter(byteStream, charSet)) , charSet);
} finally {
res.closeStream();
}
Expand Down Expand Up @@ -1031,7 +1116,7 @@ public static void doPost(final Properties conProp, final httpRequestHeader requ
}
if (chunked != null) chunked.finish();
*/
writeContent(res, new BufferedWriter(new OutputStreamWriter((chunked != null) ? chunked : countedRespond)));
writeTextContent(res, new BufferedWriter(new OutputStreamWriter((chunked != null) ? chunked : countedRespond)));

countedRespond.flush();
} finally {
Expand Down
10 changes: 9 additions & 1 deletion source/de/anomic/plasma/plasmaSwitchboard.java
Expand Up @@ -980,7 +980,7 @@ public boolean htEntryStoreProcess(final indexDocumentMetadata entry) {
* Testing if the content type is supported by the available parsers
* ========================================================================= */
final boolean isSupportedContent = plasmaParser.supportedContent(entry.url(),entry.getMimeType());
log.logFinest(entry.url() +" content of type "+ entry.getMimeType() +" is supported: "+ isSupportedContent);
log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() +" is supported: "+ isSupportedContent);

/* =========================================================================
* INDEX CONTROL HEADER
Expand Down Expand Up @@ -1516,6 +1516,10 @@ public boolean crawlJobIsPaused(final String jobType) {

public indexingQueueEntry parseDocument(final indexingQueueEntry in) {
in.queueEntry.updateStatus(IndexingStack.QUEUE_STATE_PARSING);

// debug
log.logFinest("PARSE "+ in.queueEntry.toString());

plasmaParserDocument document = null;
try {
document = parseDocument(in.queueEntry);
Expand Down Expand Up @@ -1595,6 +1599,10 @@ private plasmaParserDocument parseDocument(final IndexingStack.QueueEntry entry)

public indexingQueueEntry condenseDocument(final indexingQueueEntry in) {
in.queueEntry.updateStatus(IndexingStack.QUEUE_STATE_CONDENSING);

// debug
log.logFinest("CONDENSE "+ in.queueEntry.toString());

plasmaCondenser condenser = null;
try {
condenser = condenseDocument(in.queueEntry, in.document);
Expand Down

0 comments on commit 9ff4fc1

Please sign in to comment.