Skip to content

Commit

Permalink
- enhanced logging and exception details for parsers
Browse files Browse the repository at this point in the history
- removed inconsistencies in mime type declaration (one mime type should only appear once in all parsers)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6192 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Jul 10, 2009
1 parent 4b74ad0 commit caedd72
Show file tree
Hide file tree
Showing 15 changed files with 61 additions and 93 deletions.
27 changes: 14 additions & 13 deletions source/de/anomic/document/Parser.java
Expand Up @@ -81,7 +81,7 @@ public final class Parser {
initParser(new docParser());
initParser(new gzipParser());
initParser(new htmlParser());
initParser(new mimeTypeParser());
//initParser(new mimeTypeParser()); // what does that thing do?
initParser(new odtParser());
initParser(new pdfParser());
initParser(new pptParser());
Expand All @@ -107,9 +107,9 @@ public static Set<Idiom> idioms() {
private static void initParser(Idiom parser) {
for (Map.Entry<String, String> e: parser.getSupportedMimeTypes().entrySet()) {
// process the mime types
final String mimeType = e.getKey();
final String mimeType = normalizeMimeType(e.getKey());
Idiom p0 = mime2parser.get(mimeType);
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser.");
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
mime2parser.put(mimeType, parser);
Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());

Expand All @@ -134,15 +134,15 @@ public static Document parseSource(final yacyURL location,
if (sourceArray == null || sourceArray.length == 0) {
final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false));
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, errorMsg);
throw new ParserException(errorMsg, location);
}
byteIn = new ByteArrayInputStream(sourceArray);
return parseSource(location, mimeType, charset, sourceArray.length, byteIn);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
log.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
throw new ParserException("Unexpected exception: " + e.getMessage(), location);
} finally {
if (byteIn != null) try {
byteIn.close();
Expand All @@ -160,15 +160,15 @@ public static Document parseSource(final yacyURL location,
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2).";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "document has no content");
throw new ParserException(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
throw new ParserException("Unexpected exception: " + e.getMessage(), location);
} finally {
if (sourceStream != null)try {
sourceStream.close();
Expand All @@ -188,12 +188,12 @@ public static Document parseSource(final yacyURL location,
if (!supportsMime(mimeType)) {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type");
throw new ParserException(errorMsg, location);
}
if (!supportsExtension(location)) {
final String errorMsg = "No parser available to parse extension of url path";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong extension");
throw new ParserException(errorMsg, location);
}
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Idiom parser = mime2parser.get(normalizeMimeType(mimeType));
Expand All @@ -204,7 +204,7 @@ public static Document parseSource(final yacyURL location,
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type or wrong extension");
throw new ParserException(errorMsg, location);
}
if (doc == null) {
final String errorMsg = "Unexpected error. Parser returned null.";
Expand All @@ -217,11 +217,12 @@ public static Document parseSource(final yacyURL location,
if (e instanceof ParserException) throw (ParserException) e;
final String errorMsg = "Unexpected exception. " + e.getMessage();
log.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg, location, e);
throw new ParserException(errorMsg, location);
}
}

public static boolean supportsMime(String mimeType) {
mimeType = normalizeMimeType(mimeType);
return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType));
}

Expand Down Expand Up @@ -249,7 +250,7 @@ private static String normalizeMimeType(String mimeType) {

public static void setDenyMime(String denyList) {
denyMime.clear();
for (String s: denyList.split(",")) denyMime.add(s);
for (String s: denyList.split(",")) denyMime.add(normalizeMimeType(s));
}

public static String getDenyMime() {
Expand All @@ -260,6 +261,6 @@ public static String getDenyMime() {
}

public static void grantMime(String mime, boolean grant) {
if (grant) denyMime.remove(mime); else denyMime.add(mime);
if (grant) denyMime.remove(normalizeMimeType(mime)); else denyMime.add(normalizeMimeType(mime));
}
}
25 changes: 2 additions & 23 deletions source/de/anomic/document/ParserException.java
Expand Up @@ -26,9 +26,7 @@

import de.anomic.yacy.yacyURL;

public class ParserException extends Exception
{
private String errorCode = null;
public class ParserException extends Exception {
private yacyURL url = null;

private static final long serialVersionUID = 1L;
Expand All @@ -38,28 +36,9 @@ public ParserException() {
}

public ParserException(final String message, final yacyURL url) {
this(message,url, "parser error for url " + url.toString());
}

public ParserException(final String message, final yacyURL url, final String errorCode) {
super(message);
this.errorCode = errorCode;
this.url = url;
}

public ParserException(final String message, final yacyURL url, final Throwable cause) {
this(message,url,cause, "parser error for url " + url.toString());
}

public ParserException(final String message, final yacyURL url, final Throwable cause, final String errorCode) {
super(message, cause);
this.errorCode = errorCode;
super(message + "; url = " + url.toNormalform(true, false));
this.url = url;
}

public String getErrorCode() {
return this.errorCode;
}

public yacyURL getURL() {
return this.url;
Expand Down
4 changes: 1 addition & 3 deletions source/de/anomic/document/parser/bzipParser.java
Expand Up @@ -52,10 +52,8 @@ public class bzipParser extends AbstractParser implements Idiom {
static {
SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/bzip2", fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions);
SUPPORTED_MIME_TYPES.put("application/octet-stream",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-bzip",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-compressed",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-stuffit",fileExtensions);
}

Expand Down
58 changes: 32 additions & 26 deletions source/de/anomic/document/parser/docParser.java
Expand Up @@ -27,7 +27,9 @@

package de.anomic.document.parser;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import org.textmining.extraction.TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;
Expand Down Expand Up @@ -62,23 +64,31 @@ public docParser() {
super("Word Document Parser");
}

public Document parse(final yacyURL location, final String mimeType, final String charset,
final InputStream source) throws ParserException, InterruptedException {


try {
final WordTextExtractorFactory extractorFactory = new WordTextExtractorFactory();
final TextExtractor extractor = extractorFactory.textExtractor(source);
final String contents = extractor.getText().trim();
String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
if (title.length() > 80) title = title.substring(0, 80);
int l = title.length();
while (true) {
title = title.replaceAll(" ", " ");
if (title.length() == l) break;
l = title.length();
}
final Document theDoc = new Document(
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
final WordTextExtractorFactory extractorFactory = new WordTextExtractorFactory();
TextExtractor extractor = null;
try {
extractor = extractorFactory.textExtractor(source);
} catch (Exception e) {
throw new ParserException("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
}
String contents = null;
try {
contents = extractor.getText().trim();
} catch (IOException e) {
throw new ParserException("error in docParser, getText: " + e.getMessage(), location);
}
String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
if (title.length() > 80) title = title.substring(0, 80);
int l = title.length();
while (true) {
title = title.replaceAll(" ", " ");
if (title.length() == l) break;
l = title.length();
}
Document theDoc;
try {
theDoc = new Document(
location,
mimeType,
"UTF-8",
Expand All @@ -91,15 +101,11 @@ public Document parse(final yacyURL location, final String mimeType, final Strin
contents.getBytes("UTF-8"),
null,
null);

return theDoc;
} catch (final Exception e) {
e.printStackTrace();
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;

throw new ParserException("Unexpected error while parsing doc file. " + e.getMessage(),location);
}
} catch (UnsupportedEncodingException e) {
throw new ParserException("error in docParser, getBytes: " + e.getMessage(), location);
}

return theDoc;
}

public HashMap<String, String> getSupportedMimeTypes() {
Expand Down
1 change: 0 additions & 1 deletion source/de/anomic/document/parser/gzipParser.java
Expand Up @@ -59,7 +59,6 @@ public class gzipParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.put("application/x-compress",ext);
SUPPORTED_MIME_TYPES.put("gzip/document",ext);
SUPPORTED_MIME_TYPES.put("application/octet-stream",ext);
SUPPORTED_MIME_TYPES.put("application/x-tar",ext);
}

public gzipParser() {
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/document/parser/htmlParser.java
Expand Up @@ -50,7 +50,7 @@ public class htmlParser extends AbstractParser implements Idiom {
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp";
String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp,csv";
SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext);
SUPPORTED_MIME_TYPES.put("text/html", ext);
SUPPORTED_MIME_TYPES.put("text/plain", ext);
Expand Down
3 changes: 0 additions & 3 deletions source/de/anomic/document/parser/mimeTypeParser.java
Expand Up @@ -60,9 +60,6 @@ public class mimeTypeParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.put("text/xml","xml");
SUPPORTED_MIME_TYPES.put("application/xml","xml");
SUPPORTED_MIME_TYPES.put("application/x-xml","xml");
SUPPORTED_MIME_TYPES.put("application/octet-stream","xml");
SUPPORTED_MIME_TYPES.put("application/x-compress","xml");
SUPPORTED_MIME_TYPES.put("application/x-compressed","xml");
}

/**
Expand Down
4 changes: 2 additions & 2 deletions source/de/anomic/document/parser/odtParser.java
Expand Up @@ -65,8 +65,8 @@ public class odtParser extends AbstractParser implements Idiom {
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt");
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt");
SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt,ods,odp");
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt,ods,odp");
}

public odtParser() {
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/document/parser/pdfParser.java
Expand Up @@ -107,7 +107,7 @@ public Document parse(final yacyURL location, final String mimeType, final Strin
theDocument.openProtection(new StandardDecryptionMaterial(""));
final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("Document is encrypted",location, "document is exncrypted");
throw new ParserException("Document is encrypted", location);
}

// extracting some metadata
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/document/parser/pptParser.java
Expand Up @@ -45,7 +45,7 @@ public class pptParser extends AbstractParser implements Idiom {
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String ext = "ppt,pps";
static final String ext = "ppt,pptx,pps";
static {
SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/powerpoint",ext);
Expand Down
2 changes: 1 addition & 1 deletion source/de/anomic/document/parser/psParser.java
Expand Up @@ -282,7 +282,7 @@ public Document parse(final yacyURL location, final String mimeType, final Strin
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;

throw new ParserException("Unable to parse the ps file. " + e.getMessage(),location, e);
throw new ParserException("Unable to parse the ps file. " + e.getMessage(), location);
} finally {
if (tempFile != null) FileUtils.deletedelete(tempFile);
}
Expand Down
2 changes: 0 additions & 2 deletions source/de/anomic/document/parser/rtfParser.java
Expand Up @@ -50,8 +50,6 @@ public class rtfParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.put("text/rtf","rtf");
SUPPORTED_MIME_TYPES.put("application/x-rtf","rtf");
SUPPORTED_MIME_TYPES.put("text/richtext","rtf");
SUPPORTED_MIME_TYPES.put("application/msword","rtf");
SUPPORTED_MIME_TYPES.put("application/doc","rtf");
SUPPORTED_MIME_TYPES.put("application/x-soffice","rtf");
}

Expand Down
10 changes: 5 additions & 5 deletions source/de/anomic/document/parser/sevenzipParser.java
Expand Up @@ -72,7 +72,7 @@ public Document parse(final yacyURL location, final String mimeType, final Strin
try {
archive = new Handler(source);
} catch (final IOException e) {
throw new ParserException("error opening 7zip archive", location, e);
throw new ParserException("error opening 7zip archive: " + e.getMessage(), location);
}
checkInterruption();
final SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive,
Expand All @@ -87,8 +87,8 @@ public Document parse(final yacyURL location, final String mimeType, final Strin
if (e.getCause() instanceof ParserException)
throw (ParserException)e.getCause();
throw new ParserException(
"error processing 7zip archive at internal file: " + aec.getCurrentFilePath(),
location, e);
"error processing 7zip archive at internal file " + aec.getCurrentFilePath() + ": " + e.getMessage(),
location);
} finally {
try { archive.close(); } catch (final IOException e) { }
}
Expand All @@ -106,7 +106,7 @@ public Document parse(final yacyURL location, final String mimeType, final Strin
try {
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Idiom.MAX_KEEP_IN_MEMORY_SIZE);
} catch (final IOException e) {
throw new ParserException("error processing 7zip archive", location, e);
throw new ParserException("error processing 7zip archive: " + e.getMessage(), location);
}
}

Expand All @@ -120,7 +120,7 @@ public Document parse(final yacyURL location, final String mimeType, final Strin
}
return parse(location, mimeType, charset, cfos.getContentBAOS());
} catch (final IOException e) {
throw new ParserException("error processing 7zip archive", location, e);
throw new ParserException("error processing 7zip archive: " + e.getMessage(), location);
}
}

Expand Down
2 changes: 0 additions & 2 deletions source/de/anomic/document/parser/tarParser.java
Expand Up @@ -64,8 +64,6 @@ public class tarParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.put("application/tar","tar");
SUPPORTED_MIME_TYPES.put("applicaton/x-gtar","tar");
SUPPORTED_MIME_TYPES.put("multipart/x-tar","tar");
SUPPORTED_MIME_TYPES.put("application/x-compress","tar");
SUPPORTED_MIME_TYPES.put("application/x-compressed","tar");
}

public tarParser() {
Expand Down

0 comments on commit caedd72

Please sign in to comment.