Skip to content

Commit

Permalink
another update to the pdf parser
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6463 6c8d7289-2bf4-0310-a012-ef5d649a1542
  • Loading branch information
orbiter committed Nov 6, 2009
1 parent 54c54fb commit 08f1cbb
Showing 1 changed file with 10 additions and 20 deletions.
30 changes: 10 additions & 20 deletions source/net/yacy/document/parser/pdfParser.java
Expand Up @@ -88,31 +88,22 @@ public Set<String> supportedExtensions() {

public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {

PDDocument theDocument = null;
Writer writer = null;
File writerFile = null;

String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;

// check for interruption
checkInterruption();

// creating a pdf parser
// create a pdf parser
final PDDocument theDocument;
final PDFParser parser;
final PDFTextStripper stripper;
try {
Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
parser = new PDFParser(source);
parser.parse();
checkInterruption();
stripper = new PDFTextStripper();
theDocument = parser.getPDDocument();
} catch (IOException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
} finally {
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}

checkInterruption();

if (theDocument.isEncrypted()) {
try {
Expand All @@ -134,13 +125,16 @@ public Document parse(final DigestURI location, final String mimeType, final Str

// extracting some metadata
final PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docKeywordStr = null;
if (theDocInfo != null) {
docTitle = theDocInfo.getTitle();
docSubject = theDocInfo.getSubject();
docAuthor = theDocInfo.getAuthor();
docKeywordStr = theDocInfo.getKeywords();
}

Writer writer = null;
File writerFile = null;
try {
// creating a writer for output
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
Expand All @@ -149,13 +143,9 @@ public Document parse(final DigestURI location, final String mimeType, final Str
} else {
writer = new CharBuffer();
}
try {
stripper.writeText(theDocument, writer ); // may throw a NPE
} catch (Exception e) {
Log.logException(e);
Log.logWarning("pdfParser", e.getMessage());
}
theDocument.close(); theDocument = null;
final PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(theDocument, writer); // may throw a NPE
theDocument.close();
writer.close();
} catch (IOException e) {
Log.logException(e);
Expand Down

0 comments on commit 08f1cbb

Please sign in to comment.