Skip to content

Commit

Permalink
Updated pdf cache clear steps consistently with current pdfbox version
Browse files Browse the repository at this point in the history
- Removed calls to no more existing clearResources functions (on PDFont
class and its children) since upgrade to pdfbox 2.n.n
- Removed hacky usage of protected internal ClassLoader function. This
removes the warnings displayed when running with JDK9 or JDK10 :

     [java] WARNING: Illegal reflective access by
net.yacy.document.parser.pdfParser$ResourceCleaner (file:<path>) to
method java.lang.ClassLoader.findLoadedClass(java.lang.String)
     [java] WARNING: Please consider reporting this to the maintainers
of net.yacy.document.parser.pdfParser$ResourceCleaner
     [java] WARNING: Use --illegal-access=warn to enable warnings of
further illegal reflective access operations
     [java] WARNING: All illegal access operations will be denied in a
future release

Crawling thousands of pdf documents from various sources after
modifications applied, revealed no new memory leak related to pdfbox
(measurements done with JVisualVM).
  • Loading branch information
luccioman committed Aug 16, 2018
1 parent 6851223 commit 54fbe16
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 64 deletions.
93 changes: 30 additions & 63 deletions source/net/yacy/document/parser/pdfParser.java
Expand Up @@ -32,14 +32,14 @@
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
Expand Down Expand Up @@ -81,10 +81,6 @@ public pdfParser() {
this.SUPPORTED_MIME_TYPES.add("text/x-pdf");
}

static {
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // must be called here to get that into the class loader; it will block other threads otherwise;
}

@Override
public Document[] parse(
final DigestURL location,
Expand Down Expand Up @@ -249,18 +245,9 @@ public void run() {
try {pdfDoc.close();} catch (final Throwable e) {}
}

// clear resources in pdfbox. they say that is resolved but it's not. see:
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351
// https://issues.apache.org/jira/browse/PDFBOX-441
// the pdfbox still generates enormeous number of object allocations and don't delete these
// the following Object are statically stored and never flushed:
// COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
// COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
// the great number of these objects can easily be seen in Java Visual VM
// we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
// clear cached resources in pdfbox.
pdfDoc = null;
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
clearPdfBoxCaches();

return result;
}
Expand Down Expand Up @@ -295,55 +282,35 @@ private List<Collection<AnchorURL>> extractPdfLinks(final PDDocument pdf) {
return linkCollections;
}

public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
// Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain.
ResourceCleaner cl = new ResourceCleaner();
cl.clearClassResources("org.apache.pdfbox.cos.COSName");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont");
/**
* Clean up cache resources allocated by PDFBox that would otherwise not be released.
*/
public static void clearPdfBoxCaches() {
/*
* Prior to pdfbox 2.0.0 font cache occupied > 80MB RAM for a single pdf and
* then stayed forever (detected in YaCy with pdfbox version 1.2.1). The
* situation is now from far better, but one (unnecessary?) cache structure in
* the COSName class still needs to be explicitely cleared.
*/

// History of related issues :
// http://markmail.org/thread/quk5odee4hbsauhu
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351
// https://issues.apache.org/jira/browse/PDFBOX-441
// https://issues.apache.org/jira/browse/PDFBOX-2200
// https://issues.apache.org/jira/browse/PDFBOX-2149

COSName.clearResources();

/*
* Prior to PDFBox 2.0.0, clearResources() function had to be called on the
* org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version
* 2.0.0, there is no more such a function in PDFont class as font cache is
* handled differently and hopefully more properly.
*/
}

@SuppressWarnings({ "unchecked", "rawtypes" })
private static class ResourceCleaner {
Method findLoadedClass;
private ClassLoader sys;
public ResourceCleaner() {
try {
this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class });
this.findLoadedClass.setAccessible(true);
this.sys = ClassLoader.getSystemClassLoader();
} catch (Throwable e) {
e.printStackTrace();
this.findLoadedClass = null;
this.sys = null;
}
}
public void clearClassResources(String name) {
if (this.findLoadedClass == null) return;
try {
Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name);
if (pdfparserpainclass != null) {
Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {});
if (clearResources != null) clearResources.invoke(null);
}
} catch (Throwable e) {
//e.printStackTrace();
}
}
}

/**
* test
* @param args
Expand Down
2 changes: 1 addition & 1 deletion source/net/yacy/search/Switchboard.java
Expand Up @@ -2454,7 +2454,7 @@ public boolean surrogateProcess() {

public static void clearCaches() {
// flush caches in used libraries
pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu
pdfParser.clearPdfBoxCaches();

// clear caches
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
Expand Down

0 comments on commit 54fbe16

Please sign in to comment.