upgraded pdfbox to 3.0.0

yacy · Oct 27, 2023 · 5ba5fb5 · 5ba5fb5
1 parent c10944b
commit 5ba5fb5
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 43 deletions.
diff --git a/ivy.xml b/ivy.xml
@@ -55,7 +55,7 @@
       <dependency org="org.apache.lucene" name="lucene-queryparser" rev="8.11.2" conf="compile->master"/>
       <dependency org="org.apache.lucene" name="lucene-spatial-extras" rev="8.11.2" conf="compile->master"/>
       <dependency org="org.apache.lucene" name="lucene-suggest" rev="8.11.2"/>
-      <dependency org="org.apache.pdfbox" name="pdfbox" rev="2.0.29"  />
+      <dependency org="org.apache.pdfbox" name="pdfbox" rev="3.0.0"  />
       <dependency org="org.apache.poi" name="poi" rev="3.17"  />
       <dependency org="org.apache.poi" name="poi-scratchpad" rev="3.17"  />
       <dependency org="org.apache.solr" name="solr-core" rev="8.11.2" conf="compile->master"/>

diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java
@@ -43,6 +43,7 @@
 import javax.swing.text.html.HTMLEditorKit;
 import javax.swing.text.html.ImageView;
 
+import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.rendering.PDFRenderer;
@@ -211,7 +212,7 @@ private static boolean convertAvailableInPath() {
      *                    call termination. Beyond this limit the process is killed.
      * @return true when the destination file was successfully written
      */
-    public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) {
+    public static boolean writeWkhtmltopdf(final String url, final String proxy, final String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) {
         boolean success = false;
         for (final boolean ignoreErrors: new boolean[]{false, true}) {
             success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
@@ -352,7 +353,7 @@ public static boolean pdf2image(final File pdf, final File image, final int widt
 
         // convert pdf to jpg using internal pdfbox capability
         if (convertCmd == null) {
-            try (final PDDocument pdoc = PDDocument.load(pdf);) {
+            try (final PDDocument pdoc = Loader.loadPDF(pdf);) {
 
                 final BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB);
 
@@ -432,7 +433,7 @@ public static boolean pdf2image(final File pdf, final File image, final int widt
      * @param size
      * @throws IOException
      */
-    public static void writeSwingImage(String url, Dimension size, File destination) throws IOException {
+    public static void writeSwingImage(final String url, final Dimension size, final File destination) throws IOException {
 
         // set up a pane for rendering
         final JEditorPane htmlPane = new JEditorPane();
@@ -453,7 +454,7 @@ public Document createDefaultDocument() {
             public ViewFactory getViewFactory() {
                 return new HTMLFactory() {
                     @Override
-                    public View create(Element elem) {
+                    public View create(final Element elem) {
                         final View view = super.create(elem);
                         if (view instanceof ImageView) {
                             ((ImageView) view).setLoadsSynchronously(true);
@@ -467,7 +468,7 @@ public View create(Element elem) {
         htmlPane.setContentType("text/html");
         htmlPane.addPropertyChangeListener(new PropertyChangeListener() {
             @Override
-            public void propertyChange(PropertyChangeEvent evt) {
+            public void propertyChange(final PropertyChangeEvent evt) {
             }
         });
 
@@ -501,7 +502,7 @@ public void propertyChange(PropertyChangeEvent evt) {
      * 	</li>
      * </ol>
      */
-    public static void main(String[] args) {
+    public static void main(final String[] args) {
         final String usageMessage = "Usage : java " + Html2Image.class.getName()
                 + " <url> <target-file[.pdf|.jpg|.png]> [wkhtmltopdf|swing]";
         int exitStatus = 0;

diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
@@ -39,8 +39,10 @@
 import java.util.HashSet;
 import java.util.List;
 
+import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.io.MemoryUsageSetting;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.PDPage;
@@ -69,7 +71,7 @@ public class pdfParser extends AbstractParser implements Parser {
 
     public static boolean individualPages = false;
     public static String individualPagePropertyname = "page";
-    
+
     public pdfParser() {
         super("Acrobat Portable Document Parser");
         this.SUPPORTED_EXTENSIONS.add("pdf");
@@ -86,7 +88,7 @@ public Document[] parse(
             final DigestURL location,
             final String mimeType,
             final String charset,
-            final VocabularyScraper scraper, 
+            final VocabularyScraper scraper,
             final int timezoneOffset,
             final InputStream source) throws Parser.Failure, InterruptedException {
 
@@ -98,8 +100,8 @@ public Document[] parse(
         PDDocument pdfDoc;
         try {
             Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
-            MemoryUsageSetting mus = MemoryUsageSetting.setupMixed(200*1024*1024);
-            pdfDoc = PDDocument.load(source, mus);
+            final RandomAccessRead readBuffer = new RandomAccessReadBuffer(source);
+            pdfDoc = Loader.loadPDF(readBuffer);
         } catch (final IOException e) {
             throw new Parser.Failure(e.getMessage(), location);
         } finally {
@@ -141,34 +143,34 @@ public Document[] parse(
         if (docKeywordStr != null) {
             docKeywords = docKeywordStr.split(" |,");
         }
-        
+
         Document[] result = null;
         try {
             // get the links
         	final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
-            
+
             // get the fulltext (either per document or for each page)
             final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);
 
             if (individualPages) {
                 // this is a hack which stores individual pages of the source pdf into individual index documents
                 // the new documents will get a virtual link with a post argument page=X appended to the original url
-                
+
                 // collect text
-                int pagecount = pdfDoc.getNumberOfPages();
-                String[] pages = new String[pagecount];
+                final int pagecount = pdfDoc.getNumberOfPages();
+                final String[] pages = new String[pagecount];
                 for (int page = 1; page <= pagecount; page++) {
                     stripper.setStartPage(page);
                     stripper.setEndPage(page);
                     pages[page - 1] = stripper.getText(pdfDoc);
                     //System.out.println("PAGE " + page + ": " + pages[page - 1]);
                 }
-                
+
                 // create individual documents for each page
                 assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
                 result = new Document[Math.min(pages.length, pdflinks.size())];
-                String loc = location.toNormalform(true);
-                for (int page = 0; page < result.length; page++) {                    
+                final String loc = location.toNormalform(true);
+                for (int page = 0; page < result.length; page++) {
                     result[page] = new Document(
                             new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                             mimeType,
@@ -216,9 +218,9 @@ public void run() {
                     contentBytes = writer.getBytes(); // get final text before closing writer
                     writer.close(); // free writer resources
                 }
-                
-                Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
-                for (Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
+
+                final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
+                for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
                 result = new Document[]{new Document(
                         location,
                         mimeType,
@@ -238,7 +240,7 @@ public void run() {
                         null,
                         false,
                         docDate)};
-            }         
+            }
         } catch (final Throwable e) {
             //throw new Parser.Failure(e.getMessage(), location);
         } finally {
@@ -248,7 +250,7 @@ public void run() {
         // clear cached resources in pdfbox.
         pdfDoc = null;
         clearPdfBoxCaches();
-        
+
         return result;
     }
 
@@ -258,25 +260,25 @@ public void run() {
      * @return all detected links
      */
     private List<Collection<AnchorURL>> extractPdfLinks(final PDDocument pdf) {
-        List<Collection<AnchorURL>> linkCollections = new ArrayList<>(pdf.getNumberOfPages());
-        for (PDPage page : pdf.getPages()) {
-            final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
+        final List<Collection<AnchorURL>> linkCollections = new ArrayList<>(pdf.getNumberOfPages());
+        for (final PDPage page : pdf.getPages()) {
+            final Collection<AnchorURL> pdflinks = new ArrayList<>();
             try {
-                List<PDAnnotation> annotations = page.getAnnotations();
+                final List<PDAnnotation> annotations = page.getAnnotations();
                 if (annotations != null) {
-                    for (PDAnnotation pdfannotation : annotations) {
+                    for (final PDAnnotation pdfannotation : annotations) {
                         if (pdfannotation instanceof PDAnnotationLink) {
-                            PDAction link = ((PDAnnotationLink)pdfannotation).getAction();
+                            final PDAction link = ((PDAnnotationLink)pdfannotation).getAction();
                             if (link != null && link instanceof PDActionURI) {
-                                PDActionURI pdflinkuri = (PDActionURI) link;
-                                String uristr = pdflinkuri.getURI();
-                                AnchorURL url = new AnchorURL(uristr);
+                                final PDActionURI pdflinkuri = (PDActionURI) link;
+                                final String uristr = pdflinkuri.getURI();
+                                final AnchorURL url = new AnchorURL(uristr);
                                 pdflinks.add(url);
                             }
                         }
                     }
                 }
-            } catch (IOException ex) {}
+            } catch (final IOException ex) {}
             linkCollections.add(pdflinks);
         }
         return linkCollections;
@@ -292,17 +294,17 @@ public static void clearPdfBoxCaches() {
 		 * situation is now from far better, but one (unnecessary?) cache structure in
 		 * the COSName class still needs to be explicitely cleared.
 		 */
-    	
+
 		// History of related issues :
     	// http://markmail.org/thread/quk5odee4hbsauhu
-		// https://issues.apache.org/jira/browse/PDFBOX-313 
+		// https://issues.apache.org/jira/browse/PDFBOX-313
 		// https://issues.apache.org/jira/browse/PDFBOX-351
 		// https://issues.apache.org/jira/browse/PDFBOX-441
     	// https://issues.apache.org/jira/browse/PDFBOX-2200
     	// https://issues.apache.org/jira/browse/PDFBOX-2149
-    	
+
         COSName.clearResources();
-        
+
 		/*
 		 * Prior to PDFBox 2.0.0, clearResources() function had to be called on the
 		 * org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version
@@ -327,7 +329,7 @@ public static void main(final String[] args) {
                 // parse
                 final AbstractParser parser = new pdfParser();
                 Document document = null;
-                FileInputStream inStream = null; 
+                FileInputStream inStream = null;
                 try {
                 	inStream = new FileInputStream(pdfFile);
                     document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, inStream));
@@ -345,7 +347,7 @@ public static void main(final String[] args) {
                 	if(inStream != null) {
                 		try {
                 			inStream.close();
-                		} catch(IOException e) {
+                		} catch(final IOException e) {
                 			System.err.println("Could not close input stream on file " + pdfFile);
                 		}
                 	}
@@ -359,7 +361,7 @@ public static void main(final String[] args) {
                     System.out.println("\t!!!Parsing without result!!!");
                 } else {
                     System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
-                    InputStream textStream = document.getTextStream();
+                    final InputStream textStream = document.getTextStream();
                     try {
                         // write file
                         FileUtils.copy(textStream, new File("parsedPdf.txt"));
@@ -372,7 +374,7 @@ public static void main(final String[] args) {
                         		/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
                         		textStream.close();
                         	}
-						} catch (IOException e) {
+						} catch (final IOException e) {
 							ConcurrentLog.warn("PDFPARSER", "Could not close text input stream");
 						}
                     }