diff --git a/tika-gui-app/examples/example-digipres-metadata-mappings-jdbc.csv b/tika-gui-app/examples/example-digipres-metadata-mappings-jdbc.csv index 1b07c1d..cbda45a 100644 --- a/tika-gui-app/examples/example-digipres-metadata-mappings-jdbc.csv +++ b/tika-gui-app/examples/example-digipres-metadata-mappings-jdbc.csv @@ -22,6 +22,8 @@ tika-eval:numAlphaTokens,num_alpha_tokens,BIGINT tika-eval:lang,detected_lang,VARCHAR(12) tika-eval:oov,oov,FLOAT xmpTPg:NPages,num_pages,INTEGER +pdf:incrementalUpdateCount,pdf_incremental_updates,INTEGER +X-TIKA:versionNumber,version_number,INTEGER pdf:totalUnmappedUnicodeChars,pdf_num_unmapped_unicode,INTEGER pdf:overallPercentageUnmappedUnicodeChars,pdf_percent_unmapped_unicode,FLOAT pdf:containsNonEmbeddedFont,pdf_non_embedded_font,BOOLEAN diff --git a/tika-gui-app/examples/tika-parsers.xml b/tika-gui-app/examples/tika-parsers.xml index c5f9559..6eb050c 100644 --- a/tika-gui-app/examples/tika-parsers.xml +++ b/tika-gui-app/examples/tika-parsers.xml @@ -13,13 +13,13 @@ 0.3 true true - false + true true true 2.5 true - false + true true false false diff --git a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/config/TikaConfigWriter.java b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/config/TikaConfigWriter.java index 3098bd0..4e9e81b 100644 --- a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/config/TikaConfigWriter.java +++ b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/config/TikaConfigWriter.java @@ -142,9 +142,9 @@ private void appendDetectors(BatchProcessConfig batchProcessConfig, DomWriter wr //don't do anything, right(?) } else { //this is a total hack. - try { - XMLStringToDOM.write(writer, properties, - batchProcessConfig.getDetectorConfig().get().getPath()); + try (InputStream is = Files.newInputStream( + batchProcessConfig.getDetectorConfig().get().getPath().get())) { + XMLStringToDOM.write(writer, properties, is); } catch (TikaException | IOException | SAXException e) { LOGGER.warn("couldn't write dom"); } @@ -155,28 +155,20 @@ private void appendParsers(BatchProcessConfig batchProcessConfig, DomWriter writ Element properties) throws XMLStreamException { if (batchProcessConfig.getParserConfig().isEmpty() || batchProcessConfig.getParserConfig().get().getPath().isEmpty()) { - Element parsers = writer.createAndGetElement(properties, "parsers"); - Element defaultParser = writer.createAndGetElement(parsers, "parser"); - defaultParser.setAttribute("class", "org.apache.tika.parser.DefaultParser"); - excludeParsers(writer, defaultParser, "org.apache.tika.parser.ocr.TesseractOCRParser", - "org.apache.tika.parser.pdf.PDFParser", - "org.apache.tika.parser.microsoft.ooxml.OOXMLParser", - "org.apache.tika.parser.microsoft.OfficeParser"); - addLegacyParams(writer, parsers, "parser", "org.apache.tika.parser.pdf.PDFParser", - "extractActions", "bool", "true"); - - addLegacyParams(writer, parsers, "parser", - "org.apache.tika.parser.microsoft.ooxml.OOXMLParser", "extractMacros", "bool", - "true", "includeDeletedContent", "bool", "true", "includeMoveFromContent", - "bool", "true"); - - addLegacyParams(writer, parsers, "parser", - "org.apache.tika.parser.microsoft.OfficeParser", "extractMacros", "bool", "true"); + //this is a total hack. + try (InputStream is = + TikaConfigWriter.class.getResourceAsStream("/default_parsers.xml")) { + XMLStringToDOM.write(writer, properties, is); + } catch (TikaException | IOException | SAXException e) { + LOGGER.warn("couldn't write dom"); + } } else { //this is a total hack. - try { - XMLStringToDOM.write(writer, properties, - batchProcessConfig.getParserConfig().get().getPath()); + try (InputStream is = + Files.newInputStream( + batchProcessConfig.getParserConfig().get().getPath().get())) { + + XMLStringToDOM.write(writer, properties, is); } catch (TikaException | IOException | SAXException e) { LOGGER.warn("couldn't write dom"); } diff --git a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/sax/XMLStringToDOM.java b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/sax/XMLStringToDOM.java index cba7031..4b7cd84 100644 --- a/tika-gui-app/src/main/java/org/tallison/tika/app/fx/sax/XMLStringToDOM.java +++ b/tika-gui-app/src/main/java/org/tallison/tika/app/fx/sax/XMLStringToDOM.java @@ -17,8 +17,7 @@ package org.tallison.tika.app.fx.sax; import java.io.IOException; -import java.nio.file.Path; -import java.util.Optional; +import java.io.InputStream; import org.w3c.dom.Document; import org.w3c.dom.Element; @@ -29,9 +28,9 @@ public class XMLStringToDOM { - public static void write(DomWriter writer, Element writerRoot, Optional path) + public static void write(DomWriter writer, Element writerRoot, InputStream is) throws TikaException, IOException, SAXException { - Document document = XMLReaderUtils.buildDOM(path.get()); + Document document = XMLReaderUtils.buildDOM(is); Element documentRoot = document.getDocumentElement(); writer.appendChild(writerRoot, documentRoot); } diff --git a/tika-gui-app/src/main/resources/default_parsers.xml b/tika-gui-app/src/main/resources/default_parsers.xml new file mode 100644 index 0000000..6947fb8 --- /dev/null +++ b/tika-gui-app/src/main/resources/default_parsers.xml @@ -0,0 +1,77 @@ + + + + + + + + + + + + + true + 0.3 + true + true + true + + true + true + 2.5 + true + true + true + false + false + true + false + -1 + + 10000 + 300 + png + 1.0 + ALL + auto + better + gray + + false + false + false + 0.5 + false + + + + + true + true + true + + + + + true + + + \ No newline at end of file