Skip to content

Commit

Permalink
improve default_parsers.xml and update example-digipres-metadata-mapp…
Browse files Browse the repository at this point in the history
…ings-jdbc.csv
  • Loading branch information
tballison committed May 17, 2023
1 parent eb09273 commit b135851
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ tika-eval:numAlphaTokens,num_alpha_tokens,BIGINT
tika-eval:lang,detected_lang,VARCHAR(12)
tika-eval:oov,oov,FLOAT
xmpTPg:NPages,num_pages,INTEGER
pdf:incrementalUpdateCount,pdf_incremental_updates,INTEGER
X-TIKA:versionNumber,version_number,INTEGER
pdf:totalUnmappedUnicodeChars,pdf_num_unmapped_unicode,INTEGER
pdf:overallPercentageUnmappedUnicodeChars,pdf_percent_unmapped_unicode,FLOAT
pdf:containsNonEmbeddedFont,pdf_non_embedded_font,BOOLEAN
Expand Down
4 changes: 2 additions & 2 deletions tika-gui-app/examples/tika-parsers.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@
<param name="averageCharTolerance" type="float">0.3</param>
<param name="detectAngles" type="bool">true</param>
<param name="extractAcroFormContent" type="bool">true</param>
<param name="extractActions" type="bool">false</param>
<param name="extractActions" type="bool">true</param>
<!-- as of 2.8.0 -->
<param name="extractIncrementalUpdateInfo" type="bool">true</param>
<param name="catchIntermediateIOExceptions" type="bool">true</param>
<param name="dropThreshold" type="float">2.5</param>
<param name="enableAutoSpace" type="bool">true</param>
<param name="extractAnnotationText" type="bool">false</param>
<param name="extractAnnotationText" type="bool">true</param>
<param name="extractBookmarksText" type="bool">true</param>
<param name="extractFontNames" type="bool">false</param>
<param name="extractInlineImages" type="bool">false</param>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,9 @@ private void appendDetectors(BatchProcessConfig batchProcessConfig, DomWriter wr
//don't do anything, right(?)
} else {
//this is a total hack.
try {
XMLStringToDOM.write(writer, properties,
batchProcessConfig.getDetectorConfig().get().getPath());
try (InputStream is = Files.newInputStream(
batchProcessConfig.getDetectorConfig().get().getPath().get())) {
XMLStringToDOM.write(writer, properties, is);
} catch (TikaException | IOException | SAXException e) {
LOGGER.warn("couldn't write dom");
}
Expand All @@ -155,28 +155,20 @@ private void appendParsers(BatchProcessConfig batchProcessConfig, DomWriter writ
Element properties) throws XMLStreamException {
if (batchProcessConfig.getParserConfig().isEmpty() ||
batchProcessConfig.getParserConfig().get().getPath().isEmpty()) {
Element parsers = writer.createAndGetElement(properties, "parsers");
Element defaultParser = writer.createAndGetElement(parsers, "parser");
defaultParser.setAttribute("class", "org.apache.tika.parser.DefaultParser");
excludeParsers(writer, defaultParser, "org.apache.tika.parser.ocr.TesseractOCRParser",
"org.apache.tika.parser.pdf.PDFParser",
"org.apache.tika.parser.microsoft.ooxml.OOXMLParser",
"org.apache.tika.parser.microsoft.OfficeParser");
addLegacyParams(writer, parsers, "parser", "org.apache.tika.parser.pdf.PDFParser",
"extractActions", "bool", "true");

addLegacyParams(writer, parsers, "parser",
"org.apache.tika.parser.microsoft.ooxml.OOXMLParser", "extractMacros", "bool",
"true", "includeDeletedContent", "bool", "true", "includeMoveFromContent",
"bool", "true");

addLegacyParams(writer, parsers, "parser",
"org.apache.tika.parser.microsoft.OfficeParser", "extractMacros", "bool", "true");
//this is a total hack.
try (InputStream is =
TikaConfigWriter.class.getResourceAsStream("/default_parsers.xml")) {
XMLStringToDOM.write(writer, properties, is);
} catch (TikaException | IOException | SAXException e) {
LOGGER.warn("couldn't write dom");
}
} else {
//this is a total hack.
try {
XMLStringToDOM.write(writer, properties,
batchProcessConfig.getParserConfig().get().getPath());
try (InputStream is =
Files.newInputStream(
batchProcessConfig.getParserConfig().get().getPath().get())) {

XMLStringToDOM.write(writer, properties, is);
} catch (TikaException | IOException | SAXException e) {
LOGGER.warn("couldn't write dom");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@
package org.tallison.tika.app.fx.sax;

import java.io.IOException;
import java.nio.file.Path;
import java.util.Optional;
import java.io.InputStream;

import org.w3c.dom.Document;
import org.w3c.dom.Element;
Expand All @@ -29,9 +28,9 @@

public class XMLStringToDOM {

public static void write(DomWriter writer, Element writerRoot, Optional<Path> path)
public static void write(DomWriter writer, Element writerRoot, InputStream is)
throws TikaException, IOException, SAXException {
Document document = XMLReaderUtils.buildDOM(path.get());
Document document = XMLReaderUtils.buildDOM(is);
Element documentRoot = document.getDocumentElement();
writer.appendChild(writerRoot, documentRoot);
}
Expand Down
77 changes: 77 additions & 0 deletions tika-gui-app/src/main/resources/default_parsers.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<parsers>
<parser class="org.apache.tika.parser.DefaultParser">
<!-- turn off tesseract for now -->
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
<parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
<parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
<parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
</parser>
<parser class="org.apache.tika.parser.pdf.PDFParser">
<params>
<!-- see https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454066
for the most recent settings for the PDF Parser -->
<param name="allowExtractionForAccessibility" type="bool">true</param>
<param name="averageCharTolerance" type="float">0.3</param>
<param name="detectAngles" type="bool">true</param>
<param name="extractAcroFormContent" type="bool">true</param>
<param name="extractActions" type="bool">true</param>
<!-- as of 2.8.0 -->
<param name="extractIncrementalUpdateInfo" type="bool">true</param>
<param name="catchIntermediateIOExceptions" type="bool">true</param>
<param name="dropThreshold" type="float">2.5</param>
<param name="enableAutoSpace" type="bool">true</param>
<param name="extractAnnotationText" type="bool">true</param>
<param name="extractBookmarksText" type="bool">true</param>
<param name="extractFontNames" type="bool">false</param>
<param name="extractInlineImages" type="bool">false</param>
<param name="extractUniqueInlineImagesOnly" type="bool">true</param>
<param name="ifXFAExtractOnlyXFA" type="bool">false</param>
<param name="maxMainMemoryBytes" type="long">-1</param>
<!-- as of 2.8.0 -->
<param name="maxIncrementalUpdates" type="int">10000</param>
<param name="ocrDPI" type="int">300</param>
<param name="ocrImageFormatName" type="string">png</param>
<param name="ocrImageQuality" type="float">1.0</param>
<param name="ocrRenderingStrategy" type="string">ALL</param>
<param name="ocrStrategy" type="string">auto</param>
<param name="ocrStrategyAuto" type="string">better</param>
<param name="ocrImageType" type="string">gray</param>
<!-- as of 2.8.0 -->
<param name="parseIncrementalUpdates" type="bool">false</param>
<param name="setKCMS" type="bool">false</param>
<param name="sortByPosition" type="bool">false</param>
<param name="spacingTolerance" type="float">0.5</param>
<param name="suppressDuplicateOverlappingText" type="bool">false</param>
</params>
</parser>
<parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
<params>
<param name="includeDeletedContent" type="bool">true</param>
<param name="includeMoveFromContent" type="bool">true</param>
<param name="extractMacros" type="bool">true</param>
</params>
</parser>
<parser class="org.apache.tika.parser.microsoft.OfficeParser">
<params>
<param name="extractMacros" type="bool">true</param>
</params>
</parser>
</parsers>

0 comments on commit b135851

Please sign in to comment.