Skip to content

Commit

Permalink
bug fix on detectors in TikaConfigWriter and add tika-parsers.xml exa…
Browse files Browse the repository at this point in the history
…mple
  • Loading branch information
tballison committed May 17, 2023
1 parent 30d0195 commit eb09273
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 2 deletions.
58 changes: 58 additions & 0 deletions tika-gui-app/examples/tika-parsers.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<parsers>
<parser class="org.apache.tika.parser.DefaultParser">
<parser-exclude class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
<parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
<parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
<parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/>
</parser>
<parser class="org.apache.tika.parser.pdf.PDFParser">
<params>
<!-- see https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=109454066
for the most recent settings for the PDF Parser -->
<param name="allowExtractionForAccessibility" type="bool">true</param>
<param name="averageCharTolerance" type="float">0.3</param>
<param name="detectAngles" type="bool">true</param>
<param name="extractAcroFormContent" type="bool">true</param>
<param name="extractActions" type="bool">false</param>
<!-- as of 2.8.0 -->
<param name="extractIncrementalUpdateInfo" type="bool">true</param>
<param name="catchIntermediateIOExceptions" type="bool">true</param>
<param name="dropThreshold" type="float">2.5</param>
<param name="enableAutoSpace" type="bool">true</param>
<param name="extractAnnotationText" type="bool">false</param>
<param name="extractBookmarksText" type="bool">true</param>
<param name="extractFontNames" type="bool">false</param>
<param name="extractInlineImages" type="bool">false</param>
<param name="extractUniqueInlineImagesOnly" type="bool">true</param>
<param name="ifXFAExtractOnlyXFA" type="bool">false</param>
<param name="maxMainMemoryBytes" type="long">-1</param>
<!-- as of 2.8.0 -->
<param name="maxIncrementalUpdates" type="int">10000</param>
<param name="ocrDPI" type="int">300</param>
<param name="ocrImageFormatName" type="string">png</param>
<param name="ocrImageQuality" type="float">1.0</param>
<param name="ocrRenderingStrategy" type="string">ALL</param>
<param name="ocrStrategy" type="string">auto</param>
<param name="ocrStrategyAuto" type="string">better</param>
<param name="ocrImageType" type="string">gray</param>
<!-- as of 2.8.0 -->
<param name="parseIncrementalUpdates" type="bool">false</param>
<param name="setKCMS" type="bool">false</param>
<param name="sortByPosition" type="bool">false</param>
<param name="spacingTolerance" type="float">0.5</param>
<param name="suppressDuplicateOverlappingText" type="bool">false</param>
</params>
</parser>
<parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser">
<params>
<param name="includeDeletedContent" type="bool">true</param>
<param name="includeMoveFromContent" type="bool">true</param>
<param name="extractMacros" type="bool">true</param>
</params>
</parser>
<parser class="org.apache.tika.parser.microsoft.OfficeParser">
<params>
<param name="extractMacros" type="bool">true</param>
</params>
</parser>
</parsers>
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ public Path writeConfig(BatchProcessConfig batchProcessConfig, Path workingDir)

private void appendDetectors(BatchProcessConfig batchProcessConfig, DomWriter writer,
Element properties) throws XMLStreamException {
if (batchProcessConfig.getParserConfig().isEmpty() ||
batchProcessConfig.getParserConfig().get().getPath().isEmpty()) {
if (batchProcessConfig.getDetectorConfig().isEmpty() ||
batchProcessConfig.getDetectorConfig().get().getPath().isEmpty()) {
//don't do anything, right(?)
} else {
//this is a total hack.
Expand Down

0 comments on commit eb09273

Please sign in to comment.