Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Adding explicit runtime dependency checking, warning, and fallback be…

…havior for non-ocr text extraction.
  • Loading branch information...
commit 3cb21b7149c8021a4ee2d868d50305c3e3f2c4e0 1 parent 10378a7
Jeremy Ashkenas jashkenas authored
Showing with 17 additions and 1 deletion.
  1. +14 −0 lib/docsplit.rb
  2. +3 −1 lib/docsplit/text_extractor.rb
14 lib/docsplit.rb
View
@@ -13,6 +13,20 @@ module Docsplit
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
+
+ # Check for all dependencies, and warn of their absence.
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
+ DEPENDENCIES.each_key do |dep|
+ dirs.each do |dir|
+ if File.executable?(File.join(dir, dep.to_s))
+ DEPENDENCIES[dep] = true
+ break
+ end
+ end
+ warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
+ end
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
# broke.
class ExtractionFailed < StandardError; end
4 lib/docsplit/text_extractor.rb
View
@@ -23,7 +23,9 @@ def extract(pdfs, opts)
extract_from_ocr(pdf, pages)
else
extract_from_pdf(pdf, pages)
- extract_from_ocr(pdf, @pages_to_ocr) if !@forbid_ocr && !@pages_to_ocr.empty?
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
+ extract_from_ocr(pdf, @pages_to_ocr)
+ end
end
end
FileUtils.remove_entry_secure @tempdir if @tempdir
Please sign in to comment.
Something went wrong with that request. Please try again.