Permalink
Browse files

Adding explicit runtime dependency checking, warning, and fallback be…

…havior for non-ocr text extraction.
  • Loading branch information...
1 parent 10378a7 commit 3cb21b7149c8021a4ee2d868d50305c3e3f2c4e0 @jashkenas jashkenas committed Aug 5, 2010
Showing with 17 additions and 1 deletion.
  1. +14 −0 lib/docsplit.rb
  2. +3 −1 lib/docsplit/text_extractor.rb
View
@@ -13,6 +13,20 @@ module Docsplit
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
+ DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
+
+ # Check for all dependencies, and warn of their absence.
+ dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
+ DEPENDENCIES.each_key do |dep|
+ dirs.each do |dir|
+ if File.executable?(File.join(dir, dep.to_s))
+ DEPENDENCIES[dep] = true
+ break
+ end
+ end
+ warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
+ end
+
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
# broke.
class ExtractionFailed < StandardError; end
@@ -23,7 +23,9 @@ def extract(pdfs, opts)
extract_from_ocr(pdf, pages)
else
extract_from_pdf(pdf, pages)
- extract_from_ocr(pdf, @pages_to_ocr) if !@forbid_ocr && !@pages_to_ocr.empty?
+ if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
+ extract_from_ocr(pdf, @pages_to_ocr)
+ end
end
end
FileUtils.remove_entry_secure @tempdir if @tempdir

0 comments on commit 3cb21b7

Please sign in to comment.