Skip to content
Browse files

forcing graphicsmagick to always observe strict memory limits, even f…

…or generating tiffs for OCR.
  • Loading branch information...
1 parent 71269a8 commit d126c92fed1c667e068b5acfa078681326feedc8 @jashkenas jashkenas committed Aug 9, 2010
Showing with 4 additions and 3 deletions.
  1. +4 −3 lib/docsplit/text_extractor.rb
View
7 lib/docsplit/text_extractor.rb
@@ -16,7 +16,8 @@ class TextExtractor
NO_TEXT_DETECTED = /---------\n\Z/
- OCR_FLAGS = '-density 200x200 -colorspace GRAY'
+ OCR_FLAGS = '-density 200x200 -colorspace GRAY'
+ MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
MIN_TEXT_PER_PAGE = 100 # in bytes
@@ -61,14 +62,14 @@ def extract_from_ocr(pdf, pages)
@tempdir ||= Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
if pages
- run "gm convert +adjoin #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
+ run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
@tiffs_generated = true
pages.each do |page|
run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
end
else
tiff = "#{@tempdir}/#{@pdf_name}.tif"
- run "gm convert #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
+ run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
end
end

0 comments on commit d126c92

Please sign in to comment.
Something went wrong with that request. Please try again.