Permalink
Browse files

despeckle before OCR.

  • Loading branch information...
1 parent 513ad5c commit f3e9fa1d5095d00e71e145d44f1548e7bf1ba3d6 @jashkenas jashkenas committed May 13, 2011
Showing with 2 additions and 2 deletions.
  1. +2 −2 lib/docsplit/text_extractor.rb
@@ -63,14 +63,14 @@ def extract_from_ocr(pdf, pages)
pages.each do |page|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
file = "#{base_path}_#{page}"
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
run "tesseract #{tiff} #{file} -l eng 2>&1"
clean_text(file + '.txt') if @clean_ocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
clean_text(base_path + '.txt') if @clean_ocr
end

0 comments on commit f3e9fa1

Please sign in to comment.