Skip to content
Browse files

further tweaks to image/text extraction.

  • Loading branch information...
1 parent 8901441 commit 5efc6761540fc87942bd497c1367fa98430e55b9 @jashkenas jashkenas committed Aug 20, 2010
Showing with 11 additions and 11 deletions.
  1. +2 −2 docsplit.gemspec
  2. +1 −1 index.html
  3. +1 −1 lib/docsplit.rb
  4. +1 −1 lib/docsplit/image_extractor.rb
  5. +6 −6 lib/docsplit/text_extractor.rb
View
4 docsplit.gemspec
@@ -1,7 +1,7 @@
Gem::Specification.new do |s|
s.name = 'docsplit'
- s.version = '0.3.3' # Keep version in sync with docsplit.rb
- s.date = '2010-8-17'
+ s.version = '0.3.4' # Keep version in sync with docsplit.rb
+ s.date = '2010-8-20'
s.homepage = "http://documentcloud.github.com/docsplit/"
s.summary = "Break Apart Documents into Images, Text, Pages and PDFs"
View
2 index.html
@@ -98,7 +98,7 @@
(title, author, number of pages...)
</p>
- <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.3.3</a>.</p>
+ <p>Docsplit is currently at <a href="http://rubygems.org/gems/docsplit">version 0.3.4</a>.</p>
<p>
<i>Docsplit is an open-source component of <a href="http://documentcloud.org/">DocumentCloud</a>.</i>
View
2 lib/docsplit.rb
@@ -1,7 +1,7 @@
# The Docsplit module delegates to the Java PDF extractors.
module Docsplit
- VERSION = '0.3.3' # Keep in sync with gemspec.
+ VERSION = '0.3.4' # Keep in sync with gemspec.
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
View
2 lib/docsplit/image_extractor.rb
@@ -5,7 +5,7 @@ module Docsplit
class ImageExtractor
DENSITY_ARG = "-density 150"
- MEMORY_ARGS = "-limit memory 128MiB -limit map 256MiB"
+ MEMORY_ARGS = "-limit memory 256MiB -limit map 512MiB"
DEFAULT_FORMAT = :png
# Extract a list of PDFs as rasterized page images, according to the
View
12 lib/docsplit/text_extractor.rb
@@ -17,13 +17,12 @@ class TextExtractor
NO_TEXT_DETECTED = /---------\n\Z/
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
- MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
+ MEMORY_ARGS = '-limit memory 256MiB -limit map 512MiB'
MIN_TEXT_PER_PAGE = 100 # in bytes
def initialize
- @tiffs_generated = false
- @pages_to_ocr = []
+ @pages_to_ocr = []
end
# Extract text from a list of PDFs.
@@ -61,10 +60,11 @@ def extract_from_ocr(pdf, pages)
tempdir = Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
if pages
- run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
- @tiffs_generated = true
pages.each do |page|
- run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
+ tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
+ run "tesseract #{tiff} #{base_path}_#{page} 2>&1"
+ FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"

0 comments on commit 5efc676

Please sign in to comment.
Something went wrong with that request. Please try again.