Permalink
Browse files

use the MAGICK_TMPDIR environment variable for all 'gm' calls.

  • Loading branch information...
1 parent ab22053 commit 342a4ecb927f8fc157a2b0cb0110c78a6335f296 @jashkenas jashkenas committed Aug 17, 2010
Showing with 7 additions and 5 deletions.
  1. +4 −2 lib/docsplit/image_extractor.rb
  2. +3 −3 lib/docsplit/text_extractor.rb
@@ -11,6 +11,7 @@ class ImageExtractor
# Extract a list of PDFs as rasterized page images, according to the
# configuration in options.
def extract(pdfs, options)
+ @tempdir = Dir.mktmpdir
@pdfs = [pdfs].flatten
extract_options(options)
@pdfs.each do |pdf|
@@ -20,6 +21,7 @@ def extract(pdfs, options)
previous = size if @rolling
end
end
+ FileUtils.remove_entry_secure @tempdir if File.exists?(@tempdir)
end
# Convert a single PDF into page images at the specified size and format.
@@ -31,9 +33,9 @@ def convert(pdf, size, format, previous=nil)
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
if previous
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
- cmd = "OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
+ cmd = "MAGICK_TMPDIR=#{@tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
else
- cmd = "OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
+ cmd = "MAGICK_TMPDIR=#{@tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
end
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
@@ -42,7 +42,7 @@ def extract(pdfs, opts)
end
end
end
- FileUtils.remove_entry_secure @tempdir if @tempdir
+ FileUtils.remove_entry_secure @tempdir if @tempdir && File.exists?(@tempdir)
end
# Does a PDF have any text embedded?
@@ -62,14 +62,14 @@ def extract_from_ocr(pdf, pages)
@tempdir ||= Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
if pages
- run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
+ run "MAGICK_TMPDIR=#{@tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
@tiffs_generated = true
pages.each do |page|
run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
end
else
tiff = "#{@tempdir}/#{@pdf_name}.tif"
- run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
+ run "MAGICK_TMPDIR=#{@tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
end
end

0 comments on commit 342a4ec

Please sign in to comment.