Browse files

be more aggressive about using temp directories for every MAGICK_TEMP…

…DIR call.
  • Loading branch information...
1 parent 05142cb commit ba2e98b6ccb94a2bfd19d1b2f86bfbc3fd2ad606 @jashkenas jashkenas committed Aug 17, 2010
Showing with 13 additions and 13 deletions.
  1. +4 −4 lib/docsplit/image_extractor.rb
  2. +6 −6 lib/docsplit/text_extractor.rb
  3. +3 −3 lib/docsplit/transparent_pdfs.rb
View
8 lib/docsplit/image_extractor.rb
@@ -11,7 +11,6 @@ class ImageExtractor
# Extract a list of PDFs as rasterized page images, according to the
# configuration in options.
def extract(pdfs, options)
- @tempdir = Dir.mktmpdir
@pdfs = [pdfs].flatten
extract_options(options)
@pdfs.each do |pdf|
@@ -21,25 +20,26 @@ def extract(pdfs, options)
previous = size if @rolling
end
end
- FileUtils.remove_entry_secure @tempdir if File.exists?(@tempdir)
end
# Convert a single PDF into page images at the specified size and format.
def convert(pdf, size, format, previous=nil)
+ tempdir = Dir.mktmpdir
basename = File.basename(pdf, File.extname(pdf))
directory = directory_for(size)
FileUtils.mkdir_p(directory) unless File.exists?(directory)
out_file = File.join(directory, "#{basename}_%05d.#{format}")
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
if previous
FileUtils.cp(Dir[directory_for(previous) + '/*'], directory)
- cmd = "MAGICK_TMPDIR=#{@tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1"
else
- cmd = "MAGICK_TMPDIR=#{@tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
+ cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}#{pages_arg}\" \"#{out_file}\" 2>&1"
end
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
renumber_images(out_file, format)
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end
View
12 lib/docsplit/text_extractor.rb
@@ -42,7 +42,6 @@ def extract(pdfs, opts)
end
end
end
- FileUtils.remove_entry_secure @tempdir if @tempdir && File.exists?(@tempdir)
end
# Does a PDF have any text embedded?
@@ -59,19 +58,20 @@ def extract_from_pdf(pdf, pages)
# Extract a page range worth of text from a PDF via OCR.
def extract_from_ocr(pdf, pages)
- @tempdir ||= Dir.mktmpdir
+ tempdir = Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
if pages
- run "MAGICK_TMPDIR=#{@tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
@tiffs_generated = true
pages.each do |page|
- run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
+ run "tesseract #{tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
end
else
- tiff = "#{@tempdir}/#{@pdf_name}.tif"
- run "MAGICK_TMPDIR=#{@tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
+ tiff = "#{tempdir}/#{@pdf_name}.tif"
+ run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
end
+ FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
end
View
6 lib/docsplit/transparent_pdfs.rb
@@ -12,9 +12,9 @@ def ensure_pdfs(docs)
if ext.downcase == '.pdf'
doc
else
- @tempdir ||= File.join(Dir.tmpdir, 'docsplit')
- extract_pdf([doc], {:output => @tempdir})
- File.join(@tempdir, File.basename(doc, ext) + '.pdf')
+ tempdir = File.join(Dir.tmpdir, 'docsplit')
+ extract_pdf([doc], {:output => tempdir})
+ File.join(tempdir, File.basename(doc, ext) + '.pdf')
end
end
end

0 comments on commit ba2e98b

Please sign in to comment.