Refactoring the text_extractor in anticipation of OCR.

tzuryby · Aug 4, 2010 · dc5fdd2 · dc5fdd2
1 parent 9931267
commit dc5fdd2
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 22 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,5 +1,4 @@
 JODConverter ius licensed under the LGPL: gnu.org/licenses/lgpl.html
-PDFBox is licensed under the Apache 2 License: apache.org/licenses/LICENSE-2.0
 
 Copyright (c) 2009 Jeremy Ashkenas, DocumentCloud
 

diff --git a/index.html b/index.html
@@ -148,8 +148,8 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
     </ol>
 
     <p><i>
-        Note: the gem will take a minute to download &mdash; the combined
-        PDFBox and JODConverter jar files tip the scales at 8MB.
+        Note: the gem will take a minute to download &mdash; the
+        JODConverter jar file tips the scales at 2MB.
     </i></p>
 
     <h2 id="usage">Usage</h2>
@@ -237,11 +237,12 @@ <h2 id="internals">Internals</h2>
 
     <p>
       Under the hood, Docsplit is a thin wrapper around the excellent 
-      <a href="http://pdfbox.apache.org/">PDFBox</a>, 
-      <a href="http://www.graphicsmagick.org/">GraphicsMagick</a>, and
+      <a href="http://www.graphicsmagick.org/">GraphicsMagick</a>
+      <a href="http://poppler.freedesktop.org/">Poppler</a>
+      <a href="http://www.accesspdf.com/pdftk/">PDFTK</a>, and
       <a href="http://artofsolving.com/opensource/jodconverter">JODConverter</a>
-      libraries. PDFBox is used to extract text and metadata from PDF documents, 
-      as well as to split them apart into pages. GraphicsMagick is used to generate
+      libraries. Poppler is used to extract text and metadata from PDF documents, 
+      PDFTK is used to split them apart into pages, and GraphicsMagick is used to generate
       the page images (internally, it's rendering them with 
       <a href="http://pages.cs.wisc.edu/~ghost/doc/GPL/index.htm">GhostScript</a>).
       JODConverter communicates with OpenOffice to perform the PDF conversions.

diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
@@ -8,7 +8,7 @@ class CommandLine
 
     BANNER = <<-EOS
 docsplit breaks apart documents into images, text, or individual pages.
-It wraps PDFBox, GraphicsMagick, and JODConverter.
+It wraps GraphicsMagick, Poppler, PDFTK, and JODConverter.
 
 Usage:
   docsplit COMMAND [OPTIONS] path/to/doc.pdf
@@ -71,7 +71,7 @@ def usage
     # Use the OptionParser library to parse out all supported options. Return
     # options formatted for the Ruby API.
     def parse_options
-      @options = {}
+      @options = {:ocr => :default}
       @option_parser = OptionParser.new do |opts|
         opts.on('-o', '--output [DIR]', 'set the directory for all output') do |d|
           @options[:output] = d
@@ -85,6 +85,9 @@ def parse_options
         opts.on('-f', '--format [FORMAT]', 'set image format (pdf, jpg, gif...)') do |t|
           @options[:format] = t.split(',')
         end
+        opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
+          @options[:ocr] = o
+        end
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true
         end

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -4,33 +4,36 @@ class TextExtractor
 
     def extract(pdfs, opts)
       extract_options opts
+      FileUtils.mkdir_p @output unless File.exists?(@output)
       [pdfs].flatten.each do |pdf|
-        pdf_name = File.basename(pdf, File.extname(pdf))
-        text_path = File.join(@output, "#{pdf_name}.txt")
-        FileUtils.mkdir_p @output unless File.exists?(@output)
         if @pages
           pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
-          pages.each do |page|
-            extract_page pdf, page, pdf_name
-          end
+          pages.each {|page| extract_page(pdf, page) }
         else
-          cmd = "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
-          result = `#{cmd}`.chomp
-          raise ExtractionFailed, result if $? != 0
+          extract_full(pdf)
         end
       end
     end
 
-    def extract_page(pdf, page, pdf_name)
+
+    private
+
+    def extract_full(pdf)
+      pdf_name = File.basename(pdf, File.extname(pdf))
+      text_path = File.join(@output, "#{pdf_name}.txt")
+      cmd = "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
+      result = `#{cmd}`.chomp
+      raise ExtractionFailed, result if $? != 0
+    end
+
+    def extract_page(pdf, page)
+      pdf_name = File.basename(pdf, File.extname(pdf))
       text_path = File.join(@output, "#{pdf_name}_#{page}.txt")
       cmd = "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
       result = `#{cmd}`.chomp
       raise ExtractionFailed, result if $? != 0
-      result
     end
 
-    private
-
     def extract_options(options)
       @output  = options[:output] || '.'
       @pages   = options[:pages]