Permalink
Browse files

Commenting throughout.

  • Loading branch information...
1 parent 3cb21b7 commit 7ecff43aa5bd1a253285f5881df29ce83f26ee02 @jashkenas jashkenas committed Aug 5, 2010
@@ -1,31 +0,0 @@
-module Docsplit
-
- module ArgumentParser
-
- # Flatten an options hash into an arguments string suitable for the command
- # line.
- def parse_options(opts)
- opts.map {|k, v| ["--#{k}", normalize_value(v)] }.flatten.join(' ')
- end
-
- # Normalize a value in an options hash for the command line.
- # Ranges look like: 1-10, Arrays like: 1,2,3.
- def normalize_value(value)
- case value
- when Range then normalize_range(value)
- when Array then value.map! {|v| v.is_a?(Range) ? normalize_range(v) : v }.join(',')
- else value.to_s
- end
- end
-
- # Serialize a Ruby range into it's command-line equivalent.
- def normalize_range(range)
- arr = range.to_a
- arr.empty? ? range.first.to_s : "#{range.first}-#{arr.last}"
- end
-
- end
-
- extend ArgumentParser
-
-end
@@ -53,6 +53,8 @@ def extract_options(options)
@rolling = !!options[:rolling]
end
+ # If there's only one size requested, generate the images directly into
+ # the output directory. Multiple sizes each get a directory of their own.
def directory_for(size)
path = @sizes.length == 1 ? @output : File.join(@output, size)
File.expand_path(path)
@@ -3,6 +3,7 @@ module Docsplit
# Delegates to **pdfinfo** in order to extract information about a PDF file.
class InfoExtractor
+ # Regex matchers for different bits of information.
MATCHERS = {
:author => /^Author:\s+([^\n]+)/,
:date => /^CreationDate:\s+([^\n]+)/,
@@ -14,6 +15,7 @@ class InfoExtractor
:length => /^Pages:\s+([^\n]+)/,
}
+ # Pull out a single datum from a pdf.
def extract(key, pdfs, opts)
pdf = [pdfs].flatten.first
cmd = "pdfinfo #{pdf} 2>&1"
@@ -4,6 +4,7 @@ module Docsplit
# a PDF document.
class PageExtractor
+ # Burst a list of pdfs into single pages, as `pdfname_pagenumber.pdf`.
def extract(pdfs, opts)
extract_options opts
[pdfs].flatten.each do |pdf|
@@ -22,7 +23,7 @@ def extract(pdfs, opts)
private
def extract_options(options)
- @output = options[:output] || '.'
+ @output = options[:output] || '.'
end
end
@@ -1,18 +1,31 @@
module Docsplit
+ # Delegates to **pdftotext** and **tesseract** in order to extract text from
+ # PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
+ # forbid OCR extraction, but by default the heuristic works like this:
+ #
+ # * Check for the presence of fonts in the PDF. If no fonts are detected,
+ # OCR is used automatically.
+ # * Extract the text of each page with **pdftotext**, if the page has less
+ # than 100 bytes of text (a scanned image page, or a page that just
+ # contains a filename and a page number), then add it to the list of
+ # `@pages_to_ocr`.
+ # * Re-OCR each page in the `@pages_to_ocr` list at the end.
+ #
class TextExtractor
NO_TEXT_DETECTED = /---------\n\Z/
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
- MIN_TEXT_PER_PAGE = 100 # bytes
+ MIN_TEXT_PER_PAGE = 100 # in bytes
def initialize
@tiffs_generated = false
@pages_to_ocr = []
end
+ # Extract text from a list of PDFs.
def extract(pdfs, opts)
extract_options opts
FileUtils.mkdir_p @output unless File.exists?(@output)
@@ -31,16 +44,19 @@ def extract(pdfs, opts)
FileUtils.remove_entry_secure @tempdir if @tempdir
end
+ # Does a PDF have any text embedded?
def contains_text?(pdf)
fonts = `pdffonts #{pdf} 2>&1`
!fonts.match(NO_TEXT_DETECTED)
end
+ # Extract a page range worth of text from a PDF, directly.
def extract_from_pdf(pdf, pages)
return extract_full(pdf) unless pages
pages.each {|page| extract_page(pdf, page) }
end
+ # Extract a page range worth of text from a PDF via OCR.
def extract_from_ocr(pdf, pages)
@tempdir ||= Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
@@ -60,17 +76,21 @@ def extract_from_ocr(pdf, pages)
private
+ # Run an external process and raise an exception if it fails.
def run(command)
result = `#{command}`
raise ExtractionFailed, result if $? != 0
result
end
+ # Extract the full contents of a pdf as a single file, directly.
def extract_full(pdf)
text_path = File.join(@output, "#{@pdf_name}.txt")
run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
end
+ # Extract the contents of a single page of text, directly, adding it to
+ # the `@pages_to_ocr` list if the text length is inadequate.
def extract_page(pdf, page)
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"

0 comments on commit 7ecff43

Please sign in to comment.