forked from documentcloud/docsplit
/
text_extractor.rb
112 lines (95 loc) · 3.77 KB
/
text_extractor.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
module Docsplit
# Delegates to **pdftotext** and **tesseract** in order to extract text from
# PDF documents. The `--ocr` and `--no-ocr` flags can be used to force or
# forbid OCR extraction, but by default the heuristic works like this:
#
# * Check for the presence of fonts in the PDF. If no fonts are detected,
# OCR is used automatically.
# * Extract the text of each page with **pdftotext**, if the page has less
# than 100 bytes of text (a scanned image page, or a page that just
# contains a filename and a page number), then add it to the list of
# `@pages_to_ocr`.
# * Re-OCR each page in the `@pages_to_ocr` list at the end.
#
class TextExtractor
NO_TEXT_DETECTED = /---------\n\Z/
OCR_FLAGS = '-density 200x200 -colorspace GRAY'
MEMORY_ARGS = '-limit memory 128MiB -limit map 256MiB'
MIN_TEXT_PER_PAGE = 100 # in bytes
def initialize
@tiffs_generated = false
@pages_to_ocr = []
end
# Extract text from a list of PDFs.
def extract(pdfs, opts)
extract_options opts
FileUtils.mkdir_p @output unless File.exists?(@output)
[pdfs].flatten.each do |pdf|
@pdf_name = File.basename(pdf, File.extname(pdf))
pages = (@pages == 'all') ? 1..Docsplit.extract_length(pdf) : @pages
if @force_ocr || (!@forbid_ocr && !contains_text?(pdf))
extract_from_ocr(pdf, pages)
else
extract_from_pdf(pdf, pages)
if !@forbid_ocr && DEPENDENCIES[:tesseract] && !@pages_to_ocr.empty?
extract_from_ocr(pdf, @pages_to_ocr)
end
end
end
FileUtils.remove_entry_secure @tempdir if @tempdir
end
# Does a PDF have any text embedded?
def contains_text?(pdf)
fonts = `pdffonts #{pdf} 2>&1`
!fonts.match(NO_TEXT_DETECTED)
end
# Extract a page range worth of text from a PDF, directly.
def extract_from_pdf(pdf, pages)
return extract_full(pdf) unless pages
pages.each {|page| extract_page(pdf, page) }
end
# Extract a page range worth of text from a PDF via OCR.
def extract_from_ocr(pdf, pages)
@tempdir ||= Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
if pages
run "OMP_NUM_THREADS=2 gm convert +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{@tempdir}/#{@pdf_name}_%d.tif 2>&1" unless @tiffs_generated
@tiffs_generated = true
pages.each do |page|
run "tesseract #{@tempdir}/#{@pdf_name}_#{page - 1}.tif #{base_path}_#{page} 2>&1"
end
else
tiff = "#{@tempdir}/#{@pdf_name}.tif"
run "OMP_NUM_THREADS=2 gm convert #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
end
end
private
# Run an external process and raise an exception if it fails.
def run(command)
result = `#{command}`
raise ExtractionFailed, result if $? != 0
result
end
# Extract the full contents of a pdf as a single file, directly.
def extract_full(pdf)
text_path = File.join(@output, "#{@pdf_name}.txt")
run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
end
# Extract the contents of a single page of text, directly, adding it to
# the `@pages_to_ocr` list if the text length is inadequate.
def extract_page(pdf, page)
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
unless @forbid_ocr
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
end
end
def extract_options(options)
@output = options[:output] || '.'
@pages = options[:pages]
@force_ocr = options[:ocr] == true
@forbid_ocr = options[:ocr] == false
end
end
end