forked from documentcloud/docsplit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
docsplit.rb
executable file
·94 lines (75 loc) · 3.23 KB
/
docsplit.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# The Docsplit module delegates to the Java PDF extractors.
module Docsplit
VERSION = '0.2.1' # Keep in sync with gemspec.
ROOT = File.expand_path(File.dirname(__FILE__) + '/..')
CLASSPATH = "#{ROOT}/build#{File::PATH_SEPARATOR}#{ROOT}/vendor/'*'"
LOGGING = "-Djava.util.logging.config.file=#{ROOT}/vendor/logging.properties"
HEADLESS = "-Djava.awt.headless=true"
METADATA_KEYS = [:author, :date, :creator, :keywords, :producer, :subject, :title, :length]
DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
# Check for all dependencies, and warn of their absence.
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
DEPENDENCIES.each_key do |dep|
dirs.each do |dir|
if File.executable?(File.join(dir, dep.to_s))
DEPENDENCIES[dep] = true
break
end
end
warn "Warning: Docsplit dependency #{dep} not found." if !DEPENDENCIES[dep]
end
# Raise an ExtractionFailed exception when the PDF is encrypted, or otherwise
# broke.
class ExtractionFailed < StandardError; end
# Use the ExtractPages Java class to burst a PDF into single pages.
def self.extract_pages(pdfs, opts={})
pdfs = ensure_pdfs(pdfs)
PageExtractor.new.extract(pdfs, opts)
end
# Use the ExtractText Java class to write out all embedded text.
def self.extract_text(pdfs, opts={})
pdfs = ensure_pdfs(pdfs)
TextExtractor.new.extract(pdfs, opts)
end
# Use the ExtractImages Java class to rasterize a PDF into each page's image.
def self.extract_images(pdfs, opts={})
pdfs = ensure_pdfs(pdfs)
opts[:pages] = normalize_value(opts[:pages]) if opts[:pages]
ImageExtractor.new.extract(pdfs, opts)
end
# Use JODCConverter to extract the documents as PDFs.
def self.extract_pdf(docs, opts={})
[docs].flatten.each do |doc|
basename = File.basename(doc, File.extname(doc))
run "-jar #{ROOT}/vendor/jodconverter/jodconverter-cli-2.2.2.jar #{doc} #{opts[:output] || '.'}/#{basename}.pdf", [], {}
end
end
# Define custom methods for each of the metadata keys that we support.
# Use the ExtractInfo Java class to print out a single bit of metadata.
METADATA_KEYS.each do |key|
instance_eval <<-EOS
def self.extract_#{key}(pdfs, opts={})
pdfs = ensure_pdfs(pdfs)
InfoExtractor.new.extract(:#{key}, pdfs, opts)
end
EOS
end
private
# Runs a Java command, with quieted logging, and the classpath set properly.
def self.run(command, pdfs, opts, return_output=false)
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
args = parse_options(opts)
cmd = "java #{HEADLESS} #{LOGGING} -cp #{CLASSPATH} #{command} #{args} #{pdfs} 2>&1"
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
return return_output ? (result.empty? ? nil : result) : true
end
end
require 'tmpdir'
require 'fileutils'
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/argument_parser"
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/page_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/info_extractor"