-
Notifications
You must be signed in to change notification settings - Fork 267
/
reader.rb
182 lines (165 loc) · 5.71 KB
/
reader.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
################################################################################
#
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################
require 'stringio'
require 'zlib'
require 'ascii85'
module PDF
################################################################################
# The Reader class serves as an entry point for parsing a PDF file. There are three
# ways to kick off processing - which one you pick will be based on personal preference
# and the situation.
#
# For all examples, assume the receiver variable contains an object that will respond
# to various callbacks. Refer to the README and PDF::Reader::Content for more information
# on receivers.
#
# = Parsing a file
#
# PDF::Reader.file("somefile.pdf", receiver)
#
# = Parsing a String
#
# This is useful for processing a PDF that is already in memory
#
# PDF::Reader.string(pdf_string, receiver)
#
# = Parsing an IO object
#
# This can be a useful alternative to the first 2 options in some situations
#
# pdf = PDF::Reader.new
# pdf.parse(File.new("somefile.pdf"), receiver)
#
# = Parsing parts of a file
#
# Both PDF::Reader#file and PDF::Reader#string accept a third argument that
# specifies which parts of the file to process. By default, all options are
# enabled, so this can be useful to cut down processing time if you're only
# interested in say, metadata.
#
# As an example, the following call will disable parsing the contents of
# pages in the file, but explicitly enables processing metadata.
#
# PDF::Reader.new("somefile.pdf", receiver, {:metadata => true, :pages => false})
#
# Available options are currently:
#
# :metadata
# :pages
# :raw_text
#
# = Processing with multiple receivers
#
# If you wish to parse a PDF file with multiple simultaneous receivers, just
# pass an array of receivers as the second argument:
#
# pdf = PDF::Reader.new
# pdf.parse(File.new("somefile.pdf"), [receiver_one, receiever_two])
#
# This saves a significant amount of time by limiting the work to a single pass
# over the source file.
#
class Reader
# Parse the file with the given name, sending events to the given receiver.
#
def self.file(name, receivers, opts = {})
File.open(name,"rb") do |f|
new.parse(f, receivers, opts)
end
end
# Parse the given string, sending events to the given receiver.
#
def self.string(str, receivers, opts = {})
StringIO.open(str) do |s|
new.parse(s, receivers, opts)
end
end
# Parse the file with the given name, returning an unmarshalled ruby version of
# represents the requested pdf object
#
def self.object_file(name, id, gen = 0)
File.open(name,"rb") { |f|
new.object(f, id.to_i, gen.to_i)
}
end
# Parse the given string, returning an unmarshalled ruby version of represents
# the requested pdf object
#
def self.object_string(str, id, gen = 0)
StringIO.open(str) { |s|
new.object(s, id.to_i, gen.to_i)
}
end
# Given an IO object that contains PDF data, parse it.
#
def parse(io, receivers, opts = {})
ohash = ObjectHash.new(io)
if ohash.trailer[:Encrypt]
raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
end
options = {:pages => true, :raw_text => false, :metadata => true}
options.merge!(opts)
strategies.each do |s|
s.new(ohash, receivers, options).process
end
self
end
# Given an IO object that contains PDF data, return the contents of a single object
#
def object (io, id, gen)
@ohash = ObjectHash.new(io)
@ohash.object(Reference.new(id, gen))
end
private
def strategies
@strategies ||= [
::PDF::Reader::MetadataStrategy,
::PDF::Reader::PagesStrategy
]
end
end
end
################################################################################
require 'pdf/reader/abstract_strategy'
require 'pdf/reader/buffer'
require 'pdf/reader/cmap'
require 'pdf/reader/encoding'
require 'pdf/reader/error'
require 'pdf/reader/filter'
require 'pdf/reader/font'
require 'pdf/reader/lzw'
require 'pdf/reader/metadata_strategy'
require 'pdf/reader/object_hash'
require 'pdf/reader/object_stream'
require 'pdf/reader/pages_strategy'
require 'pdf/reader/parser'
require 'pdf/reader/print_receiver'
require 'pdf/reader/reference'
require 'pdf/reader/register_receiver'
require 'pdf/reader/stream'
require 'pdf/reader/text_receiver'
require 'pdf/reader/token'
require 'pdf/reader/xref'
require 'pdf/hash'