From d46165cacba88867517b1661474d0265e10693eb Mon Sep 17 00:00:00 2001 From: James Healy Date: Wed, 6 Jul 2011 19:51:20 +1000 Subject: [PATCH] support multiple receivers in a single parse of a source PDF * In cases were multiple receivers are needed, this saves a *huge* amount of time * before this, the source file was completely parsed and extracted N times for N receivers * now it's parsed and extracted 1 time for N receivers --- CHANGELOG | 4 ++++ lib/pdf/reader.rb | 23 +++++++++++++++++------ lib/pdf/reader/abstract_strategy.rb | 18 ++++++++++-------- spec/pages_strategy_spec.rb | 18 ++++++++++++++++++ 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index c81e6611..6a2e120c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +v0.9.4 (XXX) +- support multiple receivers within a single pass over a source file + - massive time saving when dealing with multiple receivers + v0.9.3 (2nd July 2011) - add PDF::Reader::Reference#hash method - improves behaviour of Reference objects when tehy're used as Hash keys diff --git a/lib/pdf/reader.rb b/lib/pdf/reader.rb index 0278983f..fdea1130 100644 --- a/lib/pdf/reader.rb +++ b/lib/pdf/reader.rb @@ -73,21 +73,32 @@ module PDF # :pages # :raw_text # + # = Processing with multiple receivers + # + # If you wish to parse a PDF file with multiple simultaneous receivers, just + # pass an array of receivers as the second argument: + # + # pdf = PDF::Reader.new + # pdf.parse(File.new("somefile.pdf"), [receiver_one, receiever_two]) + # + # This saves a significant amount of time by limiting the work to a single pass + # over the source file. + # class Reader # Parse the file with the given name, sending events to the given receiver. # - def self.file(name, receiver, opts = {}) + def self.file(name, receivers, opts = {}) File.open(name,"rb") do |f| - new.parse(f, receiver, opts) + new.parse(f, receivers, opts) end end # Parse the given string, sending events to the given receiver. # - def self.string(str, receiver, opts = {}) + def self.string(str, receivers, opts = {}) StringIO.open(str) do |s| - new.parse(s, receiver, opts) + new.parse(s, receivers, opts) end end @@ -111,7 +122,7 @@ def self.object_string(str, id, gen = 0) # Given an IO object that contains PDF data, parse it. # - def parse(io, receiver, opts = {}) + def parse(io, receivers, opts = {}) ohash = ObjectHash.new(io) if ohash.trailer[:Encrypt] @@ -122,7 +133,7 @@ def parse(io, receiver, opts = {}) options.merge!(opts) strategies.each do |s| - s.new(ohash, receiver, options).process + s.new(ohash, receivers, options).process end self diff --git a/lib/pdf/reader/abstract_strategy.rb b/lib/pdf/reader/abstract_strategy.rb index 83bff559..06826eb8 100644 --- a/lib/pdf/reader/abstract_strategy.rb +++ b/lib/pdf/reader/abstract_strategy.rb @@ -4,8 +4,13 @@ class PDF::Reader class AbstractStrategy # :nodoc: - def initialize(ohash, receiver, options = {}) - @ohash, @receiver, @options = ohash, receiver, options + def initialize(ohash, receivers, options = {}) + @ohash, @options = ohash, options + if receivers.is_a?(Array) + @receivers = receivers + else + @receivers = [receivers] + end end private @@ -17,7 +22,9 @@ def options # calls the name callback method on the receiver class with params as the arguments # def callback (name, params=[]) - receiver.send(name, *params) if receiver.respond_to?(name) + @receivers.each do |receiver| + receiver.send(name, *params) if receiver.respond_to?(name) + end end # strings outside of page content should be in either PDFDocEncoding or UTF-16. @@ -56,10 +63,6 @@ def pages? pages ? true : false end - def receiver - @receiver - end - def root ohash.object(trailer[:Root]) end @@ -74,4 +77,3 @@ def trailer end end - diff --git a/spec/pages_strategy_spec.rb b/spec/pages_strategy_spec.rb index 5d4fb2a8..74cfba16 100644 --- a/spec/pages_strategy_spec.rb +++ b/spec/pages_strategy_spec.rb @@ -87,4 +87,22 @@ class PDF::Reader::PagesStrategy text_callbacks[0][:args].should eql([["My name is"]]) text_callbacks[1][:args].should eql([["James Healy"]]) end + + it "should send the correct callbacks when using more than one receiver" do + + # mock up an object that will be called with callbacks. This will test that + # the content class correctly recognises all instructions + one = mock("receiver_one") + one.should_receive(:move_text_position).once # Td + + two = mock("receiver_two") + two.should_receive(:move_text_position).once # Td + + # The instructions to test with + instructions = "36.000 794.330 Td" + + # process the instructions + content = PDF::Reader::PagesStrategy.new(nil, [one, two]) + content.content_stream(instructions) + end end