Browse files

extract lower-level PDF routines to pdf-reader-turtletext gem

  • Loading branch information...
1 parent 09ac749 commit 7643fae550b3c927d28f325ba5a01098cc4500a0 @tardate committed Jul 22, 2012
View
3 Gemfile
@@ -1,12 +1,11 @@
source "http://rubygems.org"
-gem 'pdf-reader', '1.1.1'
+gem 'pdf-reader-turtletext', '~> 0.1.0'
gem 'getoptions', '~> 0.3'
group :development do
gem 'bundler', '~> 1.1.4'
gem 'jeweler', '~> 1.6.4'
- gem 'rcov', '>= 0'
end
group :development, :test do
View
10 Gemfile.lock
@@ -3,7 +3,7 @@ GEM
specs:
Ascii85 (1.0.1)
diff-lcs (1.1.3)
- ffi (1.0.11)
+ ffi (1.1.0)
getoptions (0.3)
git (1.2.5)
guard (1.2.3)
@@ -15,21 +15,22 @@ GEM
bundler (~> 1.0)
git (>= 1.2.5)
rake
- json (1.6.4)
+ json (1.7.3)
listen (0.4.7)
rb-fchange (~> 0.0.5)
rb-fsevent (~> 0.9.1)
rb-inotify (~> 0.8.8)
pdf-reader (1.1.1)
Ascii85 (~> 1.0.0)
ruby-rc4
+ pdf-reader-turtletext (0.1.0)
+ pdf-reader (= 1.1.1)
rake (0.9.2.2)
rb-fchange (0.0.5)
ffi
rb-fsevent (0.9.1)
rb-inotify (0.8.8)
ffi (>= 0.5.0)
- rcov (0.9.11)
rdoc (3.12)
json (~> 1.4)
rspec (2.8.0)
@@ -51,8 +52,7 @@ DEPENDENCIES
getoptions (~> 0.3)
guard-rspec
jeweler (~> 1.6.4)
- pdf-reader (= 1.1.1)
+ pdf-reader-turtletext (~> 0.1.0)
rake (~> 0.9.2.2)
- rcov
rdoc (~> 3.11)
rspec (~> 2.8.0)
View
46 lib/pdf/object_hash.rb
@@ -1,46 +0,0 @@
-# This monkey-patches pdf-reader to allow it to read downloaded SP Services PDF.
-# The patch caters for junk characters that appear in the file before the start of the PDF stream.
-# (its an html head block actually - I suspect a bug in the Adobe software used to serve the bills)
-#
-# The patch has been contributed back to the pdf-reader project (https://github.com/yob/pdf-reader/pull/54)
-# and has already been merged on master. When it shows up in a release of the pdf-reader gem
-# we can trash this patch.
-#
-class PDF::Reader::ObjectHash
-
- def extract_io_from(input)
- if input.respond_to?(:seek) && input.respond_to?(:read)
- input
- elsif File.file?(input.to_s)
- read_with_quirks(input)
- else
- raise ArgumentError, "input must be an IO-like object or a filename"
- end
- end
-
- # Load file as a StringIO stream, accounting for invalid format
- # where additional characters exist in the file before the %PDF start of file
- def read_with_quirks(input)
- stream = File.open(input.to_s, "rb")
- if ofs = pdf_offset(stream)
- stream.seek(ofs)
- StringIO.new(stream.read)
- else
- raise ArgumentError, "invalid file format"
- end
- end
- private :read_with_quirks
-
- # Returns the offset of the PDF document in the +stream+.
- # Checks up to 50 chars into the file, returns nil of no PDF stream detected.
- def pdf_offset(stream)
- stream.rewind
- ofs = stream.pos
- until (c = stream.readchar) == '%' || c == 37 || ofs > 50
- ofs += 1
- end
- ofs < 50 ? ofs : nil
- end
- private :pdf_offset
-
-end
View
17 lib/pdf/positional_text_receiver.rb
@@ -1,17 +0,0 @@
-# Class for collating positional text content from a PDF
-class PDF::Reader::PositionalTextReceiver < PDF::Reader::PageTextReceiver
-
- # record text that is drawn on the page
- def show_text(string) # Tj
- raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
- newx, newy = @state.trm_transform(0,0)
- @content[newy] ||= {}
- @content[newy][newx] = @state.current_font.to_utf8(string)
- end
-
- # override content accessor
- def content
- @content
- end
-
-end
View
108 lib/pdf/structured_reader.rb
@@ -1,108 +0,0 @@
-# Class for reading structured text content
-# This is the one that is a bit hairy - specifically check the fuzzed_y usage
-# which attempts to align text content in the PDF so it can be extracted
-# with correct alignment.
-#
-class PDF::StructuredReader
- attr_reader :reader
-
- # +source+ is a file name or stream-like object
- def initialize(source)
- @reader = PDF::Reader.new(source)
- end
-
- # Returns positional (with fuzzed y positioning) text content collection as a hash:
- # { y_position: { x_position: content}}
- def content(page=1)
- @content ||= []
- if @content[page]
- @content[page]
- else
- @content[page] = fuzzed_y(precise_content(page))
- end
- end
-
- # Returns a hash with fuzzed y positioning:
- # { fuzzed_y_position: { x_position: content}}
- # Given +input+ as a hash:
- # { y_position: { x_position: content}}
- # y values that fall within +precision+ points of another will be clustered
- def fuzzed_y(input,precision=3)
- output = {}
- input.keys.sort.each do |precise_y|
- # matching_y = (precise_y / 5.0).truncate * 5.0
- matching_y = output.keys.select{|new_y| (new_y - precise_y).abs < precision }.first || precise_y
- output[matching_y] ||= {}
- output[matching_y].merge!(input[precise_y])
- end
- output
- end
-
- # Returns positional text content collection as a hash with precise x,y positioning:
- # { y_position: { x_position: content}}
- def precise_content(page=1)
- @precise_content ||= []
- if @precise_content[page]
- @precise_content[page]
- else
- @precise_content[page] = load_content(page)
- end
- end
-
- # Returns an array of text elements in the bounding box
- def text_in_rect(xmin,xmax,ymin,ymax,page=1)
- text_map = content(page)
- box = []
- text_map.keys.sort.reverse.each do |y|
- if y >= ymin && y<= ymax
- row = []
- text_map[y].keys.sort.each do |x|
- if x >= xmin && x<= xmax
- row << text_map[y][x]
- end
- end
- box << row unless row.empty?
- end
- end
- box
- end
-
- # Returns the position {x: val, y: val } of +text+ on +page+
- # +text+ may be astring (exact match required) or a Regexp
- def text_position(text,page=1)
- item = if text.class <= Regexp
- content(page).map {|k,v| if x = v.reduce(nil){|memo,vv| memo = (vv[1] =~ text) ? vv[0] : memo } ; [k,x] ; end }
- else
- content(page).map {|k,v| if x = v.rassoc(text) ; [k,x] ; end }
- end
- item = item.compact.flatten
- unless item.empty?
- { :x => item[1], :y => item[0] }
- end
- end
-
- # WIP - not using Textangle yet for text extraction.
- # Ideal usage is something like this:
- #
- # textangle = reader.bounding_box do
- # page 1
- # below "Electricity Services"
- # above "Gas Services by City Gas Pte Ltd"
- # right_of 240.0
- # left_of "Total ($)"
- # end
- # textangle.text
- #
- def bounding_box(&block)
- PDF::Reader::Textangle.new(self,&block)
- end
-
- private
-
- def load_content(page)
- receiver = PDF::Reader::PositionalTextReceiver.new
- reader.page(page).walk(receiver)
- receiver.content
- end
-
-end
View
27 lib/pdf/textangle.rb
@@ -1,27 +0,0 @@
-# A DSL syntax for text extraction.
-# WIP - not using this yet
-#
-# textangle = PDF::Reader::Textangle.new(reader) do
-# page 1
-# below "Electricity Services"
-# above "Gas Services by City Gas Pte Ltd"
-# right_of 240.0
-# left_of "Total ($)"
-# end
-# textangle.text
-#
-class PDF::Reader::Textangle
- attr_reader :reader
- attr_writer :page,:above,:below,:left_of,:right_of
-
- # +structured_reader+ is a PDF::StructuredReader
- def initialize(structured_reader,&block)
- @reader = structured_reader
- instance_eval( &block ) if block
- end
-
- def text
- # TODO
- end
-
-end
View
6 lib/sps_bill.rb
@@ -1,8 +1,4 @@
-require 'pdf-reader'
-require 'pdf/object_hash'
-require 'pdf/positional_text_receiver'
-require 'pdf/textangle'
-require 'pdf/structured_reader'
+require 'pdf-reader-turtletext'
module SpsBill
end
View
2 lib/sps_bill/bill.rb
@@ -31,7 +31,7 @@ def initialize(source)
# Returns the PDF reader isntance
def reader
- @reader ||= PDF::StructuredReader.new(source_file) if source_file
+ @reader ||= PDF::Reader::Turtletext.new(source_file) if source_file
end
# Return a pretty(-ish) text format of the core bill details
View
16 lib/sps_bill/bill_parser.rb
@@ -26,30 +26,30 @@ def do_complete_parse
# Command: extracts the account number
def parse_account_number
- @account_number = reader.text_in_rect(383.0,999.0,785.0,790.0,1).flatten.join('')
+ @account_number = reader.text_in_region(383.0,999.0,785.0,790.0,1).flatten.join('')
end
# Command: extracts the total amount due for the current month
def parse_total_amount
@total_amount = if ref = reader.text_position(/^Total Current Charges due on/)
- total_parts = reader.text_in_rect(ref[:x] + 1,400.0,ref[:y] - 1,ref[:y] + 1,1)
+ total_parts = reader.text_in_region(ref[:x] + 1,400.0,ref[:y] - 1,ref[:y] + 1,1)
total_parts.flatten.first.to_f
end
end
# Command: extracts the invoice date
def parse_invoice_date
@invoice_date = if ref = reader.text_position("Dated")
- date_parts = reader.text_in_rect(ref[:x] + 1,999.0,ref[:y] - 1,ref[:y] + 1,1)
+ date_parts = reader.text_in_region(ref[:x] + 1,999.0,ref[:y] - 1,ref[:y] + 1,1)
Date.parse(date_parts.first.join('-'))
end
end
# Command: extracts the invoice month (as Date, set to 1st of the month)
def parse_invoice_month
@invoice_month = if ref = reader.text_position("Dated")
- date_parts = reader.text_in_rect(ref[:x] + 1,999.0,ref[:y] - 1,ref[:y] + 1,1)
- m_parts = reader.text_in_rect(ref[:x]-200,ref[:x]-1,ref[:y] - 1,ref[:y] + 1,1)
+ date_parts = reader.text_in_region(ref[:x] + 1,999.0,ref[:y] - 1,ref[:y] + 1,1)
+ m_parts = reader.text_in_region(ref[:x]-200,ref[:x]-1,ref[:y] - 1,ref[:y] + 1,1)
Date.parse("#{date_parts.first.last}-#{m_parts.first.first}-01")
end
end
@@ -61,7 +61,7 @@ def parse_electricity_usage
lower_ref = reader.text_position(GAS_SERVICE_HEAD)
lower_ref ||= reader.text_position(WATER_SERVICE_HEAD)
if lower_ref
- raw_data = reader.text_in_rect(240.0,450.0,lower_ref[:y]+1,upper_ref[:y],1)
+ raw_data = reader.text_in_region(240.0,450.0,lower_ref[:y]+1,upper_ref[:y],1)
raw_data.map{|l| {:kwh => l[0].gsub(/kwh/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
end
end
@@ -72,7 +72,7 @@ def parse_electricity_usage
def parse_gas_usage
@gas_usage = if upper_ref = reader.text_position(GAS_SERVICE_HEAD)
if lower_ref = reader.text_position(WATER_SERVICE_HEAD)
- raw_data = reader.text_in_rect(240.0,450.0,lower_ref[:y]+1,upper_ref[:y],1)
+ raw_data = reader.text_in_region(240.0,450.0,lower_ref[:y]+1,upper_ref[:y],1)
raw_data.map{|l| {:kwh => l[0].gsub(/kwh/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
end
end
@@ -83,7 +83,7 @@ def parse_gas_usage
def parse_water_usage
@water_usage = if upper_ref = reader.text_position(WATER_SERVICE_HEAD)
if lower_ref = reader.text_position("Waterborne Fee")
- raw_data = reader.text_in_rect(240.0,450.0,lower_ref[:y]+1,upper_ref[:y],1)
+ raw_data = reader.text_in_region(240.0,450.0,lower_ref[:y]+1,upper_ref[:y],1)
raw_data.map{|l| {:cubic_m => l[0].gsub(/cu m/i,'').to_f, :rate => l[1].to_f, :amount => l[2].to_f} }
end
end
View
BIN spec/fixtures/pdf_samples/junk_prefix.pdf
Binary file not shown.
View
2 spec/support/bill_examples.rb
@@ -5,7 +5,7 @@
let(:resource) { eval "#{resource_key}" }
let(:reader) { resource.reader }
subject { reader }
- it { should be_a(PDF::StructuredReader) }
+ it { should be_a(PDF::Reader::Turtletext) }
end
end
View
15 spec/unit/pdf/object_hash_spec.rb
@@ -1,15 +0,0 @@
-require 'spec_helper'
-include PdfSamplesHelper
-
-describe PDF::Reader::ObjectHash do
-
- context "when there is a junk prefix" do
- let(:sample_name) { junk_prefix_pdf_sample_name }
- let(:object_hash) { PDF::Reader::ObjectHash.new(sample_name) }
- let(:stream) { object_hash.instance_variable_get(:@io) }
- before { stream.rewind }
- subject { stream.read(4) }
- it { should eql("%PDF") }
- end
-
-end

0 comments on commit 7643fae

Please sign in to comment.