Permalink
Browse files

experimenting with using a new object to wrap logic of extracting pag…

…e data

* very incomplete
  • Loading branch information...
1 parent 6890cac commit 582336a3c06252f0632f9e57102d82d7c67d5da1 @yob committed Jun 10, 2010
Showing with 200 additions and 97 deletions.
  1. +1 −1 TODO
  2. +1 −0 lib/pdf/reader.rb
  3. +87 −0 lib/pdf/reader/page.rb
  4. +20 −96 lib/pdf/reader/pages_visitor.rb
  5. +91 −0 specs/page_spec.rb
View
2 TODO
@@ -14,7 +14,7 @@ v0.8
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
- Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
- are inheritable. See table 3.2.7 in the spec
+ are inheritable. See table 3.7.2 in the spec
v0.9
- Add a way to extract raster images
View
1 lib/pdf/reader.rb
@@ -157,6 +157,7 @@ def visitors
require 'pdf/reader/metadata_visitor'
require 'pdf/reader/object_hash'
require 'pdf/reader/object_stream'
+require 'pdf/reader/page'
require 'pdf/reader/pages_visitor'
require 'pdf/reader/parser'
require 'pdf/reader/print_receiver'
View
87 lib/pdf/reader/page.rb
@@ -0,0 +1,87 @@
+# coding: utf-8
+
+
+class PDF::Reader
+ class Page
+ def initialize(ohash, page_dict)
+ @ohash, @page_dict = ohash, page_dict
+ end
+
+ def ohash
+ @ohash
+ end
+
+ def page_dict
+ @page_dict
+ end
+
+ def content_streams
+ return [] if page_dict[:Contents].nil?
+
+ if ohash.object(page_dict[:Contents]).kind_of?(Array)
+ contents = @ohash.object(page_dict[:Contents])
+ else
+ contents = [page_dict[:Contents]]
+ end
+ contents.map { |content| @ohash.object(content) }
+ #content_stream(direct_contents, fonts)
+ end
+
+ def fonts
+ return {} if resources[:Font].nil?
+
+ fonts = {}
+ ohash.object(resources[:Font]).each do |label, desc|
+ desc = @ohash.object(desc)
+ fonts[label] = PDF::Reader::Font.new
+ fonts[label].label = label
+ fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
+ fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
+ fonts[label].encoding = PDF::Reader::Encoding.new(@ohash.object(desc[:Encoding]))
+ fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
+ if desc[:ToUnicode]
+ # this stream is a cmap
+ stream = desc[:ToUnicode]
+ fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
+ end
+ end
+ fonts
+ end
+
+ def parent(dict)
+ if dict.nil? || dict[:Parent].nil?
+ return []
+ else
+ return parent(ohash.object(dict[:Parent])) + [ohash.object(dict[:Parent])]
+ end
+ end
+
+ def parents
+ @parents ||= parent(page_dict)
+ end
+
+ def resources
+ resources = {}
+ parents.reverse.each do |parent|
+ resources.merge!(ohash.object(parent[:Resources])) if ohash.object(parent[:Resources])
+ end
+ resources.merge!(ohash.object(page_dict[:Resources])) if ohash.object(page_dict[:Resources])
+ resolve_references(resources)
+ end
+ ################################################################################
+ # Convert any PDF::Reader::Resource objects into a real object
+ def resolve_references(obj)
+ case obj
+ when PDF::Reader::Stream then
+ obj.hash = resolve_references(obj.hash)
+ obj
+ when PDF::Reader::Reference then
+ resolve_references(@ohash.object(obj))
+ when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
+ when Array then obj.collect { |item| resolve_references(item) }
+ else
+ obj
+ end
+ end
+ end
+end
View
116 lib/pdf/reader/pages_visitor.rb
@@ -250,46 +250,28 @@ def self.to_sym
# Begin processing the document
def process
callback(:begin_document, [root])
- walk_pages(@ohash.object(root[:Pages]))
+ raw_pages(pages).each do |page|
+ process_page(page)
+ end
callback(:end_document)
end
- private
- ################################################################################
- # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
- # its content
- def walk_pages (page)
-
- # extract page content
- if page[:Type] == :Pages
- callback(:begin_page_container, [page])
- res = @ohash.object(page[:Resources])
- resources.push res if res
- @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
- resources.pop if res
- callback(:end_page_container)
- elsif page[:Type] == :Page
- callback(:begin_page, [page])
- res = @ohash.object(page[:Resources])
- resources.push res if res
- walk_resources(current_resources)
-
- if @ohash.object(page[:Contents]).kind_of?(Array)
- contents = @ohash.object(page[:Contents])
- else
- contents = [page[:Contents]]
- end
- fonts = font_hash_from_resources(current_resources)
-
- if page.has_key?(:Contents) and page[:Contents]
- direct_contents = contents.map { |content| @ohash.object(content) }
- content_stream(direct_contents, fonts)
- end
+ private
- resources.pop if res
- callback(:end_page)
+ def raw_pages(pdf_page)
+ if pdf_page[:Type] == :Pages
+ @ohash.object(pdf_page[:Kids]).map { |child| raw_pages(@ohash.object(child))}.flatten
+ elsif pdf_page[:Type] == :Page
+ PDF::Reader::Page.new(@ohash, pdf_page)
end
end
+ def process_page(page)
+ #callback(:begin_page, [page])
+ callback(:begin_page)
+ walk_resources(page)
+ content_stream(page.content_streams, page.fonts)
+ callback(:end_page)
+ end
################################################################################
# Retreive the XObject for the supplied label and if it's a Form, walk it
# like a regular page content stream.
@@ -309,16 +291,6 @@ def walk_xobject_form(label)
end
################################################################################
- # Return a merged hash of all resources that are current. Pages, page and xobject
- #
- def current_resources
- hash = {}
- resources.each do |res|
- hash.merge!(res)
- end
- hash
- end
- ################################################################################
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
# it contains
#
@@ -359,7 +331,7 @@ def content_stream (instructions, fonts = {})
if OPERATORS[token] == :invoke_xobject
xobject_label = params.first
params.clear
- walk_xobject_form(xobject_label)
+ # walk_xobject_form(xobject_label)
else
params.clear
end
@@ -371,10 +343,8 @@ def content_stream (instructions, fonts = {})
raise MalformedPDFError, "End Of File while processing a content stream"
end
################################################################################
- def walk_resources(resources)
- return unless resources.respond_to?(:[])
-
- resources = resolve_references(resources)
+ def walk_resources(page)
+ resources = page.resources
# extract any procset information
if resources[:ProcSet]
@@ -411,57 +381,11 @@ def walk_resources(resources)
# extract any font information
if resources[:Font]
- fonts = font_hash_from_resources(resources)
- fonts.each do |label, font|
+ page.fonts.each do |label, font|
callback(:resource_font, [label, font])
end
end
end
- ################################################################################
- # Convert any PDF::Reader::Resource objects into a real object
- def resolve_references(obj)
- case obj
- when PDF::Reader::Stream then
- obj.hash = resolve_references(obj.hash)
- obj
- when PDF::Reader::Reference then
- resolve_references(@ohash.object(obj))
- when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
- when Array then obj.collect { |item| resolve_references(item) }
- else
- obj
- end
- end
- ################################################################################
- ################################################################################
- def font_hash_from_resources(resources)
- return {} unless resources.respond_to?(:[])
-
- fonts = {}
- resources = @ohash.object(resources[:Font]) || {}
- resources.each do |label, desc|
- desc = @ohash.object(desc)
- fonts[label] = PDF::Reader::Font.new
- fonts[label].label = label
- fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
- fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
- fonts[label].encoding = PDF::Reader::Encoding.new(@ohash.object(desc[:Encoding]))
- fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
- if desc[:ToUnicode]
- # this stream is a cmap
- begin
- stream = desc[:ToUnicode]
- fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
- rescue
- # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
- end
- end
- end
- fonts
- end
- def resources
- @resources ||= []
- end
end
################################################################################
end
View
91 specs/page_spec.rb
@@ -0,0 +1,91 @@
+# coding: utf-8
+
+require File.dirname(__FILE__) + "/spec_helper"
+
+context PDF::Reader::Page, "content_streams method" do
+
+ it "should return an array of Stream objects for page 1 of cairo-basic.pdf" do
+ @filename = File.dirname(__FILE__) + "/data/cairo-basic.pdf"
+ @ohash = PDF::Reader::ObjectHash.new(@filename)
+ @dict = @ohash[4]
+ @page = PDF::Reader::Page.new(@ohash, @dict)
+
+ @page.content_streams.should be_a(Array)
+ @page.content_streams.size.should eql(1)
+ @page.content_streams.each do |obj|
+ obj.should be_a(PDF::Reader::Stream)
+ end
+ end
+
+ it "should return an array of Stream objects for page 1 of split_params_and_operator.pdf" do
+ @filename = File.dirname(__FILE__) + "/data/split_params_and_operator.pdf"
+ @ohash = PDF::Reader::ObjectHash.new(@filename)
+ @dict = @ohash[5]
+ @page = PDF::Reader::Page.new(@ohash, @dict)
+
+ @page.content_streams.should be_a(Array)
+ @page.content_streams.size.should eql(2)
+ @page.content_streams.each do |obj|
+ obj.should be_a(PDF::Reader::Stream)
+ end
+ end
+end
+
+context PDF::Reader::Page, "parents method" do
+
+ it "should return an array of PDF Page objects for page 1 of cairo-basic.pdf" do
+ @filename = File.dirname(__FILE__) + "/data/cairo-basic.pdf"
+ @ohash = PDF::Reader::ObjectHash.new(@filename)
+ @dict = @ohash[4]
+ @page = PDF::Reader::Page.new(@ohash, @dict)
+
+ @page.parents.should be_a(Array)
+ @page.parents.size.should eql(1)
+ @page.parents.each do |obj|
+ obj.should be_a(Hash)
+ obj[:Type].should eql(:Pages)
+ end
+ end
+end
+
+context PDF::Reader::Page, "resources method" do
+
+ it "should return a hash of Resources applicable to page 1 of cairo-basic.pdf" do
+ @filename = File.dirname(__FILE__) + "/data/cairo-basic.pdf"
+ @ohash = PDF::Reader::ObjectHash.new(@filename)
+ @dict = @ohash[4]
+ @page = PDF::Reader::Page.new(@ohash, @dict)
+
+ @page.resources.should be_a(Hash)
+ @page.resources.size.should eql(2)
+ @page.resources.keys.include?(:Font).should be_true
+ @page.resources.keys.include?(:ExtGState).should be_true
+ end
+
+ it "should return a hash of Resources applicable to page 1 of split_params_and_operator.pdf" do
+ @filename = File.dirname(__FILE__) + "/data/split_params_and_operator.pdf"
+ @ohash = PDF::Reader::ObjectHash.new(@filename)
+ @dict = @ohash[5]
+ @page = PDF::Reader::Page.new(@ohash, @dict)
+
+ @page.resources.should be_a(Hash)
+ @page.resources.size.should eql(2)
+ @page.resources.keys.include?(:Font).should be_true
+ @page.resources.keys.include?(:ProcSet).should be_true
+ end
+end
+
+context PDF::Reader::Page, "fonts method" do
+
+ it "should return a hash of Fonts applicable to page 1 of cairo-basic.pdf" do
+ @filename = File.dirname(__FILE__) + "/data/cairo-basic.pdf"
+ @ohash = PDF::Reader::ObjectHash.new(@filename)
+ @dict = @ohash[4]
+ @page = PDF::Reader::Page.new(@ohash, @dict)
+
+ @page.fonts.should be_a(Hash)
+ @page.fonts.size.should eql(1)
+ @page.fonts.keys[0].should eql(:"CairoFont-0-0")
+ @page.fonts.values[0].should be_a(PDF::Reader::Font)
+ end
+end

0 comments on commit 582336a

Please sign in to comment.