Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

fix text extraction to work with non A4 page sizes

  • Loading branch information...
commit ad9da7e3c837e0232b5b17239ef93e2e603d295b 1 parent b53321a
@yob authored
View
9 lib/pdf/reader/page_layout.rb
@@ -4,13 +4,16 @@ class PDF::Reader
# Takes a collection of TextRun objects and renders them into a single
# string that best approximates the way they'd appear on a render PDF page.
+ #
+ # media box should be a 4 number array that describes the dimensions of the
+ # page to be rendered as described by the page's MediaBox attribute
class PageLayout
- def initialize(runs)
+ def initialize(runs, mediabox)
@runs = merge_runs(runs)
@mean_font_size = mean(@runs.map(&:font_size)) || 0
@mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
- @page_width = 595.28
- @page_height = 841.89
+ @page_width = mediabox[2] - mediabox[0]
+ @page_height = mediabox[3] - mediabox[1]
@x_offset = @runs.map(&:x).sort.first
@current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
RUBY_VERSION >= "1.9.0"
View
3  lib/pdf/reader/page_text_receiver.rb
@@ -42,10 +42,11 @@ def page=(page)
@state = PageState.new(page)
@content = []
@characters = []
+ @mediabox = page.attributes[:MediaBox]
end
def content
- PageLayout.new(@characters).to_s
+ PageLayout.new(@characters, @mediabox).to_s
end
#####################################################
View
230 spec/page_layout_spec.rb
@@ -3,119 +3,123 @@
require "spec_helper"
describe PDF::Reader::PageLayout, "#to_s" do
- context "with no words" do
- subject { PDF::Reader::PageLayout.new([])}
-
- it "should return a correct string" do
- subject.to_s.should == ""
- end
- end
- context "with one word" do
- let!(:runs) do
- [
- PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello")
- ]
- end
- subject { PDF::Reader::PageLayout.new(runs)}
-
- it "should return a correct string" do
- subject.to_s.should == "Hello"
- end
- end
- context "with one run directly below another" do
- let!(:runs) do
- [
- PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
- PDF::Reader::TextRun.new(30, 687, 50, 12, "World"),
- ]
- end
- subject { PDF::Reader::PageLayout.new(runs)}
-
- it "should return a correct string" do
- subject.to_s.should == "Hello\nWorld"
- end
- end
- context "with one two words on one line, separated by a font size gap" do
- let!(:runs) do
- [
- PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
- PDF::Reader::TextRun.new(92, 700, 50, 12, "World"),
- ]
- end
- subject { PDF::Reader::PageLayout.new(runs)}
-
- it "should return a correct string" do
- subject.to_s.should == "Hello World"
- end
- end
-
- context "with two words on one line, separated just over the mean glyph width" do
- let!(:runs) do
- [
- PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
- PDF::Reader::TextRun.new(91, 700, 50, 12, "World"),
- ]
- end
- subject { PDF::Reader::PageLayout.new(runs)}
-
- it "should return a correct string" do
- subject.to_s.should == "Hello World"
- end
- end
-
- context "with one two words on one line, separated just over 2x the mean glyph width" do
- let!(:runs) do
- [
- PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
- PDF::Reader::TextRun.new(101, 700, 50, 12, "World"),
- ]
- end
- subject { PDF::Reader::PageLayout.new(runs)}
-
- it "should return a correct string" do
- subject.to_s.should == "Hello World"
- end
- end
-
- context "with one run directly below another and indented by just over 1 font size gap" do
- let!(:runs) do
- [
- PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
- PDF::Reader::TextRun.new(43, 687, 50, 12, "World"),
- ]
- end
- subject { PDF::Reader::PageLayout.new(runs)}
-
- it "should return a correct string" do
- subject.to_s.should == "Hello\n World"
- end
- end
-
- context "with one run directly below another and the first indented by just over 1x fs gap" do
- let!(:runs) do
- [
- PDF::Reader::TextRun.new(43, 700, 50, 12, "Hello"),
- PDF::Reader::TextRun.new(30, 687, 50, 12, "World"),
- ]
- end
- subject { PDF::Reader::PageLayout.new(runs)}
-
- it "should return a correct string" do
- subject.to_s.should == " Hello\nWorld"
- end
- end
-
- context "with one run directly below another with 1 font size gap" do
- let!(:runs) do
- [
- PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
- PDF::Reader::TextRun.new(30, 676, 50, 12, "World"),
- ]
- end
- subject { PDF::Reader::PageLayout.new(runs)}
-
- it "should return a correct string" do
- subject.to_s.should == "Hello\n\nWorld"
+ context "with an A4 page" do
+ let(:mediabox) { [0, 0, 595.28, 841.89 ]}
+
+ context "with no words" do
+ subject { PDF::Reader::PageLayout.new([], mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == ""
+ end
+ end
+ context "with one word" do
+ let!(:runs) do
+ [
+ PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello")
+ ]
+ end
+ subject { PDF::Reader::PageLayout.new(runs, mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == "Hello"
+ end
+ end
+ context "with one run directly below another" do
+ let!(:runs) do
+ [
+ PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
+ PDF::Reader::TextRun.new(30, 687, 50, 12, "World"),
+ ]
+ end
+ subject { PDF::Reader::PageLayout.new(runs, mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == "Hello\nWorld"
+ end
+ end
+ context "with one two words on one line, separated by a font size gap" do
+ let!(:runs) do
+ [
+ PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
+ PDF::Reader::TextRun.new(92, 700, 50, 12, "World"),
+ ]
+ end
+ subject { PDF::Reader::PageLayout.new(runs, mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == "Hello World"
+ end
+ end
+
+ context "with two words on one line, separated just over the mean glyph width" do
+ let!(:runs) do
+ [
+ PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
+ PDF::Reader::TextRun.new(91, 700, 50, 12, "World"),
+ ]
+ end
+ subject { PDF::Reader::PageLayout.new(runs, mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == "Hello World"
+ end
+ end
+
+ context "with one two words on one line, separated just over 2x the mean glyph width" do
+ let!(:runs) do
+ [
+ PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
+ PDF::Reader::TextRun.new(101, 700, 50, 12, "World"),
+ ]
+ end
+ subject { PDF::Reader::PageLayout.new(runs, mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == "Hello World"
+ end
+ end
+
+ context "with one run directly below another and indented by just over 1 font size gap" do
+ let!(:runs) do
+ [
+ PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
+ PDF::Reader::TextRun.new(43, 687, 50, 12, "World"),
+ ]
+ end
+ subject { PDF::Reader::PageLayout.new(runs, mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == "Hello\n World"
+ end
+ end
+
+ context "with one run directly below another and the first indented by just over 1x fs gap" do
+ let!(:runs) do
+ [
+ PDF::Reader::TextRun.new(43, 700, 50, 12, "Hello"),
+ PDF::Reader::TextRun.new(30, 687, 50, 12, "World"),
+ ]
+ end
+ subject { PDF::Reader::PageLayout.new(runs, mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == " Hello\nWorld"
+ end
+ end
+
+ context "with one run directly below another with 1 font size gap" do
+ let!(:runs) do
+ [
+ PDF::Reader::TextRun.new(30, 700, 50, 12, "Hello"),
+ PDF::Reader::TextRun.new(30, 676, 50, 12, "World"),
+ ]
+ end
+ subject { PDF::Reader::PageLayout.new(runs, mediabox)}
+
+ it "should return a correct string" do
+ subject.to_s.should == "Hello\n\nWorld"
+ end
end
end
end
Please sign in to comment.
Something went wrong with that request. Please try again.