Permalink
Browse files

rough experiments in parsing speed with treetop

  • Loading branch information...
1 parent 0b64809 commit 32671686b72582888ab98b9fbd7dadfb47e8763f @yob committed Jun 9, 2012
Showing with 39 additions and 4 deletions.
  1. +16 −3 lib/pdf/reader/new_parser.rb
  2. +1 −1 lib/pdf/reader/pdf.treetop
  3. +22 −0 tools/bench2.rb
@@ -90,7 +90,7 @@ def to_ruby
class NewParser
Treetop.load(File.join(File.dirname(__FILE__), 'pdf.treetop'))
- attr_reader :pos
+ attr_reader :pos, :count
def initialize(data)
@data = data
@@ -99,22 +99,35 @@ def initialize(data)
@parser.root = :content_stream
@pos = 0
@tokens = []
+ @count = 0
end
def next_token
- 100.times { prepare_tokens } if @tokens.size <= 3
+ 100.times { prepare_tokens } if @tokens.size <= 3 && @pos < @data.bytesize
@tokens.shift
end
def all_tokens
- 100.times { prepare_tokens } if @tokens.size <= 3
+ #100.times { prepare_tokens } if @tokens.size <= 3 && @pos < @data.bytesize
+ @parser.consume_all_input = true
+ tree = @parser.parse(@data, index: @pos)
+ if tree
+ @tokens = tree.elements.select { |obj|
+ obj.respond_to?(:to_ruby)
+ }.map(&:to_ruby)
+ else
+ # If the AST is nil then there was an error during parsing
+ # we need to report a simple error message to help the user
+ raise Exception, "Parse error at offset: #{@parser.index}"
+ end
@tokens
end
private
def prepare_tokens
return if @pos >= @data.bytesize
+ @count += 1
token = @parser.parse(@data, index: @pos)
@pos = @parser.index
@@ -13,7 +13,7 @@ grammar Pdf
end
rule content_stream
- (comment / base_object / operator / separator)
+ (comment / base_object / operator / separator)*
end
#---------------------------------------------
View
@@ -0,0 +1,22 @@
+# coding: utf-8
+
+
+require 'pdf/reader'
+require 'benchmark'
+require 'stringio'
+
+Benchmark.bm(7) do |x|
+ x.report("Parser") do
+ 1000.times do
+ buf = PDF::Reader::Buffer.new(StringIO.new("1 q Q"))
+ PDF::Reader::Parser.new(buf).parse_token
+ end
+ end
+ x.report("NewParser") do
+ 1000.times do
+ parser = PDF::Reader::NewParser.new("1 q Q")
+ #parser.next_token
+ parser.all_tokens
+ end
+ end
+end

0 comments on commit 3267168

Please sign in to comment.