Permalink
Browse files

Auto-detect record-separator to handle different encodings (\r, \n, a…

…nd \r\n); auto-join CONT tags to previous tag
  • Loading branch information...
1 parent 3c1f7e9 commit 343317b1fd003b036b7839e39619f5d73f006214 @jslade jslade committed Dec 11, 2008
Showing with 100 additions and 5 deletions.
  1. +64 −3 lib/gedcom.rb
  2. +36 −2 tests/parser_spec.rb
View
@@ -18,8 +18,10 @@
# -------------------------------------------------------------------------
require 'gedcom_date'
+require 'stringio'
module GEDCOM
+ attr_accessor :auto_concat
# Possibly a better way to do this?
VERSION = "0.2.1"
@@ -31,6 +33,9 @@ def initialize &block
@ctxStack = []
@dataStack = []
@curlvl = -1
+
+ @auto_concat = true
+
instance_eval(&block) if block_given?
end
@@ -47,7 +52,11 @@ def after tag, proc=nil, &block
def parse( file )
case file
when String
- parse_file(file)
+ if file =~ /\n/mo
+ parse_string(file)
+ else
+ parse_file(file)
+ end
when IO
parse_io(file)
else
@@ -76,10 +85,22 @@ def parse_file(file)
end
end
+ def parse_string(str)
+ parse_io(StringIO.new(str))
+ end
+
def parse_io(io)
- io.each_line do |line|
+ rs = detect_rs(io)
+ io.each_line(rs) do |line|
level, tag, rest = line.chop.split( ' ', 3 )
+ next if level.nil? or tag.nil?
level = level.to_i
+
+ if tag == 'CONT' and @auto_concat
+ concat_data rest
+ next
+ end
+
unwind_to level
tag, rest = rest, tag if tag =~ /@.*@/
@@ -94,14 +115,26 @@ def parse_io(io)
end
def unwind_to level
- while level <= @curlvl
+ while @curlvl >= level
do_after @ctxStack, @dataStack.last
@ctxStack.pop
@dataStack.pop
@curlvl -= 1
end
end
+ def concat_data rest
+ if @dataStack[-1].nil?
+ @dataStack[-1] = rest
+ else
+ if @ctxStack[-1] == 'BLOB'
+ @dataStack[-1] << rest
+ else
+ @dataStack[-1] << "\n" + rest
+ end
+ end
+ end
+
def do_before tag, data
if proc = @before[tag]
proc.call data
@@ -119,6 +152,34 @@ def do_after tag, data
end
ANY = [:any]
+
+ # valid gedcom may use either of \r or \r\n as the record separator.
+ # just in case, also detects simple \n as the separator as well
+ # detects the rs for this string by scanning ahead to the first occurence
+ # of either \r or \n, and checking the character after it
+ def detect_rs io
+ rs = "\x0d"
+ mark = io.pos
+ begin
+ while ch = io.readchar
+ case ch
+ when 0x0d
+ ch2 = io.readchar
+ if ch2 == 0x0a
+ rs = "\x0d\x0a"
+ end
+ break
+ when 0x0a
+ rs = "\x0a"
+ break
+ end
+ end
+ ensure
+ io.pos = mark
+ end
+ rs
+ end
+
end #/ Parser
end #/ GEDCOM
View
@@ -37,6 +37,11 @@
count_after.should == 3
end
+ it "should auto-concatenate text" do
+ @parser.after %w(SUBM NAME ADDR) do |text|
+ text.should == "Submitters address\naddress continued here"
+ end
+ end
it "should unwind all the way" do
after_trlr = false
@@ -50,15 +55,44 @@
it "should use :any as default" do
@parser.parse SIMPLE
- @tag_count[:all].should == 48
+ @tag_count[:all].should == 47
@tag_count['INDI'].should == 3
@tag_count['FAM'].should == 1
@tag_count['FAM_MARR_DATE'].should == 1
end
+ it "should handle empty gedcom" do
+ @parser.parse "\n"
+ @tag_count[:all].should == 0
+ end
+
+ it "should parse TGC551.ged (\\r)" do
+ @parser.parse "#{GEDCOMS}/TGC551.ged"
+ @tag_count[:all].should == 1653
+ end
- it "should parse torture-test cases okay" do
+ it "should parse TGC551LF.ged (\\r\\n)" do
+ @parser.parse "#{GEDCOMS}/TGC551LF.ged"
+ @tag_count[:all].should == 1653
end
+ it "should parse TGC55C.ged (\\r)" do
+ @parser.parse "#{GEDCOMS}/TGC55C.ged"
+ @tag_count[:all].should == 1684
+ end
+
+ it "should parse TGC55CLF.ged (\\r\\n) with auto-concat" do
+ @parser.parse "#{GEDCOMS}/TGC55CLF.ged"
+ @parser.after %w(OBJE BLOB) do |data|
+ data.size.should == 458
+ end
+ @tag_count[:all].should == 1684
+ end
+
+ it "should parse TGC55CLF.ged (\\r\\n) without auto-concat" do
+ @parser.auto_concat = false
+ @parser.parse "#{GEDCOMS}/TGC55CLF.ged"
+ @tag_count[:all].should == 2197
+ end
end

0 comments on commit 343317b

Please sign in to comment.