Permalink
Browse files

spec behaviour of ONIX::Reader when no encoding declaration exists in…

… an ISO-8859-1 file

* Nokogiri prints debugging info to stderr and then throws an exception.
* This isn't ideal, I'd much prefer to give the user an option to
  replace invalid bytes with a valid character, but I haven't worked
  out how yet
  • Loading branch information...
1 parent 1d637d3 commit 00e1596a77a2aa96bc385e1b1a12645c2d85e8b6 @yob committed Apr 14, 2011
Showing with 112 additions and 11 deletions.
  1. +96 −0 data/aau.xml
  2. +16 −11 spec/reader_spec.rb
View
96 data/aau.xml
@@ -0,0 +1,96 @@
+<?xml version="1.0" ?>
+<!DOCTYPE ONIXMessage SYSTEM "http://www.editeur.org/onix/2.1/02/reference/onix-international.dtd">
+<ONIXMessage>
+ <Header>
+ <FromCompany>AllenandUnwin</FromCompany>
+ <SentDate>20110411</SentDate>
+ </Header>
+ <Product>
+ <RecordReference>9780747595175</RecordReference>
+ <NotificationType></NotificationType>
+ <ProductIdentifier>
+ <ProductIDType>03</ProductIDType>
+ <IDValue>9780747595175</IDValue>
+ </ProductIdentifier>
+ <ProductIdentifier>
+ <ProductIDType>15</ProductIDType>
+ <IDValue>9780747595175</IDValue>
+ </ProductIdentifier>
+ <ProductForm>BC</ProductForm>
+ <ProductFormDetail>B106</ProductFormDetail>
+ <Title>
+ <TitleType>01</TitleType>
+ <TitleText>Lost World</TitleText>
+ <TitlePrefix></TitlePrefix>
+ <TitleWithoutPrefix>Lost World</TitleWithoutPrefix>
+ </Title>
+ <Website>
+ <WebsiteLink>http://www.allenandunwin.com/default.aspx?page=94&amp;book=9780747595175</WebsiteLink>
+ </Website>
+ <MediaFile>
+ <MediaFileTypeCode>04</MediaFileTypeCode>
+ <MediaFileLinkTypeCode>01</MediaFileLinkTypeCode>
+ <MediaFileLink>http://www.allenandunwin.com/BookCovers/resized_9780747595175_224_297_FitSquare.jpg</MediaFileLink>
+ </MediaFile>
+ <Contributor>
+ <SequenceNumber>001</SequenceNumber>
+ <ContributorRole>A01</ContributorRole>
+ <PersonName></PersonName>
+ <PersonNameInverted>Melo,Patr¡cia</PersonNameInverted>
+ </Contributor>
+ <EditionNumber>1</EditionNumber>
+ <NumberOfPages>224</NumberOfPages>
+ <BICMainSubject>FH</BICMainSubject>
+ <AudienceCode></AudienceCode>
+ <OtherText>
+ <TextTypeCode>01</TextTypeCode>
+ <Text>Maiquel is an ex-contract killer who's been a fugitive for ten years - ever since his girlfriend Erica ran off with his daughter Samantha, took up with an evangelical pastor and disappeared as completely from his life as Maiquel himself has disappeared from the front pages of the Brazilian newspapers. Then his aunt dies, leaving him a house and a savings account and a fresh chance to find the lost world of his onetime family. Converting his new assets to cash and breaking all the rules in the book (including his own), Maiquel sets out to find the man who stole his girlfriend and daughter.</Text>
+ </OtherText>
+ <Imprint>
+ <ImprintName>Bloomsbury</ImprintName>
+ </Imprint>
+ <Publisher>
+ <PublishingRole>01</PublishingRole>
+ <PublisherName>Allen &amp; Unwin</PublisherName>
+ </Publisher>
+ <PublishingStatus>07</PublishingStatus>
+ <PublicationDate>20091201</PublicationDate>
+ <YearFirstPublished>2009</YearFirstPublished>
+ <Measure>
+ <MeasureTypeCode>01</MeasureTypeCode>
+ <Measurement>214</Measurement>
+ <MeasureUnitCode>mm</MeasureUnitCode>
+ </Measure>
+ <Measure>
+ <MeasureTypeCode>02</MeasureTypeCode>
+ <Measurement>135</Measurement>
+ <MeasureUnitCode>mm</MeasureUnitCode>
+ </Measure>
+ <Measure>
+ <MeasureTypeCode>08</MeasureTypeCode>
+ <Measurement>246</Measurement>
+ <MeasureUnitCode>gr</MeasureUnitCode>
+ </Measure>
+ <SupplyDetail>
+ <SupplierName>United Book Distributors</SupplierName>
+ <SupplierRole>02</SupplierRole>
+ <ProductAvailability>40</ProductAvailability>
+ <ExpectedShipDate></ExpectedShipDate>
+ <PackQuantity>21</PackQuantity>
+ <Price>
+ <PriceTypeCode>02</PriceTypeCode>
+ <PriceAmount>32.99</PriceAmount>
+ </Price>
+ </SupplyDetail>
+ <MarketRepresentation>
+ <AgentName>Allen &amp; Unwin</AgentName>
+ <AgentRole>07</AgentRole>
+ <MarketCountry>AU</MarketCountry>
+ <MarketPublishingStatus>07</MarketPublishingStatus>
+ <MarketDate>
+ <MarketDateRole>01</MarketDateRole>
+ <Date>20091201</Date>
+ </MarketDate>
+ </MarketRepresentation>
+ </Product>
+</ONIXMessage>
View
27 spec/reader_spec.rb
@@ -12,6 +12,7 @@
@entity_file = File.join(@data_path, "entities.xml")
@utf_16_file = File.join(@data_path, "utf_16.xml")
@iso_8859_1_file = File.join(@data_path, "iso_8859_1.xml")
+ @no_encoding_decl_file = File.join(@data_path, "aau.xml")
end
it "should initialize with a filename" do
@@ -90,19 +91,24 @@
it "should transparently convert a iso-8859-1 file to utf-8" do
reader = ONIX::Reader.new(@iso_8859_1_file)
- product = nil
- reader.each do |p|
- product = p
- end
+ reader.each do |product|
+ if RUBY_VERSION >= "1.9"
+ utf8 = Encoding.find("utf-8")
+ product.contributors[0].person_name_inverted.encoding.should eql(utf8)
+ end
- # ROXML appears to munge the string encodings
- if RUBY_VERSION >= "1.9"
- utf8 = Encoding.find("utf-8")
- product.contributors[0].person_name_inverted.encoding.should eql(utf8)
+ product.contributors[0].person_name_inverted.should eql("Küng, Hans")
end
+ end
- product.contributors[0].person_name_inverted.should eql("Küng, Hans")
-
+ # This isn't ideal behaviour, but i'm somewhat hamstrung by nokogiri API. It'd
+ # be nice to have the option to replace unrecognised bytes with a valid char.
+ it "should raise an exception when an iso-8859-1 file isn't declared as such" do
+ reader = ONIX::Reader.new(@no_encoding_decl_file)
+ lambda {
+ reader.each do |product|
+ end
+ }.should raise_error(Nokogiri::XML::SyntaxError)
end
it "should transparently convert a utf-16 file to utf-8" do
@@ -119,6 +125,5 @@
end
product.contributors[0].person_name_inverted.should eql("Küng, Hans")
-
end
end

0 comments on commit 00e1596

Please sign in to comment.