Permalink
Browse files

expand ONIX::Normaliser to strip control chars

  • Loading branch information...
1 parent c551b84 commit d1c032b4d62f5bc3a0b02faa62bbbc5ef23f6869 @yob committed Sep 2, 2009
Showing with 112 additions and 0 deletions.
  1. +74 −0 data/control_chars.xml
  2. +14 −0 lib/onix/normaliser.rb
  3. +24 −0 spec/normaliser_spec.rb
View
@@ -0,0 +1,74 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE ONIXMessage SYSTEM "http://www.editeur.org/onix/2.1/reference/onix-international.dtd">
+<ONIXMessage>
+ <Header>
+ <FromCompany>TitlePage</FromCompany>
+ <FromPerson>TitlePage 02 92819788</FromPerson>
+ <FromEmail>titlepage@publishers.asn.au</FromEmail>
+ <SentDate>20080519</SentDate>
+ <MessageNote>This data is copyright to TitlePage. TitlePage makes no guarantee of the accuracy or the timeliness of production. It is supplied for your exclusive use as our customer and is only to be used for advising on the pricing and availability of publications (Authorised Purpose). You must not charge a fee for this service. TitlePage and its content may not be used for any other purpose. While it may be copied once for the authorised purpose, written permission from TitlePage must be obtained for any other use. If you were not an intended recipient, you must notify the sender and delete all copies.</MessageNote>
+ </Header>
+ <Product>
+ <RecordReference>365-9780194351898</RecordReference>
+ <NotificationType>03</NotificationType>
+ <ProductIdentifier>
+ <ProductIDType>02</ProductIDType>
+ <IDValue>0194351890</IDValue>
+ </ProductIdentifier>
+ <ProductIdentifier>
+ <ProductIDType>03</ProductIDType>
+ <IDValue>9780194351898</IDValue>
+ </ProductIdentifier>
+ <ProductIdentifier>
+ <ProductIDType>15</ProductIDType>
+ <IDValue>9780194351898</IDValue>
+ </ProductIdentifier>
+ <ProductForm>BC</ProductForm>
+ <Series>
+ <TitleOfSeries>DICTIONARIES BEGINNER TO PRE-INTERMED</TitleOfSeries>
+ </Series>
+ <Title>
+ <TitleType>01</TitleType>
+ <TitleText>OXFORD PICTURE DICTIONARY CHINESE</TitleText>
+ </Title>
+ <Website>
+ <WebsiteLink>http://www.oup.com.au/searchbuy/product.asp?ISBN=9780194351898</WebsiteLink>
+ </Website>
+ <Contributor>
+ <ContributorRole>A01</ContributorRole>
+ <PersonNameInverted>SHAPIRO</PersonNameInverted>
+ </Contributor>
+ <BICMainSubject>EB</BICMainSubject>
+ <Subject>
+ <SubjectSchemeIdentifier>14</SubjectSchemeIdentifier>
+ <SubjectCode>2ABM</SubjectCode>
+ </Subject>
+ <AudienceCode>07</AudienceCode>
+ <Imprint>
+ <ImprintName>Oxford University Press UK</ImprintName>
+ </Imprint>
+ <Publisher>
+ <PublishingRole>01</PublishingRole>
+ <PublisherName>Oxford University Press</PublisherName>
+ </Publisher>
+ <PublishingStatus>04</PublishingStatus>
+ <PublicationDate>19980901</PublicationDate>
+ <YearFirstPublished>1998</YearFirstPublished>
+ <Measure>
+ <MeasureTypeCode>08</MeasureTypeCode>
+ <Measurement>720 </Measurement>
+ <MeasureUnitCode>gr</MeasureUnitCode>
+ </Measure>
+ <SupplyDetail>
+ <SupplierName>Oxford University Press Australia and New Zealand</SupplierName>
+ <ProductAvailability>21</ProductAvailability>
+ <Stock>
+ <OnHand>70</OnHand>
+ </Stock>
+ <Price>
+ <PriceTypeCode>02</PriceTypeCode>
+ <PriceAmount>59.95</PriceAmount>
+ </Price>
+ </SupplyDetail>
+ </Product>
+</ONIXMessage>
View
@@ -43,6 +43,7 @@ def initialize(oldfile, newfile)
raise "isutf8 app not found" unless app_available?("isutf8")
raise "iconv app not found" unless app_available?("iconv")
raise "sed app not found" unless app_available?("sed")
+ raise "tr app not found" unless app_available?("tr")
@oldfile = oldfile
@newfile = newfile
@@ -64,6 +65,11 @@ def run
to_utf8(@curfile, dest)
@curfile = dest
+ # remove control chars
+ dest = next_tempfile
+ remove_control_chars(@curfile, dest)
+ @curfile = dest
+
# remove entities
replace_named_entities(@curfile)
@@ -137,6 +143,14 @@ def to_utf8(src, dest)
end
end
+ # XML files shouldn't contain low ASCII control chars. Strip them.
+ #
+ def remove_control_chars(src, dest)
+ inpath = File.expand_path(src)
+ outpath = File.expand_path(dest)
+ `cat #{inpath} | tr -d "\\000-\\010\\013\\014\\016-\\037" > #{outpath}`
+ end
+
# replace all named entities in the specified file with
# numeric entities.
#
View
@@ -99,3 +99,27 @@
content.include?("encoding=\"UTF-8\"").should be_true
end
end
+
+context "ONIX::Normaliser", "with a utf8 file that has illegal control chars" do
+
+ before(:each) do
+ @data_path = File.join(File.dirname(__FILE__),"..","data")
+ @filename = File.join(@data_path, "control_chars.xml")
+ @outfile = @filename + ".new"
+ end
+
+ after(:each) do
+ File.unlink(@outfile) if File.file?(@outfile)
+ end
+
+ # this is to test for a bug where an exception was raised on files that
+ # had no declared encoding
+ specify "should remove all control chars except LF, CR and TAB" do
+ ONIX::Normaliser.process(@filename, @outfile)
+
+ File.file?(@outfile).should be_true
+ content = File.read(@outfile)
+
+ content.include?("<TitleText>OXFORDPICTURE DICTIONARY CHINESE</TitleText>").should be_true
+ end
+end

0 comments on commit d1c032b

Please sign in to comment.